mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-10 05:56:42 +00:00
Add TCP timeout option
This commit is contained in:
parent
b846498030
commit
86ec78cae1
@ -20,7 +20,6 @@ var config struct {
|
||||
Retries int
|
||||
Workers int
|
||||
UserAgent string
|
||||
Timeout time.Duration
|
||||
Tasks int32
|
||||
CrawlStats time.Duration
|
||||
AllocStats time.Duration
|
||||
@ -38,6 +37,7 @@ const (
|
||||
ConfRetries = "crawl.retries"
|
||||
ConfWorkers = "crawl.connections"
|
||||
ConfUserAgent = "crawl.user-agent"
|
||||
ConfDialTimeout = "crawl.dial_timeout"
|
||||
ConfTimeout = "crawl.timeout"
|
||||
ConfCrawlStats = "output.crawl_stats"
|
||||
ConfAllocStats = "output.resource_stats"
|
||||
@ -51,7 +51,8 @@ func prepareConfig() {
|
||||
viper.SetDefault(ConfWorkers, 2)
|
||||
viper.SetDefault(ConfTasks, 3)
|
||||
viper.SetDefault(ConfUserAgent, "")
|
||||
viper.SetDefault(ConfTimeout, 10 * time.Second)
|
||||
viper.SetDefault(ConfDialTimeout, 10 * time.Second)
|
||||
viper.SetDefault(ConfTimeout, 30 * time.Second)
|
||||
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
|
||||
viper.SetDefault(ConfAllocStats, 0)
|
||||
viper.SetDefault(ConfVerbose, false)
|
||||
@ -107,7 +108,9 @@ func readConfig() {
|
||||
|
||||
config.UserAgent = viper.GetString(ConfUserAgent)
|
||||
|
||||
config.Timeout = viper.GetDuration(ConfTimeout)
|
||||
setDialTimeout(viper.GetDuration(ConfDialTimeout))
|
||||
|
||||
setTimeout(viper.GetDuration(ConfTimeout))
|
||||
|
||||
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
|
||||
|
||||
|
10
config.yml
10
config.yml
@ -52,9 +52,17 @@ crawl:
|
||||
# from the site before giving up
|
||||
retries: 5
|
||||
|
||||
# Time before discarding a failed connection attempt
|
||||
dial_timeout: 10s
|
||||
|
||||
# Time before discarding a network request
|
||||
timeout: 10s
|
||||
timeout: 30s
|
||||
|
||||
# Crawler User-Agent
|
||||
# If empty, no User-Agent header is sent.
|
||||
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
|
||||
|
||||
# Job buffer size (per task)
|
||||
# Higher values cause less disk writes
|
||||
# but require more memory.
|
||||
job_buffer: 5000
|
||||
|
16
crawl.go
16
crawl.go
@ -8,6 +8,7 @@ import (
|
||||
"github.com/valyala/fasthttp"
|
||||
"golang.org/x/crypto/blake2b"
|
||||
"golang.org/x/net/html"
|
||||
"net"
|
||||
"path"
|
||||
"strconv"
|
||||
"strings"
|
||||
@ -20,6 +21,17 @@ var client = fasthttp.Client {
|
||||
},
|
||||
}
|
||||
|
||||
func setDialTimeout(d time.Duration) {
|
||||
client.Dial = func(addr string) (net.Conn, error) {
|
||||
return fasthttp.DialTimeout(addr, d)
|
||||
}
|
||||
}
|
||||
|
||||
func setTimeout(d time.Duration) {
|
||||
client.ReadTimeout = d
|
||||
client.WriteTimeout = d / 2
|
||||
}
|
||||
|
||||
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
||||
f.IsDir = true
|
||||
f.Name = path.Base(j.Uri.Path)
|
||||
@ -33,7 +45,7 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
||||
res := fasthttp.AcquireResponse()
|
||||
defer fasthttp.ReleaseResponse(res)
|
||||
|
||||
err = client.DoTimeout(req, res, config.Timeout)
|
||||
err = client.Do(req, res)
|
||||
fasthttp.ReleaseRequest(req)
|
||||
|
||||
if err != nil {
|
||||
@ -132,7 +144,7 @@ func GetFile(u fasturl.URL, f *File) (err error) {
|
||||
res.SkipBody = true
|
||||
defer fasthttp.ReleaseResponse(res)
|
||||
|
||||
err = client.DoTimeout(req, res, config.Timeout)
|
||||
err = client.Do(req, res)
|
||||
fasthttp.ReleaseRequest(req)
|
||||
|
||||
if err != nil {
|
||||
|
Loading…
x
Reference in New Issue
Block a user