diff --git a/config.go b/config.go index f58d9a3..2616672 100644 --- a/config.go +++ b/config.go @@ -20,7 +20,6 @@ var config struct { Retries int Workers int UserAgent string - Timeout time.Duration Tasks int32 CrawlStats time.Duration AllocStats time.Duration @@ -38,6 +37,7 @@ const ( ConfRetries = "crawl.retries" ConfWorkers = "crawl.connections" ConfUserAgent = "crawl.user-agent" + ConfDialTimeout = "crawl.dial_timeout" ConfTimeout = "crawl.timeout" ConfCrawlStats = "output.crawl_stats" ConfAllocStats = "output.resource_stats" @@ -51,7 +51,8 @@ func prepareConfig() { viper.SetDefault(ConfWorkers, 2) viper.SetDefault(ConfTasks, 3) viper.SetDefault(ConfUserAgent, "") - viper.SetDefault(ConfTimeout, 10 * time.Second) + viper.SetDefault(ConfDialTimeout, 10 * time.Second) + viper.SetDefault(ConfTimeout, 30 * time.Second) viper.SetDefault(ConfCrawlStats, 3 * time.Second) viper.SetDefault(ConfAllocStats, 0) viper.SetDefault(ConfVerbose, false) @@ -107,7 +108,9 @@ func readConfig() { config.UserAgent = viper.GetString(ConfUserAgent) - config.Timeout = viper.GetDuration(ConfTimeout) + setDialTimeout(viper.GetDuration(ConfDialTimeout)) + + setTimeout(viper.GetDuration(ConfTimeout)) config.CrawlStats = viper.GetDuration(ConfCrawlStats) diff --git a/config.yml b/config.yml index bb9aafe..10f7b0b 100644 --- a/config.yml +++ b/config.yml @@ -52,9 +52,17 @@ crawl: # from the site before giving up retries: 5 + # Time before discarding a failed connection attempt + dial_timeout: 10s + # Time before discarding a network request - timeout: 10s + timeout: 30s # Crawler User-Agent # If empty, no User-Agent header is sent. user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0" + + # Job buffer size (per task) + # Higher values cause less disk writes + # but require more memory. + job_buffer: 5000 diff --git a/crawl.go b/crawl.go index 19ce529..fa599a1 100644 --- a/crawl.go +++ b/crawl.go @@ -8,6 +8,7 @@ import ( "github.com/valyala/fasthttp" "golang.org/x/crypto/blake2b" "golang.org/x/net/html" + "net" "path" "strconv" "strings" @@ -20,6 +21,17 @@ var client = fasthttp.Client { }, } +func setDialTimeout(d time.Duration) { + client.Dial = func(addr string) (net.Conn, error) { + return fasthttp.DialTimeout(addr, d) + } +} + +func setTimeout(d time.Duration) { + client.ReadTimeout = d + client.WriteTimeout = d / 2 +} + func GetDir(j *Job, f *File) (links []fasturl.URL, err error) { f.IsDir = true f.Name = path.Base(j.Uri.Path) @@ -33,7 +45,7 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) { res := fasthttp.AcquireResponse() defer fasthttp.ReleaseResponse(res) - err = client.DoTimeout(req, res, config.Timeout) + err = client.Do(req, res) fasthttp.ReleaseRequest(req) if err != nil { @@ -132,7 +144,7 @@ func GetFile(u fasturl.URL, f *File) (err error) { res.SkipBody = true defer fasthttp.ReleaseResponse(res) - err = client.DoTimeout(req, res, config.Timeout) + err = client.Do(req, res) fasthttp.ReleaseRequest(req) if err != nil {