Add TCP timeout option

This commit is contained in:
Richard Patel 2018-11-20 03:29:10 +01:00
parent b846498030
commit 86ec78cae1
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB
3 changed files with 29 additions and 6 deletions

View File

@ -20,7 +20,6 @@ var config struct {
Retries int Retries int
Workers int Workers int
UserAgent string UserAgent string
Timeout time.Duration
Tasks int32 Tasks int32
CrawlStats time.Duration CrawlStats time.Duration
AllocStats time.Duration AllocStats time.Duration
@ -38,6 +37,7 @@ const (
ConfRetries = "crawl.retries" ConfRetries = "crawl.retries"
ConfWorkers = "crawl.connections" ConfWorkers = "crawl.connections"
ConfUserAgent = "crawl.user-agent" ConfUserAgent = "crawl.user-agent"
ConfDialTimeout = "crawl.dial_timeout"
ConfTimeout = "crawl.timeout" ConfTimeout = "crawl.timeout"
ConfCrawlStats = "output.crawl_stats" ConfCrawlStats = "output.crawl_stats"
ConfAllocStats = "output.resource_stats" ConfAllocStats = "output.resource_stats"
@ -51,7 +51,8 @@ func prepareConfig() {
viper.SetDefault(ConfWorkers, 2) viper.SetDefault(ConfWorkers, 2)
viper.SetDefault(ConfTasks, 3) viper.SetDefault(ConfTasks, 3)
viper.SetDefault(ConfUserAgent, "") viper.SetDefault(ConfUserAgent, "")
viper.SetDefault(ConfTimeout, 10 * time.Second) viper.SetDefault(ConfDialTimeout, 10 * time.Second)
viper.SetDefault(ConfTimeout, 30 * time.Second)
viper.SetDefault(ConfCrawlStats, 3 * time.Second) viper.SetDefault(ConfCrawlStats, 3 * time.Second)
viper.SetDefault(ConfAllocStats, 0) viper.SetDefault(ConfAllocStats, 0)
viper.SetDefault(ConfVerbose, false) viper.SetDefault(ConfVerbose, false)
@ -107,7 +108,9 @@ func readConfig() {
config.UserAgent = viper.GetString(ConfUserAgent) config.UserAgent = viper.GetString(ConfUserAgent)
config.Timeout = viper.GetDuration(ConfTimeout) setDialTimeout(viper.GetDuration(ConfDialTimeout))
setTimeout(viper.GetDuration(ConfTimeout))
config.CrawlStats = viper.GetDuration(ConfCrawlStats) config.CrawlStats = viper.GetDuration(ConfCrawlStats)

View File

@ -52,9 +52,17 @@ crawl:
# from the site before giving up # from the site before giving up
retries: 5 retries: 5
# Time before discarding a failed connection attempt
dial_timeout: 10s
# Time before discarding a network request # Time before discarding a network request
timeout: 10s timeout: 30s
# Crawler User-Agent # Crawler User-Agent
# If empty, no User-Agent header is sent. # If empty, no User-Agent header is sent.
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0" user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
# Job buffer size (per task)
# Higher values cause less disk writes
# but require more memory.
job_buffer: 5000

View File

@ -8,6 +8,7 @@ import (
"github.com/valyala/fasthttp" "github.com/valyala/fasthttp"
"golang.org/x/crypto/blake2b" "golang.org/x/crypto/blake2b"
"golang.org/x/net/html" "golang.org/x/net/html"
"net"
"path" "path"
"strconv" "strconv"
"strings" "strings"
@ -20,6 +21,17 @@ var client = fasthttp.Client {
}, },
} }
func setDialTimeout(d time.Duration) {
client.Dial = func(addr string) (net.Conn, error) {
return fasthttp.DialTimeout(addr, d)
}
}
func setTimeout(d time.Duration) {
client.ReadTimeout = d
client.WriteTimeout = d / 2
}
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) { func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
f.IsDir = true f.IsDir = true
f.Name = path.Base(j.Uri.Path) f.Name = path.Base(j.Uri.Path)
@ -33,7 +45,7 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
res := fasthttp.AcquireResponse() res := fasthttp.AcquireResponse()
defer fasthttp.ReleaseResponse(res) defer fasthttp.ReleaseResponse(res)
err = client.DoTimeout(req, res, config.Timeout) err = client.Do(req, res)
fasthttp.ReleaseRequest(req) fasthttp.ReleaseRequest(req)
if err != nil { if err != nil {
@ -132,7 +144,7 @@ func GetFile(u fasturl.URL, f *File) (err error) {
res.SkipBody = true res.SkipBody = true
defer fasthttp.ReleaseResponse(res) defer fasthttp.ReleaseResponse(res)
err = client.DoTimeout(req, res, config.Timeout) err = client.Do(req, res)
fasthttp.ReleaseRequest(req) fasthttp.ReleaseRequest(req)
if err != nil { if err != nil {