mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-18 01:46:43 +00:00
Add TCP timeout option
This commit is contained in:
parent
b846498030
commit
86ec78cae1
@ -20,7 +20,6 @@ var config struct {
|
|||||||
Retries int
|
Retries int
|
||||||
Workers int
|
Workers int
|
||||||
UserAgent string
|
UserAgent string
|
||||||
Timeout time.Duration
|
|
||||||
Tasks int32
|
Tasks int32
|
||||||
CrawlStats time.Duration
|
CrawlStats time.Duration
|
||||||
AllocStats time.Duration
|
AllocStats time.Duration
|
||||||
@ -38,6 +37,7 @@ const (
|
|||||||
ConfRetries = "crawl.retries"
|
ConfRetries = "crawl.retries"
|
||||||
ConfWorkers = "crawl.connections"
|
ConfWorkers = "crawl.connections"
|
||||||
ConfUserAgent = "crawl.user-agent"
|
ConfUserAgent = "crawl.user-agent"
|
||||||
|
ConfDialTimeout = "crawl.dial_timeout"
|
||||||
ConfTimeout = "crawl.timeout"
|
ConfTimeout = "crawl.timeout"
|
||||||
ConfCrawlStats = "output.crawl_stats"
|
ConfCrawlStats = "output.crawl_stats"
|
||||||
ConfAllocStats = "output.resource_stats"
|
ConfAllocStats = "output.resource_stats"
|
||||||
@ -51,7 +51,8 @@ func prepareConfig() {
|
|||||||
viper.SetDefault(ConfWorkers, 2)
|
viper.SetDefault(ConfWorkers, 2)
|
||||||
viper.SetDefault(ConfTasks, 3)
|
viper.SetDefault(ConfTasks, 3)
|
||||||
viper.SetDefault(ConfUserAgent, "")
|
viper.SetDefault(ConfUserAgent, "")
|
||||||
viper.SetDefault(ConfTimeout, 10 * time.Second)
|
viper.SetDefault(ConfDialTimeout, 10 * time.Second)
|
||||||
|
viper.SetDefault(ConfTimeout, 30 * time.Second)
|
||||||
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
|
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
|
||||||
viper.SetDefault(ConfAllocStats, 0)
|
viper.SetDefault(ConfAllocStats, 0)
|
||||||
viper.SetDefault(ConfVerbose, false)
|
viper.SetDefault(ConfVerbose, false)
|
||||||
@ -107,7 +108,9 @@ func readConfig() {
|
|||||||
|
|
||||||
config.UserAgent = viper.GetString(ConfUserAgent)
|
config.UserAgent = viper.GetString(ConfUserAgent)
|
||||||
|
|
||||||
config.Timeout = viper.GetDuration(ConfTimeout)
|
setDialTimeout(viper.GetDuration(ConfDialTimeout))
|
||||||
|
|
||||||
|
setTimeout(viper.GetDuration(ConfTimeout))
|
||||||
|
|
||||||
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
|
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
|
||||||
|
|
||||||
|
10
config.yml
10
config.yml
@ -52,9 +52,17 @@ crawl:
|
|||||||
# from the site before giving up
|
# from the site before giving up
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
|
# Time before discarding a failed connection attempt
|
||||||
|
dial_timeout: 10s
|
||||||
|
|
||||||
# Time before discarding a network request
|
# Time before discarding a network request
|
||||||
timeout: 10s
|
timeout: 30s
|
||||||
|
|
||||||
# Crawler User-Agent
|
# Crawler User-Agent
|
||||||
# If empty, no User-Agent header is sent.
|
# If empty, no User-Agent header is sent.
|
||||||
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
|
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
|
||||||
|
|
||||||
|
# Job buffer size (per task)
|
||||||
|
# Higher values cause less disk writes
|
||||||
|
# but require more memory.
|
||||||
|
job_buffer: 5000
|
||||||
|
16
crawl.go
16
crawl.go
@ -8,6 +8,7 @@ import (
|
|||||||
"github.com/valyala/fasthttp"
|
"github.com/valyala/fasthttp"
|
||||||
"golang.org/x/crypto/blake2b"
|
"golang.org/x/crypto/blake2b"
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
|
"net"
|
||||||
"path"
|
"path"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@ -20,6 +21,17 @@ var client = fasthttp.Client {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func setDialTimeout(d time.Duration) {
|
||||||
|
client.Dial = func(addr string) (net.Conn, error) {
|
||||||
|
return fasthttp.DialTimeout(addr, d)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func setTimeout(d time.Duration) {
|
||||||
|
client.ReadTimeout = d
|
||||||
|
client.WriteTimeout = d / 2
|
||||||
|
}
|
||||||
|
|
||||||
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
||||||
f.IsDir = true
|
f.IsDir = true
|
||||||
f.Name = path.Base(j.Uri.Path)
|
f.Name = path.Base(j.Uri.Path)
|
||||||
@ -33,7 +45,7 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
|||||||
res := fasthttp.AcquireResponse()
|
res := fasthttp.AcquireResponse()
|
||||||
defer fasthttp.ReleaseResponse(res)
|
defer fasthttp.ReleaseResponse(res)
|
||||||
|
|
||||||
err = client.DoTimeout(req, res, config.Timeout)
|
err = client.Do(req, res)
|
||||||
fasthttp.ReleaseRequest(req)
|
fasthttp.ReleaseRequest(req)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -132,7 +144,7 @@ func GetFile(u fasturl.URL, f *File) (err error) {
|
|||||||
res.SkipBody = true
|
res.SkipBody = true
|
||||||
defer fasthttp.ReleaseResponse(res)
|
defer fasthttp.ReleaseResponse(res)
|
||||||
|
|
||||||
err = client.DoTimeout(req, res, config.Timeout)
|
err = client.Do(req, res)
|
||||||
fasthttp.ReleaseRequest(req)
|
fasthttp.ReleaseRequest(req)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user