Add User-Agent parameter

This commit is contained in:
Richard Patel 2018-11-18 14:24:04 +01:00
parent 6dbec8c789
commit a71157b4d8
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB
3 changed files with 14 additions and 0 deletions

View File

@ -17,6 +17,7 @@ var config struct {
ChunkSize int64
Retries int
Workers int
UserAgent string
Timeout time.Duration
Tasks int32
CrawlStats time.Duration
@ -34,6 +35,7 @@ const (
ConfTasks = "crawl.tasks"
ConfRetries = "crawl.retries"
ConfWorkers = "crawl.connections"
ConfUserAgent = "crawl.user-agent"
ConfTimeout = "crawl.timeout"
ConfCrawlStats = "output.crawl_stats"
ConfAllocStats = "output.resource_stats"
@ -45,6 +47,7 @@ func prepareConfig() {
viper.SetDefault(ConfRetries, 5)
viper.SetDefault(ConfWorkers, 2)
viper.SetDefault(ConfTasks, 3)
viper.SetDefault(ConfUserAgent, "")
viper.SetDefault(ConfTimeout, 10 * time.Second)
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
viper.SetDefault(ConfAllocStats, 0)
@ -98,6 +101,8 @@ func readConfig() {
configOOB(ConfTasks, int(config.Tasks))
}
config.UserAgent = viper.GetString(ConfUserAgent)
config.Timeout = viper.GetDuration(ConfTimeout)
config.CrawlStats = viper.GetDuration(ConfCrawlStats)

View File

@ -47,3 +47,6 @@ crawl:
# Time before discarding a network request
timeout: 10s
# Crawler User-Agent
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"

View File

@ -25,6 +25,9 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
f.Name = path.Base(j.Uri.Path)
req := fasthttp.AcquireRequest()
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(j.UriStr)
res := fasthttp.AcquireResponse()
@ -120,6 +123,9 @@ func GetFile(u fasturl.URL, f *File) (err error) {
req := fasthttp.AcquireRequest()
req.Header.SetMethod("HEAD")
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(u.String())
res := fasthttp.AcquireResponse()