From a71157b4d855d00d59cd66c8bfbadd76ee1571f0 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sun, 18 Nov 2018 14:24:04 +0100 Subject: [PATCH] Add User-Agent parameter --- config.go | 5 +++++ config.yml | 3 +++ crawl.go | 6 ++++++ 3 files changed, 14 insertions(+) diff --git a/config.go b/config.go index 379586f..e949a83 100644 --- a/config.go +++ b/config.go @@ -17,6 +17,7 @@ var config struct { ChunkSize int64 Retries int Workers int + UserAgent string Timeout time.Duration Tasks int32 CrawlStats time.Duration @@ -34,6 +35,7 @@ const ( ConfTasks = "crawl.tasks" ConfRetries = "crawl.retries" ConfWorkers = "crawl.connections" + ConfUserAgent = "crawl.user-agent" ConfTimeout = "crawl.timeout" ConfCrawlStats = "output.crawl_stats" ConfAllocStats = "output.resource_stats" @@ -45,6 +47,7 @@ func prepareConfig() { viper.SetDefault(ConfRetries, 5) viper.SetDefault(ConfWorkers, 2) viper.SetDefault(ConfTasks, 3) + viper.SetDefault(ConfUserAgent, "") viper.SetDefault(ConfTimeout, 10 * time.Second) viper.SetDefault(ConfCrawlStats, 3 * time.Second) viper.SetDefault(ConfAllocStats, 0) @@ -98,6 +101,8 @@ func readConfig() { configOOB(ConfTasks, int(config.Tasks)) } + config.UserAgent = viper.GetString(ConfUserAgent) + config.Timeout = viper.GetDuration(ConfTimeout) config.CrawlStats = viper.GetDuration(ConfCrawlStats) diff --git a/config.yml b/config.yml index 7b4385a..f59555e 100644 --- a/config.yml +++ b/config.yml @@ -47,3 +47,6 @@ crawl: # Time before discarding a network request timeout: 10s + + # Crawler User-Agent + user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0" diff --git a/crawl.go b/crawl.go index 8e2dad7..1bb4542 100644 --- a/crawl.go +++ b/crawl.go @@ -25,6 +25,9 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) { f.Name = path.Base(j.Uri.Path) req := fasthttp.AcquireRequest() + if config.UserAgent != "" { + req.Header.SetUserAgent(config.UserAgent) + } req.SetRequestURI(j.UriStr) res := fasthttp.AcquireResponse() @@ -120,6 +123,9 @@ func GetFile(u fasturl.URL, f *File) (err error) { req := fasthttp.AcquireRequest() req.Header.SetMethod("HEAD") + if config.UserAgent != "" { + req.Header.SetUserAgent(config.UserAgent) + } req.SetRequestURI(u.String()) res := fasthttp.AcquireResponse()