mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-16 00:46:43 +00:00
Add User-Agent parameter
This commit is contained in:
parent
6dbec8c789
commit
a71157b4d8
@ -17,6 +17,7 @@ var config struct {
|
||||
ChunkSize int64
|
||||
Retries int
|
||||
Workers int
|
||||
UserAgent string
|
||||
Timeout time.Duration
|
||||
Tasks int32
|
||||
CrawlStats time.Duration
|
||||
@ -34,6 +35,7 @@ const (
|
||||
ConfTasks = "crawl.tasks"
|
||||
ConfRetries = "crawl.retries"
|
||||
ConfWorkers = "crawl.connections"
|
||||
ConfUserAgent = "crawl.user-agent"
|
||||
ConfTimeout = "crawl.timeout"
|
||||
ConfCrawlStats = "output.crawl_stats"
|
||||
ConfAllocStats = "output.resource_stats"
|
||||
@ -45,6 +47,7 @@ func prepareConfig() {
|
||||
viper.SetDefault(ConfRetries, 5)
|
||||
viper.SetDefault(ConfWorkers, 2)
|
||||
viper.SetDefault(ConfTasks, 3)
|
||||
viper.SetDefault(ConfUserAgent, "")
|
||||
viper.SetDefault(ConfTimeout, 10 * time.Second)
|
||||
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
|
||||
viper.SetDefault(ConfAllocStats, 0)
|
||||
@ -98,6 +101,8 @@ func readConfig() {
|
||||
configOOB(ConfTasks, int(config.Tasks))
|
||||
}
|
||||
|
||||
config.UserAgent = viper.GetString(ConfUserAgent)
|
||||
|
||||
config.Timeout = viper.GetDuration(ConfTimeout)
|
||||
|
||||
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
|
||||
|
@ -47,3 +47,6 @@ crawl:
|
||||
|
||||
# Time before discarding a network request
|
||||
timeout: 10s
|
||||
|
||||
# Crawler User-Agent
|
||||
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
|
||||
|
6
crawl.go
6
crawl.go
@ -25,6 +25,9 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
||||
f.Name = path.Base(j.Uri.Path)
|
||||
|
||||
req := fasthttp.AcquireRequest()
|
||||
if config.UserAgent != "" {
|
||||
req.Header.SetUserAgent(config.UserAgent)
|
||||
}
|
||||
req.SetRequestURI(j.UriStr)
|
||||
|
||||
res := fasthttp.AcquireResponse()
|
||||
@ -120,6 +123,9 @@ func GetFile(u fasturl.URL, f *File) (err error) {
|
||||
|
||||
req := fasthttp.AcquireRequest()
|
||||
req.Header.SetMethod("HEAD")
|
||||
if config.UserAgent != "" {
|
||||
req.Header.SetUserAgent(config.UserAgent)
|
||||
}
|
||||
req.SetRequestURI(u.String())
|
||||
|
||||
res := fasthttp.AcquireResponse()
|
||||
|
Loading…
x
Reference in New Issue
Block a user