mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-19 10:26:43 +00:00
Add User-Agent parameter
This commit is contained in:
parent
6dbec8c789
commit
a71157b4d8
@ -17,6 +17,7 @@ var config struct {
|
|||||||
ChunkSize int64
|
ChunkSize int64
|
||||||
Retries int
|
Retries int
|
||||||
Workers int
|
Workers int
|
||||||
|
UserAgent string
|
||||||
Timeout time.Duration
|
Timeout time.Duration
|
||||||
Tasks int32
|
Tasks int32
|
||||||
CrawlStats time.Duration
|
CrawlStats time.Duration
|
||||||
@ -34,6 +35,7 @@ const (
|
|||||||
ConfTasks = "crawl.tasks"
|
ConfTasks = "crawl.tasks"
|
||||||
ConfRetries = "crawl.retries"
|
ConfRetries = "crawl.retries"
|
||||||
ConfWorkers = "crawl.connections"
|
ConfWorkers = "crawl.connections"
|
||||||
|
ConfUserAgent = "crawl.user-agent"
|
||||||
ConfTimeout = "crawl.timeout"
|
ConfTimeout = "crawl.timeout"
|
||||||
ConfCrawlStats = "output.crawl_stats"
|
ConfCrawlStats = "output.crawl_stats"
|
||||||
ConfAllocStats = "output.resource_stats"
|
ConfAllocStats = "output.resource_stats"
|
||||||
@ -45,6 +47,7 @@ func prepareConfig() {
|
|||||||
viper.SetDefault(ConfRetries, 5)
|
viper.SetDefault(ConfRetries, 5)
|
||||||
viper.SetDefault(ConfWorkers, 2)
|
viper.SetDefault(ConfWorkers, 2)
|
||||||
viper.SetDefault(ConfTasks, 3)
|
viper.SetDefault(ConfTasks, 3)
|
||||||
|
viper.SetDefault(ConfUserAgent, "")
|
||||||
viper.SetDefault(ConfTimeout, 10 * time.Second)
|
viper.SetDefault(ConfTimeout, 10 * time.Second)
|
||||||
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
|
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
|
||||||
viper.SetDefault(ConfAllocStats, 0)
|
viper.SetDefault(ConfAllocStats, 0)
|
||||||
@ -98,6 +101,8 @@ func readConfig() {
|
|||||||
configOOB(ConfTasks, int(config.Tasks))
|
configOOB(ConfTasks, int(config.Tasks))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
config.UserAgent = viper.GetString(ConfUserAgent)
|
||||||
|
|
||||||
config.Timeout = viper.GetDuration(ConfTimeout)
|
config.Timeout = viper.GetDuration(ConfTimeout)
|
||||||
|
|
||||||
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
|
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
|
||||||
|
@ -47,3 +47,6 @@ crawl:
|
|||||||
|
|
||||||
# Time before discarding a network request
|
# Time before discarding a network request
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
|
|
||||||
|
# Crawler User-Agent
|
||||||
|
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
|
||||||
|
6
crawl.go
6
crawl.go
@ -25,6 +25,9 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
|||||||
f.Name = path.Base(j.Uri.Path)
|
f.Name = path.Base(j.Uri.Path)
|
||||||
|
|
||||||
req := fasthttp.AcquireRequest()
|
req := fasthttp.AcquireRequest()
|
||||||
|
if config.UserAgent != "" {
|
||||||
|
req.Header.SetUserAgent(config.UserAgent)
|
||||||
|
}
|
||||||
req.SetRequestURI(j.UriStr)
|
req.SetRequestURI(j.UriStr)
|
||||||
|
|
||||||
res := fasthttp.AcquireResponse()
|
res := fasthttp.AcquireResponse()
|
||||||
@ -120,6 +123,9 @@ func GetFile(u fasturl.URL, f *File) (err error) {
|
|||||||
|
|
||||||
req := fasthttp.AcquireRequest()
|
req := fasthttp.AcquireRequest()
|
||||||
req.Header.SetMethod("HEAD")
|
req.Header.SetMethod("HEAD")
|
||||||
|
if config.UserAgent != "" {
|
||||||
|
req.Header.SetUserAgent(config.UserAgent)
|
||||||
|
}
|
||||||
req.SetRequestURI(u.String())
|
req.SetRequestURI(u.String())
|
||||||
|
|
||||||
res := fasthttp.AcquireResponse()
|
res := fasthttp.AcquireResponse()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user