mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-19 10:26:43 +00:00
Remove debug messages & don't use disk queue by default
This commit is contained in:
parent
84c10e1981
commit
24f0bd91f7
10
config.go
10
config.go
@ -81,7 +81,7 @@ func prepareConfig() {
|
|||||||
|
|
||||||
pf.Duration(ConfRecheck, 1*time.Second, "OD-DB: Poll interval for new jobs")
|
pf.Duration(ConfRecheck, 1*time.Second, "OD-DB: Poll interval for new jobs")
|
||||||
|
|
||||||
pf.Duration(ConfCooldown, 30*time.Second, "OD-DB: Time to wait after a server-side error")
|
pf.Duration(ConfCooldown, 1*time.Minute, "OD-DB: Time to wait after a server-side error")
|
||||||
|
|
||||||
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
|
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
|
||||||
|
|
||||||
@ -91,7 +91,7 @@ func prepareConfig() {
|
|||||||
|
|
||||||
pf.Uint(ConfTasks, 25, "Crawler: Max concurrent tasks")
|
pf.Uint(ConfTasks, 25, "Crawler: Max concurrent tasks")
|
||||||
|
|
||||||
pf.Uint(ConfWorkers, 4, "Crawler: Connections per server")
|
pf.Uint(ConfWorkers, 1, "Crawler: Connections per server")
|
||||||
|
|
||||||
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
|
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
|
||||||
|
|
||||||
@ -101,11 +101,11 @@ func prepareConfig() {
|
|||||||
|
|
||||||
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
|
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
|
||||||
|
|
||||||
pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size")
|
pf.Int(ConfJobBufferSize, -1, "Crawler: Task queue cache size")
|
||||||
|
|
||||||
pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval")
|
pf.Duration(ConfCrawlStats, 500*time.Second, "Log: Crawl stats interval")
|
||||||
|
|
||||||
pf.Duration(ConfAllocStats, 10*time.Second, "Log: Resource stats interval")
|
pf.Duration(ConfAllocStats, 500*time.Second, "Log: Resource stats interval")
|
||||||
|
|
||||||
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")
|
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")
|
||||||
|
|
||||||
|
@ -29,10 +29,10 @@ server:
|
|||||||
# Log output settings
|
# Log output settings
|
||||||
output:
|
output:
|
||||||
# Crawl statistics
|
# Crawl statistics
|
||||||
crawl_stats: 1s
|
crawl_stats: 1m
|
||||||
|
|
||||||
# CPU/RAM/Job queue stats
|
# CPU/RAM/Job queue stats
|
||||||
resource_stats: 10s
|
resource_stats: 1m
|
||||||
|
|
||||||
# More output? (Every listed dir)
|
# More output? (Every listed dir)
|
||||||
verbose: false
|
verbose: false
|
||||||
@ -81,4 +81,4 @@ crawl:
|
|||||||
# in memory.
|
# in memory.
|
||||||
# A negative value will cause all jobs
|
# A negative value will cause all jobs
|
||||||
# to be stored in memory. (Don't do this)
|
# to be stored in memory. (Don't do this)
|
||||||
job_buffer: 5000
|
job_buffer: -1
|
||||||
|
22
worker.go
22
worker.go
@ -14,8 +14,8 @@ import (
|
|||||||
var globalWait sync.WaitGroup
|
var globalWait sync.WaitGroup
|
||||||
|
|
||||||
type WorkerContext struct {
|
type WorkerContext struct {
|
||||||
OD *OD
|
OD *OD
|
||||||
Queue *BufferedQueue
|
Queue *BufferedQueue
|
||||||
lastRateLimit time.Time
|
lastRateLimit time.Time
|
||||||
numRateLimits int
|
numRateLimits int
|
||||||
}
|
}
|
||||||
@ -56,16 +56,16 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
|
|||||||
|
|
||||||
if !shouldRetry(err) {
|
if !shouldRetry(err) {
|
||||||
atomic.AddUint64(&totalAborted, 1)
|
atomic.AddUint64(&totalAborted, 1)
|
||||||
logrus.WithField("url", job.UriStr).
|
//logrus.WithField("url", job.UriStr).
|
||||||
WithError(err).
|
// WithError(err).
|
||||||
Error("Giving up after failure")
|
// Error("Giving up after failure")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if job.Fails > config.Retries {
|
if job.Fails > config.Retries {
|
||||||
atomic.AddUint64(&totalAborted, 1)
|
atomic.AddUint64(&totalAborted, 1)
|
||||||
logrus.WithField("url", job.UriStr).
|
//logrus.WithField("url", job.UriStr).
|
||||||
Errorf("Giving up after %d fails", job.Fails)
|
// Errorf("Giving up after %d fails", job.Fails)
|
||||||
} else {
|
} else {
|
||||||
atomic.AddUint64(&totalRetries, 1)
|
atomic.AddUint64(&totalRetries, 1)
|
||||||
if err == ErrRateLimit {
|
if err == ErrRateLimit {
|
||||||
@ -88,7 +88,9 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
||||||
if len(job.Uri.Path) == 0 { return }
|
if len(job.Uri.Path) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
|
if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
|
||||||
// Load directory
|
// Load directory
|
||||||
links, err := GetDir(job, f)
|
links, err := GetDir(job, f)
|
||||||
@ -159,10 +161,10 @@ func (w *WorkerContext) queueJob(job Job) {
|
|||||||
w.OD.Wait.Add(1)
|
w.OD.Wait.Add(1)
|
||||||
|
|
||||||
if w.numRateLimits > 0 {
|
if w.numRateLimits > 0 {
|
||||||
if time.Since(w.lastRateLimit) > 5 * time.Second {
|
if time.Since(w.lastRateLimit) > 5*time.Second {
|
||||||
w.numRateLimits = 0
|
w.numRateLimits = 0
|
||||||
} else {
|
} else {
|
||||||
time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) *
|
time.Sleep(time.Duration(math.Sqrt(float64(50*w.numRateLimits))) *
|
||||||
100 * time.Millisecond)
|
100 * time.Millisecond)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user