From 24f0bd91f72c08810946a76b4817b85271ec98e1 Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 6 Apr 2019 12:11:42 -0400 Subject: [PATCH] Remove debug messages & don't use disk queue by default --- config.go | 10 +++++----- config.yml | 6 +++--- worker.go | 22 ++++++++++++---------- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/config.go b/config.go index 28a122a..8e9db17 100644 --- a/config.go +++ b/config.go @@ -81,7 +81,7 @@ func prepareConfig() { pf.Duration(ConfRecheck, 1*time.Second, "OD-DB: Poll interval for new jobs") - pf.Duration(ConfCooldown, 30*time.Second, "OD-DB: Time to wait after a server-side error") + pf.Duration(ConfCooldown, 1*time.Minute, "OD-DB: Time to wait after a server-side error") pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size") @@ -91,7 +91,7 @@ func prepareConfig() { pf.Uint(ConfTasks, 25, "Crawler: Max concurrent tasks") - pf.Uint(ConfWorkers, 4, "Crawler: Connections per server") + pf.Uint(ConfWorkers, 1, "Crawler: Connections per server") pf.Uint(ConfRetries, 5, "Crawler: Request retries") @@ -101,11 +101,11 @@ func prepareConfig() { pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent") - pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size") + pf.Int(ConfJobBufferSize, -1, "Crawler: Task queue cache size") - pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval") + pf.Duration(ConfCrawlStats, 500*time.Second, "Log: Crawl stats interval") - pf.Duration(ConfAllocStats, 10*time.Second, "Log: Resource stats interval") + pf.Duration(ConfAllocStats, 500*time.Second, "Log: Resource stats interval") pf.Bool(ConfVerbose, false, "Log: Print every listed dir") diff --git a/config.yml b/config.yml index 2aa8aa2..fd42948 100644 --- a/config.yml +++ b/config.yml @@ -29,10 +29,10 @@ server: # Log output settings output: # Crawl statistics - crawl_stats: 1s + crawl_stats: 1m # CPU/RAM/Job queue stats - resource_stats: 10s + resource_stats: 1m # More output? (Every listed dir) verbose: false @@ -81,4 +81,4 @@ crawl: # in memory. # A negative value will cause all jobs # to be stored in memory. (Don't do this) - job_buffer: 5000 + job_buffer: -1 diff --git a/worker.go b/worker.go index 118e28f..4f8ae6d 100644 --- a/worker.go +++ b/worker.go @@ -14,8 +14,8 @@ import ( var globalWait sync.WaitGroup type WorkerContext struct { - OD *OD - Queue *BufferedQueue + OD *OD + Queue *BufferedQueue lastRateLimit time.Time numRateLimits int } @@ -56,16 +56,16 @@ func (w *WorkerContext) step(results chan<- File, job Job) { if !shouldRetry(err) { atomic.AddUint64(&totalAborted, 1) - logrus.WithField("url", job.UriStr). - WithError(err). - Error("Giving up after failure") + //logrus.WithField("url", job.UriStr). + // WithError(err). + // Error("Giving up after failure") return } if job.Fails > config.Retries { atomic.AddUint64(&totalAborted, 1) - logrus.WithField("url", job.UriStr). - Errorf("Giving up after %d fails", job.Fails) + //logrus.WithField("url", job.UriStr). + // Errorf("Giving up after %d fails", job.Fails) } else { atomic.AddUint64(&totalRetries, 1) if err == ErrRateLimit { @@ -88,7 +88,9 @@ func (w *WorkerContext) step(results chan<- File, job Job) { } func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) { - if len(job.Uri.Path) == 0 { return } + if len(job.Uri.Path) == 0 { + return + } if job.Uri.Path[len(job.Uri.Path)-1] == '/' { // Load directory links, err := GetDir(job, f) @@ -159,10 +161,10 @@ func (w *WorkerContext) queueJob(job Job) { w.OD.Wait.Add(1) if w.numRateLimits > 0 { - if time.Since(w.lastRateLimit) > 5 * time.Second { + if time.Since(w.lastRateLimit) > 5*time.Second { w.numRateLimits = 0 } else { - time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) * + time.Sleep(time.Duration(math.Sqrt(float64(50*w.numRateLimits))) * 100 * time.Millisecond) } }