Remove debug messages & don't use disk queue by default

2025-04-24 12:55:51 +00:00 · 2019-04-06 12:11:42 -04:00 · 2019-04-06 12:11:42 -04:00 · 24f0bd91f7
commit 24f0bd91f7
parent 84c10e1981
3 changed files with 20 additions and 18 deletions
--- a/config.go
+++ b/config.go
@ -81,7 +81,7 @@ func prepareConfig() {

 	pf.Duration(ConfRecheck, 1*time.Second, "OD-DB: Poll interval for new jobs")

-	pf.Duration(ConfCooldown, 30*time.Second, "OD-DB: Time to wait after a server-side error")
+	pf.Duration(ConfCooldown, 1*time.Minute, "OD-DB: Time to wait after a server-side error")

 	pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")

@ -91,7 +91,7 @@ func prepareConfig() {

 	pf.Uint(ConfTasks, 25, "Crawler: Max concurrent tasks")

-	pf.Uint(ConfWorkers, 4, "Crawler: Connections per server")
+	pf.Uint(ConfWorkers, 1, "Crawler: Connections per server")

 	pf.Uint(ConfRetries, 5, "Crawler: Request retries")

@ -101,11 +101,11 @@ func prepareConfig() {

 	pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")

-	pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size")
+	pf.Int(ConfJobBufferSize, -1, "Crawler: Task queue cache size")

-	pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval")
+	pf.Duration(ConfCrawlStats, 500*time.Second, "Log: Crawl stats interval")

-	pf.Duration(ConfAllocStats, 10*time.Second, "Log: Resource stats interval")
+	pf.Duration(ConfAllocStats, 500*time.Second, "Log: Resource stats interval")

 	pf.Bool(ConfVerbose, false, "Log: Print every listed dir")

--- a/config.yml
+++ b/config.yml
@ -29,10 +29,10 @@ server:
 # Log output settings
 output:
  # Crawl statistics
-  crawl_stats: 1s
+  crawl_stats: 1m

  # CPU/RAM/Job queue stats
-  resource_stats: 10s
+  resource_stats: 1m

  # More output? (Every listed dir)
  verbose: false
@ -81,4 +81,4 @@ crawl:
  # in memory.
  # A negative value will cause all jobs
  # to be stored in memory. (Don't do this)
-  job_buffer: 5000
+  job_buffer: -1
--- a/worker.go
+++ b/worker.go
@ -14,8 +14,8 @@ import (
 var globalWait sync.WaitGroup

 type WorkerContext struct {
-	OD *OD
-	Queue *BufferedQueue
+	OD            *OD
+	Queue         *BufferedQueue
 	lastRateLimit time.Time
 	numRateLimits int
 }
@ -56,16 +56,16 @@ func (w *WorkerContext) step(results chan<- File, job Job) {

 		if !shouldRetry(err) {
 			atomic.AddUint64(&totalAborted, 1)
-			logrus.WithField("url", job.UriStr).
-				WithError(err).
-				Error("Giving up after failure")
+			//logrus.WithField("url", job.UriStr).
+			//	WithError(err).
+			//	Error("Giving up after failure")
 			return
 		}

 		if job.Fails > config.Retries {
 			atomic.AddUint64(&totalAborted, 1)
-			logrus.WithField("url", job.UriStr).
-				Errorf("Giving up after %d fails", job.Fails)
+			//logrus.WithField("url", job.UriStr).
+			//	Errorf("Giving up after %d fails", job.Fails)
 		} else {
 			atomic.AddUint64(&totalRetries, 1)
 			if err == ErrRateLimit {
@ -88,7 +88,9 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
 }

 func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
-	if len(job.Uri.Path) == 0 { return }
+	if len(job.Uri.Path) == 0 {
+		return
+	}
 	if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
 		// Load directory
 		links, err := GetDir(job, f)
@ -159,10 +161,10 @@ func (w *WorkerContext) queueJob(job Job) {
 	w.OD.Wait.Add(1)

 	if w.numRateLimits > 0 {
-		if time.Since(w.lastRateLimit) > 5 * time.Second {
+		if time.Since(w.lastRateLimit) > 5*time.Second {
 			w.numRateLimits = 0
 		} else {
-			time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) *
+			time.Sleep(time.Duration(math.Sqrt(float64(50*w.numRateLimits))) *
 				100 * time.Millisecond)
 		}
 	}