Remove debug messages & don't use disk queue by default

This commit is contained in:
simon987 2019-04-06 12:11:42 -04:00
parent 84c10e1981
commit 24f0bd91f7
3 changed files with 20 additions and 18 deletions

View File

@ -81,7 +81,7 @@ func prepareConfig() {
pf.Duration(ConfRecheck, 1*time.Second, "OD-DB: Poll interval for new jobs") pf.Duration(ConfRecheck, 1*time.Second, "OD-DB: Poll interval for new jobs")
pf.Duration(ConfCooldown, 30*time.Second, "OD-DB: Time to wait after a server-side error") pf.Duration(ConfCooldown, 1*time.Minute, "OD-DB: Time to wait after a server-side error")
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size") pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
@ -91,7 +91,7 @@ func prepareConfig() {
pf.Uint(ConfTasks, 25, "Crawler: Max concurrent tasks") pf.Uint(ConfTasks, 25, "Crawler: Max concurrent tasks")
pf.Uint(ConfWorkers, 4, "Crawler: Connections per server") pf.Uint(ConfWorkers, 1, "Crawler: Connections per server")
pf.Uint(ConfRetries, 5, "Crawler: Request retries") pf.Uint(ConfRetries, 5, "Crawler: Request retries")
@ -101,11 +101,11 @@ func prepareConfig() {
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent") pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size") pf.Int(ConfJobBufferSize, -1, "Crawler: Task queue cache size")
pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval") pf.Duration(ConfCrawlStats, 500*time.Second, "Log: Crawl stats interval")
pf.Duration(ConfAllocStats, 10*time.Second, "Log: Resource stats interval") pf.Duration(ConfAllocStats, 500*time.Second, "Log: Resource stats interval")
pf.Bool(ConfVerbose, false, "Log: Print every listed dir") pf.Bool(ConfVerbose, false, "Log: Print every listed dir")

View File

@ -29,10 +29,10 @@ server:
# Log output settings # Log output settings
output: output:
# Crawl statistics # Crawl statistics
crawl_stats: 1s crawl_stats: 1m
# CPU/RAM/Job queue stats # CPU/RAM/Job queue stats
resource_stats: 10s resource_stats: 1m
# More output? (Every listed dir) # More output? (Every listed dir)
verbose: false verbose: false
@ -81,4 +81,4 @@ crawl:
# in memory. # in memory.
# A negative value will cause all jobs # A negative value will cause all jobs
# to be stored in memory. (Don't do this) # to be stored in memory. (Don't do this)
job_buffer: 5000 job_buffer: -1

View File

@ -14,8 +14,8 @@ import (
var globalWait sync.WaitGroup var globalWait sync.WaitGroup
type WorkerContext struct { type WorkerContext struct {
OD *OD OD *OD
Queue *BufferedQueue Queue *BufferedQueue
lastRateLimit time.Time lastRateLimit time.Time
numRateLimits int numRateLimits int
} }
@ -56,16 +56,16 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
if !shouldRetry(err) { if !shouldRetry(err) {
atomic.AddUint64(&totalAborted, 1) atomic.AddUint64(&totalAborted, 1)
logrus.WithField("url", job.UriStr). //logrus.WithField("url", job.UriStr).
WithError(err). // WithError(err).
Error("Giving up after failure") // Error("Giving up after failure")
return return
} }
if job.Fails > config.Retries { if job.Fails > config.Retries {
atomic.AddUint64(&totalAborted, 1) atomic.AddUint64(&totalAborted, 1)
logrus.WithField("url", job.UriStr). //logrus.WithField("url", job.UriStr).
Errorf("Giving up after %d fails", job.Fails) // Errorf("Giving up after %d fails", job.Fails)
} else { } else {
atomic.AddUint64(&totalRetries, 1) atomic.AddUint64(&totalRetries, 1)
if err == ErrRateLimit { if err == ErrRateLimit {
@ -88,7 +88,9 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
} }
func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) { func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
if len(job.Uri.Path) == 0 { return } if len(job.Uri.Path) == 0 {
return
}
if job.Uri.Path[len(job.Uri.Path)-1] == '/' { if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
// Load directory // Load directory
links, err := GetDir(job, f) links, err := GetDir(job, f)
@ -159,10 +161,10 @@ func (w *WorkerContext) queueJob(job Job) {
w.OD.Wait.Add(1) w.OD.Wait.Add(1)
if w.numRateLimits > 0 { if w.numRateLimits > 0 {
if time.Since(w.lastRateLimit) > 5 * time.Second { if time.Since(w.lastRateLimit) > 5*time.Second {
w.numRateLimits = 0 w.numRateLimits = 0
} else { } else {
time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) * time.Sleep(time.Duration(math.Sqrt(float64(50*w.numRateLimits))) *
100 * time.Millisecond) 100 * time.Millisecond)
} }
} }