Remove debug messages & don't use disk queue by default

This commit is contained in:
simon987 2019-04-06 12:11:42 -04:00
parent 84c10e1981
commit 24f0bd91f7
3 changed files with 20 additions and 18 deletions

View File

@ -81,7 +81,7 @@ func prepareConfig() {
pf.Duration(ConfRecheck, 1*time.Second, "OD-DB: Poll interval for new jobs")
pf.Duration(ConfCooldown, 30*time.Second, "OD-DB: Time to wait after a server-side error")
pf.Duration(ConfCooldown, 1*time.Minute, "OD-DB: Time to wait after a server-side error")
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
@ -91,7 +91,7 @@ func prepareConfig() {
pf.Uint(ConfTasks, 25, "Crawler: Max concurrent tasks")
pf.Uint(ConfWorkers, 4, "Crawler: Connections per server")
pf.Uint(ConfWorkers, 1, "Crawler: Connections per server")
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
@ -101,11 +101,11 @@ func prepareConfig() {
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size")
pf.Int(ConfJobBufferSize, -1, "Crawler: Task queue cache size")
pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval")
pf.Duration(ConfCrawlStats, 500*time.Second, "Log: Crawl stats interval")
pf.Duration(ConfAllocStats, 10*time.Second, "Log: Resource stats interval")
pf.Duration(ConfAllocStats, 500*time.Second, "Log: Resource stats interval")
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")

View File

@ -29,10 +29,10 @@ server:
# Log output settings
output:
# Crawl statistics
crawl_stats: 1s
crawl_stats: 1m
# CPU/RAM/Job queue stats
resource_stats: 10s
resource_stats: 1m
# More output? (Every listed dir)
verbose: false
@ -81,4 +81,4 @@ crawl:
# in memory.
# A negative value will cause all jobs
# to be stored in memory. (Don't do this)
job_buffer: 5000
job_buffer: -1

View File

@ -14,8 +14,8 @@ import (
var globalWait sync.WaitGroup
type WorkerContext struct {
OD *OD
Queue *BufferedQueue
OD *OD
Queue *BufferedQueue
lastRateLimit time.Time
numRateLimits int
}
@ -56,16 +56,16 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
if !shouldRetry(err) {
atomic.AddUint64(&totalAborted, 1)
logrus.WithField("url", job.UriStr).
WithError(err).
Error("Giving up after failure")
//logrus.WithField("url", job.UriStr).
// WithError(err).
// Error("Giving up after failure")
return
}
if job.Fails > config.Retries {
atomic.AddUint64(&totalAborted, 1)
logrus.WithField("url", job.UriStr).
Errorf("Giving up after %d fails", job.Fails)
//logrus.WithField("url", job.UriStr).
// Errorf("Giving up after %d fails", job.Fails)
} else {
atomic.AddUint64(&totalRetries, 1)
if err == ErrRateLimit {
@ -88,7 +88,9 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
}
func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
if len(job.Uri.Path) == 0 { return }
if len(job.Uri.Path) == 0 {
return
}
if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
// Load directory
links, err := GetDir(job, f)
@ -159,10 +161,10 @@ func (w *WorkerContext) queueJob(job Job) {
w.OD.Wait.Add(1)
if w.numRateLimits > 0 {
if time.Since(w.lastRateLimit) > 5 * time.Second {
if time.Since(w.lastRateLimit) > 5*time.Second {
w.numRateLimits = 0
} else {
time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) *
time.Sleep(time.Duration(math.Sqrt(float64(50*w.numRateLimits))) *
100 * time.Millisecond)
}
}