mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-16 17:06:46 +00:00
Remove debug messages & don't use disk queue by default
This commit is contained in:
parent
84c10e1981
commit
24f0bd91f7
10
config.go
10
config.go
@ -81,7 +81,7 @@ func prepareConfig() {
|
||||
|
||||
pf.Duration(ConfRecheck, 1*time.Second, "OD-DB: Poll interval for new jobs")
|
||||
|
||||
pf.Duration(ConfCooldown, 30*time.Second, "OD-DB: Time to wait after a server-side error")
|
||||
pf.Duration(ConfCooldown, 1*time.Minute, "OD-DB: Time to wait after a server-side error")
|
||||
|
||||
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
|
||||
|
||||
@ -91,7 +91,7 @@ func prepareConfig() {
|
||||
|
||||
pf.Uint(ConfTasks, 25, "Crawler: Max concurrent tasks")
|
||||
|
||||
pf.Uint(ConfWorkers, 4, "Crawler: Connections per server")
|
||||
pf.Uint(ConfWorkers, 1, "Crawler: Connections per server")
|
||||
|
||||
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
|
||||
|
||||
@ -101,11 +101,11 @@ func prepareConfig() {
|
||||
|
||||
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
|
||||
|
||||
pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size")
|
||||
pf.Int(ConfJobBufferSize, -1, "Crawler: Task queue cache size")
|
||||
|
||||
pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval")
|
||||
pf.Duration(ConfCrawlStats, 500*time.Second, "Log: Crawl stats interval")
|
||||
|
||||
pf.Duration(ConfAllocStats, 10*time.Second, "Log: Resource stats interval")
|
||||
pf.Duration(ConfAllocStats, 500*time.Second, "Log: Resource stats interval")
|
||||
|
||||
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")
|
||||
|
||||
|
@ -29,10 +29,10 @@ server:
|
||||
# Log output settings
|
||||
output:
|
||||
# Crawl statistics
|
||||
crawl_stats: 1s
|
||||
crawl_stats: 1m
|
||||
|
||||
# CPU/RAM/Job queue stats
|
||||
resource_stats: 10s
|
||||
resource_stats: 1m
|
||||
|
||||
# More output? (Every listed dir)
|
||||
verbose: false
|
||||
@ -81,4 +81,4 @@ crawl:
|
||||
# in memory.
|
||||
# A negative value will cause all jobs
|
||||
# to be stored in memory. (Don't do this)
|
||||
job_buffer: 5000
|
||||
job_buffer: -1
|
||||
|
22
worker.go
22
worker.go
@ -14,8 +14,8 @@ import (
|
||||
var globalWait sync.WaitGroup
|
||||
|
||||
type WorkerContext struct {
|
||||
OD *OD
|
||||
Queue *BufferedQueue
|
||||
OD *OD
|
||||
Queue *BufferedQueue
|
||||
lastRateLimit time.Time
|
||||
numRateLimits int
|
||||
}
|
||||
@ -56,16 +56,16 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
|
||||
|
||||
if !shouldRetry(err) {
|
||||
atomic.AddUint64(&totalAborted, 1)
|
||||
logrus.WithField("url", job.UriStr).
|
||||
WithError(err).
|
||||
Error("Giving up after failure")
|
||||
//logrus.WithField("url", job.UriStr).
|
||||
// WithError(err).
|
||||
// Error("Giving up after failure")
|
||||
return
|
||||
}
|
||||
|
||||
if job.Fails > config.Retries {
|
||||
atomic.AddUint64(&totalAborted, 1)
|
||||
logrus.WithField("url", job.UriStr).
|
||||
Errorf("Giving up after %d fails", job.Fails)
|
||||
//logrus.WithField("url", job.UriStr).
|
||||
// Errorf("Giving up after %d fails", job.Fails)
|
||||
} else {
|
||||
atomic.AddUint64(&totalRetries, 1)
|
||||
if err == ErrRateLimit {
|
||||
@ -88,7 +88,9 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
|
||||
}
|
||||
|
||||
func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
||||
if len(job.Uri.Path) == 0 { return }
|
||||
if len(job.Uri.Path) == 0 {
|
||||
return
|
||||
}
|
||||
if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
|
||||
// Load directory
|
||||
links, err := GetDir(job, f)
|
||||
@ -159,10 +161,10 @@ func (w *WorkerContext) queueJob(job Job) {
|
||||
w.OD.Wait.Add(1)
|
||||
|
||||
if w.numRateLimits > 0 {
|
||||
if time.Since(w.lastRateLimit) > 5 * time.Second {
|
||||
if time.Since(w.lastRateLimit) > 5*time.Second {
|
||||
w.numRateLimits = 0
|
||||
} else {
|
||||
time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) *
|
||||
time.Sleep(time.Duration(math.Sqrt(float64(50*w.numRateLimits))) *
|
||||
100 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user