mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-19 18:36:43 +00:00
more stuff
This commit is contained in:
parent
4ea5f8a410
commit
faad19f121
@ -2,4 +2,5 @@ server_url: localhost:6969
|
|||||||
token: abc
|
token: abc
|
||||||
stats_interval: 1s
|
stats_interval: 1s
|
||||||
verbose: true
|
verbose: true
|
||||||
retries: 0
|
retries: 5
|
||||||
|
workers: 80
|
2
main.go
2
main.go
@ -14,7 +14,7 @@ func main() {
|
|||||||
remotes := make(chan *RemoteDir)
|
remotes := make(chan *RemoteDir)
|
||||||
go Schedule(c, remotes)
|
go Schedule(c, remotes)
|
||||||
|
|
||||||
u, _ := url.Parse("https://the-eye.eu/public/rom/")
|
u, _ := url.Parse("http://mine.terorie.com:420/")
|
||||||
remote := NewRemoteDir(*u)
|
remote := NewRemoteDir(*u)
|
||||||
|
|
||||||
globalWait.Add(1)
|
globalWait.Add(1)
|
||||||
|
@ -10,11 +10,15 @@ type Job struct {
|
|||||||
Uri url.URL
|
Uri url.URL
|
||||||
UriStr string
|
UriStr string
|
||||||
Fails int
|
Fails int
|
||||||
|
LastError error
|
||||||
}
|
}
|
||||||
|
|
||||||
func Schedule(c context.Context, remotes <-chan *RemoteDir) {
|
func Schedule(c context.Context, remotes <-chan *RemoteDir) {
|
||||||
in, out := makeJobBuffer()
|
in, out := makeJobBuffer()
|
||||||
wCtx := WorkerContext{ in, out }
|
wCtx := WorkerContext{
|
||||||
|
in: in,
|
||||||
|
out: out,
|
||||||
|
}
|
||||||
for i := 0; i < config.Workers; i++ {
|
for i := 0; i < config.Workers; i++ {
|
||||||
go wCtx.Worker()
|
go wCtx.Worker()
|
||||||
}
|
}
|
||||||
@ -28,7 +32,7 @@ func Schedule(c context.Context, remotes <-chan *RemoteDir) {
|
|||||||
|
|
||||||
case remote := <-remotes:
|
case remote := <-remotes:
|
||||||
// Enqueue initial job
|
// Enqueue initial job
|
||||||
queueJob(in, Job{
|
wCtx.queueJob(Job{
|
||||||
Remote: remote,
|
Remote: remote,
|
||||||
Uri: remote.BaseUri,
|
Uri: remote.BaseUri,
|
||||||
UriStr: remote.BaseUri.String(),
|
UriStr: remote.BaseUri.String(),
|
||||||
|
31
worker.go
31
worker.go
@ -2,9 +2,11 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
|
"math"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
var globalWait sync.WaitGroup
|
var globalWait sync.WaitGroup
|
||||||
@ -12,6 +14,8 @@ var globalWait sync.WaitGroup
|
|||||||
type WorkerContext struct {
|
type WorkerContext struct {
|
||||||
in chan<- Job
|
in chan<- Job
|
||||||
out <-chan Job
|
out <-chan Job
|
||||||
|
lastRateLimit time.Time
|
||||||
|
numRateLimits int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w WorkerContext) Worker() {
|
func (w WorkerContext) Worker() {
|
||||||
@ -21,7 +25,7 @@ func (w WorkerContext) Worker() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (w WorkerContext) step(job Job) {
|
func (w WorkerContext) step(job Job) {
|
||||||
defer finishJob(&job)
|
defer w.finishJob(&job)
|
||||||
|
|
||||||
var f File
|
var f File
|
||||||
|
|
||||||
@ -37,14 +41,18 @@ func (w WorkerContext) step(job Job) {
|
|||||||
Errorf("Giving up after %d fails", job.Fails)
|
Errorf("Giving up after %d fails", job.Fails)
|
||||||
} else {
|
} else {
|
||||||
atomic.AddUint64(&totalRetries, 1)
|
atomic.AddUint64(&totalRetries, 1)
|
||||||
queueJob(w.in, job)
|
if err == ErrRateLimit {
|
||||||
|
w.lastRateLimit = time.Now()
|
||||||
|
w.numRateLimits++
|
||||||
|
}
|
||||||
|
w.queueJob(job)
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
atomic.AddUint64(&totalDone, 1)
|
atomic.AddUint64(&totalDone, 1)
|
||||||
for _, job := range newJobs {
|
for _, job := range newJobs {
|
||||||
queueJob(w.in, job)
|
w.queueJob(job)
|
||||||
}
|
}
|
||||||
|
|
||||||
job.Remote.Files = append(job.Remote.Files, f)
|
job.Remote.Files = append(job.Remote.Files, f)
|
||||||
@ -86,13 +94,24 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func queueJob(in chan<- Job, job Job) {
|
func (w WorkerContext) queueJob(job Job) {
|
||||||
job.Remote.Wait.Add(1)
|
job.Remote.Wait.Add(1)
|
||||||
globalWait.Add(1)
|
globalWait.Add(1)
|
||||||
in <- job
|
|
||||||
|
if w.numRateLimits > 0 {
|
||||||
|
if time.Since(w.lastRateLimit) > 5 * time.Second {
|
||||||
|
w.numRateLimits = 0
|
||||||
|
} else {
|
||||||
|
time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) *
|
||||||
|
100 * time.Millisecond)
|
||||||
|
w.in <- job
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
w.in <- job
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func finishJob(job *Job) {
|
func (w WorkerContext) finishJob(job *Job) {
|
||||||
job.Remote.Wait.Done()
|
job.Remote.Wait.Done()
|
||||||
globalWait.Done()
|
globalWait.Done()
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user