From ab5874129f7d7b76f924c426dbdef60d943dd7c7 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sun, 28 Oct 2018 03:47:29 +0100 Subject: [PATCH] Don't retry on 401/403 --- crawl_http.go | 42 ++++++++++++++++++++++-------------------- worker.go | 5 +++++ 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/crawl_http.go b/crawl_http.go index e0cf5db..30e63d7 100644 --- a/crawl_http.go +++ b/crawl_http.go @@ -18,6 +18,7 @@ import ( var client fasthttp.Client var ErrRateLimit = errors.New("too many requests") +var ErrForbidden = errors.New("access denied") type RemoteDir struct { Wait sync.WaitGroup @@ -48,16 +49,8 @@ func GetDir(j *Job, f *File) (links []url.URL, err error) { return } - switch res.StatusCode() { - case fasthttp.StatusOK: - break - - case fasthttp.StatusTooManyRequests: - return nil, ErrRateLimit - - default: - return nil, fmt.Errorf("got HTTP status %d", res.StatusCode()) - } + err = checkStatusCode(res.StatusCode()) + if err != nil { return } body := res.Body() doc := html.NewTokenizer(bytes.NewReader(body)) @@ -155,16 +148,8 @@ func GetFile(u url.URL, f *File) (err error) { if err != nil { return } - switch res.StatusCode() { - case fasthttp.StatusOK: - break - - case fasthttp.StatusTooManyRequests: - return ErrRateLimit - - default: - return fmt.Errorf("got HTTP status %d", res.StatusCode()) - } + err = checkStatusCode(res.StatusCode()) + if err != nil { return } // TODO Inefficient af header := res.Header.Header() @@ -233,6 +218,23 @@ func (f *File) applyHeader(k, v string) { } } +func checkStatusCode(status int) error { + switch status { + case fasthttp.StatusOK: + return nil + + case fasthttp.StatusTooManyRequests: + return ErrRateLimit + + case fasthttp.StatusForbidden, + fasthttp.StatusUnauthorized: + return ErrForbidden + + default: + return fmt.Errorf("got HTTP status %d", status) + } +} + var urlBlackList = [...]string { "", " ", diff --git a/worker.go b/worker.go index 2fd2785..badee0e 100644 --- a/worker.go +++ b/worker.go @@ -35,6 +35,11 @@ func (w WorkerContext) step(job Job) { if err != nil { job.Fails++ + if err == ErrForbidden { + // Don't attempt crawling again + return + } + if job.Fails > config.Retries { atomic.AddUint64(&totalAborted, 1) logrus.WithField("url", job.UriStr).