From 4ea5f8a41099039829499d6111c3ec8961c4ec5c Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sun, 28 Oct 2018 03:22:25 +0100 Subject: [PATCH] Handle HTTP statuses --- crawl_http.go | 29 +++++++++++++++++++++++++---- worker.go | 5 ----- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/crawl_http.go b/crawl_http.go index 5e159fa..e0cf5db 100644 --- a/crawl_http.go +++ b/crawl_http.go @@ -2,6 +2,8 @@ package main import ( "bytes" + "errors" + "fmt" "github.com/sirupsen/logrus" "github.com/valyala/fasthttp" "golang.org/x/net/html" @@ -15,6 +17,7 @@ import ( ) var client fasthttp.Client +var ErrRateLimit = errors.New("too many requests") type RemoteDir struct { Wait sync.WaitGroup @@ -45,6 +48,17 @@ func GetDir(j *Job, f *File) (links []url.URL, err error) { return } + switch res.StatusCode() { + case fasthttp.StatusOK: + break + + case fasthttp.StatusTooManyRequests: + return nil, ErrRateLimit + + default: + return nil, fmt.Errorf("got HTTP status %d", res.StatusCode()) + } + body := res.Body() doc := html.NewTokenizer(bytes.NewReader(body)) @@ -119,10 +133,6 @@ func GetDir(j *Job, f *File) (links []url.URL, err error) { nextToken: } - if len(links) == 0 { - println(string(body)) - } - return } @@ -145,6 +155,17 @@ func GetFile(u url.URL, f *File) (err error) { if err != nil { return } + switch res.StatusCode() { + case fasthttp.StatusOK: + break + + case fasthttp.StatusTooManyRequests: + return ErrRateLimit + + default: + return fmt.Errorf("got HTTP status %d", res.StatusCode()) + } + // TODO Inefficient af header := res.Header.Header() f.ParseHeader(header) diff --git a/worker.go b/worker.go index 380ca98..481aee2 100644 --- a/worker.go +++ b/worker.go @@ -31,11 +31,6 @@ func (w WorkerContext) step(job Job) { if err != nil { job.Fails++ - logrus.WithFields(logrus.Fields{ - "error": err.Error(), - "url": job.UriStr, - }).Warningf("Crawl error: %s", err) - if job.Fails > config.Retries { atomic.AddUint64(&totalAborted, 1) logrus.WithField("url", job.UriStr).