Handle HTTP statuses

This commit is contained in:
Richard Patel 2018-10-28 03:22:25 +01:00
parent 1c33346f45
commit 4ea5f8a410
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB
2 changed files with 25 additions and 9 deletions

View File

@ -2,6 +2,8 @@ package main
import ( import (
"bytes" "bytes"
"errors"
"fmt"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/valyala/fasthttp" "github.com/valyala/fasthttp"
"golang.org/x/net/html" "golang.org/x/net/html"
@ -15,6 +17,7 @@ import (
) )
var client fasthttp.Client var client fasthttp.Client
var ErrRateLimit = errors.New("too many requests")
type RemoteDir struct { type RemoteDir struct {
Wait sync.WaitGroup Wait sync.WaitGroup
@ -45,6 +48,17 @@ func GetDir(j *Job, f *File) (links []url.URL, err error) {
return return
} }
switch res.StatusCode() {
case fasthttp.StatusOK:
break
case fasthttp.StatusTooManyRequests:
return nil, ErrRateLimit
default:
return nil, fmt.Errorf("got HTTP status %d", res.StatusCode())
}
body := res.Body() body := res.Body()
doc := html.NewTokenizer(bytes.NewReader(body)) doc := html.NewTokenizer(bytes.NewReader(body))
@ -119,10 +133,6 @@ func GetDir(j *Job, f *File) (links []url.URL, err error) {
nextToken: nextToken:
} }
if len(links) == 0 {
println(string(body))
}
return return
} }
@ -145,6 +155,17 @@ func GetFile(u url.URL, f *File) (err error) {
if err != nil { return } if err != nil { return }
switch res.StatusCode() {
case fasthttp.StatusOK:
break
case fasthttp.StatusTooManyRequests:
return ErrRateLimit
default:
return fmt.Errorf("got HTTP status %d", res.StatusCode())
}
// TODO Inefficient af // TODO Inefficient af
header := res.Header.Header() header := res.Header.Header()
f.ParseHeader(header) f.ParseHeader(header)

View File

@ -31,11 +31,6 @@ func (w WorkerContext) step(job Job) {
if err != nil { if err != nil {
job.Fails++ job.Fails++
logrus.WithFields(logrus.Fields{
"error": err.Error(),
"url": job.UriStr,
}).Warningf("Crawl error: %s", err)
if job.Fails > config.Retries { if job.Fails > config.Retries {
atomic.AddUint64(&totalAborted, 1) atomic.AddUint64(&totalAborted, 1)
logrus.WithField("url", job.UriStr). logrus.WithField("url", job.UriStr).