Don't retry on 401/403

This commit is contained in:
Richard Patel 2018-10-28 03:47:29 +01:00
parent faad19f121
commit ab5874129f
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB
2 changed files with 27 additions and 20 deletions

View File

@ -18,6 +18,7 @@ import (
var client fasthttp.Client
var ErrRateLimit = errors.New("too many requests")
var ErrForbidden = errors.New("access denied")
type RemoteDir struct {
Wait sync.WaitGroup
@ -48,16 +49,8 @@ func GetDir(j *Job, f *File) (links []url.URL, err error) {
return
}
switch res.StatusCode() {
case fasthttp.StatusOK:
break
case fasthttp.StatusTooManyRequests:
return nil, ErrRateLimit
default:
return nil, fmt.Errorf("got HTTP status %d", res.StatusCode())
}
err = checkStatusCode(res.StatusCode())
if err != nil { return }
body := res.Body()
doc := html.NewTokenizer(bytes.NewReader(body))
@ -155,16 +148,8 @@ func GetFile(u url.URL, f *File) (err error) {
if err != nil { return }
switch res.StatusCode() {
case fasthttp.StatusOK:
break
case fasthttp.StatusTooManyRequests:
return ErrRateLimit
default:
return fmt.Errorf("got HTTP status %d", res.StatusCode())
}
err = checkStatusCode(res.StatusCode())
if err != nil { return }
// TODO Inefficient af
header := res.Header.Header()
@ -233,6 +218,23 @@ func (f *File) applyHeader(k, v string) {
}
}
func checkStatusCode(status int) error {
switch status {
case fasthttp.StatusOK:
return nil
case fasthttp.StatusTooManyRequests:
return ErrRateLimit
case fasthttp.StatusForbidden,
fasthttp.StatusUnauthorized:
return ErrForbidden
default:
return fmt.Errorf("got HTTP status %d", status)
}
}
var urlBlackList = [...]string {
"",
" ",

View File

@ -35,6 +35,11 @@ func (w WorkerContext) step(job Job) {
if err != nil {
job.Fails++
if err == ErrForbidden {
// Don't attempt crawling again
return
}
if job.Fails > config.Retries {
atomic.AddUint64(&totalAborted, 1)
logrus.WithField("url", job.UriStr).