Unescape results & don't recrawl 404

This commit is contained in:
Richard Patel
2018-11-17 01:21:20 +01:00
parent 145d37f84a
commit f1687679ab
5 changed files with 80 additions and 15 deletions

View File

@@ -2,6 +2,7 @@ package main
import (
"github.com/sirupsen/logrus"
"github.com/valyala/fasthttp"
"math"
"sort"
"strings"
@@ -39,9 +40,16 @@ func (w WorkerContext) step(results chan<- File, job Job) {
if err != nil {
job.Fails++
if err == ErrForbidden {
// Don't attempt crawling again
return
if httpErr, ok := err.(HttpError); ok {
switch httpErr.code {
case
fasthttp.StatusUnauthorized,
fasthttp.StatusForbidden,
fasthttp.StatusNotFound:
return
case fasthttp.StatusTooManyRequests:
err = ErrRateLimit
}
}
if job.Fails > config.Retries {