diff --git a/crawl.go b/crawl.go index d20f221..75a2cfb 100644 --- a/crawl.go +++ b/crawl.go @@ -2,7 +2,6 @@ package main import ( "bytes" - "fmt" "github.com/terorie/oddb-go/ds/redblackhash" "github.com/terorie/oddb-go/fasturl" "github.com/valyala/fasthttp" @@ -190,15 +189,7 @@ func checkStatusCode(status int) error { switch status { case fasthttp.StatusOK: return nil - - case fasthttp.StatusTooManyRequests: - return ErrRateLimit - - case fasthttp.StatusForbidden, - fasthttp.StatusUnauthorized: - return ErrForbidden - default: - return fmt.Errorf("got HTTP status %d", status) + return &HttpError{status} } } diff --git a/errors.go b/errors.go index 316a108..9ef0157 100644 --- a/errors.go +++ b/errors.go @@ -1,8 +1,17 @@ package main -import "errors" +import ( + "errors" + "fmt" +) var ErrRateLimit = errors.New("too many requests") -var ErrForbidden = errors.New("access denied") var ErrKnown = errors.New("already crawled") +type HttpError struct { + code int +} + +func (e HttpError) Error() string { + return fmt.Sprintf("http status %d", e.code) +} diff --git a/fasturl/url.go b/fasturl/url.go index f03612b..6ec4e28 100644 --- a/fasturl/url.go +++ b/fasturl/url.go @@ -811,3 +811,57 @@ func validUserinfo(s string) bool { } return true } + +func PathUnescape(s string) string { + newStr, err := pathUnescape(s) + if err != nil { + return s + } else { + return newStr + } +} + +func pathUnescape(s string) (string, error) { + // Count %, check that they're well-formed. + n := 0 + for i := 0; i < len(s); { + switch s[i] { + case '%': + n++ + if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) { + s = s[i:] + if len(s) > 3 { + s = s[:3] + } + return "", EscapeError(s) + } + i += 3 + default: + i++ + } + } + + if n == 0 { + return s, nil + } + + t := make([]byte, len(s)-2*n) + j := 0 + for i := 0; i < len(s); { + switch s[i] { + case '%': + t[j] = unhex(s[i+1])<<4 | unhex(s[i+2]) + j++ + i += 3 + case '+': + t[j] = '+' + j++ + i++ + default: + t[j] = s[i] + j++ + i++ + } + } + return string(t), nil +} diff --git a/scheduler.go b/scheduler.go index 8c0938d..470cb21 100644 --- a/scheduler.go +++ b/scheduler.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "github.com/sirupsen/logrus" + "github.com/terorie/oddb-go/fasturl" "os" "path" "sync/atomic" @@ -91,6 +92,8 @@ func (t *Task) collect(results chan File) error { defer f.Close() for result := range results { + result.Path = fasturl.PathUnescape(result.Path) + result.Name = fasturl.PathUnescape(result.Name) resJson, err := json.Marshal(result) if err != nil { panic(err) } _, err = f.Write(resJson) diff --git a/worker.go b/worker.go index 50d46d3..8066875 100644 --- a/worker.go +++ b/worker.go @@ -2,6 +2,7 @@ package main import ( "github.com/sirupsen/logrus" + "github.com/valyala/fasthttp" "math" "sort" "strings" @@ -39,9 +40,16 @@ func (w WorkerContext) step(results chan<- File, job Job) { if err != nil { job.Fails++ - if err == ErrForbidden { - // Don't attempt crawling again - return + if httpErr, ok := err.(HttpError); ok { + switch httpErr.code { + case + fasthttp.StatusUnauthorized, + fasthttp.StatusForbidden, + fasthttp.StatusNotFound: + return + case fasthttp.StatusTooManyRequests: + err = ErrRateLimit + } } if job.Fails > config.Retries {