Unescape results & don't recrawl 404

This commit is contained in:
Richard Patel 2018-11-17 01:21:20 +01:00
parent 145d37f84a
commit f1687679ab
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB
5 changed files with 80 additions and 15 deletions

View File

@ -2,7 +2,6 @@ package main
import ( import (
"bytes" "bytes"
"fmt"
"github.com/terorie/oddb-go/ds/redblackhash" "github.com/terorie/oddb-go/ds/redblackhash"
"github.com/terorie/oddb-go/fasturl" "github.com/terorie/oddb-go/fasturl"
"github.com/valyala/fasthttp" "github.com/valyala/fasthttp"
@ -190,15 +189,7 @@ func checkStatusCode(status int) error {
switch status { switch status {
case fasthttp.StatusOK: case fasthttp.StatusOK:
return nil return nil
case fasthttp.StatusTooManyRequests:
return ErrRateLimit
case fasthttp.StatusForbidden,
fasthttp.StatusUnauthorized:
return ErrForbidden
default: default:
return fmt.Errorf("got HTTP status %d", status) return &HttpError{status}
} }
} }

View File

@ -1,8 +1,17 @@
package main package main
import "errors" import (
"errors"
"fmt"
)
var ErrRateLimit = errors.New("too many requests") var ErrRateLimit = errors.New("too many requests")
var ErrForbidden = errors.New("access denied")
var ErrKnown = errors.New("already crawled") var ErrKnown = errors.New("already crawled")
type HttpError struct {
code int
}
func (e HttpError) Error() string {
return fmt.Sprintf("http status %d", e.code)
}

View File

@ -811,3 +811,57 @@ func validUserinfo(s string) bool {
} }
return true return true
} }
func PathUnescape(s string) string {
newStr, err := pathUnescape(s)
if err != nil {
return s
} else {
return newStr
}
}
func pathUnescape(s string) (string, error) {
// Count %, check that they're well-formed.
n := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
n++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:]
if len(s) > 3 {
s = s[:3]
}
return "", EscapeError(s)
}
i += 3
default:
i++
}
}
if n == 0 {
return s, nil
}
t := make([]byte, len(s)-2*n)
j := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
j++
i += 3
case '+':
t[j] = '+'
j++
i++
default:
t[j] = s[i]
j++
i++
}
}
return string(t), nil
}

View File

@ -5,6 +5,7 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/terorie/oddb-go/fasturl"
"os" "os"
"path" "path"
"sync/atomic" "sync/atomic"
@ -91,6 +92,8 @@ func (t *Task) collect(results chan File) error {
defer f.Close() defer f.Close()
for result := range results { for result := range results {
result.Path = fasturl.PathUnescape(result.Path)
result.Name = fasturl.PathUnescape(result.Name)
resJson, err := json.Marshal(result) resJson, err := json.Marshal(result)
if err != nil { panic(err) } if err != nil { panic(err) }
_, err = f.Write(resJson) _, err = f.Write(resJson)

View File

@ -2,6 +2,7 @@ package main
import ( import (
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/valyala/fasthttp"
"math" "math"
"sort" "sort"
"strings" "strings"
@ -39,9 +40,16 @@ func (w WorkerContext) step(results chan<- File, job Job) {
if err != nil { if err != nil {
job.Fails++ job.Fails++
if err == ErrForbidden { if httpErr, ok := err.(HttpError); ok {
// Don't attempt crawling again switch httpErr.code {
case
fasthttp.StatusUnauthorized,
fasthttp.StatusForbidden,
fasthttp.StatusNotFound:
return return
case fasthttp.StatusTooManyRequests:
err = ErrRateLimit
}
} }
if job.Fails > config.Retries { if job.Fails > config.Retries {