Unescape results & don't recrawl 404

This commit is contained in:
Richard Patel 2018-11-17 01:21:20 +01:00
parent 145d37f84a
commit f1687679ab
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB
5 changed files with 80 additions and 15 deletions

View File

@ -2,7 +2,6 @@ package main
import (
"bytes"
"fmt"
"github.com/terorie/oddb-go/ds/redblackhash"
"github.com/terorie/oddb-go/fasturl"
"github.com/valyala/fasthttp"
@ -190,15 +189,7 @@ func checkStatusCode(status int) error {
switch status {
case fasthttp.StatusOK:
return nil
case fasthttp.StatusTooManyRequests:
return ErrRateLimit
case fasthttp.StatusForbidden,
fasthttp.StatusUnauthorized:
return ErrForbidden
default:
return fmt.Errorf("got HTTP status %d", status)
return &HttpError{status}
}
}

View File

@ -1,8 +1,17 @@
package main
import "errors"
import (
"errors"
"fmt"
)
var ErrRateLimit = errors.New("too many requests")
var ErrForbidden = errors.New("access denied")
var ErrKnown = errors.New("already crawled")
type HttpError struct {
code int
}
func (e HttpError) Error() string {
return fmt.Sprintf("http status %d", e.code)
}

View File

@ -811,3 +811,57 @@ func validUserinfo(s string) bool {
}
return true
}
func PathUnescape(s string) string {
newStr, err := pathUnescape(s)
if err != nil {
return s
} else {
return newStr
}
}
func pathUnescape(s string) (string, error) {
// Count %, check that they're well-formed.
n := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
n++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:]
if len(s) > 3 {
s = s[:3]
}
return "", EscapeError(s)
}
i += 3
default:
i++
}
}
if n == 0 {
return s, nil
}
t := make([]byte, len(s)-2*n)
j := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
j++
i += 3
case '+':
t[j] = '+'
j++
i++
default:
t[j] = s[i]
j++
i++
}
}
return string(t), nil
}

View File

@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"github.com/sirupsen/logrus"
"github.com/terorie/oddb-go/fasturl"
"os"
"path"
"sync/atomic"
@ -91,6 +92,8 @@ func (t *Task) collect(results chan File) error {
defer f.Close()
for result := range results {
result.Path = fasturl.PathUnescape(result.Path)
result.Name = fasturl.PathUnescape(result.Name)
resJson, err := json.Marshal(result)
if err != nil { panic(err) }
_, err = f.Write(resJson)

View File

@ -2,6 +2,7 @@ package main
import (
"github.com/sirupsen/logrus"
"github.com/valyala/fasthttp"
"math"
"sort"
"strings"
@ -39,9 +40,16 @@ func (w WorkerContext) step(results chan<- File, job Job) {
if err != nil {
job.Fails++
if err == ErrForbidden {
// Don't attempt crawling again
return
if httpErr, ok := err.(HttpError); ok {
switch httpErr.code {
case
fasthttp.StatusUnauthorized,
fasthttp.StatusForbidden,
fasthttp.StatusNotFound:
return
case fasthttp.StatusTooManyRequests:
err = ErrRateLimit
}
}
if job.Fails > config.Retries {