mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-24 12:55:51 +00:00
Handle HTTP statuses
This commit is contained in:
parent
1c33346f45
commit
4ea5f8a410
@ -2,6 +2,8 @@ package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/valyala/fasthttp"
|
||||
"golang.org/x/net/html"
|
||||
@ -15,6 +17,7 @@ import (
|
||||
)
|
||||
|
||||
var client fasthttp.Client
|
||||
var ErrRateLimit = errors.New("too many requests")
|
||||
|
||||
type RemoteDir struct {
|
||||
Wait sync.WaitGroup
|
||||
@ -45,6 +48,17 @@ func GetDir(j *Job, f *File) (links []url.URL, err error) {
|
||||
return
|
||||
}
|
||||
|
||||
switch res.StatusCode() {
|
||||
case fasthttp.StatusOK:
|
||||
break
|
||||
|
||||
case fasthttp.StatusTooManyRequests:
|
||||
return nil, ErrRateLimit
|
||||
|
||||
default:
|
||||
return nil, fmt.Errorf("got HTTP status %d", res.StatusCode())
|
||||
}
|
||||
|
||||
body := res.Body()
|
||||
doc := html.NewTokenizer(bytes.NewReader(body))
|
||||
|
||||
@ -119,10 +133,6 @@ func GetDir(j *Job, f *File) (links []url.URL, err error) {
|
||||
nextToken:
|
||||
}
|
||||
|
||||
if len(links) == 0 {
|
||||
println(string(body))
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
@ -145,6 +155,17 @@ func GetFile(u url.URL, f *File) (err error) {
|
||||
|
||||
if err != nil { return }
|
||||
|
||||
switch res.StatusCode() {
|
||||
case fasthttp.StatusOK:
|
||||
break
|
||||
|
||||
case fasthttp.StatusTooManyRequests:
|
||||
return ErrRateLimit
|
||||
|
||||
default:
|
||||
return fmt.Errorf("got HTTP status %d", res.StatusCode())
|
||||
}
|
||||
|
||||
// TODO Inefficient af
|
||||
header := res.Header.Header()
|
||||
f.ParseHeader(header)
|
||||
|
@ -31,11 +31,6 @@ func (w WorkerContext) step(job Job) {
|
||||
if err != nil {
|
||||
job.Fails++
|
||||
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"error": err.Error(),
|
||||
"url": job.UriStr,
|
||||
}).Warningf("Crawl error: %s", err)
|
||||
|
||||
if job.Fails > config.Retries {
|
||||
atomic.AddUint64(&totalAborted, 1)
|
||||
logrus.WithField("url", job.UriStr).
|
||||
|
Loading…
x
Reference in New Issue
Block a user