Fix ton of bugs

This commit is contained in:
Richard Patel 2018-11-17 04:18:22 +01:00
parent 0fe97a8058
commit d596882b40
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB
9 changed files with 146 additions and 93 deletions

View File

@ -165,22 +165,27 @@ func (f *File) applyContentLength(v string) {
f.Size = size f.Size = size
} }
// TODO Cleanup
func (f *File) applyLastModified(v string) { func (f *File) applyLastModified(v string) {
if v == "" { if v == "" {
return return
} }
var t time.Time
var err error var err error
f.MTime, err = time.Parse(time.RFC1123, v) t, err = time.Parse(time.RFC1123, v)
if err == nil { if err == nil {
f.MTime = t.Unix()
return return
} }
f.MTime, err = time.Parse(time.RFC850, v) t, err = time.Parse(time.RFC850, v)
if err == nil { if err == nil {
f.MTime = t.Unix()
return return
} }
// TODO Parse asctime // TODO Parse asctime
f.MTime, err = time.Parse("2006-01-02", v[:10]) t, err = time.Parse("2006-01-02", v[:10])
if err == nil { if err == nil {
f.MTime = t.Unix()
return return
} }
} }

View File

@ -16,6 +16,7 @@ package redblackhash
import ( import (
"bytes" "bytes"
"fmt" "fmt"
"sync"
) )
const ( const (
@ -28,6 +29,7 @@ type Key [KeySize]byte
// Tree holds elements of the red-black tree // Tree holds elements of the red-black tree
type Tree struct { type Tree struct {
sync.Mutex
Root *Node Root *Node
size int size int
} }

17
main.go
View File

@ -83,11 +83,7 @@ func cmdBase(clic *cli.Context) error {
time.Sleep(30 * time.Second) time.Sleep(30 * time.Second)
continue continue
} }
globalWait.Add(1) ScheduleTask(inRemotes, t, &baseUri)
inRemotes <- &OD {
Task: t,
BaseUri: baseUri,
}
} }
} }
@ -122,14 +118,11 @@ func cmdCrawler(clic *cli.Context) error {
ticker := time.NewTicker(3 * time.Second) ticker := time.NewTicker(3 * time.Second)
defer ticker.Stop() defer ticker.Stop()
globalWait.Add(1) task := Task {
inRemotes <- &OD { WebsiteId: 0,
Task: &Task{ Url: u.String(),
WebsiteId: 0,
Url: u.String(),
},
BaseUri: u,
} }
ScheduleTask(inRemotes, &task, &u)
// Wait for all jobs to finish // Wait for all jobs to finish
globalWait.Wait() globalWait.Wait()

View File

@ -7,6 +7,21 @@ import (
"time" "time"
) )
type Task struct {
WebsiteId uint64 `json:"website_id"`
Url string `json:"url"`
}
type TaskResult struct {
StatusCode string `json:"status_code"`
FileCount uint64 `json:"file_count"`
ErrorCount uint64 `json:"-"`
StartTime time.Time `json:"-"`
StartTimeUnix int64 `json:"start_time"`
EndTimeUnix int64 `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}
type Job struct { type Job struct {
OD *OD OD *OD
Uri fasturl.URL Uri fasturl.URL
@ -16,26 +31,25 @@ type Job struct {
} }
type OD struct { type OD struct {
Task *Task Task Task
Result TaskResult
Wait sync.WaitGroup Wait sync.WaitGroup
BaseUri fasturl.URL BaseUri fasturl.URL
WCtx WorkerContext WCtx WorkerContext
Scanned redblackhash.Tree Scanned redblackhash.Tree
lock sync.Mutex
} }
type File struct { type File struct {
Name string `json:"name"` Name string `json:"name"`
Size int64 `json:"size"` Size int64 `json:"size"`
MTime time.Time `json:"mtime"` MTime int64 `json:"mtime"`
Path string `json:"path"` Path string `json:"path"`
IsDir bool `json:"-"` IsDir bool `json:"-"`
} }
func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) { func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) {
o.lock.Lock() o.Scanned.Lock()
defer o.lock.Unlock() defer o.Scanned.Unlock()
exists = o.Scanned.Get(k) exists = o.Scanned.Get(k)
if exists { return true } if exists { return true }

View File

@ -8,6 +8,7 @@ import (
"github.com/terorie/od-database-crawler/fasturl" "github.com/terorie/od-database-crawler/fasturl"
"os" "os"
"path" "path"
"sync"
"sync/atomic" "sync/atomic"
"time" "time"
) )
@ -55,28 +56,80 @@ func Schedule(c context.Context, remotes <-chan *OD) {
} }
} }
func (r *OD) Watch(results chan File) { func ScheduleTask(remotes chan<- *OD, t *Task, u *fasturl.URL) {
go r.Task.Collect(results) globalWait.Add(1)
now := time.Now()
// Wait for all jobs on remote to finish od := &OD {
r.Wait.Wait() Task: *t,
close(r.WCtx.in) BaseUri: *u,
atomic.AddInt32(&activeTasks, -1) Result: TaskResult {
WebsiteId: t.WebsiteId,
logrus.WithField("url", r.BaseUri.String()). StartTime: now,
Info("Crawler finished") StartTimeUnix: now.Unix(),
},
globalWait.Done() }
remotes <- od
close(results)
} }
func (t *Task) Collect(results chan File) { func (o *OD) Watch(results chan File) {
// Wait for the file to be fully written
var fileLock sync.Mutex
fileLock.Lock()
go o.Task.Collect(results, &fileLock)
// Wait for all jobs on remote to finish
o.Wait.Wait()
close(o.WCtx.in)
atomic.AddInt32(&activeTasks, -1)
// Log finish
logrus.
WithField("url", o.BaseUri.String()).
WithField("duration", time.Since(o.Result.StartTime)).
Info("Crawler finished")
// Set status code
now := time.Now()
o.Result.EndTimeUnix = now.Unix()
fileCount := atomic.LoadUint64(&o.Result.FileCount)
if fileCount == 0 {
errorCount := atomic.LoadUint64(&o.Result.ErrorCount)
if errorCount == 0 {
o.Result.StatusCode = "empty"
} else {
o.Result.StatusCode = "directory listing failed"
}
} else {
o.Result.StatusCode = "success"
}
// Shut down Collect()
close(results)
// Wait for results to sync to file
fileLock.Lock()
fileLock.Unlock()
// Upload results
err := PushResult(&o.Result)
if err != nil {
logrus.WithError(err).
Error("Failed uploading result")
}
// Mark job as completely done
globalWait.Done()
}
func (t *Task) Collect(results chan File, done *sync.Mutex) {
err := t.collect(results) err := t.collect(results)
if err != nil { if err != nil {
logrus.WithError(err). logrus.WithError(err).
Error("Failed saving crawl results") Error("Failed saving crawl results")
} }
done.Unlock()
} }
func (t *Task) collect(results chan File) error { func (t *Task) collect(results chan File) error {

View File

@ -12,7 +12,6 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings"
) )
const ( const (
@ -31,7 +30,7 @@ func FetchTask() (t *Task, err error) {
switch res.StatusCode { switch res.StatusCode {
case 200: case 200:
break break
case 500: case 404, 500:
return nil, nil return nil, nil
default: default:
return nil, fmt.Errorf("http %s", res.Status) return nil, fmt.Errorf("http %s", res.Status)
@ -45,6 +44,11 @@ func FetchTask() (t *Task, err error) {
} }
func PushResult(result *TaskResult) (err error) { func PushResult(result *TaskResult) (err error) {
if result.WebsiteId == 0 {
// Not a real result, don't push
return nil
}
filePath := filepath.Join( filePath := filepath.Join(
".", "crawled", ".", "crawled",
fmt.Sprintf("%d.json", result.WebsiteId)) fmt.Sprintf("%d.json", result.WebsiteId))
@ -83,34 +87,41 @@ func PushResult(result *TaskResult) (err error) {
return return
} }
func uploadChunks(websiteId uint64, f *os.File) (err error) { func uploadChunks(websiteId uint64, f *os.File) error {
for iter := 1; iter > 0; iter++ { eof := false
for iter := 1; !eof; iter++ {
// TODO Stream with io.Pipe? // TODO Stream with io.Pipe?
var b bytes.Buffer var b bytes.Buffer
multi := multipart.NewWriter(&b) multi := multipart.NewWriter(&b)
// Set upload fields // Set upload fields
var err error
err = multi.WriteField("token", config.Token) err = multi.WriteField("token", config.Token)
if err != nil { return } if err != nil { return err }
err = multi.WriteField("website_id", fmt.Sprintf("%d", websiteId)) err = multi.WriteField("website_id", fmt.Sprintf("%d", websiteId))
if err != nil { return } if err != nil { return err }
// Copy chunk to file_list // Copy chunk to file_list
formFile, err := multi.CreateFormFile("file_list", "file_list") formFile, err := multi.CreateFormFile("file_list", "file_list")
_, err = io.CopyN(formFile, f, fileListChunkSize) var n int64
if err == io.EOF { n, err = io.CopyN(formFile, f, fileListChunkSize)
break if err != io.EOF {
} else if err == io.ErrUnexpectedEOF { return err
}
if n < fileListChunkSize {
err = nil err = nil
// Break at end of iteration // Break at end of iteration
iter = -420 eof = true
} }
multi.Close()
req, err := http.NewRequest( req, err := http.NewRequest(
http.MethodPost, http.MethodPost,
config.ServerUrl + "/task/upload", config.ServerUrl + "/task/upload",
&b) &b)
req.Header.Set("content-type", multi.FormDataContentType())
if err != nil { return err } if err != nil { return err }
res, err := serverClient.Do(req) res, err := serverClient.Do(req)
@ -125,49 +136,38 @@ func uploadChunks(websiteId uint64, f *os.File) (err error) {
logrus.Infof("Uploading file list part %d: %s", logrus.Infof("Uploading file list part %d: %s",
iter, res.Status) iter, res.Status)
} }
return return nil
} }
func uploadResult(result *TaskResult) (err error) { func uploadResult(result *TaskResult) (err error) {
resultEnc, err := json.Marshal(result) resultEnc, err := json.Marshal(result)
if err != nil { panic(err) } if err != nil { panic(err) }
payload := url.Values { res, err := serverClient.PostForm(
"token": {config.Token},
"result": {string(resultEnc)},
}.Encode()
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/complete", config.ServerUrl + "/task/complete",
strings.NewReader(payload)) url.Values {
if err != nil { return } "token": {config.Token},
"result": {string(resultEnc)},
res, err := serverClient.Do(req) },
)
if err != nil { return } if err != nil { return }
res.Body.Close() res.Body.Close()
if res.StatusCode != http.StatusOK { if res.StatusCode != http.StatusOK {
return fmt.Errorf("failed to cancel task: %s", res.Status) return fmt.Errorf("%s", res.Status)
} }
return return
} }
func CancelTask(websiteId uint64) (err error) { func CancelTask(websiteId uint64) (err error) {
form := url.Values{ res, err := serverClient.PostForm(
"token": {config.Token},
"website_id": {strconv.FormatUint(websiteId, 10)},
}
encForm := form.Encode()
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/cancel", config.ServerUrl + "/task/cancel",
strings.NewReader(encForm)) url.Values{
if err != nil { return } "token": {config.Token},
"website_id": {strconv.FormatUint(websiteId, 10)},
res, err := serverClient.Do(req) },
)
if err != nil { return } if err != nil { return }
res.Body.Close() res.Body.Close()

View File

@ -57,7 +57,7 @@ func Stats(c context.Context) {
runtime.ReadMemStats(&mem) runtime.ReadMemStats(&mem)
logrus.WithFields(logrus.Fields{ logrus.WithFields(logrus.Fields{
"queue_count": totalBuffered, "queue_count": atomic.LoadInt64(&totalBuffered),
"heap": FormatByteCount(mem.Alloc), "heap": FormatByteCount(mem.Alloc),
"objects": mem.HeapObjects, "objects": mem.HeapObjects,
"num_gc": mem.NumGC, "num_gc": mem.NumGC,

View File

@ -1,16 +0,0 @@
package main
import "time"
type Task struct {
WebsiteId int `json:"website_id"`
Url string `json:"url"`
}
type TaskResult struct {
StatusCode int `json:"status_code"`
FileCount uint64 `json:"file_count"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}

View File

@ -43,6 +43,7 @@ func (w WorkerContext) step(results chan<- File, job Job) {
if httpErr, ok := err.(HttpError); ok { if httpErr, ok := err.(HttpError); ok {
switch httpErr.code { switch httpErr.code {
case case
fasthttp.StatusFound,
fasthttp.StatusUnauthorized, fasthttp.StatusUnauthorized,
fasthttp.StatusForbidden, fasthttp.StatusForbidden,
fasthttp.StatusNotFound: fasthttp.StatusNotFound:
@ -137,6 +138,7 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
Error("Failed getting file") Error("Failed getting file")
return nil, err return nil, err
} }
atomic.AddUint64(&job.OD.Result.FileCount, 1)
} }
return return
} }