mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-18 18:06:45 +00:00
Fix file uploads
This commit is contained in:
parent
24ee6fcba2
commit
7b29da9340
29
scheduler.go
29
scheduler.go
@ -72,11 +72,22 @@ func ScheduleTask(remotes chan<- *OD, t *Task, u *fasturl.URL) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (o *OD) Watch(results chan File) {
|
func (o *OD) Watch(results chan File) {
|
||||||
|
filePath := path.Join("crawled", fmt.Sprintf("%d.json", o.Task.WebsiteId))
|
||||||
|
// Open crawl results file
|
||||||
|
f, err := os.OpenFile(
|
||||||
|
filePath,
|
||||||
|
os.O_CREATE | os.O_RDWR | os.O_TRUNC,
|
||||||
|
0644,
|
||||||
|
)
|
||||||
|
if err != nil { return }
|
||||||
|
defer f.Close()
|
||||||
|
defer os.Remove(filePath)
|
||||||
|
|
||||||
// Wait for the file to be fully written
|
// Wait for the file to be fully written
|
||||||
var fileLock sync.Mutex
|
var fileLock sync.Mutex
|
||||||
fileLock.Lock()
|
fileLock.Lock()
|
||||||
|
|
||||||
go o.Task.Collect(results, &fileLock)
|
go o.Task.Collect(results, f, &fileLock)
|
||||||
|
|
||||||
// Wait for all jobs on remote to finish
|
// Wait for all jobs on remote to finish
|
||||||
o.Wait.Wait()
|
o.Wait.Wait()
|
||||||
@ -113,7 +124,7 @@ func (o *OD) Watch(results chan File) {
|
|||||||
fileLock.Unlock()
|
fileLock.Unlock()
|
||||||
|
|
||||||
// Upload results
|
// Upload results
|
||||||
err := PushResult(&o.Result)
|
err = PushResult(&o.Result, f)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logrus.WithError(err).
|
logrus.WithError(err).
|
||||||
Error("Failed uploading result")
|
Error("Failed uploading result")
|
||||||
@ -123,8 +134,8 @@ func (o *OD) Watch(results chan File) {
|
|||||||
globalWait.Done()
|
globalWait.Done()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *Task) Collect(results chan File, done *sync.Mutex) {
|
func (t *Task) Collect(results chan File, f *os.File, done *sync.Mutex) {
|
||||||
err := t.collect(results)
|
err := t.collect(results, f)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logrus.WithError(err).
|
logrus.WithError(err).
|
||||||
Error("Failed saving crawl results")
|
Error("Failed saving crawl results")
|
||||||
@ -132,18 +143,10 @@ func (t *Task) Collect(results chan File, done *sync.Mutex) {
|
|||||||
done.Unlock()
|
done.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *Task) collect(results chan File) error {
|
func (t *Task) collect(results chan File, f *os.File) error {
|
||||||
err := os.MkdirAll("crawled", 0755)
|
err := os.MkdirAll("crawled", 0755)
|
||||||
if err != nil { return err }
|
if err != nil { return err }
|
||||||
|
|
||||||
f, err := os.OpenFile(
|
|
||||||
path.Join("crawled", fmt.Sprintf("%d.json", t.WebsiteId)),
|
|
||||||
os.O_CREATE | os.O_WRONLY | os.O_TRUNC,
|
|
||||||
0755,
|
|
||||||
)
|
|
||||||
if err != nil { return err }
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
for result := range results {
|
for result := range results {
|
||||||
result.Path = fasturl.PathUnescape(result.Path)
|
result.Path = fasturl.PathUnescape(result.Path)
|
||||||
result.Name = fasturl.PathUnescape(result.Name)
|
result.Name = fasturl.PathUnescape(result.Name)
|
||||||
|
18
server.go
18
server.go
@ -10,7 +10,6 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -43,26 +42,17 @@ func FetchTask() (t *Task, err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func PushResult(result *TaskResult) (err error) {
|
func PushResult(result *TaskResult, f *os.File) (err error) {
|
||||||
if result.WebsiteId == 0 {
|
if result.WebsiteId == 0 {
|
||||||
// Not a real result, don't push
|
// Not a real result, don't push
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
filePath := filepath.Join(
|
// Rewind to the beginning of the file
|
||||||
".", "crawled",
|
_, err = f.Seek(0, 0)
|
||||||
fmt.Sprintf("%d.json", result.WebsiteId))
|
if err != nil {
|
||||||
|
|
||||||
defer os.Remove(filePath)
|
|
||||||
|
|
||||||
f, err := os.Open(filePath)
|
|
||||||
if os.IsNotExist(err) {
|
|
||||||
err = fmt.Errorf("cannot upload result: %s does not exist", filePath)
|
|
||||||
return
|
|
||||||
} else if err != nil {
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
err = uploadChunks(result.WebsiteId, f)
|
err = uploadChunks(result.WebsiteId, f)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user