34 Commits

Author SHA1 Message Date
Richard Patel
1625d6c888 Bump to v1.0.2 2018-11-18 18:53:57 +01:00
Richard Patel
03a487f393 Fix crawl loop 2018-11-18 18:45:06 +01:00
Richard Patel
ac8221b109 Retry /task/upload 2018-11-18 18:33:26 +01:00
Richard Patel
8ed2cf3b93 Bump to v1.0.1 2018-11-18 14:49:07 +01:00
Richard Patel
f3620262fc Add log file support 2018-11-18 14:46:52 +01:00
Richard Patel
dc4e4212a0 Add freebsd to release.sh 2018-11-18 14:38:18 +01:00
Richard Patel
6e6a4edd27 Ignore all HTTP errors 2018-11-18 14:25:06 +01:00
Richard Patel
a71157b4d8 Add User-Agent parameter 2018-11-18 14:24:04 +01:00
Richard Patel
6dbec8c789 Add release script 2018-11-18 02:36:22 +01:00
Richard Patel
605f6db5a5 Don't call /task/upload for websites with no results 2018-11-18 01:42:57 +01:00
Richard Patel
d593ba2d0b Bump to 1.0 2018-11-18 00:54:58 +01:00
Richard Patel
6793086c22 Ignore HTTPS errors 2018-11-18 00:37:30 +01:00
Richard Patel
4464f34779 Add recheck and timeout parameters 2018-11-18 00:29:29 +01:00
Richard Patel
339175220d Refactor uploading & chunk size parameter 2018-11-18 00:19:43 +01:00
Richard Patel
1e6687c519 Upload result ignoring errors 2018-11-17 15:04:20 +01:00
Richard Patel
8060556089 Fix: make crawled dir 2018-11-17 13:36:35 +01:00
Richard Patel
73ba848e17 Grammar 2018-11-17 13:35:29 +01:00
Richard Patel
115983f70e Silent HTTP errors 2018-11-17 13:22:46 +01:00
Richard Patel
9210996b4c Fix multiple part file upload 2018-11-17 12:52:24 +01:00
Richard Patel
7b29da9340 Fix file uploads 2018-11-17 12:47:16 +01:00
Richard Patel
24ee6fcba2 Quickfix: Revert FTP give back 2018-11-17 12:43:30 +01:00
Richard Patel
bfb18d62b2 mini fix 2018-11-17 05:27:09 +01:00
Richard Patel
f4054441ab Return FTP tasks 2018-11-17 05:07:52 +01:00
Richard Patel
f8d2bf386d Fix FTP error ignore 2018-11-17 04:57:19 +01:00
Richard Patel
f41198b00c Ignore FTP URLs 2018-11-17 04:50:59 +01:00
Richard Patel
7fdffff58f Update config.yml 2018-11-17 04:19:04 +01:00
Richard Patel
d596882b40 Fix ton of bugs 2018-11-17 04:18:22 +01:00
Richard Patel
0fe97a8058 Update README.md 2018-11-17 01:36:07 +01:00
Richard Patel
718f9d7fbc Rename project 2018-11-17 01:33:15 +01:00
Richard Patel
f1687679ab Unescape results & don't recrawl 404 2018-11-17 01:21:20 +01:00
Richard Patel
145d37f84a Fix wait, add back crawl command 2018-11-17 00:49:09 +01:00
Richard Patel
cc777bcaeb redblackhash: Use bytes.Compare 2018-11-16 21:17:39 +01:00
Simon
1e78cea7e7 Saved path should not contain file name 2018-11-16 13:58:12 -05:00
Richard Patel
3f85cf679b Getting tasks 2018-11-16 04:47:08 +01:00
18 changed files with 662 additions and 309 deletions

BIN
.github/stress.png vendored

Binary file not shown.

Before

Width:  |  Height:  |  Size: 369 KiB

2
.gitignore vendored
View File

@@ -1,3 +1,3 @@
/.idea/
.DS_Store
/oddb-go
/od-database-crawler

View File

@@ -1,12 +1,7 @@
# oddb Go crawler 🚀
# od-database Go crawler 🚀
> by terorie 2018 :P
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
* Crawls HTTP open directories (standard Web Server Listings)
* Gets name, path, size and modification time of all files
* Soon: Will work as a crawler for [OD-Database](https://github.com/simon987/od-database)!
Stress test crawling [pandoradir](https://github.com/terorie/pandoradir)
on an average laptop (~10K requests per second, 4 connections):
![image](.github/stress.png)
Memory usage is being optimized :P
* Lightweight and fast: __over 9000 requests per second__ on a standard laptop

View File

@@ -1,45 +1,64 @@
package main
import (
"bufio"
"fmt"
"github.com/sirupsen/logrus"
"github.com/spf13/viper"
"io"
"os"
"strings"
"time"
)
var config struct {
ServerUrl string
Token string
ServerTimeout time.Duration
Recheck time.Duration
ChunkSize int64
Retries int
Workers int
UserAgent string
Timeout time.Duration
Tasks int32
CrawlStats time.Duration
AllocStats time.Duration
Verbose bool
PrintHTTP bool
}
const (
ConfServerUrl = "server.url"
ConfToken = "server.token"
ConfServerTimeout = "server.timeout"
ConfRecheck = "server.recheck"
ConfChunkSize = "server.upload_chunk"
ConfTasks = "crawl.tasks"
ConfRetries = "crawl.retries"
ConfWorkers = "crawl.connections"
ConfUserAgent = "crawl.user-agent"
ConfTimeout = "crawl.timeout"
ConfCrawlStats = "output.crawl_stats"
ConfAllocStats = "output.resource_stats"
ConfVerbose = "output.verbose"
ConfPrintHTTP = "output.http"
ConfLogFile = "output.log"
)
func prepareConfig() {
viper.SetDefault(ConfRetries, 5)
viper.SetDefault(ConfWorkers, 2)
viper.SetDefault(ConfTasks, 3)
viper.SetDefault(ConfUserAgent, "")
viper.SetDefault(ConfTimeout, 10 * time.Second)
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
viper.SetDefault(ConfAllocStats, 0)
viper.SetDefault(ConfVerbose, false)
viper.SetDefault(ConfPrintHTTP, false)
viper.SetDefault(ConfLogFile, "")
viper.SetDefault(ConfRecheck, 3 * time.Second)
viper.SetDefault(ConfChunkSize, "1 MB")
}
func readConfig() {
@@ -52,14 +71,24 @@ func readConfig() {
}
config.ServerUrl = viper.GetString(ConfServerUrl)
//if config.ServerUrl == "" {
// configMissing(ConfServerUrl)
//}
if config.ServerUrl == "" {
configMissing(ConfServerUrl)
}
config.ServerUrl = strings.TrimRight(config.ServerUrl, "/")
config.Token = viper.GetString(ConfToken)
//if config.Token == "" {
// configMissing(ConfToken)
//}
if config.Token == "" {
configMissing(ConfToken)
}
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
config.Recheck = viper.GetDuration(ConfRecheck)
config.ChunkSize = int64(viper.GetSizeInBytes(ConfChunkSize))
if config.ChunkSize < 100 {
configOOB(ConfChunkSize, config.ChunkSize)
}
config.Retries = viper.GetInt(ConfRetries)
if config.Retries < 0 {
@@ -76,6 +105,8 @@ func readConfig() {
configOOB(ConfTasks, int(config.Tasks))
}
config.UserAgent = viper.GetString(ConfUserAgent)
config.Timeout = viper.GetDuration(ConfTimeout)
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
@@ -86,6 +117,19 @@ func readConfig() {
if config.Verbose {
logrus.SetLevel(logrus.DebugLevel)
}
if filePath := viper.GetString(ConfLogFile); filePath != "" {
f, err := os.OpenFile(filePath, os.O_CREATE | os.O_WRONLY | os.O_APPEND, 0644)
bufWriter := bufio.NewWriter(f)
if err != nil { panic(err) }
exitHooks.Add(func() {
bufWriter.Flush()
f.Close()
})
logrus.SetOutput(io.MultiWriter(os.Stdout, bufWriter))
}
config.PrintHTTP = viper.GetBool(ConfPrintHTTP)
}
func configMissing(key string) {
@@ -93,7 +137,7 @@ func configMissing(key string) {
os.Exit(1)
}
func configOOB(key string, v int) {
fmt.Fprintf(os.Stderr, "config: illegal value %d for %key!\n", v, key)
func configOOB(key string, v interface{}) {
fmt.Fprintf(os.Stderr, "config: illegal value %v for key %s!\n", v, key)
os.Exit(1)
}

View File

@@ -1,28 +1,60 @@
# OD-Database server settings
server:
# Connection URL
url: localhost:6969
url: http://od-db.mine.terorie.com/api
# Server auth token
token:
# Request timeout
timeout: 60s
# Recheck interval
# The crawler periodically asks the server
# for new jobs. Sets the minimum wait time
# between /task/get requests to the server.
recheck: 1s
# Upload chunk size
# If the value is too high, the upload fails.
upload_chunk: 1 MB
# Log output settings
output:
# Crawl statistics
crawl_stats: 1s
# CPU/RAM/Job queue stats
resource_stats: 1s
resource_stats: 10s
# More output? (Every listed dir)
verbose: false
# Print HTTP errors (Super spammy)
http: false
# Log file
# If empty, no log file is created.
log: crawler.log
# Crawler settings
crawl:
# Number of sites that can be
# processed at once
tasks: 3
# Number of sites that can be processed at once
tasks: 100
# Number of connections per site
connections: 2
# Please be careful with this setting!
# The crawler fires fast and more than
# ten connections can overwhelm a server.
connections: 10
# How often to retry getting data
# from the site before giving up
retries: 5
# Time before discarding a network request
timeout: 10s
# Crawler User-Agent
# If empty, no User-Agent header is sent.
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"

View File

@@ -2,9 +2,9 @@ package main
import (
"bytes"
"fmt"
"github.com/terorie/oddb-go/ds/redblackhash"
"github.com/terorie/oddb-go/fasturl"
"crypto/tls"
"github.com/terorie/od-database-crawler/ds/redblackhash"
"github.com/terorie/od-database-crawler/fasturl"
"github.com/valyala/fasthttp"
"golang.org/x/crypto/blake2b"
"golang.org/x/net/html"
@@ -14,13 +14,20 @@ import (
"time"
)
var client fasthttp.Client
var client = fasthttp.Client {
TLSConfig: &tls.Config{
InsecureSkipVerify: true,
},
}
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
f.IsDir = true
f.Name = path.Base(j.Uri.Path)
req := fasthttp.AcquireRequest()
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(j.UriStr)
res := fasthttp.AcquireResponse()
@@ -29,16 +36,22 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
err = client.DoTimeout(req, res, config.Timeout)
fasthttp.ReleaseRequest(req)
if err != nil { return }
if err != nil {
return
}
err = checkStatusCode(res.StatusCode())
if err != nil { return }
if err != nil {
return
}
body := res.Body()
doc := html.NewTokenizer(bytes.NewReader(body))
var linkHref string
for {
err = nil
tokenType := doc.Next()
if tokenType == html.ErrorToken {
break
@@ -69,21 +82,23 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
linkHref = ""
if strings.LastIndexByte(href, '?') != -1 {
goto nextToken
continue
}
switch href {
case "", " ", ".", "..", "/":
goto nextToken
continue
}
if strings.Contains(href, "../") {
goto nextToken
continue
}
var link fasturl.URL
err = j.Uri.ParseRel(&link, href)
if err != nil { continue }
if err != nil {
continue
}
if link.Scheme != j.Uri.Scheme ||
link.Host != j.Uri.Host ||
@@ -95,8 +110,6 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
links = append(links, link)
}
}
nextToken:
}
return
@@ -106,10 +119,13 @@ func GetFile(u fasturl.URL, f *File) (err error) {
f.IsDir = false
u.Path = path.Clean(u.Path)
f.Name = path.Base(u.Path)
f.Path = strings.Trim(u.Path, "/")
f.Path = strings.Trim(path.Dir(u.Path), "/")
req := fasthttp.AcquireRequest()
req.Header.SetMethod("HEAD")
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(u.String())
res := fasthttp.AcquireResponse()
@@ -119,10 +135,14 @@ func GetFile(u fasturl.URL, f *File) (err error) {
err = client.DoTimeout(req, res, config.Timeout)
fasthttp.ReleaseRequest(req)
if err != nil { return }
if err != nil {
return
}
err = checkStatusCode(res.StatusCode())
if err != nil { return }
if err != nil {
return
}
f.applyContentLength(string(res.Header.Peek("content-length")))
f.applyLastModified(string(res.Header.Peek("last-modified")))
@@ -143,38 +163,49 @@ func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) {
}
func (f *File) applyContentLength(v string) {
if v == "" { return }
if v == "" {
return
}
size, err := strconv.ParseInt(v, 10, 64)
if err != nil { return }
if size < 0 { return }
if err != nil {
return
}
if size < 0 {
return
}
f.Size = size
}
// TODO Cleanup
func (f *File) applyLastModified(v string) {
if v == "" { return }
if v == "" {
return
}
var t time.Time
var err error
f.MTime, err = time.Parse(time.RFC1123, v)
if err == nil { return }
f.MTime, err = time.Parse(time.RFC850, v)
if err == nil { return }
t, err = time.Parse(time.RFC1123, v)
if err == nil {
f.MTime = t.Unix()
return
}
t, err = time.Parse(time.RFC850, v)
if err == nil {
f.MTime = t.Unix()
return
}
// TODO Parse asctime
f.MTime, err = time.Parse("2006-01-02", v[:10])
if err == nil { return }
t, err = time.Parse("2006-01-02", v[:10])
if err == nil {
f.MTime = t.Unix()
return
}
}
func checkStatusCode(status int) error {
switch status {
case fasthttp.StatusOK:
return nil
case fasthttp.StatusTooManyRequests:
return ErrRateLimit
case fasthttp.StatusForbidden,
fasthttp.StatusUnauthorized:
return ErrForbidden
default:
return fmt.Errorf("got HTTP status %d", status)
return &HttpError{status}
}
}

View File

@@ -14,7 +14,9 @@
package redblackhash
import (
"bytes"
"fmt"
"sync"
)
const (
@@ -27,6 +29,7 @@ type Key [KeySize]byte
// Tree holds elements of the red-black tree
type Tree struct {
sync.Mutex
Root *Node
size int
}
@@ -41,42 +44,7 @@ type Node struct {
}
func (k *Key) Compare(o *Key) int {
// TODO Assembly
/*for i := 0; i < KeySize / 8; i++ {
a := uint64(k[i+0] ) |
uint64(k[i+1] >> 8) |
uint64(k[i+2] >> 16) |
uint64(k[i+3] >> 24) |
uint64(k[i+4] >> 32) |
uint64(k[i+5] >> 40) |
uint64(k[i+6] >> 48) |
uint64(k[i+7] >> 56)
b := uint64(o[i+0] ) |
uint64(o[i+1] >> 8) |
uint64(o[i+2] >> 16) |
uint64(o[i+3] >> 24) |
uint64(o[i+4] >> 32) |
uint64(o[i+5] >> 40) |
uint64(o[i+6] >> 48) |
uint64(o[i+7] >> 56)
switch {
case a < b:
return -1
case a > b:
return 1
}
}*/
for i := 0; i < KeySize; i++ {
switch {
case k[i] < o[i]:
return -1
case k[i] > o[i]:
return 1
}
}
return 0
return bytes.Compare(k[:], o[:])
}
// Put inserts node into the tree.

View File

@@ -1,8 +1,17 @@
package main
import "errors"
import (
"errors"
"fmt"
)
var ErrRateLimit = errors.New("too many requests")
var ErrForbidden = errors.New("access denied")
var ErrKnown = errors.New("already crawled")
type HttpError struct {
code int
}
func (e HttpError) Error() string {
return fmt.Sprintf("http status %d", e.code)
}

View File

@@ -33,6 +33,8 @@ var Schemes = [SchemeCount]string {
"https",
}
var ErrUnknownScheme = errors.New("unknown protocol scheme")
// Error reports an error and the operation and URL that caused it.
type Error struct {
Op string
@@ -353,7 +355,7 @@ func getscheme(rawurl string) (scheme Scheme, path string, err error) {
case "https":
scheme = SchemeHTTPS
default:
return SchemeInvalid, "", errors.New("unknown protocol scheme")
return SchemeInvalid, "", ErrUnknownScheme
}
path = rawurl[i+1:]
@@ -811,3 +813,57 @@ func validUserinfo(s string) bool {
}
return true
}
func PathUnescape(s string) string {
newStr, err := pathUnescape(s)
if err != nil {
return s
} else {
return newStr
}
}
func pathUnescape(s string) (string, error) {
// Count %, check that they're well-formed.
n := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
n++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:]
if len(s) > 3 {
s = s[:3]
}
return "", EscapeError(s)
}
i += 3
default:
i++
}
}
if n == 0 {
return s, nil
}
t := make([]byte, len(s)-2*n)
j := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
j++
i += 3
case '+':
t[j] = '+'
j++
i++
default:
t[j] = s[i]
j++
i++
}
}
return string(t), nil
}

157
main.go
View File

@@ -3,92 +3,141 @@ package main
import (
"context"
"github.com/sirupsen/logrus"
"github.com/terorie/oddb-go/fasturl"
"github.com/terorie/od-database-crawler/fasturl"
"github.com/urfave/cli"
"log"
"net/http"
_ "net/http/pprof"
"os"
"strings"
"sync/atomic"
"time"
)
var app = cli.App {
Name: "oddb-go",
Name: "od-database-crawler",
Usage: "OD-Database Go crawler",
Version: "0.2",
Version: "1.0.2",
BashComplete: cli.DefaultAppComplete,
Writer: os.Stdout,
Compiled: buildDate,
Action: cmdBase,
Commands: []cli.Command{
{
Name: "crawl",
Usage: "Crawl a list of URLs",
ArgsUsage: "[site, site, ...]",
ArgsUsage: "<site>",
Action: cmdCrawler,
},
},
After: func(i *cli.Context) error {
exitHooks.Execute()
return nil
},
}
var exitHooks Hooks
func init() {
prepareConfig()
}
func main() {
go func() {
log.Println(http.ListenAndServe("localhost:42069", nil))
}()
err := os.MkdirAll("crawled", 0755)
if err != nil {
panic(err)
}
readConfig()
app.Run(os.Args)
}
func cmdCrawler(clic *cli.Context) error {
readConfig()
if clic.NArg() == 0 {
cli.ShowCommandHelpAndExit(clic, "crawl", 1)
}
args := clic.Args()
remotes := make([]*OD, len(args))
for i, arg := range args {
// https://github.com/golang/go/issues/19779
if !strings.Contains(arg, "://") {
arg = "http://" + arg
}
var u fasturl.URL
err := u.Parse(arg)
if !strings.HasSuffix(u.Path, "/") {
u.Path += "/"
}
if err != nil { return err }
remotes[i] = &OD {
Task: &Task{
WebsiteId: 0,
Url: u.String(),
},
BaseUri: u,
}
}
c := context.Background()
func cmdBase(_ *cli.Context) error {
// TODO Graceful shutdown
appCtx := context.Background()
forceCtx := context.Background()
inRemotes := make(chan *OD)
go Schedule(c, inRemotes)
go Schedule(forceCtx, inRemotes)
for _, remote := range remotes {
globalWait.Add(1)
inRemotes <- remote
ticker := time.NewTicker(config.Recheck)
defer ticker.Stop()
for {
select {
case <-appCtx.Done():
return nil
case <-ticker.C:
t, err := FetchTask()
if err != nil {
logrus.WithError(err).
Error("Failed to get new task")
time.Sleep(30 * time.Second)
continue
}
if t == nil {
// No new task
if atomic.LoadInt32(&numActiveTasks) == 0 {
logrus.Info("Waiting …")
}
continue
}
var baseUri fasturl.URL
err = baseUri.Parse(t.Url)
if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
// Not an error
err = nil
// Give back task
//err2 := CancelTask(t.WebsiteId)
//if err2 != nil {
// logrus.Error(err2)
//}
continue
} else if err != nil {
logrus.WithError(err).
Error("Failed to get new task")
time.Sleep(30 * time.Second)
continue
}
ScheduleTask(inRemotes, t, &baseUri)
}
}
// Wait for all jobs to finish
globalWait.Wait()
logrus.Info("All dirs processed!")
return nil
}
var buildDate = time.Date(
2018, 11, 15,
23, 24, 0, 0,
time.UTC)
func cmdCrawler(clic *cli.Context) error {
if clic.NArg() != 1 {
cli.ShowCommandHelpAndExit(clic, "crawl", 1)
}
arg := clic.Args()[0]
// https://github.com/golang/go/issues/19779
if !strings.Contains(arg, "://") {
arg = "http://" + arg
}
var u fasturl.URL
err := u.Parse(arg)
if !strings.HasSuffix(u.Path, "/") {
u.Path += "/"
}
if err != nil { return err }
// TODO Graceful shutdown
forceCtx := context.Background()
inRemotes := make(chan *OD)
go Schedule(forceCtx, inRemotes)
ticker := time.NewTicker(3 * time.Second)
defer ticker.Stop()
task := Task {
WebsiteId: 0,
Url: u.String(),
}
ScheduleTask(inRemotes, &task, &u)
// Wait for all jobs to finish
globalWait.Wait()
return nil
}

View File

@@ -1,12 +1,27 @@
package main
import (
"github.com/terorie/oddb-go/ds/redblackhash"
"github.com/terorie/oddb-go/fasturl"
"github.com/terorie/od-database-crawler/ds/redblackhash"
"github.com/terorie/od-database-crawler/fasturl"
"sync"
"time"
)
type Task struct {
WebsiteId uint64 `json:"website_id"`
Url string `json:"url"`
}
type TaskResult struct {
StatusCode string `json:"status_code"`
FileCount uint64 `json:"file_count"`
ErrorCount uint64 `json:"-"`
StartTime time.Time `json:"-"`
StartTimeUnix int64 `json:"start_time"`
EndTimeUnix int64 `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}
type Job struct {
OD *OD
Uri fasturl.URL
@@ -16,26 +31,25 @@ type Job struct {
}
type OD struct {
Task *Task
Task Task
Result TaskResult
Wait sync.WaitGroup
BaseUri fasturl.URL
WCtx WorkerContext
Scanned redblackhash.Tree
lock sync.Mutex
}
type File struct {
Name string `json:"name"`
Size int64 `json:"size"`
MTime time.Time `json:"mtime"`
Path string `json:"path"`
IsDir bool `json:"-"`
Name string `json:"name"`
Size int64 `json:"size"`
MTime int64 `json:"mtime"`
Path string `json:"path"`
IsDir bool `json:"-"`
}
func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) {
o.lock.Lock()
defer o.lock.Unlock()
o.Scanned.Lock()
defer o.Scanned.Unlock()
exists = o.Scanned.Get(k)
if exists { return true }

25
release.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
appname="od-database-crawler"
tag=$1
[ -z "$tag" ] && echo "Usage: build <version>" && exit 1
name=${appname}-${tag}-windows.exe
GOOS="windows" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-linux
GOOS="linux" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-mac
GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-freebsd
GOOS="freebsd" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name

View File

@@ -5,12 +5,17 @@ import (
"encoding/json"
"fmt"
"github.com/sirupsen/logrus"
"github.com/terorie/od-database-crawler/fasturl"
"os"
"path"
"sync"
"sync/atomic"
"time"
)
var activeTasks int32
var activeTasksLock sync.Mutex
var activeTasks = make(map[uint64]bool)
var numActiveTasks int32
var totalBuffered int64
func Schedule(c context.Context, remotes <-chan *OD) {
@@ -30,7 +35,7 @@ func Schedule(c context.Context, remotes <-chan *OD) {
}
// Enqueue initial job
atomic.AddInt32(&activeTasks, 1)
atomic.AddInt32(&numActiveTasks, 1)
remote.WCtx.queueJob(Job{
OD: remote,
Uri: remote.BaseUri,
@@ -40,46 +45,149 @@ func Schedule(c context.Context, remotes <-chan *OD) {
// Upload result when ready
go remote.Watch(results)
// Sleep if max number of tasks are active
for atomic.LoadInt32(&numActiveTasks) > config.Tasks {
select {
case <-c.Done():
return
case <-time.After(time.Second):
continue
}
}
}
}
func (r *OD) Watch(results chan File) {
go r.Task.Collect(results)
func ScheduleTask(remotes chan<- *OD, t *Task, u *fasturl.URL) {
if !t.register() {
return
}
// Wait for all jobs on remote to finish
r.Wait.Wait()
close(r.WCtx.in)
atomic.AddInt32(&activeTasks, -1)
logrus.WithField("url", r.BaseUri.String()).
Info("Crawler finished")
globalWait.Done()
close(results)
globalWait.Add(1)
now := time.Now()
od := &OD {
Task: *t,
BaseUri: *u,
Result: TaskResult {
WebsiteId: t.WebsiteId,
StartTime: now,
StartTimeUnix: now.Unix(),
},
}
remotes <- od
}
func (t *Task) Collect(results chan File) {
err := t.collect(results)
func (t *Task) register() bool {
activeTasksLock.Lock()
defer activeTasksLock.Unlock()
if _, known := activeTasks[t.WebsiteId]; known {
return false
} else {
activeTasks[t.WebsiteId] = true
return true
}
}
func (t *Task) unregister() {
activeTasksLock.Lock()
delete(activeTasks, t.WebsiteId)
activeTasksLock.Unlock()
}
func (o *OD) Watch(results chan File) {
// Mark job as completely done
defer globalWait.Done()
defer o.Task.unregister()
filePath := path.Join("crawled", fmt.Sprintf("%d.json", o.Task.WebsiteId))
// Open crawl results file
f, err := os.OpenFile(
filePath,
os.O_CREATE | os.O_RDWR | os.O_TRUNC,
0644,
)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
return
}
defer f.Close()
defer os.Remove(filePath)
// Listen for exit code of Collect()
collectErrC := make(chan error)
// Block until all results are written
// (closes results channel)
o.handleCollect(results, f, collectErrC)
// Exit code of Collect()
err = <-collectErrC
close(collectErrC)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
return
}
// Upload results
err = PushResult(&o.Result, f)
if err != nil {
logrus.WithError(err).
Error("Failed uploading crawl results")
return
}
}
func (o *OD) handleCollect(results chan File, f *os.File, collectErrC chan error) {
// Begin collecting results
go o.Task.Collect(results, f, collectErrC)
defer close(results)
// Wait for all jobs on remote to finish
o.Wait.Wait()
close(o.WCtx.in)
atomic.AddInt32(&numActiveTasks, -1)
// Log finish
logrus.WithFields(logrus.Fields{
"id": o.Task.WebsiteId,
"url": o.BaseUri.String(),
"duration": time.Since(o.Result.StartTime),
}).Info("Crawler finished")
// Set status code
now := time.Now()
o.Result.EndTimeUnix = now.Unix()
fileCount := atomic.LoadUint64(&o.Result.FileCount)
if fileCount == 0 {
errorCount := atomic.LoadUint64(&o.Result.ErrorCount)
if errorCount == 0 {
o.Result.StatusCode = "empty"
} else {
o.Result.StatusCode = "directory listing failed"
}
} else {
o.Result.StatusCode = "success"
}
}
func (t *Task) Collect(results chan File, f *os.File, errC chan<- error) {
err := t.collect(results, f)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
}
errC <- err
}
func (t *Task) collect(results chan File) error {
err := os.MkdirAll("crawled", 0755)
if err != nil { return err }
f, err := os.OpenFile(
path.Join("crawled", fmt.Sprintf("%d.json", t.WebsiteId)),
os.O_CREATE | os.O_WRONLY | os.O_TRUNC,
0755,
)
if err != nil { return err }
defer f.Close()
func (t *Task) collect(results chan File, f *os.File) error {
for result := range results {
result.Path = fasturl.PathUnescape(result.Path)
result.Name = fasturl.PathUnescape(result.Name)
resJson, err := json.Marshal(result)
if err != nil { panic(err) }
_, err = f.Write(resJson)

170
server.go
View File

@@ -10,34 +10,28 @@ import (
"net/http"
"net/url"
"os"
"path/filepath"
"strconv"
"strings"
"time"
)
const (
fileListChunkSize int64 = 5000000 // 5 mb
)
var serverClient = http.DefaultClient
var serverClient = http.Client {
Timeout: config.ServerTimeout,
}
func FetchTask() (t *Task, err error) {
escToken, _ := json.Marshal(config.Token)
payload := `{"token":` + string(escToken) + `}`
req, err := http.NewRequest(
http.MethodPost,
res, err := serverClient.PostForm(
config.ServerUrl + "/task/get",
strings.NewReader(payload))
if err != nil { return }
res, err := serverClient.Do(req)
url.Values{ "token": {config.Token} })
if err != nil { return }
defer res.Body.Close()
if res.StatusCode != 200 {
err = fmt.Errorf("http %s", res.Status)
return
switch res.StatusCode {
case 200:
break
case 404, 500:
return nil, nil
default:
return nil, fmt.Errorf("http %s", res.Status)
}
t = new(Task)
@@ -47,21 +41,17 @@ func FetchTask() (t *Task, err error) {
return
}
func PushResult(result *TaskResult) (err error) {
filePath := filepath.Join(
".", "crawled",
fmt.Sprintf("%d.json", result.WebsiteId))
func PushResult(result *TaskResult, f *os.File) (err error) {
if result.WebsiteId == 0 {
// Not a real result, don't push
return nil
}
defer os.Remove(filePath)
f, err := os.Open(filePath)
if os.IsNotExist(err) {
err = fmt.Errorf("cannot upload result: %s does not exist", filePath)
return
} else if err != nil {
// Rewind to the beginning of the file
_, err = f.Seek(0, 0)
if err != nil {
return
}
defer f.Close()
err = uploadChunks(result.WebsiteId, f)
if err != nil {
@@ -73,104 +63,110 @@ func PushResult(result *TaskResult) (err error) {
return
}
err = uploadResult(result)
if err != nil {
logrus.Errorf("Failed to upload result: %s", err)
err2 := CancelTask(result.WebsiteId)
if err2 != nil {
logrus.Error(err2)
}
return
}
// Upload result ignoring errors
uploadResult(result)
return
}
func uploadChunks(websiteId uint64, f *os.File) (err error) {
for iter := 1; iter > 0; iter++ {
func uploadChunks(websiteId uint64, f *os.File) error {
eof := false
for iter := 1; !eof; iter++ {
// TODO Stream with io.Pipe?
var b bytes.Buffer
multi := multipart.NewWriter(&b)
// Set upload fields
var err error
err = multi.WriteField("token", config.Token)
if err != nil { return }
if err != nil { return err }
err = multi.WriteField("website_id", fmt.Sprintf("%d", websiteId))
if err != nil { return }
if err != nil { return err }
// Copy chunk to file_list
formFile, err := multi.CreateFormFile("file_list", "file_list")
_, err = io.CopyN(formFile, f, fileListChunkSize)
if err == io.EOF {
break
} else if err == io.ErrUnexpectedEOF {
var n int64
n, err = io.CopyN(formFile, f, config.ChunkSize)
if err != io.EOF && err != nil {
return err
}
if n == 0 {
// Don't upload, no content
return nil
} else if n < config.ChunkSize {
err = nil
// Break at end of iteration
iter = -420
eof = true
}
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/upload",
&b)
if err != nil { return err }
multi.Close()
res, err := serverClient.Do(req)
if err != nil { return err }
res.Body.Close()
for retried := false; true; retried = true {
err = nil
if retried {
// Error occurred, retry upload
time.Sleep(5 * time.Second)
}
if res.StatusCode != http.StatusOK {
return fmt.Errorf("failed to upload list part %d: %s",
iter, res.Status)
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/upload",
&b)
req.Header.Set("content-type", multi.FormDataContentType())
if err != nil { continue }
res, err := serverClient.Do(req)
if err != nil { continue }
res.Body.Close()
if res.StatusCode != http.StatusOK {
logrus.WithField("status", res.Status).
WithField("part", iter).
Errorf("Upload failed")
continue
}
// Upload successful
break
}
logrus.Infof("Uploading file list part %d: %s",
iter, res.Status)
logrus.WithField("id", websiteId).
WithField("part", iter).
Infof("Uploaded files chunk")
}
return
return nil
}
func uploadResult(result *TaskResult) (err error) {
resultEnc, err := json.Marshal(result)
if err != nil { panic(err) }
payload := url.Values {
"token": {config.Token},
"result": {string(resultEnc)},
}.Encode()
req, err := http.NewRequest(
http.MethodPost,
res, err := serverClient.PostForm(
config.ServerUrl + "/task/complete",
strings.NewReader(payload))
if err != nil { return }
res, err := serverClient.Do(req)
url.Values {
"token": {config.Token},
"result": {string(resultEnc)},
},
)
if err != nil { return }
res.Body.Close()
if res.StatusCode != http.StatusOK {
return fmt.Errorf("failed to cancel task: %s", res.Status)
return HttpError{res.StatusCode}
}
return
}
func CancelTask(websiteId uint64) (err error) {
form := url.Values{
"token": {config.Token},
"website_id": {strconv.FormatUint(websiteId, 10)},
}
encForm := form.Encode()
req, err := http.NewRequest(
http.MethodPost,
res, err := serverClient.PostForm(
config.ServerUrl + "/task/cancel",
strings.NewReader(encForm))
if err != nil { return }
res, err := serverClient.Do(req)
url.Values{
"token": {config.Token},
"website_id": {strconv.FormatUint(websiteId, 10)},
},
)
if err != nil { return }
res.Body.Close()

View File

@@ -39,6 +39,10 @@ func Stats(c context.Context) {
perSecond = math.Round(perSecond)
perSecond /= 2
if perSecond <= 0 {
continue
}
logrus.WithFields(logrus.Fields{
"per_second": perSecond,
"done": atomic.LoadUint64(&totalDone),
@@ -53,7 +57,7 @@ func Stats(c context.Context) {
runtime.ReadMemStats(&mem)
logrus.WithFields(logrus.Fields{
"queue_count": totalBuffered,
"queue_count": atomic.LoadInt64(&totalBuffered),
"heap": FormatByteCount(mem.Alloc),
"objects": mem.HeapObjects,
"num_gc": mem.NumGC,

View File

@@ -1,16 +0,0 @@
package main
import "time"
type Task struct {
WebsiteId int `json:"website_id"`
Url string `json:"url"`
}
type TaskResult struct {
StatusCode int `json:"status_code"`
FileCount uint64 `json:"file_count"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}

22
util.go
View File

@@ -1,6 +1,9 @@
package main
import "fmt"
import (
"fmt"
"sync"
)
// https://programming.guide/go/formatting-byte-size-to-human-readable-format.html
func FormatByteCount(b uint64) string {
@@ -16,3 +19,20 @@ func FormatByteCount(b uint64) string {
return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp])
}
}
type Hooks struct {
m sync.Mutex
l []func()
}
func (h *Hooks) Add(hook func()) {
h.m.Lock()
h.l = append(h.l, hook)
h.m.Unlock()
}
func (h *Hooks) Execute() {
for _, hook := range h.l {
hook()
}
}

View File

@@ -2,6 +2,7 @@ package main
import (
"github.com/sirupsen/logrus"
"github.com/valyala/fasthttp"
"math"
"sort"
"strings"
@@ -39,9 +40,14 @@ func (w WorkerContext) step(results chan<- File, job Job) {
if err != nil {
job.Fails++
if err == ErrForbidden {
// Don't attempt crawling again
return
if httpErr, ok := err.(*HttpError); ok {
switch httpErr.code {
case fasthttp.StatusTooManyRequests:
err = ErrRateLimit
default:
// Don't retry HTTP error codes
return
}
}
if job.Fails > config.Retries {
@@ -75,9 +81,11 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
// Load directory
links, err := GetDir(job, f)
if err != nil {
logrus.WithError(err).
WithField("url", job.UriStr).
Error("Failed getting dir")
if !isErrSilent(err) {
logrus.WithError(err).
WithField("url", job.UriStr).
Error("Failed to crawl dir")
}
return nil, err
}
@@ -105,7 +113,6 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
}
lastLink = uriStr
job.OD.Wait.Add(1)
newJobs = append(newJobs, Job{
OD: job.OD,
Uri: link,
@@ -125,11 +132,14 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
// Load file
err := GetFile(job.Uri, f)
if err != nil {
logrus.WithError(err).
WithField("url", job.UriStr).
Error("Failed getting file")
if !isErrSilent(err) {
logrus.WithError(err).
WithField("url", job.UriStr).
Error("Failed to crawl file")
}
return nil, err
}
atomic.AddUint64(&job.OD.Result.FileCount, 1)
}
return
}
@@ -143,13 +153,21 @@ func (w WorkerContext) queueJob(job Job) {
} else {
time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) *
100 * time.Millisecond)
w.in <- job
}
} else {
w.in <- job
}
w.in <- job
}
func (w WorkerContext) finishJob(job *Job) {
job.OD.Wait.Done()
}
func isErrSilent(err error) bool {
if !config.PrintHTTP {
if _, ok := err.(*HttpError); ok {
return true
}
}
return false
}