74 Commits

Author SHA1 Message Date
Richard Patel
24d9d1fd42 Make resume work 2019-02-03 16:47:01 +01:00
Richard Patel
f3be76e001 resume len 2019-02-03 16:41:26 +01:00
Richard Patel
4ef4ab13a8 Fix sleeps 2019-02-03 16:34:29 +01:00
Richard Patel
25d0b0042c resume: Fix missing gob register 2019-02-03 16:32:01 +01:00
Richard Patel
ef7d17cad4 Fix too long sleep 2019-02-03 16:28:43 +01:00
Richard Patel
e919323169 Resume tests 2019-02-03 16:24:18 +01:00
Richard Patel
a3aebe4ef2 Pause file version 2019-02-03 16:08:42 +01:00
Richard Patel
acbfd78a5d Save marker 2019-02-03 16:06:52 +01:00
Richard Patel
fe1e7bf261 Save: queue dir if not yet exists 2019-02-03 16:01:15 +01:00
Richard Patel
c6d7fad8e8 Resume state saving 2019-02-03 15:54:02 +01:00
Richard Patel
0b20823ae1 Resume log messages 2019-02-03 15:09:49 +01:00
Richard Patel
8d68bf1bbc Open result files in append-mode 2019-02-03 15:06:52 +01:00
Richard Patel
a83eb0cfd7 Initial resume implementation 2019-02-03 15:02:07 +01:00
Richard Patel
b18b70f798 Fix segfault (thanks Pikami) 2019-02-03 14:00:17 +01:00
Richard Patel
9d5f549774 Better server User-Agent string 2019-02-03 12:23:21 +01:00
Richard Patel
5239af08f7 Bump version to v1.2.1 2019-02-03 03:36:39 +01:00
Richard Patel
46c0e0bd32 Smarter HTTP error handling 2019-02-03 03:35:09 +01:00
Richard Patel
0ca6deede8 Fix --config flag 2019-02-03 03:26:48 +01:00
Richard Patel
120c026983 Bump version to v1.2.0 2019-02-03 02:55:21 +01:00
Richard Patel
527e8895ec Support configuration without config file 2019-02-03 02:54:52 +01:00
Richard Patel
108fff0503 Add Travis CI badge 2019-02-03 02:09:06 +01:00
Richard Patel
e5746baa5b Switch to spf13/cobra
lul
2019-02-03 02:02:23 +01:00
Richard Patel
17ba5583c9 Add .travis.yml 2019-02-02 23:18:03 +01:00
Richard Patel
92a8c07f4a Add go.mod 2019-02-02 23:15:52 +01:00
Richard Patel
43f96c6988 Benchmark: Reference parser 2018-12-18 15:39:41 +01:00
Richard Patel
b244cdae80 Minor cleanup 2018-12-18 15:31:33 +01:00
Richard Patel
4b8275c7bf Add parser tests 2018-12-18 15:31:09 +01:00
Richard Patel
f90bf94a44 Bump version to v1.1.1 2018-11-27 22:11:57 +01:00
Richard Patel
e82768ff80 Wait time control in config 2018-11-27 22:11:57 +01:00
Richard Patel
b1bf59adef Add The Eye DB to README.md 2018-11-27 17:40:12 +01:00
Richard Patel
a2df2972f4 Bump the upload retry interval up to 30s 2018-11-20 04:13:20 +01:00
Richard Patel
3fc8837dd7 Add output files to .gitignore 2018-11-20 03:51:42 +01:00
Richard Patel
f9a0d6bffe Bump to v1.1.0 2018-11-20 03:46:36 +01:00
Richard Patel
4dbe2aef2b Add job buffer size parameter 2018-11-20 03:42:32 +01:00
Richard Patel
86ec78cae1 Add TCP timeout option 2018-11-20 03:29:10 +01:00
Richard Patel
b846498030 Delete URL queues after crawling 2018-11-20 03:05:43 +01:00
Richard Patel
4f3140a39f Fix queue_count in log 2018-11-20 02:49:03 +01:00
Richard Patel
85d2aac9d4 Performance patch 2018-11-20 02:33:50 +01:00
Richard Patel
b6c0a45900 Job queue disk offloading 2018-11-20 02:03:10 +01:00
Richard Patel
d332f06659 Limit retries to 10 2018-11-18 21:05:26 +01:00
Richard Patel
1625d6c888 Bump to v1.0.2 2018-11-18 18:53:57 +01:00
Richard Patel
03a487f393 Fix crawl loop 2018-11-18 18:45:06 +01:00
Richard Patel
ac8221b109 Retry /task/upload 2018-11-18 18:33:26 +01:00
Richard Patel
8ed2cf3b93 Bump to v1.0.1 2018-11-18 14:49:07 +01:00
Richard Patel
f3620262fc Add log file support 2018-11-18 14:46:52 +01:00
Richard Patel
dc4e4212a0 Add freebsd to release.sh 2018-11-18 14:38:18 +01:00
Richard Patel
6e6a4edd27 Ignore all HTTP errors 2018-11-18 14:25:06 +01:00
Richard Patel
a71157b4d8 Add User-Agent parameter 2018-11-18 14:24:04 +01:00
Richard Patel
6dbec8c789 Add release script 2018-11-18 02:36:22 +01:00
Richard Patel
605f6db5a5 Don't call /task/upload for websites with no results 2018-11-18 01:42:57 +01:00
Richard Patel
d593ba2d0b Bump to 1.0 2018-11-18 00:54:58 +01:00
Richard Patel
6793086c22 Ignore HTTPS errors 2018-11-18 00:37:30 +01:00
Richard Patel
4464f34779 Add recheck and timeout parameters 2018-11-18 00:29:29 +01:00
Richard Patel
339175220d Refactor uploading & chunk size parameter 2018-11-18 00:19:43 +01:00
Richard Patel
1e6687c519 Upload result ignoring errors 2018-11-17 15:04:20 +01:00
Richard Patel
8060556089 Fix: make crawled dir 2018-11-17 13:36:35 +01:00
Richard Patel
73ba848e17 Grammar 2018-11-17 13:35:29 +01:00
Richard Patel
115983f70e Silent HTTP errors 2018-11-17 13:22:46 +01:00
Richard Patel
9210996b4c Fix multiple part file upload 2018-11-17 12:52:24 +01:00
Richard Patel
7b29da9340 Fix file uploads 2018-11-17 12:47:16 +01:00
Richard Patel
24ee6fcba2 Quickfix: Revert FTP give back 2018-11-17 12:43:30 +01:00
Richard Patel
bfb18d62b2 mini fix 2018-11-17 05:27:09 +01:00
Richard Patel
f4054441ab Return FTP tasks 2018-11-17 05:07:52 +01:00
Richard Patel
f8d2bf386d Fix FTP error ignore 2018-11-17 04:57:19 +01:00
Richard Patel
f41198b00c Ignore FTP URLs 2018-11-17 04:50:59 +01:00
Richard Patel
7fdffff58f Update config.yml 2018-11-17 04:19:04 +01:00
Richard Patel
d596882b40 Fix ton of bugs 2018-11-17 04:18:22 +01:00
Richard Patel
0fe97a8058 Update README.md 2018-11-17 01:36:07 +01:00
Richard Patel
718f9d7fbc Rename project 2018-11-17 01:33:15 +01:00
Richard Patel
f1687679ab Unescape results & don't recrawl 404 2018-11-17 01:21:20 +01:00
Richard Patel
145d37f84a Fix wait, add back crawl command 2018-11-17 00:49:09 +01:00
Richard Patel
cc777bcaeb redblackhash: Use bytes.Compare 2018-11-16 21:17:39 +01:00
Simon
1e78cea7e7 Saved path should not contain file name 2018-11-16 13:58:12 -05:00
Richard Patel
3f85cf679b Getting tasks 2018-11-16 04:47:08 +01:00
29 changed files with 6759 additions and 445 deletions

BIN
.github/stress.png vendored

Binary file not shown.

Before

Width:  |  Height:  |  Size: 369 KiB

5
.gitignore vendored
View File

@@ -1,3 +1,6 @@
/.idea/ /.idea/
.DS_Store .DS_Store
/oddb-go /od-database-crawler
*.log
/queue/
/crawled/

5
.travis.yml Normal file
View File

@@ -0,0 +1,5 @@
language: go
go:
- "1.11.x"
- master

View File

@@ -1,12 +1,24 @@
# oddb Go crawler 🚀 # od-database Go crawler 🚀
[![Build Status](https://travis-ci.org/terorie/od-database-crawler.svg?branch=master)](https://travis-ci.org/terorie/od-database-crawler)
> by terorie 2018 :P > by terorie 2018 :P
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
* Crawls HTTP open directories (standard Web Server Listings) * Crawls HTTP open directories (standard Web Server Listings)
* Gets name, path, size and modification time of all files * Gets name, path, size and modification time of all files
* Soon: Will work as a crawler for [OD-Database](https://github.com/simon987/od-database)! * Lightweight and fast: __over 9000 requests per second__ on a standard laptop
Stress test crawling [pandoradir](https://github.com/terorie/pandoradir) https://od-db.the-eye.eu/
on an average laptop (~10K requests per second, 4 connections):
![image](.github/stress.png)
Memory usage is being optimized :P #### Usage
1. With Config File (if `config.yml` found in working dir)
- Download [default config](https://github.com/terorie/od-database-crawler/blob/master/config.yml)
- Set `server.url` and `server.token`
- Start with `./od-database-crawler server --config <file>`
2. With Flags or env
- Override config file if it exists
- `--help` for list of flags
- Every flag is available as an environment variable:
`--server.crawl_stats` ➡️ `OD_SERVER_CRAWL_STATS`
- Start with `./od-database-crawler server <flags>`

171
config.go
View File

@@ -1,65 +1,173 @@
package main package main
import ( import (
"bufio"
"fmt" "fmt"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/spf13/pflag"
"github.com/spf13/viper" "github.com/spf13/viper"
"io"
"os" "os"
"strings"
"time" "time"
) )
var config struct { var config struct {
ServerUrl string ServerUrl string
Token string Token string
ServerTimeout time.Duration
Recheck time.Duration
ChunkSize int64
Retries int Retries int
Workers int Workers int
Timeout time.Duration UserAgent string
Tasks int32 Tasks int32
CrawlStats time.Duration
AllocStats time.Duration
Verbose bool Verbose bool
PrintHTTP bool
JobBufferSize int
} }
var onlineMode bool
const ( const (
ConfServerUrl = "server.url" ConfServerUrl = "server.url"
ConfToken = "server.token" ConfToken = "server.token"
ConfServerTimeout = "server.timeout"
ConfRecheck = "server.recheck"
ConfCooldown = "server.cooldown"
ConfChunkSize = "server.upload_chunk"
ConfUploadRetries = "server.upload_retries"
ConfUploadRetryInterval = "server.upload_retry_interval"
ConfTasks = "crawl.tasks" ConfTasks = "crawl.tasks"
ConfRetries = "crawl.retries" ConfRetries = "crawl.retries"
ConfWorkers = "crawl.connections" ConfWorkers = "crawl.connections"
ConfUserAgent = "crawl.user-agent"
ConfDialTimeout = "crawl.dial_timeout"
ConfTimeout = "crawl.timeout" ConfTimeout = "crawl.timeout"
ConfJobBufferSize = "crawl.job_buffer"
ConfResume = "crawl.resume"
ConfCrawlStats = "output.crawl_stats" ConfCrawlStats = "output.crawl_stats"
ConfAllocStats = "output.resource_stats" ConfAllocStats = "output.resource_stats"
ConfVerbose = "output.verbose" ConfVerbose = "output.verbose"
ConfPrintHTTP = "output.http"
ConfLogFile = "output.log"
) )
func prepareConfig() { func prepareConfig() {
viper.SetDefault(ConfRetries, 5) pf := rootCmd.PersistentFlags()
viper.SetDefault(ConfWorkers, 2)
viper.SetDefault(ConfTasks, 3) pf.SortFlags = false
viper.SetDefault(ConfTimeout, 10 * time.Second) pf.StringVar(&configFile, "config", "", "Config file")
viper.SetDefault(ConfCrawlStats, 3 * time.Second) configFile = os.Getenv("OD_CONFIG")
viper.SetDefault(ConfAllocStats, 0)
viper.SetDefault(ConfVerbose, false) pf.String(ConfServerUrl, "http://od-db.the-eye.eu/api", "OD-DB server URL")
pf.String(ConfToken, "", "OD-DB access token (env OD_SERVER_TOKEN)")
pf.Duration(ConfServerTimeout, 60 * time.Second, "OD-DB request timeout")
pf.Duration(ConfRecheck, 1 * time.Second, "OD-DB: Poll interval for new jobs")
pf.Duration(ConfCooldown, 30 * time.Second, "OD-DB: Time to wait after a server-side error")
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
pf.Uint(ConfUploadRetries, 10, "OD-DB: Max upload retries")
pf.Duration(ConfUploadRetryInterval, 30 * time.Second, "OD-DB: Time to wait between upload retries")
pf.Uint(ConfTasks, 100, "Crawler: Max concurrent tasks")
pf.Uint(ConfWorkers, 4, "Crawler: Connections per server")
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
pf.Duration(ConfDialTimeout, 10 * time.Second, "Crawler: Handshake timeout")
pf.Duration(ConfTimeout, 30 * time.Second, "Crawler: Request timeout")
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size")
pf.Duration(ConfResume, 72 * time.Hour, "Crawler: Resume tasks not older than x")
pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval")
pf.Duration(ConfAllocStats, 10 * time.Second, "Log: Resource stats interval")
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")
pf.Bool(ConfPrintHTTP, false, "Log: Print HTTP client errors")
pf.String(ConfLogFile, "crawler.log", "Log file")
// Bind all flags to Viper
pf.VisitAll(func(flag *pflag.Flag) {
s := flag.Name
s = strings.TrimLeft(s, "-")
if err := viper.BindPFlag(s, flag); err != nil {
panic(err)
}
var envKey string
envKey = strings.Replace(s, ".", "_", -1)
envKey = strings.ToUpper(envKey)
envKey = "OD_" + envKey
if err := viper.BindEnv(s, envKey); err != nil {
panic(err)
}
})
} }
func readConfig() { func readConfig() {
viper.AddConfigPath(".") // If config.yml in working dir, use it
viper.SetConfigName("config") if configFile == "" {
err := viper.ReadInConfig() _, err := os.Stat("config.yml")
if err == nil {
configFile = "config.yml"
}
}
if configFile != "" {
confF, err := os.Open(configFile)
if err != nil { if err != nil {
fmt.Fprintln(os.Stderr, err) fmt.Fprintln(os.Stderr, err)
os.Exit(1) os.Exit(1)
} }
defer confF.Close()
viper.SetConfigType("yml")
err = viper.ReadConfig(confF)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
if onlineMode {
config.ServerUrl = viper.GetString(ConfServerUrl) config.ServerUrl = viper.GetString(ConfServerUrl)
//if config.ServerUrl == "" { if config.ServerUrl == "" {
// configMissing(ConfServerUrl) configMissing(ConfServerUrl)
//} }
config.ServerUrl = strings.TrimRight(config.ServerUrl, "/")
config.Token = viper.GetString(ConfToken) config.Token = viper.GetString(ConfToken)
//if config.Token == "" { if config.Token == "" {
// configMissing(ConfToken) configMissing(ConfToken)
//} }
}
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
config.Recheck = viper.GetDuration(ConfRecheck)
config.ChunkSize = int64(viper.GetSizeInBytes(ConfChunkSize))
if config.ChunkSize < 100 {
configOOB(ConfChunkSize, config.ChunkSize)
}
config.Retries = viper.GetInt(ConfRetries) config.Retries = viper.GetInt(ConfRetries)
if config.Retries < 0 { if config.Retries < 0 {
@@ -76,16 +184,31 @@ func readConfig() {
configOOB(ConfTasks, int(config.Tasks)) configOOB(ConfTasks, int(config.Tasks))
} }
config.Timeout = viper.GetDuration(ConfTimeout) config.UserAgent = viper.GetString(ConfUserAgent)
config.CrawlStats = viper.GetDuration(ConfCrawlStats) setDialTimeout(viper.GetDuration(ConfDialTimeout))
config.AllocStats = viper.GetDuration(ConfAllocStats) setTimeout(viper.GetDuration(ConfTimeout))
config.JobBufferSize = viper.GetInt(ConfJobBufferSize)
config.Verbose = viper.GetBool(ConfVerbose) config.Verbose = viper.GetBool(ConfVerbose)
if config.Verbose { if config.Verbose {
logrus.SetLevel(logrus.DebugLevel) logrus.SetLevel(logrus.DebugLevel)
} }
if filePath := viper.GetString(ConfLogFile); filePath != "" {
f, err := os.OpenFile(filePath, os.O_CREATE | os.O_WRONLY | os.O_APPEND, 0644)
bufWriter := bufio.NewWriter(f)
if err != nil { panic(err) }
exitHooks.Add(func() {
bufWriter.Flush()
f.Close()
})
logrus.SetOutput(io.MultiWriter(os.Stdout, bufWriter))
}
config.PrintHTTP = viper.GetBool(ConfPrintHTTP)
} }
func configMissing(key string) { func configMissing(key string) {
@@ -93,7 +216,7 @@ func configMissing(key string) {
os.Exit(1) os.Exit(1)
} }
func configOOB(key string, v int) { func configOOB(key string, v interface{}) {
fmt.Fprintf(os.Stderr, "config: illegal value %d for %key!\n", v, key) fmt.Fprintf(os.Stderr, "config: illegal value %v for key %s!\n", v, key)
os.Exit(1) os.Exit(1)
} }

View File

@@ -1,28 +1,84 @@
# OD-Database server settings # OD-Database server settings
server: server:
# Connection URL # Connection URL
url: localhost:6969 url: http://od-db.mine.terorie.com/api
# Server auth token # Server auth token
token: token:
# Request timeout
timeout: 60s
# Recheck interval
# The crawler periodically asks the server
# for new jobs. Sets the minimum wait time
# between /task/get requests to the server.
recheck: 1s
# Time to wait after receiving an error
# from the server. Doesn't apply to uploads.
cooldown: 30s
# Upload chunk size
# If the value is too high, the upload fails.
upload_chunk: 1 MB
upload_retries: 10
upload_retry_interval: 30s
# Log output settings # Log output settings
output: output:
# Crawl statistics # Crawl statistics
crawl_stats: 1s crawl_stats: 1s
# CPU/RAM/Job queue stats # CPU/RAM/Job queue stats
resource_stats: 1s resource_stats: 10s
# More output? (Every listed dir) # More output? (Every listed dir)
verbose: false verbose: false
# Print HTTP errors (Super spammy)
http: false
# Log file
# If empty, no log file is created.
log: crawler.log
# Crawler settings # Crawler settings
crawl: crawl:
# Number of sites that can be # Number of sites that can be processed at once
# processed at once tasks: 100
tasks: 3
# Number of connections per site # Number of connections per site
connections: 2 # Please be careful with this setting!
# The crawler fires fast and more than
# ten connections can overwhelm a server.
connections: 4
# How often to retry getting data # How often to retry getting data
# from the site before giving up # from the site before giving up
retries: 5 retries: 5
# Time before discarding a failed connection attempt
dial_timeout: 10s
# Time before discarding a network request # Time before discarding a network request
timeout: 10s timeout: 30s
# Crawler User-Agent
# If empty, no User-Agent header is sent.
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
# Job buffer size (per task)
# Higher values cause less disk writes
# but require more memory.
#
# The job queue contains all URLs
# that should be crawled next.
# As it grows very large over time,
# it's kept mainly on disk.
# This sets how many jobs are kept
# in memory.
# A negative value will cause all jobs
# to be stored in memory. (Don't do this)
job_buffer: 5000

129
crawl.go
View File

@@ -2,43 +2,72 @@ package main
import ( import (
"bytes" "bytes"
"fmt" "crypto/tls"
"github.com/terorie/oddb-go/ds/redblackhash" "github.com/terorie/od-database-crawler/ds/redblackhash"
"github.com/terorie/oddb-go/fasturl" "github.com/terorie/od-database-crawler/fasturl"
"github.com/valyala/fasthttp" "github.com/valyala/fasthttp"
"golang.org/x/crypto/blake2b" "golang.org/x/crypto/blake2b"
"golang.org/x/net/html" "golang.org/x/net/html"
"net"
"path" "path"
"strconv" "strconv"
"strings" "strings"
"time" "time"
) )
var client fasthttp.Client var client = fasthttp.Client {
TLSConfig: &tls.Config{
InsecureSkipVerify: true,
},
}
func setDialTimeout(d time.Duration) {
client.Dial = func(addr string) (net.Conn, error) {
return fasthttp.DialTimeout(addr, d)
}
}
func setTimeout(d time.Duration) {
client.ReadTimeout = d
client.WriteTimeout = d / 2
}
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) { func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
f.IsDir = true f.IsDir = true
f.Name = path.Base(j.Uri.Path) f.Name = path.Base(j.Uri.Path)
req := fasthttp.AcquireRequest() req := fasthttp.AcquireRequest()
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(j.UriStr) req.SetRequestURI(j.UriStr)
res := fasthttp.AcquireResponse() res := fasthttp.AcquireResponse()
defer fasthttp.ReleaseResponse(res) defer fasthttp.ReleaseResponse(res)
err = client.DoTimeout(req, res, config.Timeout) err = client.Do(req, res)
fasthttp.ReleaseRequest(req) fasthttp.ReleaseRequest(req)
if err != nil { return } if err != nil {
return
}
err = checkStatusCode(res.StatusCode()) err = checkStatusCode(res.StatusCode())
if err != nil { return } if err != nil {
return
}
body := res.Body() body := res.Body()
return ParseDir(body, &j.Uri)
}
func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error) {
doc := html.NewTokenizer(bytes.NewReader(body)) doc := html.NewTokenizer(bytes.NewReader(body))
var linkHref string var linkHref string
for { for {
err = nil
tokenType := doc.Next() tokenType := doc.Next()
if tokenType == html.ErrorToken { if tokenType == html.ErrorToken {
break break
@@ -69,34 +98,34 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
linkHref = "" linkHref = ""
if strings.LastIndexByte(href, '?') != -1 { if strings.LastIndexByte(href, '?') != -1 {
goto nextToken continue
} }
switch href { switch href {
case "", " ", ".", "..", "/": case "", " ", ".", "..", "/":
goto nextToken continue
} }
if strings.Contains(href, "../") { if strings.Contains(href, "../") {
goto nextToken continue
} }
var link fasturl.URL var link fasturl.URL
err = j.Uri.ParseRel(&link, href) err = baseUri.ParseRel(&link, href)
if err != nil { continue } if err != nil {
continue
}
if link.Scheme != j.Uri.Scheme || if link.Scheme != baseUri.Scheme ||
link.Host != j.Uri.Host || link.Host != baseUri.Host ||
link.Path == j.Uri.Path || link.Path == baseUri.Path ||
!strings.HasPrefix(link.Path, j.Uri.Path) { !strings.HasPrefix(link.Path, baseUri.Path) {
continue continue
} }
links = append(links, link) links = append(links, link)
} }
} }
nextToken:
} }
return return
@@ -106,23 +135,30 @@ func GetFile(u fasturl.URL, f *File) (err error) {
f.IsDir = false f.IsDir = false
u.Path = path.Clean(u.Path) u.Path = path.Clean(u.Path)
f.Name = path.Base(u.Path) f.Name = path.Base(u.Path)
f.Path = strings.Trim(u.Path, "/") f.Path = strings.Trim(path.Dir(u.Path), "/")
req := fasthttp.AcquireRequest() req := fasthttp.AcquireRequest()
req.Header.SetMethod("HEAD") req.Header.SetMethod("HEAD")
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(u.String()) req.SetRequestURI(u.String())
res := fasthttp.AcquireResponse() res := fasthttp.AcquireResponse()
res.SkipBody = true res.SkipBody = true
defer fasthttp.ReleaseResponse(res) defer fasthttp.ReleaseResponse(res)
err = client.DoTimeout(req, res, config.Timeout) err = client.Do(req, res)
fasthttp.ReleaseRequest(req) fasthttp.ReleaseRequest(req)
if err != nil { return } if err != nil {
return
}
err = checkStatusCode(res.StatusCode()) err = checkStatusCode(res.StatusCode())
if err != nil { return } if err != nil {
return
}
f.applyContentLength(string(res.Header.Peek("content-length"))) f.applyContentLength(string(res.Header.Peek("content-length")))
f.applyLastModified(string(res.Header.Peek("last-modified"))) f.applyLastModified(string(res.Header.Peek("last-modified")))
@@ -143,38 +179,49 @@ func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) {
} }
func (f *File) applyContentLength(v string) { func (f *File) applyContentLength(v string) {
if v == "" { return } if v == "" {
return
}
size, err := strconv.ParseInt(v, 10, 64) size, err := strconv.ParseInt(v, 10, 64)
if err != nil { return } if err != nil {
if size < 0 { return } return
}
if size < 0 {
return
}
f.Size = size f.Size = size
} }
// TODO Cleanup
func (f *File) applyLastModified(v string) { func (f *File) applyLastModified(v string) {
if v == "" { return } if v == "" {
return
}
var t time.Time
var err error var err error
f.MTime, err = time.Parse(time.RFC1123, v) t, err = time.Parse(time.RFC1123, v)
if err == nil { return } if err == nil {
f.MTime, err = time.Parse(time.RFC850, v) f.MTime = t.Unix()
if err == nil { return } return
}
t, err = time.Parse(time.RFC850, v)
if err == nil {
f.MTime = t.Unix()
return
}
// TODO Parse asctime // TODO Parse asctime
f.MTime, err = time.Parse("2006-01-02", v[:10]) t, err = time.Parse("2006-01-02", v[:10])
if err == nil { return } if err == nil {
f.MTime = t.Unix()
return
}
} }
func checkStatusCode(status int) error { func checkStatusCode(status int) error {
switch status { switch status {
case fasthttp.StatusOK: case fasthttp.StatusOK:
return nil return nil
case fasthttp.StatusTooManyRequests:
return ErrRateLimit
case fasthttp.StatusForbidden,
fasthttp.StatusUnauthorized:
return ErrForbidden
default: default:
return fmt.Errorf("got HTTP status %d", status) return &HttpError{status}
} }
} }

4766
crawl_apache2_test.go Normal file

File diff suppressed because it is too large Load Diff

117
crawl_nginx_test.go Normal file
View File

@@ -0,0 +1,117 @@
package main
import (
"github.com/terorie/od-database-crawler/fasturl"
"testing"
)
func TestParseDirNginx(t *testing.T) {
var u fasturl.URL
err := u.Parse("https://the-eye.eu/public/")
if err != nil {
t.Fatal("Failed to parse URL", err)
}
links, err := ParseDir([]byte(nginxListing), &u)
if err != nil {
t.Fatal("Failed to extract links", err)
}
if len(links) != len(nginxLinks) {
t.Fatalf("Expected %d links, got %d",
len(nginxLinks), len(links))
}
for i := 0; i < len(links); i++ {
gotLink := links[i].String()
expLink := nginxLinks[i]
if gotLink != expLink {
t.Errorf(`Expected "%s" got "%s"`,
expLink, gotLink)
}
}
}
var nginxLinks = []string {
"https://the-eye.eu/public/AppleArchive/",
"https://the-eye.eu/public/AudioBooks/",
"https://the-eye.eu/public/Books/",
"https://the-eye.eu/public/Comics/",
"https://the-eye.eu/public/Games/",
"https://the-eye.eu/public/Icons/",
"https://the-eye.eu/public/Images/",
"https://the-eye.eu/public/JFK_Files/",
"https://the-eye.eu/public/MSDN/",
"https://the-eye.eu/public/Music/",
"https://the-eye.eu/public/Operating%20Systems/",
"https://the-eye.eu/public/Posters/",
"https://the-eye.eu/public/Psychedelics/",
"https://the-eye.eu/public/Psychoactives/",
"https://the-eye.eu/public/Radio/",
"https://the-eye.eu/public/Random/",
"https://the-eye.eu/public/Site-Dumps/",
"https://the-eye.eu/public/Software/",
"https://the-eye.eu/public/Strategic%20Intelligence%20Network/",
"https://the-eye.eu/public/WorldTracker.org/",
"https://the-eye.eu/public/concen.org/",
"https://the-eye.eu/public/freenrg.info/",
"https://the-eye.eu/public/murdercube.com/",
"https://the-eye.eu/public/parazite/",
"https://the-eye.eu/public/ripreddit/",
"https://the-eye.eu/public/rom/",
"https://the-eye.eu/public/touhou/",
"https://the-eye.eu/public/vns/",
"https://the-eye.eu/public/xbins/",
"https://the-eye.eu/public/xbins.diodematrix/",
"https://the-eye.eu/public/Rclone_for_Scrubs.pdf",
"https://the-eye.eu/public/Wget_Linux_Guide.pdf",
"https://the-eye.eu/public/Wget_Windows_Guide.pdf",
"https://the-eye.eu/public/rclone_guide.pdf",
"https://the-eye.eu/public/wget-noobs-guide.pdf",
"https://the-eye.eu/public/xbox-scene_Aug2014.7z",
}
const nginxListing =
`<html>
<head><title>Index of /public/</title></head>
<body bgcolor="white">
<h1>Index of /public/</h1><hr><pre><a href="../">../</a>
<a href="AppleArchive/">AppleArchive/</a> 03-Nov-2017 18:13 -
<a href="AudioBooks/">AudioBooks/</a> 29-Sep-2018 19:47 -
<a href="Books/">Books/</a> 27-Nov-2018 17:50 -
<a href="Comics/">Comics/</a> 05-Nov-2018 21:37 -
<a href="Games/">Games/</a> 28-Nov-2018 11:54 -
<a href="Icons/">Icons/</a> 22-May-2018 07:47 -
<a href="Images/">Images/</a> 21-Jan-2018 03:21 -
<a href="JFK_Files/">JFK_Files/</a> 03-Nov-2017 17:03 -
<a href="MSDN/">MSDN/</a> 03-Nov-2017 15:48 -
<a href="Music/">Music/</a> 02-Mar-2018 15:47 -
<a href="Operating%20Systems/">Operating Systems/</a> 25-Apr-2018 07:18 -
<a href="Posters/">Posters/</a> 07-Jul-2018 01:12 -
<a href="Psychedelics/">Psychedelics/</a> 11-Apr-2018 05:45 -
<a href="Psychoactives/">Psychoactives/</a> 18-May-2018 02:58 -
<a href="Radio/">Radio/</a> 09-Jun-2018 15:49 -
<a href="Random/">Random/</a> 04-Dec-2018 12:33 -
<a href="Site-Dumps/">Site-Dumps/</a> 15-Dec-2018 11:04 -
<a href="Software/">Software/</a> 27-Nov-2017 00:22 -
<a href="Strategic%20Intelligence%20Network/">Strategic Intelligence Network/</a> 17-Nov-2017 16:35 -
<a href="WorldTracker.org/">WorldTracker.org/</a> 12-Apr-2018 04:16 -
<a href="concen.org/">concen.org/</a> 08-Oct-2018 14:08 -
<a href="freenrg.info/">freenrg.info/</a> 19-Dec-2017 10:59 -
<a href="murdercube.com/">murdercube.com/</a> 06-Dec-2017 10:45 -
<a href="parazite/">parazite/</a> 20-Nov-2017 21:25 -
<a href="ripreddit/">ripreddit/</a> 04-Aug-2018 14:30 -
<a href="rom/">rom/</a> 28-Nov-2018 14:15 -
<a href="touhou/">touhou/</a> 03-Nov-2017 11:07 -
<a href="vns/">vns/</a> 03-Nov-2017 11:36 -
<a href="xbins/">xbins/</a> 03-Nov-2017 17:23 -
<a href="xbins.diodematrix/">xbins.diodematrix/</a> 21-Sep-2018 22:33 -
<a href="Rclone_for_Scrubs.pdf">Rclone_for_Scrubs.pdf</a> 04-Sep-2018 13:31 315K
<a href="Wget_Linux_Guide.pdf">Wget_Linux_Guide.pdf</a> 21-Dec-2017 20:28 168K
<a href="Wget_Windows_Guide.pdf">Wget_Windows_Guide.pdf</a> 25-Nov-2017 17:59 867K
<a href="rclone_guide.pdf">rclone_guide.pdf</a> 03-Sep-2018 23:37 315K
<a href="wget-noobs-guide.pdf">wget-noobs-guide.pdf</a> 21-Dec-2017 20:29 168K
<a href="xbox-scene_Aug2014.7z">xbox-scene_Aug2014.7z</a> 26-Oct-2017 23:09 1G
</pre><hr></body>
</html>`

59
crawl_test.go Normal file
View File

@@ -0,0 +1,59 @@
package main
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"github.com/terorie/od-database-crawler/fasturl"
"net/url"
"strings"
"testing"
)
func BenchmarkParseDir(b *testing.B) {
for n := 0; n < b.N; n++ {
var u fasturl.URL
err := u.Parse("http://archive.ubuntu.com/ubuntu/indices/")
if err != nil {
b.Fatal("Failed to parse URL", err)
}
_, err = ParseDir([]byte(apache2Listing), &u)
if err != nil {
b.Fatal("Failed to extract links", err)
}
}
}
func BenchmarkParseDirReference(b *testing.B) {
for n := 0; n < b.N; n++ {
u, err := url.Parse("http://archive.ubuntu.com/ubuntu/indices/")
if err != nil {
b.Fatal("Failed to parse URL", err)
}
_, err = referenceParseDir([]byte(apache2Listing), u)
if err != nil {
b.Fatal("Failed to extract links", err)
}
}
}
func referenceParseDir(body []byte, baseUri *url.URL) (links []*url.URL, err error) {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil { return nil, err }
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
sub, err := baseUri.Parse(href)
if err != nil { return } // continue
if !strings.HasPrefix(sub.String(), baseUri.String()) {
return // continue
}
links = append(links, sub)
})
return
}

View File

@@ -14,7 +14,12 @@
package redblackhash package redblackhash
import ( import (
"bytes"
"encoding/binary"
"encoding/hex"
"fmt" "fmt"
"io"
"sync"
) )
const ( const (
@@ -27,6 +32,7 @@ type Key [KeySize]byte
// Tree holds elements of the red-black tree // Tree holds elements of the red-black tree
type Tree struct { type Tree struct {
sync.Mutex
Root *Node Root *Node
size int size int
} }
@@ -40,43 +46,15 @@ type Node struct {
Parent *Node Parent *Node
} }
type nodeHeader struct {
Key *Key
Color color
}
var o = binary.BigEndian
func (k *Key) Compare(o *Key) int { func (k *Key) Compare(o *Key) int {
// TODO Assembly return bytes.Compare(k[:], o[:])
/*for i := 0; i < KeySize / 8; i++ {
a := uint64(k[i+0] ) |
uint64(k[i+1] >> 8) |
uint64(k[i+2] >> 16) |
uint64(k[i+3] >> 24) |
uint64(k[i+4] >> 32) |
uint64(k[i+5] >> 40) |
uint64(k[i+6] >> 48) |
uint64(k[i+7] >> 56)
b := uint64(o[i+0] ) |
uint64(o[i+1] >> 8) |
uint64(o[i+2] >> 16) |
uint64(o[i+3] >> 24) |
uint64(o[i+4] >> 32) |
uint64(o[i+5] >> 40) |
uint64(o[i+6] >> 48) |
uint64(o[i+7] >> 56)
switch {
case a < b:
return -1
case a > b:
return 1
}
}*/
for i := 0; i < KeySize; i++ {
switch {
case k[i] < o[i]:
return -1
case k[i] > o[i]:
return 1
}
}
return 0
} }
// Put inserts node into the tree. // Put inserts node into the tree.
@@ -265,7 +243,7 @@ func (tree *Tree) String() string {
} }
func (node *Node) String() string { func (node *Node) String() string {
return fmt.Sprintf("%v", node.Key) return hex.EncodeToString(node.Key[:16]) + "..."
} }
func output(node *Node, prefix string, isTail bool, str *string) { func output(node *Node, prefix string, isTail bool, str *string) {
@@ -513,6 +491,119 @@ func (tree *Tree) deleteCase6(node *Node) {
} }
} }
func (tree *Tree) Marshal(w io.Writer) (err error) {
tree.Lock()
defer tree.Unlock()
err = binary.Write(w, o, uint64(0x617979797979790A))
if err != nil { return err }
err = marshal(tree.Root, w)
if err != nil { return err }
err = binary.Write(w, o, uint64(0x6C6D616F6F6F6F0A))
if err != nil { return err }
return nil
}
func marshal(n *Node, w io.Writer) (err error) {
if n == nil {
err = binary.Write(w, o, uint64(0x796565656565740A))
return err
}
err = binary.Write(w, o, uint64(0xF09F85B1EFB88F0A))
if err != nil { return err }
_, err = w.Write(n.Key[:])
if err != nil { return err }
var colorI uint64
if n.color {
colorI = 0x7468652D6579657C
} else {
colorI = 0x6865782B7465727C
}
err = binary.Write(w, o, colorI)
if err != nil { return err }
err = marshal(n.Left, w)
if err != nil { return err }
err = marshal(n.Right, w)
if err != nil { return err }
return nil
}
func (tree *Tree) Unmarshal(r io.Reader) (err error) {
tree.Lock()
defer tree.Unlock()
var sof uint64
err = binary.Read(r, o, &sof)
if err != nil { return err }
if sof != 0x617979797979790A {
return fmt.Errorf("redblack: wrong format")
}
tree.Root, tree.size, err = unmarshal(r)
if err != nil { return err }
var eof uint64
err = binary.Read(r, o, &eof)
if err != nil { return err }
if eof != 0x6C6D616F6F6F6F0A {
return fmt.Errorf("redblack: end of file missing")
}
return nil
}
func unmarshal(r io.Reader) (n *Node, size int, err error) {
var head uint64
err = binary.Read(r, o, &head)
if err != nil { return nil, 0, err }
size = 1
switch head {
case 0x796565656565740A:
return nil, 0, nil
case 0xF09F85B1EFB88F0A:
n = new(Node)
_, err = io.ReadFull(r, n.Key[:])
if err != nil { return nil, 0, err }
var colorInt uint64
err = binary.Read(r, o, &colorInt)
if err != nil { return nil, 0, err }
switch colorInt {
case 0x7468652D6579657C:
n.color = true
case 0x6865782B7465727C:
n.color = false
default:
return nil, 0, fmt.Errorf("redblack: corrupt node color")
}
default:
return nil, 0, fmt.Errorf("redblack: corrupt node info")
}
var s2 int
n.Left, s2, err = unmarshal(r)
size += s2
if err != nil { return nil, 0, err }
n.Right, s2, err = unmarshal(r)
size += s2
if err != nil { return nil, 0, err }
return n, size, nil
}
func nodeColor(node *Node) color { func nodeColor(node *Node) color {
if node == nil { if node == nil {
return black return black

View File

@@ -0,0 +1,47 @@
package redblackhash
import (
"bytes"
"math/rand"
"testing"
)
func TestTree_Marshal(t *testing.T) {
var t1, t2 Tree
// Generate 1000 random values to insert
for i := 0; i < 1000; i++ {
var key Key
rand.Read(key[:])
t1.Put(&key)
}
// Marshal tree
var wr bytes.Buffer
err := t1.Marshal(&wr)
if err != nil {
t.Error(err)
t.FailNow()
}
buf := wr.Bytes()
rd := bytes.NewBuffer(buf)
// Unmarshal tree
err = t2.Unmarshal(rd)
if err != nil {
t.Error(err)
t.FailNow()
}
if !compare(t1.Root, t2.Root) {
t.Error("trees are not equal")
t.FailNow()
}
}
func compare(n1, n2 *Node) bool {
return n1.Key.Compare(&n2.Key) == 0 &&
(n1.Left == nil || compare(n1.Left, n2.Left)) &&
(n1.Right == nil || compare(n1.Right, n2.Right))
}

View File

@@ -1,8 +1,45 @@
package main package main
import "errors" import (
"errors"
"fmt"
"github.com/valyala/fasthttp"
"net"
)
var ErrRateLimit = errors.New("too many requests") var ErrRateLimit = errors.New("too many requests")
var ErrForbidden = errors.New("access denied")
var ErrKnown = errors.New("already crawled") var ErrKnown = errors.New("already crawled")
type HttpError struct {
code int
}
func (e HttpError) Error() string {
return fmt.Sprintf("http status %d", e.code)
}
func shouldRetry(err error) bool {
// HTTP errors
if httpErr, ok := err.(*HttpError); ok {
switch httpErr.code {
case fasthttp.StatusTooManyRequests:
return true
default:
// Don't retry HTTP error codes
return false
}
}
if dnsError, ok := err.(*net.DNSError); ok {
// Don't retry permanent DNS errors
return dnsError.IsTemporary
}
if netErr, ok := err.(*net.OpError); ok {
// Don't retry permanent network errors
return netErr.Temporary()
}
// Retry by default
return true
}

View File

@@ -33,6 +33,8 @@ var Schemes = [SchemeCount]string {
"https", "https",
} }
var ErrUnknownScheme = errors.New("unknown protocol scheme")
// Error reports an error and the operation and URL that caused it. // Error reports an error and the operation and URL that caused it.
type Error struct { type Error struct {
Op string Op string
@@ -353,7 +355,7 @@ func getscheme(rawurl string) (scheme Scheme, path string, err error) {
case "https": case "https":
scheme = SchemeHTTPS scheme = SchemeHTTPS
default: default:
return SchemeInvalid, "", errors.New("unknown protocol scheme") return SchemeInvalid, "", ErrUnknownScheme
} }
path = rawurl[i+1:] path = rawurl[i+1:]
@@ -811,3 +813,57 @@ func validUserinfo(s string) bool {
} }
return true return true
} }
func PathUnescape(s string) string {
newStr, err := pathUnescape(s)
if err != nil {
return s
} else {
return newStr
}
}
func pathUnescape(s string) (string, error) {
// Count %, check that they're well-formed.
n := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
n++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:]
if len(s) > 3 {
s = s[:3]
}
return "", EscapeError(s)
}
i += 3
default:
i++
}
}
if n == 0 {
return s, nil
}
t := make([]byte, len(s)-2*n)
j := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
j++
i += 3
case '+':
t[j] = '+'
j++
i++
default:
t[j] = s[i]
j++
i++
}
}
return string(t), nil
}

14
go.mod Normal file
View File

@@ -0,0 +1,14 @@
module github.com/syndtr/od-database-crawler
require (
github.com/beeker1121/goque v2.0.1+incompatible
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect
github.com/sirupsen/logrus v1.3.0
github.com/spf13/cobra v0.0.3
github.com/spf13/viper v1.3.1
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 // indirect
github.com/terorie/od-database-crawler v1.1.1
github.com/valyala/fasthttp v1.1.0
golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613
golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3
)

66
go.sum Normal file
View File

@@ -0,0 +1,66 @@
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
github.com/beeker1121/goque v2.0.1+incompatible h1:5nJHPMqQLxUvGFc8m/NW2QzxKyc0zICmqs/JUsmEjwE=
github.com/beeker1121/goque v2.0.1+incompatible/go.mod h1:L6dOWBhDOnxUVQsb0wkLve0VCnt2xJW/MI8pdRX4ANw=
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w=
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
github.com/klauspost/compress v1.4.0 h1:8nsMz3tWa9SWWPL60G1V6CUsf4lLjWLTNEtibhe8gh8=
github.com/klauspost/compress v1.4.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
github.com/klauspost/cpuid v0.0.0-20180405133222-e7e905edc00e h1:+lIPJOWl+jSiJOc70QXJ07+2eg2Jy2EC7Mi11BWujeM=
github.com/klauspost/cpuid v0.0.0-20180405133222-e7e905edc00e/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/magiconair/properties v1.8.0 h1:LLgXmsheXeRoUOBOjtwPQCWIYqM/LU1ayDtDePerRcY=
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE=
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc=
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/sirupsen/logrus v1.3.0 h1:hI/7Q+DtNZ2kINb6qt/lS+IyXnHQe9e90POfeewL/ME=
github.com/sirupsen/logrus v1.3.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/spf13/afero v1.1.2 h1:m8/z1t7/fwjysjQRYbP0RD+bUIF/8tJwPdEZsI83ACI=
github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
github.com/spf13/cast v1.3.0 h1:oget//CVOEoFewqQxwr0Ej5yjygnqGkvggSE/gB35Q8=
github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
github.com/spf13/cobra v0.0.3 h1:ZlrZ4XsMRm04Fr5pSFxBgfND2EBVa1nLpiy1stUsX/8=
github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ=
github.com/spf13/jwalterweatherman v1.0.0 h1:XHEdyB+EcvlqZamSM4ZOMGlc93t6AcsBEu9Gc1vn7yk=
github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg=
github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
github.com/spf13/viper v1.3.1 h1:5+8j8FTpnFV4nEImW/ofkzEt8VoOiLXxdYIDsB73T38=
github.com/spf13/viper v1.3.1/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 h1:GnOzE5fEFN3b2zDhJJABEofdb51uMRNb8eqIVtdducs=
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2/go.mod h1:Z4AUp2Km+PwemOoO/VB5AOx9XSsIItzFjoJlOSiYmn0=
github.com/terorie/od-database-crawler v1.1.1 h1:Ca+ZqbZX3rVWBR8SDRzvroyxjBtUs75MQXZ9YG0gqGo=
github.com/terorie/od-database-crawler v1.1.1/go.mod h1:vVJ7pLkudrlUNp9qu24JCzQ8N6mFsrOmX1tPXr155DQ=
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasthttp v1.1.0 h1:3BohG7mqwj4lq7PTX//7gLbUlzNvZSPmuHFnloXT0lw=
github.com/valyala/fasthttp v1.1.0/go.mod h1:4vX61m6KN+xDduDNwXrhIAVZaZaZiQ1luJk8LWSxF3s=
github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613 h1:MQ/ZZiDsUapFFiMS+vzwXkCTeEKaum+Do5rINYJDmxc=
golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3 h1:czFLhve3vsQetD6JOJ8NZZvGQIXlnN3/yXxbT6/awxI=
golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a h1:1n5lsVfiQW3yfsRGu98756EH1YthsFqr/5mxHduZW2A=
golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

15
help.go Normal file
View File

@@ -0,0 +1,15 @@
package main
const helpText =
`HTTP crawler for the OD-Database
DB >> https://od-db.the-eye.eu <<
Crawler >> https://github.com/terorie/od-database-crawler <<
Server >> https://github.com/simon987/od-database <<
Quick start:
- get config file (config.yml in working dir)
- get OD-DB server ("server.url": Database URL + /api)
- get access token ("server.token": e.g. c010b6dd-20...)
- ./od-database-crawler server
Questions? Discord @terorie#2664 / Telegram @terorie`

206
main.go
View File

@@ -2,55 +2,146 @@ package main
import ( import (
"context" "context"
"fmt"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/terorie/oddb-go/fasturl" "github.com/spf13/cobra"
"github.com/urfave/cli" "github.com/spf13/viper"
"log" "github.com/terorie/od-database-crawler/fasturl"
"net/http"
_ "net/http/pprof"
"os" "os"
"os/signal"
"strings" "strings"
"sync/atomic"
"time" "time"
) )
var app = cli.App { var configFile string
Name: "oddb-go",
Usage: "OD-Database Go crawler", var rootCmd = cobra.Command {
Version: "0.2", Use: "od-database-crawler",
BashComplete: cli.DefaultAppComplete, Version: "1.2.1",
Writer: os.Stdout, Short: "OD-Database Go crawler",
Compiled: buildDate, Long: helpText,
Commands: []cli.Command{ PersistentPreRunE: preRun,
{ PersistentPostRun: func(cmd *cobra.Command, args []string) {
Name: "crawl", exitHooks.Execute()
Usage: "Crawl a list of URLs",
ArgsUsage: "[site, site, ...]",
Action: cmdCrawler,
},
}, },
} }
var serverCmd = cobra.Command {
Use: "server",
Short: "Start crawl server",
Long: "Connect to the OD-Database and contribute to the database\n" +
"by crawling the web for open directories!",
Run: cmdBase,
}
var crawlCmd = cobra.Command {
Use: "crawl",
Short: "Crawl an URL",
Long: "Crawl the URL specified.\n" +
"Results will not be uploaded to the database,\n" +
"they're saved under crawled/0.json instead.\n" +
"Primarily used for testing and benchmarking.",
RunE: cmdCrawler,
Args: cobra.ExactArgs(1),
}
var exitHooks Hooks
func init() { func init() {
rootCmd.AddCommand(&crawlCmd)
rootCmd.AddCommand(&serverCmd)
prepareConfig() prepareConfig()
} }
func main() { func preRun(cmd *cobra.Command, args []string) error {
go func() { if err := os.MkdirAll("crawled", 0755);
log.Println(http.ListenAndServe("localhost:42069", nil)) err != nil { panic(err) }
}()
app.Run(os.Args) if err := os.MkdirAll("queue", 0755);
err != nil { panic(err) }
return nil
} }
func cmdCrawler(clic *cli.Context) error { func main() {
err := rootCmd.Execute()
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
func cmdBase(_ *cobra.Command, _ []string) {
onlineMode = true
readConfig() readConfig()
if clic.NArg() == 0 { appCtx, soft := context.WithCancel(context.Background())
cli.ShowCommandHelpAndExit(clic, "crawl", 1) forceCtx, hard := context.WithCancel(context.Background())
go hardShutdown(forceCtx)
go listenCtrlC(soft, hard)
inRemotes := make(chan *OD)
go LoadResumeTasks(inRemotes)
go Schedule(appCtx, inRemotes)
ticker := time.NewTicker(config.Recheck)
defer ticker.Stop()
for {
select {
case <-appCtx.Done():
goto shutdown
case <-ticker.C:
t, err := FetchTask()
if err != nil {
logrus.WithError(err).
Error("Failed to get new task")
if !sleep(viper.GetDuration(ConfCooldown), appCtx) {
goto shutdown
}
continue
}
if t == nil {
// No new task
if atomic.LoadInt32(&numActiveTasks) == 0 {
logrus.Info("Waiting …")
}
continue
} }
args := clic.Args() var baseUri fasturl.URL
remotes := make([]*OD, len(args)) err = baseUri.Parse(t.Url)
for i, arg := range args { if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
// Not an error
err = nil
// Give back task
//err2 := CancelTask(t.WebsiteId)
//if err2 != nil {
// logrus.Error(err2)
//}
continue
} else if err != nil {
logrus.WithError(err).
Error("Failed to get new task")
time.Sleep(viper.GetDuration(ConfCooldown))
continue
}
ScheduleTask(inRemotes, t, &baseUri)
}
}
shutdown:
globalWait.Wait()
}
func cmdCrawler(_ *cobra.Command, args []string) error {
onlineMode = false
readConfig()
arg := args[0]
// https://github.com/golang/go/issues/19779 // https://github.com/golang/go/issues/19779
if !strings.Contains(arg, "://") { if !strings.Contains(arg, "://") {
arg = "http://" + arg arg = "http://" + arg
@@ -61,34 +152,51 @@ func cmdCrawler(clic *cli.Context) error {
u.Path += "/" u.Path += "/"
} }
if err != nil { return err } if err != nil { return err }
remotes[i] = &OD {
Task: &Task{
WebsiteId: 0,
Url: u.String(),
},
BaseUri: u,
}
}
c := context.Background() // TODO Graceful shutdown
forceCtx := context.Background()
inRemotes := make(chan *OD) inRemotes := make(chan *OD)
go Schedule(c, inRemotes) go Schedule(forceCtx, inRemotes)
for _, remote := range remotes { ticker := time.NewTicker(3 * time.Second)
globalWait.Add(1) defer ticker.Stop()
inRemotes <- remote
task := Task {
WebsiteId: 0,
Url: u.String(),
} }
ScheduleTask(inRemotes, &task, &u)
// Wait for all jobs to finish // Wait for all jobs to finish
globalWait.Wait() globalWait.Wait()
logrus.Info("All dirs processed!")
return nil return nil
} }
var buildDate = time.Date( func listenCtrlC(soft, hard context.CancelFunc) {
2018, 11, 15, c := make(chan os.Signal)
23, 24, 0, 0, signal.Notify(c, os.Interrupt)
time.UTC)
<-c
logrus.Info(">>> Shutting down crawler... <<<")
soft()
<-c
logrus.Warning(">>> Force shutdown! <<<")
hard()
}
func hardShutdown(c context.Context) {
<-c.Done()
os.Exit(1)
}
func sleep(d time.Duration, c context.Context) bool {
select {
case <-time.After(d):
return true
case <-c.Done():
return false
}
}

View File

@@ -1,14 +1,27 @@
package main package main
import ( import (
"github.com/terorie/oddb-go/ds/redblackhash" "github.com/terorie/od-database-crawler/ds/redblackhash"
"github.com/terorie/oddb-go/fasturl" "github.com/terorie/od-database-crawler/fasturl"
"sync"
"time" "time"
) )
type Task struct {
WebsiteId uint64 `json:"website_id"`
Url string `json:"url"`
}
type TaskResult struct {
StatusCode string `json:"status_code"`
FileCount uint64 `json:"file_count"`
ErrorCount uint64 `json:"-"`
StartTime time.Time `json:"-"`
StartTimeUnix int64 `json:"start_time"`
EndTimeUnix int64 `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}
type Job struct { type Job struct {
OD *OD
Uri fasturl.URL Uri fasturl.URL
UriStr string UriStr string
Fails int Fails int
@@ -16,26 +29,32 @@ type Job struct {
} }
type OD struct { type OD struct {
Task *Task Task Task
Wait sync.WaitGroup Result TaskResult
InProgress int64
BaseUri fasturl.URL BaseUri fasturl.URL
WCtx WorkerContext WCtx WorkerContext
Scanned redblackhash.Tree Scanned redblackhash.Tree
}
lock sync.Mutex type PausedOD struct {
Task *Task
Result *TaskResult
BaseUri *fasturl.URL
InProgress int64
} }
type File struct { type File struct {
Name string `json:"name"` Name string `json:"name"`
Size int64 `json:"size"` Size int64 `json:"size"`
MTime time.Time `json:"mtime"` MTime int64 `json:"mtime"`
Path string `json:"path"` Path string `json:"path"`
IsDir bool `json:"-"` IsDir bool `json:"-"`
} }
func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) { func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) {
o.lock.Lock() o.Scanned.Lock()
defer o.lock.Unlock() defer o.Scanned.Unlock()
exists = o.Scanned.Get(k) exists = o.Scanned.Get(k)
if exists { return true } if exists { return true }
@@ -43,3 +62,8 @@ func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) {
o.Scanned.Put(k) o.Scanned.Put(k)
return false return false
} }
type errorString string
func (e errorString) Error() string {
return string(e)
}

129
queue.go Normal file
View File

@@ -0,0 +1,129 @@
package main
import (
"github.com/beeker1121/goque"
"os"
"sync"
"sync/atomic"
)
type BufferedQueue struct {
dataDir string
q *goque.Queue
buf []Job
m sync.Mutex
}
func OpenQueue(dataDir string) (bq *BufferedQueue, err error) {
bq = new(BufferedQueue)
if config.JobBufferSize < 0 {
return
}
bq.dataDir = dataDir
bq.q, err = goque.OpenQueue(dataDir)
if err != nil { return nil, err }
return
}
func (q *BufferedQueue) Enqueue(job *Job) error {
atomic.AddInt64(&totalQueued, 1)
if q.directEnqueue(job) {
return nil
}
var gob JobGob
gob.ToGob(job)
_, err := q.q.EnqueueObject(gob)
return err
}
func (q *BufferedQueue) Dequeue() (job Job, err error) {
if q.directDequeue(&job) {
atomic.AddInt64(&totalQueued, -1)
return job, nil
}
if config.JobBufferSize < 0 {
err = goque.ErrEmpty
return
}
var item *goque.Item
item, err = q.q.Dequeue()
if err != nil { return }
atomic.AddInt64(&totalQueued, -1)
var gob JobGob
err = item.ToObject(&gob)
if err != nil { return }
gob.FromGob(&job)
return
}
func (q *BufferedQueue) directEnqueue(job *Job) bool {
q.m.Lock()
defer q.m.Unlock()
bs := config.JobBufferSize
if len(q.buf) < bs || bs < 0 {
q.buf = append(q.buf, *job)
return true
} else {
return false
}
}
func (q *BufferedQueue) directDequeue(job *Job) bool {
q.m.Lock()
defer q.m.Unlock()
if len(q.buf) > 0 {
*job = q.buf[0]
q.buf = q.buf[1:]
return true
} else {
return false
}
}
// Always returns nil (But implements io.Closer)
func (q *BufferedQueue) Close() error {
if config.JobBufferSize < 0 {
return nil
}
// Close ignoring errors
q.q.Close()
// Delete files
if err := os.RemoveAll(q.dataDir);
err != nil { panic(err) }
return nil
}
type JobGob struct {
Uri string
Fails int
LastError string
}
func (g *JobGob) ToGob(j *Job) {
g.Uri = j.UriStr
g.Fails = j.Fails
if j.LastError != nil {
g.LastError = j.LastError.Error()
}
}
func (g *JobGob) FromGob(j *Job) {
if err := j.Uri.Parse(g.Uri);
err != nil { panic(err) }
j.UriStr = g.Uri
j.Fails = g.Fails
if g.LastError != "" {
j.LastError = errorString(g.LastError)
}
}

25
release.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
appname="od-database-crawler"
tag=$1
[ -z "$tag" ] && echo "Usage: build <version>" && exit 1
name=${appname}-${tag}-windows.exe
GOOS="windows" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-linux
GOOS="linux" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-mac
GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-freebsd
GOOS="freebsd" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name

270
resume.go Normal file
View File

@@ -0,0 +1,270 @@
package main
import (
"bytes"
"encoding/binary"
"encoding/gob"
"fmt"
"github.com/beeker1121/goque"
"github.com/sirupsen/logrus"
"github.com/spf13/viper"
"io"
"os"
"path/filepath"
"strconv"
"sync/atomic"
"time"
)
func init() {
gob.Register(&PausedOD{})
}
func LoadResumeTasks(inRemotes chan<- *OD) {
resumed, err := ResumeTasks()
if err != nil {
logrus.WithError(err).
Error("Failed to resume queued tasks. " +
"/queue is probably corrupt")
err = nil
}
for _, remote := range resumed {
inRemotes <- remote
}
}
func ResumeTasks() (tasks []*OD, err error) {
// Get files in /queue
var queueF *os.File
var entries []os.FileInfo
queueF, err = os.Open("queue")
if err != nil { return nil, err }
defer queueF.Close()
entries, err = queueF.Readdir(-1)
if err != nil { return nil, err }
resumeDur := viper.GetDuration(ConfResume)
for _, entry := range entries {
if !entry.IsDir() { continue }
// Check if name is a number
var id uint64
if id, err = strconv.ParseUint(entry.Name(), 10, 64); err != nil {
continue
}
// Too old to be resumed
timeDelta := time.Since(entry.ModTime())
if resumeDur >= 0 && timeDelta > resumeDur {
removeOldQueue(id)
continue
}
// Load queue
var od *OD
if od, err = resumeQueue(id); err != nil {
logrus.WithError(err).
WithField("id", id).
Warning("Failed to load paused task")
continue
} else if od == nil {
removeOldQueue(id)
continue
}
tasks = append(tasks, od)
}
return tasks, nil
}
func SaveTask(od *OD) (err error) {
dir := filepath.Join("queue",
strconv.FormatUint(od.Task.WebsiteId, 10))
fPath := filepath.Join(dir, "PAUSED")
err = os.Mkdir(dir, 0777)
if err != nil { return err }
// Open pause file
pausedF, err := os.OpenFile(fPath, os.O_CREATE | os.O_WRONLY | os.O_TRUNC, 0666)
if err != nil { return err }
defer pausedF.Close()
err = writePauseFile(od, pausedF)
if err != nil { return err }
return nil
}
func resumeQueue(id uint64) (od *OD, err error) {
logrus.WithField("id", id).
Info("Found unfinished")
fPath := filepath.Join("queue", strconv.FormatUint(id, 10))
// Try to find pause file
pausedF, err := os.Open(filepath.Join(fPath, "PAUSED"))
if os.IsNotExist(err) {
// No PAUSED file => not paused
// not paused => no error
return nil, nil
} else if err != nil {
return nil, err
}
defer pausedF.Close()
od = new(OD)
od.WCtx.OD = od
err = readPauseFile(od, pausedF)
if err != nil { return nil, err }
// Open queue
bq, err := OpenQueue(fPath)
if err != nil { return nil, err }
od.WCtx.Queue = bq
logrus.WithField("id", id).
Info("Resuming task")
return od, nil
}
func removeOldQueue(id uint64) {
if id == 0 {
// TODO Make custom crawl less of an ugly hack
return
}
logrus.WithField("id", id).
Warning("Deleting & returning old task")
name := strconv.FormatUint(id, 10)
fPath := filepath.Join("queue", name)
// Acquire old queue
q, err := goque.OpenQueue(fPath)
if err != nil {
// Queue lock exists, don't delete
logrus.WithField("err", err).
WithField("path", fPath).
Error("Failed to acquire old task")
return
}
// Delete old queue from disk
err = q.Drop()
if err != nil {
// Queue lock exists, don't delete
logrus.WithField("err", err).
WithField("path", fPath).
Error("Failed to delete old task")
return
}
// Delete old crawl result from disk
_ = os.Remove(filepath.Join("crawled", name + ".json"))
// Return task to server
if err := CancelTask(id); err != nil {
// Queue lock exists, don't delete
logrus.WithField("err", err).
WithField("id", id).
Warning("Failed to return unfinished task to server")
return
}
}
func writePauseFile(od *OD, w io.Writer) (err error) {
// Write pause file version
_, err = w.Write([]byte("ODPAUSE-"))
if err != nil { return err }
// Create save state
paused := PausedOD {
Task: &od.Task,
Result: &od.Result,
BaseUri: &od.BaseUri,
InProgress: atomic.LoadInt64(&od.InProgress),
}
// Prepare pause settings
var b bytes.Buffer
pauseEnc := gob.NewEncoder(&b)
err = pauseEnc.Encode(&paused)
if err != nil { return err }
// Write length of pause settings
err = binary.Write(w, binary.LittleEndian, uint64(b.Len()))
if err != nil { return err }
// Write pause settings
_, err = w.Write(b.Bytes())
if err != nil { return err }
// Write pause scan state
err = od.Scanned.Marshal(w)
if err != nil { return err }
// Save mark
_, err = w.Write([]byte("--------"))
if err != nil { return err }
return nil
}
func readPauseFile(od *OD, r io.Reader) (err error) {
// Make the paused struct point to OD fields
// So gob loads values into the OD struct
paused := PausedOD {
Task: &od.Task,
Result: &od.Result,
BaseUri: &od.BaseUri,
}
var version [8]byte
_, err = io.ReadFull(r, version[:])
if err != nil { return err }
if !bytes.Equal(version[:], []byte("ODPAUSE-")) {
return fmt.Errorf("unsupported pause file")
}
// Read pause settings len
var pauseSettingsLen uint64
err = binary.Read(r, binary.LittleEndian, &pauseSettingsLen)
// Read pause settings
pauseDec := gob.NewDecoder(io.LimitReader(r, int64(pauseSettingsLen)))
err = pauseDec.Decode(&paused)
if err != nil { return err }
atomic.StoreInt64(&od.InProgress, paused.InProgress)
err = readPauseStateTree(od, r)
if err != nil {
return fmt.Errorf("failed to read state tree: %s", err)
}
return nil
}
func readPauseStateTree(od *OD, r io.Reader) (err error) {
// Read pause scan state
err = od.Scanned.Unmarshal(r)
if err != nil { return err }
// Check mark
var mark [8]byte
_, err = io.ReadFull(r, mark[:])
if err != nil { return err }
if !bytes.Equal(mark[:], []byte("--------")) {
return fmt.Errorf("corrupt pause file")
}
return nil
}

48
resume_test.go Normal file
View File

@@ -0,0 +1,48 @@
package main
import (
"bytes"
"github.com/terorie/od-database-crawler/fasturl"
"testing"
"time"
)
func TestResumeTasks_Empty(t *testing.T) {
start := time.Now().Add(-1 * time.Minute)
od := OD {
Task: Task {
WebsiteId: 213,
Url: "https://the-eye.eu/public/",
},
Result: TaskResult {
StartTime: start,
StartTimeUnix: start.Unix(),
EndTimeUnix: time.Now().Unix(),
WebsiteId: 213,
},
InProgress: 0,
BaseUri: fasturl.URL {
Scheme: fasturl.SchemeHTTPS,
Host: "the-eye.eu",
Path: "/public/",
},
}
od.WCtx.OD = &od
var b bytes.Buffer
var err error
err = writePauseFile(&od, &b)
if err != nil {
t.Fatal(err)
}
buf := b.Bytes()
var od2 OD
b2 := bytes.NewBuffer(buf)
err = readPauseFile(&od2, b2)
if err != nil {
t.Fatal(err)
}
}

View File

@@ -5,34 +5,56 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/terorie/od-database-crawler/fasturl"
"os" "os"
"path" "path"
"sync"
"sync/atomic" "sync/atomic"
"time"
) )
var activeTasks int32 var activeTasksLock sync.Mutex
var totalBuffered int64 var activeTasks = make(map[uint64]bool)
var numActiveTasks int32
var totalQueued int64
func Schedule(c context.Context, remotes <-chan *OD) { func Schedule(c context.Context, remotes <-chan *OD) {
go Stats(c) go Stats(c)
for remote := range remotes { for remote := range remotes {
if !scheduleNewTask(c, remote) {
return
}
}
}
func scheduleNewTask(c context.Context, remote *OD) bool {
logrus.WithField("url", remote.BaseUri.String()). logrus.WithField("url", remote.BaseUri.String()).
Info("Starting crawler") Info("Starting crawler")
// Collect results // Collect results
results := make(chan File) results := make(chan File)
remote.WCtx.OD = remote
// Get queue path
queuePath := path.Join("queue", fmt.Sprintf("%d", remote.Task.WebsiteId))
// Delete existing queue
if err := os.RemoveAll(queuePath);
err != nil { panic(err) }
// Start new queue
var err error
remote.WCtx.Queue, err = OpenQueue(queuePath)
if err != nil { panic(err) }
// Spawn workers // Spawn workers
remote.WCtx.in, remote.WCtx.out = makeJobBuffer(c) remote.WCtx.SpawnWorkers(c, results, config.Workers)
for i := 0; i < config.Workers; i++ {
go remote.WCtx.Worker(results)
}
// Enqueue initial job // Enqueue initial job
atomic.AddInt32(&activeTasks, 1) atomic.AddInt32(&numActiveTasks, 1)
remote.WCtx.queueJob(Job{ remote.WCtx.queueJob(Job{
OD: remote,
Uri: remote.BaseUri, Uri: remote.BaseUri,
UriStr: remote.BaseUri.String(), UriStr: remote.BaseUri.String(),
Fails: 0, Fails: 0,
@@ -40,46 +62,199 @@ func Schedule(c context.Context, remotes <-chan *OD) {
// Upload result when ready // Upload result when ready
go remote.Watch(results) go remote.Watch(results)
// Sleep if max number of tasks are active
for atomic.LoadInt32(&numActiveTasks) > config.Tasks {
if !sleep(time.Second, c) {
break
} }
} }
func (r *OD) Watch(results chan File) { return true
go r.Task.Collect(results) }
func ScheduleTask(remotes chan<- *OD, t *Task, u *fasturl.URL) {
if !t.register() {
return
}
globalWait.Add(1)
now := time.Now()
od := &OD {
Task: *t,
BaseUri: *u,
Result: TaskResult {
WebsiteId: t.WebsiteId,
StartTime: now,
StartTimeUnix: now.Unix(),
},
}
remotes <- od
}
func (t *Task) register() bool {
activeTasksLock.Lock()
defer activeTasksLock.Unlock()
if _, known := activeTasks[t.WebsiteId]; known {
return false
} else {
activeTasks[t.WebsiteId] = true
return true
}
}
func (t *Task) unregister() {
activeTasksLock.Lock()
delete(activeTasks, t.WebsiteId)
activeTasksLock.Unlock()
}
func (o *OD) Watch(results chan File) {
// Mark job as completely done
defer globalWait.Done()
defer o.Task.unregister()
filePath := path.Join("crawled", fmt.Sprintf("%d.json", o.Task.WebsiteId))
// Open crawl results file
f, err := os.OpenFile(
filePath,
os.O_CREATE | os.O_RDWR | os.O_APPEND,
0644,
)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
return
}
defer f.Close()
defer os.Remove(filePath)
// Listen for exit code of Collect()
collectErrC := make(chan error)
// Block until all results are written
// (closes results channel)
o.handleCollect(results, f, collectErrC)
// Exit code of Collect()
err = <-collectErrC
close(collectErrC)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
return
}
// Upload results
err = PushResult(&o.Result, f)
if err != nil {
logrus.WithError(err).
Error("Failed uploading crawl results")
return
}
}
func (o *OD) handleCollect(results chan File, f *os.File, collectErrC chan error) {
// Begin collecting results
go o.Task.Collect(results, f, collectErrC)
defer close(results)
// Wait for all jobs on remote to finish // Wait for all jobs on remote to finish
r.Wait.Wait() for {
close(r.WCtx.in) // Natural finish
atomic.AddInt32(&activeTasks, -1) if atomic.LoadInt64(&o.InProgress) == 0 {
o.onTaskFinished()
logrus.WithField("url", r.BaseUri.String()). return
Info("Crawler finished") }
// Abort
globalWait.Done() if atomic.LoadInt32(&o.WCtx.aborted) != 0 {
// Wait for all workers to finish
close(results) o.WCtx.workers.Wait()
o.onTaskPaused()
return
} }
func (t *Task) Collect(results chan File) { time.Sleep(500 * time.Millisecond)
err := t.collect(results) }
}
func (o *OD) onTaskFinished() {
defer atomic.AddInt32(&numActiveTasks, -1)
// Close queue
if err := o.WCtx.Queue.Close(); err != nil {
panic(err)
}
// Log finish
logrus.WithFields(logrus.Fields{
"id": o.Task.WebsiteId,
"url": o.BaseUri.String(),
"duration": time.Since(o.Result.StartTime),
}).Info("Crawler finished")
// Set status code
now := time.Now()
o.Result.EndTimeUnix = now.Unix()
fileCount := atomic.LoadUint64(&o.Result.FileCount)
if fileCount == 0 {
errorCount := atomic.LoadUint64(&o.Result.ErrorCount)
if errorCount == 0 {
o.Result.StatusCode = "empty"
} else {
o.Result.StatusCode = "directory listing failed"
}
} else {
o.Result.StatusCode = "success"
}
}
func (o *OD) onTaskPaused() {
defer atomic.AddInt32(&numActiveTasks, -1)
// Close queue
if err := o.WCtx.Queue.Close(); err != nil {
panic(err)
}
// Set current end time
o.Result.EndTimeUnix = time.Now().Unix()
// Save task metadata
err := SaveTask(o)
if err != nil {
// Log finish
logrus.WithFields(logrus.Fields{
"err": err.Error(),
"id": o.Task.WebsiteId,
"url": o.BaseUri.String(),
}).Error("Failed to save crawler state")
return
}
// Log finish
logrus.WithFields(logrus.Fields{
"id": o.Task.WebsiteId,
"url": o.BaseUri.String(),
"duration": time.Since(o.Result.StartTime),
}).Info("Crawler paused")
}
func (t *Task) Collect(results chan File, f *os.File, errC chan<- error) {
err := t.collect(results, f)
if err != nil { if err != nil {
logrus.WithError(err). logrus.WithError(err).
Error("Failed saving crawl results") Error("Failed saving crawl results")
} }
errC <- err
} }
func (t *Task) collect(results chan File) error { func (t *Task) collect(results chan File, f *os.File) error {
err := os.MkdirAll("crawled", 0755)
if err != nil { return err }
f, err := os.OpenFile(
path.Join("crawled", fmt.Sprintf("%d.json", t.WebsiteId)),
os.O_CREATE | os.O_WRONLY | os.O_TRUNC,
0755,
)
if err != nil { return err }
defer f.Close()
for result := range results { for result := range results {
result.Path = fasturl.PathUnescape(result.Path)
result.Name = fasturl.PathUnescape(result.Name)
resJson, err := json.Marshal(result) resJson, err := json.Marshal(result)
if err != nil { panic(err) } if err != nil { panic(err) }
_, err = f.Write(resJson) _, err = f.Write(resJson)
@@ -90,51 +265,3 @@ func (t *Task) collect(results chan File) error {
return nil return nil
} }
func makeJobBuffer(c context.Context) (chan<- Job, <-chan Job) {
in := make(chan Job)
out := make(chan Job)
go bufferJobs(c, in, out)
return in, out
}
func bufferJobs(c context.Context, in chan Job, out chan Job) {
defer close(out)
var inQueue []Job
outCh := func() chan Job {
if len(inQueue) == 0 {
return nil
}
return out
}
for len(inQueue) > 0 || in != nil {
if len(inQueue) == 0 {
select {
case v, ok := <-in:
if !ok {
in = nil
} else {
atomic.AddInt64(&totalBuffered, 1)
inQueue = append(inQueue, v)
}
case <-c.Done():
return
}
} else {
select {
case v, ok := <-in:
if !ok {
in = nil
} else {
atomic.AddInt64(&totalBuffered, 1)
inQueue = append(inQueue, v)
}
case outCh() <- inQueue[0]:
atomic.AddInt64(&totalBuffered, -1)
inQueue = inQueue[1:]
case <-c.Done():
return
}
}
}
}

160
server.go
View File

@@ -5,39 +5,37 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/spf13/viper"
"io" "io"
"mime/multipart" "mime/multipart"
"net/http" "net/http"
"net/url" "net/url"
"os" "os"
"path/filepath"
"strconv" "strconv"
"strings" "time"
) )
const ( var serverClient = http.Client {
fileListChunkSize int64 = 5000000 // 5 mb Timeout: config.ServerTimeout,
) Transport: new(ServerTripper),
}
var serverClient = http.DefaultClient var serverUserAgent = "od-database-crawler/" + rootCmd.Version
func FetchTask() (t *Task, err error) { func FetchTask() (t *Task, err error) {
escToken, _ := json.Marshal(config.Token) res, err := serverClient.PostForm(
payload := `{"token":` + string(escToken) + `}`
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/get", config.ServerUrl + "/task/get",
strings.NewReader(payload)) url.Values{ "token": {config.Token} })
if err != nil { return }
res, err := serverClient.Do(req)
if err != nil { return } if err != nil { return }
defer res.Body.Close() defer res.Body.Close()
if res.StatusCode != 200 { switch res.StatusCode {
err = fmt.Errorf("http %s", res.Status) case 200:
return break
case 404, 500:
return nil, nil
default:
return nil, fmt.Errorf("http %s", res.Status)
} }
t = new(Task) t = new(Task)
@@ -47,21 +45,17 @@ func FetchTask() (t *Task, err error) {
return return
} }
func PushResult(result *TaskResult) (err error) { func PushResult(result *TaskResult, f *os.File) (err error) {
filePath := filepath.Join( if result.WebsiteId == 0 {
".", "crawled", // Not a real result, don't push
fmt.Sprintf("%d.json", result.WebsiteId)) return nil
}
defer os.Remove(filePath) // Rewind to the beginning of the file
_, err = f.Seek(0, 0)
f, err := os.Open(filePath) if err != nil {
if os.IsNotExist(err) {
err = fmt.Errorf("cannot upload result: %s does not exist", filePath)
return
} else if err != nil {
return return
} }
defer f.Close()
err = uploadChunks(result.WebsiteId, f) err = uploadChunks(result.WebsiteId, f)
if err != nil { if err != nil {
@@ -73,104 +67,109 @@ func PushResult(result *TaskResult) (err error) {
return return
} }
err = uploadResult(result) // Upload result ignoring errors
if err != nil { uploadResult(result)
logrus.Errorf("Failed to upload result: %s", err)
err2 := CancelTask(result.WebsiteId)
if err2 != nil {
logrus.Error(err2)
}
return
}
return return
} }
func uploadChunks(websiteId uint64, f *os.File) (err error) { func uploadChunks(websiteId uint64, f *os.File) error {
for iter := 1; iter > 0; iter++ { eof := false
for iter := 1; !eof; iter++ {
// TODO Stream with io.Pipe? // TODO Stream with io.Pipe?
var b bytes.Buffer var b bytes.Buffer
multi := multipart.NewWriter(&b) multi := multipart.NewWriter(&b)
// Set upload fields // Set upload fields
var err error
err = multi.WriteField("token", config.Token) err = multi.WriteField("token", config.Token)
if err != nil { return } if err != nil { return err }
err = multi.WriteField("website_id", fmt.Sprintf("%d", websiteId)) err = multi.WriteField("website_id", fmt.Sprintf("%d", websiteId))
if err != nil { return } if err != nil { return err }
// Copy chunk to file_list // Copy chunk to file_list
formFile, err := multi.CreateFormFile("file_list", "file_list") formFile, err := multi.CreateFormFile("file_list", "file_list")
_, err = io.CopyN(formFile, f, fileListChunkSize) var n int64
if err == io.EOF { n, err = io.CopyN(formFile, f, config.ChunkSize)
break if err != io.EOF && err != nil {
} else if err == io.ErrUnexpectedEOF { return err
}
if n == 0 {
// Don't upload, no content
return nil
} else if n < config.ChunkSize {
err = nil err = nil
// Break at end of iteration // Break at end of iteration
iter = -420 eof = true
}
multi.Close()
for retries := 0; retries < viper.GetInt(ConfUploadRetries); retries++ {
if retries > 0 {
// Error occurred, retry upload
time.Sleep(viper.GetDuration(ConfUploadRetryInterval))
} }
req, err := http.NewRequest( req, err := http.NewRequest(
http.MethodPost, http.MethodPost,
config.ServerUrl + "/task/upload", config.ServerUrl + "/task/upload",
&b) &b)
if err != nil { return err } req.Header.Set("content-type", multi.FormDataContentType())
if err != nil { continue }
res, err := serverClient.Do(req) res, err := serverClient.Do(req)
if err != nil { return err } if err != nil { continue }
res.Body.Close() res.Body.Close()
if res.StatusCode != http.StatusOK { if res.StatusCode != http.StatusOK {
return fmt.Errorf("failed to upload list part %d: %s", logrus.WithField("status", res.Status).
iter, res.Status) WithField("part", iter).
Errorf("Upload failed")
continue
} }
logrus.Infof("Uploading file list part %d: %s", // Upload successful
iter, res.Status) break
} }
return
logrus.WithField("id", websiteId).
WithField("part", iter).
Infof("Uploaded files chunk")
}
return nil
} }
func uploadResult(result *TaskResult) (err error) { func uploadResult(result *TaskResult) (err error) {
resultEnc, err := json.Marshal(result) resultEnc, err := json.Marshal(result)
if err != nil { panic(err) } if err != nil { panic(err) }
payload := url.Values { res, err := serverClient.PostForm(
config.ServerUrl + "/task/complete",
url.Values {
"token": {config.Token}, "token": {config.Token},
"result": {string(resultEnc)}, "result": {string(resultEnc)},
}.Encode() },
)
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/complete",
strings.NewReader(payload))
if err != nil { return }
res, err := serverClient.Do(req)
if err != nil { return } if err != nil { return }
res.Body.Close() res.Body.Close()
if res.StatusCode != http.StatusOK { if res.StatusCode != http.StatusOK {
return fmt.Errorf("failed to cancel task: %s", res.Status) return HttpError{res.StatusCode}
} }
return return
} }
func CancelTask(websiteId uint64) (err error) { func CancelTask(websiteId uint64) (err error) {
form := url.Values{ res, err := serverClient.PostForm(
config.ServerUrl + "/task/cancel",
url.Values{
"token": {config.Token}, "token": {config.Token},
"website_id": {strconv.FormatUint(websiteId, 10)}, "website_id": {strconv.FormatUint(websiteId, 10)},
} },
encForm := form.Encode() )
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/cancel",
strings.NewReader(encForm))
if err != nil { return }
res, err := serverClient.Do(req)
if err != nil { return } if err != nil { return }
res.Body.Close() res.Body.Close()
@@ -180,3 +179,10 @@ func CancelTask(websiteId uint64) (err error) {
return return
} }
type ServerTripper struct{}
func (t *ServerTripper) RoundTrip(req *http.Request) (res *http.Response, err error) {
req.Header.Set("User-Agent", serverUserAgent)
return http.DefaultTransport.RoundTrip(req)
}

View File

@@ -3,6 +3,7 @@ package main
import ( import (
"context" "context"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/spf13/viper"
"math" "math"
"runtime" "runtime"
"sync/atomic" "sync/atomic"
@@ -19,11 +20,14 @@ func Stats(c context.Context) {
var crawlTicker <-chan time.Time var crawlTicker <-chan time.Time
var allocTicker <-chan time.Time var allocTicker <-chan time.Time
if config.CrawlStats != 0 { crawlInterval := viper.GetDuration(ConfCrawlStats)
crawlTicker = time.NewTicker(config.CrawlStats).C allocInterval := viper.GetDuration(ConfAllocStats)
if crawlInterval != 0 {
crawlTicker = time.Tick(crawlInterval)
} }
if config.AllocStats != 0 { if allocInterval != 0 {
allocTicker = time.NewTicker(config.AllocStats).C allocTicker = time.Tick(allocInterval)
} }
for { for {
@@ -32,13 +36,17 @@ func Stats(c context.Context) {
startedNow := atomic.LoadUint64(&totalStarted) startedNow := atomic.LoadUint64(&totalStarted)
perSecond := float64(startedNow - startedLast) / perSecond := float64(startedNow - startedLast) /
config.CrawlStats.Seconds() crawlInterval.Seconds()
// Round to .5 // Round to .5
perSecond *= 2 perSecond *= 2
perSecond = math.Round(perSecond) perSecond = math.Round(perSecond)
perSecond /= 2 perSecond /= 2
if perSecond <= 0 {
continue
}
logrus.WithFields(logrus.Fields{ logrus.WithFields(logrus.Fields{
"per_second": perSecond, "per_second": perSecond,
"done": atomic.LoadUint64(&totalDone), "done": atomic.LoadUint64(&totalDone),
@@ -53,7 +61,7 @@ func Stats(c context.Context) {
runtime.ReadMemStats(&mem) runtime.ReadMemStats(&mem)
logrus.WithFields(logrus.Fields{ logrus.WithFields(logrus.Fields{
"queue_count": totalBuffered, "queue_count": atomic.LoadInt64(&totalQueued),
"heap": FormatByteCount(mem.Alloc), "heap": FormatByteCount(mem.Alloc),
"objects": mem.HeapObjects, "objects": mem.HeapObjects,
"num_gc": mem.NumGC, "num_gc": mem.NumGC,

View File

@@ -1,16 +0,0 @@
package main
import "time"
type Task struct {
WebsiteId int `json:"website_id"`
Url string `json:"url"`
}
type TaskResult struct {
StatusCode int `json:"status_code"`
FileCount uint64 `json:"file_count"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}

22
util.go
View File

@@ -1,6 +1,9 @@
package main package main
import "fmt" import (
"fmt"
"sync"
)
// https://programming.guide/go/formatting-byte-size-to-human-readable-format.html // https://programming.guide/go/formatting-byte-size-to-human-readable-format.html
func FormatByteCount(b uint64) string { func FormatByteCount(b uint64) string {
@@ -16,3 +19,20 @@ func FormatByteCount(b uint64) string {
return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp]) return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp])
} }
} }
type Hooks struct {
m sync.Mutex
l []func()
}
func (h *Hooks) Add(hook func()) {
h.m.Lock()
h.l = append(h.l, hook)
h.m.Unlock()
}
func (h *Hooks) Execute() {
for _, hook := range h.l {
hook()
}
}

View File

@@ -1,6 +1,8 @@
package main package main
import ( import (
"context"
"github.com/beeker1121/goque"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"math" "math"
"sort" "sort"
@@ -13,24 +15,57 @@ import (
var globalWait sync.WaitGroup var globalWait sync.WaitGroup
type WorkerContext struct { type WorkerContext struct {
in chan<- Job OD *OD
out <-chan Job Queue *BufferedQueue
lastRateLimit time.Time lastRateLimit time.Time
numRateLimits int numRateLimits int
workers sync.WaitGroup
aborted int32
} }
func (w WorkerContext) Worker(results chan<- File) { func (w *WorkerContext) SpawnWorkers(c context.Context, results chan<- File, n int) {
for job := range w.out { w.workers.Add(n)
for i := 0; i < n; i++ {
go w.Worker(c, results)
}
}
func (w *WorkerContext) Worker(c context.Context, results chan<- File) {
defer w.workers.Done()
for {
select {
case <-c.Done():
// Not yet done
atomic.StoreInt32(&w.aborted, 1)
return
default:
}
job, err := w.Queue.Dequeue()
switch err {
case goque.ErrEmpty:
time.Sleep(500 * time.Millisecond)
continue
case goque.ErrDBClosed:
return
case nil:
w.step(results, job) w.step(results, job)
default:
panic(err)
}
} }
} }
func (w WorkerContext) step(results chan<- File, job Job) { func (w *WorkerContext) step(results chan<- File, job Job) {
defer w.finishJob(&job) defer w.finishJob()
var f File var f File
newJobs, err := DoJob(&job, &f) newJobs, err := w.DoJob(&job, &f)
atomic.AddUint64(&totalStarted, 1) atomic.AddUint64(&totalStarted, 1)
if err == ErrKnown { if err == ErrKnown {
return return
@@ -39,8 +74,11 @@ func (w WorkerContext) step(results chan<- File, job Job) {
if err != nil { if err != nil {
job.Fails++ job.Fails++
if err == ErrForbidden { if !shouldRetry(err) {
// Don't attempt crawling again atomic.AddUint64(&totalAborted, 1)
logrus.WithField("url", job.UriStr).
WithError(err).
Error("Giving up after failure")
return return
} }
@@ -69,15 +107,17 @@ func (w WorkerContext) step(results chan<- File, job Job) {
} }
} }
func DoJob(job *Job, f *File) (newJobs []Job, err error) { func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
if len(job.Uri.Path) == 0 { return } if len(job.Uri.Path) == 0 { return }
if job.Uri.Path[len(job.Uri.Path)-1] == '/' { if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
// Load directory // Load directory
links, err := GetDir(job, f) links, err := GetDir(job, f)
if err != nil { if err != nil {
if !isErrSilent(err) {
logrus.WithError(err). logrus.WithError(err).
WithField("url", job.UriStr). WithField("url", job.UriStr).
Error("Failed getting dir") Error("Failed to crawl dir")
}
return nil, err return nil, err
} }
@@ -85,7 +125,7 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
hash := f.HashDir(links) hash := f.HashDir(links)
// Skip symlinked dirs // Skip symlinked dirs
if job.OD.LoadOrStoreKey(&hash) { if w.OD.LoadOrStoreKey(&hash) {
return nil, ErrKnown return nil, ErrKnown
} }
@@ -105,9 +145,7 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
} }
lastLink = uriStr lastLink = uriStr
job.OD.Wait.Add(1)
newJobs = append(newJobs, Job{ newJobs = append(newJobs, Job{
OD: job.OD,
Uri: link, Uri: link,
UriStr: uriStr, UriStr: uriStr,
Fails: 0, Fails: 0,
@@ -125,17 +163,20 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
// Load file // Load file
err := GetFile(job.Uri, f) err := GetFile(job.Uri, f)
if err != nil { if err != nil {
if !isErrSilent(err) {
logrus.WithError(err). logrus.WithError(err).
WithField("url", job.UriStr). WithField("url", job.UriStr).
Error("Failed getting file") Error("Failed to crawl file")
}
return nil, err return nil, err
} }
atomic.AddUint64(&w.OD.Result.FileCount, 1)
} }
return return
} }
func (w WorkerContext) queueJob(job Job) { func (w *WorkerContext) queueJob(job Job) {
job.OD.Wait.Add(1) atomic.AddInt64(&w.OD.InProgress, 1)
if w.numRateLimits > 0 { if w.numRateLimits > 0 {
if time.Since(w.lastRateLimit) > 5 * time.Second { if time.Since(w.lastRateLimit) > 5 * time.Second {
@@ -143,13 +184,23 @@ func (w WorkerContext) queueJob(job Job) {
} else { } else {
time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) * time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) *
100 * time.Millisecond) 100 * time.Millisecond)
w.in <- job
}
} else {
w.in <- job
} }
} }
func (w WorkerContext) finishJob(job *Job) { if err := w.Queue.Enqueue(&job); err != nil {
job.OD.Wait.Done() panic(err)
}
}
func (w *WorkerContext) finishJob() {
atomic.AddInt64(&w.OD.InProgress, -1)
}
func isErrSilent(err error) bool {
if !config.PrintHTTP {
if _, ok := err.(*HttpError); ok {
return true
}
}
return false
} }