mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-12-14 07:39:03 +00:00
Compare commits
46 Commits
v1.1.0
...
task_track
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a962c60b82 | ||
|
|
24f0bd91f7 | ||
|
|
84c10e1981 | ||
|
|
860fa79327 | ||
|
|
76bc8293d6 | ||
|
|
3470be6086 | ||
|
|
60471a081e | ||
|
|
0b3f0d87fe | ||
|
|
da9c75e392 | ||
|
|
8947e05d0c | ||
|
|
8c5f99d616 | ||
|
|
206ea0e91d | ||
|
|
8b9d8bfd17 | ||
|
|
c9ff102d80 | ||
|
|
88856c1c19 | ||
|
|
9e9b606250 | ||
|
|
326e29e5e4 | ||
|
|
c2acd5463f | ||
|
|
e4d04e6a5f | ||
|
|
9f1402e841 | ||
|
|
7c8ab50ee4 | ||
|
|
281d2d17d6 | ||
|
|
45cbd4d535 | ||
|
|
771d49f2dd | ||
|
|
dbd787aa81 | ||
|
|
cea6c1658b | ||
|
|
885af5bb3b | ||
|
|
b18b70f798 | ||
|
|
9d5f549774 | ||
|
|
5239af08f7 | ||
|
|
46c0e0bd32 | ||
|
|
0ca6deede8 | ||
|
|
120c026983 | ||
|
|
527e8895ec | ||
|
|
108fff0503 | ||
|
|
e5746baa5b | ||
|
|
17ba5583c9 | ||
|
|
92a8c07f4a | ||
|
|
43f96c6988 | ||
|
|
b244cdae80 | ||
|
|
4b8275c7bf | ||
|
|
f90bf94a44 | ||
|
|
e82768ff80 | ||
|
|
b1bf59adef | ||
|
|
a2df2972f4 | ||
|
|
3fc8837dd7 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,3 +1,6 @@
|
|||||||
/.idea/
|
/.idea/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
/od-database-crawler
|
/od-database-crawler
|
||||||
|
*.log
|
||||||
|
/queue/
|
||||||
|
/crawled/
|
||||||
|
|||||||
5
.travis.yml
Normal file
5
.travis.yml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
language: go
|
||||||
|
|
||||||
|
go:
|
||||||
|
- "1.11.x"
|
||||||
|
- master
|
||||||
15
Dockerfile
Normal file
15
Dockerfile
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
FROM golang:alpine as builder
|
||||||
|
ADD . /go/src/github.com/terorie/od-database-crawler
|
||||||
|
RUN apk add git \
|
||||||
|
&& go get -d -v github.com/terorie/od-database-crawler \
|
||||||
|
&& CGO_ENABLED=0 go install -a \
|
||||||
|
-installsuffix cgo \
|
||||||
|
-ldflags="-s -w" \
|
||||||
|
github.com/terorie/od-database-crawler
|
||||||
|
|
||||||
|
FROM scratch
|
||||||
|
COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
|
||||||
|
COPY --from=builder /go/bin/od-database-crawler /bin/
|
||||||
|
WORKDIR /oddb
|
||||||
|
VOLUME [ "/oddb" ]
|
||||||
|
CMD ["/bin/od-database-crawler", "server"]
|
||||||
47
README.md
47
README.md
@@ -1,7 +1,54 @@
|
|||||||
# od-database Go crawler 🚀
|
# od-database Go crawler 🚀
|
||||||
|
[](https://travis-ci.org/terorie/od-database-crawler)
|
||||||
> by terorie 2018 :P
|
> by terorie 2018 :P
|
||||||
|
|
||||||
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
|
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
|
||||||
* Crawls HTTP open directories (standard Web Server Listings)
|
* Crawls HTTP open directories (standard Web Server Listings)
|
||||||
* Gets name, path, size and modification time of all files
|
* Gets name, path, size and modification time of all files
|
||||||
* Lightweight and fast: __over 9000 requests per second__ on a standard laptop
|
* Lightweight and fast: __over 9000 requests per second__ on a standard laptop
|
||||||
|
|
||||||
|
https://od-db.the-eye.eu/
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Deploys
|
||||||
|
|
||||||
|
1. With Config File (if `config.yml` found in working dir)
|
||||||
|
- Download [default config](https://github.com/terorie/od-database-crawler/blob/master/config.yml)
|
||||||
|
- Set `server.url` and `server.token`
|
||||||
|
- Start with `./od-database-crawler server --config <file>`
|
||||||
|
|
||||||
|
2. With Flags or env
|
||||||
|
- Override config file if it exists
|
||||||
|
- `--help` for list of flags
|
||||||
|
- Every flag is available as an environment variable:
|
||||||
|
`--server.crawl_stats` ➡️ `OD_SERVER_CRAWL_STATS`
|
||||||
|
- Start with `./od-database-crawler server <flags>`
|
||||||
|
|
||||||
|
3. With Docker
|
||||||
|
```bash
|
||||||
|
docker run \
|
||||||
|
-e OD_SERVER_URL=xxx \
|
||||||
|
-e OD_SERVER_TOKEN=xxx \
|
||||||
|
terorie/od-database-crawler
|
||||||
|
```
|
||||||
|
|
||||||
|
### Flag reference
|
||||||
|
|
||||||
|
Here are the most important config flags. For more fine control, take a look at `/config.yml`.
|
||||||
|
|
||||||
|
| Flag/Environment | Description | Example |
|
||||||
|
| ------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------- |
|
||||||
|
| `server.url`<br />`OD_SERVER_URL` | OD-DB Server URL | `https://od-db.mine.the-eye.eu/api` |
|
||||||
|
| `server.token`<br />`OD_SERVER_TOKEN` | OD-DB Server Access Token | _Ask Hexa **TM**_ |
|
||||||
|
| `server.recheck`<br />`OD_SERVER_RECHECK` | Job Fetching Interval | `3s` |
|
||||||
|
| `output.crawl_stats`<br />`OD_OUTPUT_CRAWL_STATS` | Crawl Stats Logging Interval (0 = disabled) | `500ms` |
|
||||||
|
| `output.resource_stats`<br />`OD_OUTPUT_RESORUCE_STATS` | Resource Stats Logging Interval (0 = disabled) | `8s` |
|
||||||
|
| `output.log`<br />`OD_OUTPUT_LOG` | Log File (none = disabled) | `crawler.log` |
|
||||||
|
| `crawl.tasks`<br />`OD_CRAWL_TASKS` | Max number of sites to crawl concurrently | `500` |
|
||||||
|
| `crawl.connections`<br />`OD_CRAWL_CONNECTIONS` | HTTP connections per site | `1` |
|
||||||
|
| `crawl.retries`<br />`OD_CRAWL_RETRIES` | How often to retry after a temporary failure (e.g. `HTTP 429` or timeouts) | `5` |
|
||||||
|
| `crawl.dial_timeout`<br />`OD_CRAWL_DIAL_TIMEOUT` | TCP Connect timeout | `5s` |
|
||||||
|
| `crawl.timeout`<br />`OD_CRAWL_TIMEOUT` | HTTP request timeout | `20s` |
|
||||||
|
| `crawl.user-agent`<br />`OD_CRAWL_USER_AGENT` | HTTP Crawler User-Agent | `googlebot/1.2.3` |
|
||||||
|
| `crawl.job_buffer`<br />`OD_CRAWL_JOB_BUFFER` | Number of URLs to keep in memory/cache, per job. The rest is offloaded to disk. Decrease this value if the crawler uses too much RAM. (0 = Disable Cache, -1 = Only use Cache) | `5000` |
|
||||||
|
|||||||
156
config.go
156
config.go
@@ -4,6 +4,7 @@ import (
|
|||||||
"bufio"
|
"bufio"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
|
"github.com/spf13/pflag"
|
||||||
"github.com/spf13/viper"
|
"github.com/spf13/viper"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
@@ -12,8 +13,11 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var config struct {
|
var config struct {
|
||||||
ServerUrl string
|
TrackerUrl string
|
||||||
Token string
|
TrackerProject int
|
||||||
|
TrackerAlias string
|
||||||
|
WsBucketScheme string
|
||||||
|
WsBucketHost string
|
||||||
ServerTimeout time.Duration
|
ServerTimeout time.Duration
|
||||||
Recheck time.Duration
|
Recheck time.Duration
|
||||||
ChunkSize int64
|
ChunkSize int64
|
||||||
@@ -21,19 +25,26 @@ var config struct {
|
|||||||
Workers int
|
Workers int
|
||||||
UserAgent string
|
UserAgent string
|
||||||
Tasks int32
|
Tasks int32
|
||||||
CrawlStats time.Duration
|
|
||||||
AllocStats time.Duration
|
|
||||||
Verbose bool
|
Verbose bool
|
||||||
PrintHTTP bool
|
PrintHTTP bool
|
||||||
JobBufferSize int
|
JobBufferSize int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var onlineMode bool
|
||||||
|
|
||||||
const (
|
const (
|
||||||
ConfServerUrl = "server.url"
|
ConfTrackerUrl = "server.url"
|
||||||
ConfToken = "server.token"
|
ConfTrackerProject = "server.project"
|
||||||
|
ConfTrackerAlias = "server.alias"
|
||||||
|
ConfWsBucketScheme = "server.ws_bucket_scheme"
|
||||||
|
ConfWsBucketHost = "server.ws_bucket_host"
|
||||||
ConfServerTimeout = "server.timeout"
|
ConfServerTimeout = "server.timeout"
|
||||||
ConfRecheck = "server.recheck"
|
ConfRecheck = "server.recheck"
|
||||||
|
ConfCooldown = "server.cooldown"
|
||||||
ConfChunkSize = "server.upload_chunk"
|
ConfChunkSize = "server.upload_chunk"
|
||||||
|
ConfUploadRetries = "server.upload_retries"
|
||||||
|
ConfUploadRetryInterval = "server.upload_retry_interval"
|
||||||
|
|
||||||
ConfTasks = "crawl.tasks"
|
ConfTasks = "crawl.tasks"
|
||||||
ConfRetries = "crawl.retries"
|
ConfRetries = "crawl.retries"
|
||||||
ConfWorkers = "crawl.connections"
|
ConfWorkers = "crawl.connections"
|
||||||
@@ -41,6 +52,7 @@ const (
|
|||||||
ConfDialTimeout = "crawl.dial_timeout"
|
ConfDialTimeout = "crawl.dial_timeout"
|
||||||
ConfTimeout = "crawl.timeout"
|
ConfTimeout = "crawl.timeout"
|
||||||
ConfJobBufferSize = "crawl.job_buffer"
|
ConfJobBufferSize = "crawl.job_buffer"
|
||||||
|
|
||||||
ConfCrawlStats = "output.crawl_stats"
|
ConfCrawlStats = "output.crawl_stats"
|
||||||
ConfAllocStats = "output.resource_stats"
|
ConfAllocStats = "output.resource_stats"
|
||||||
ConfVerbose = "output.verbose"
|
ConfVerbose = "output.verbose"
|
||||||
@@ -49,41 +61,115 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func prepareConfig() {
|
func prepareConfig() {
|
||||||
viper.SetDefault(ConfRetries, 5)
|
pf := rootCmd.PersistentFlags()
|
||||||
viper.SetDefault(ConfWorkers, 2)
|
|
||||||
viper.SetDefault(ConfTasks, 3)
|
pf.SortFlags = false
|
||||||
viper.SetDefault(ConfUserAgent, "")
|
pf.StringVar(&configFile, "config", "", "Config file")
|
||||||
viper.SetDefault(ConfDialTimeout, 10 * time.Second)
|
configFile = os.Getenv("OD_CONFIG")
|
||||||
viper.SetDefault(ConfTimeout, 30 * time.Second)
|
|
||||||
viper.SetDefault(ConfJobBufferSize, 5000)
|
pf.String(ConfTrackerUrl, "https://tt.the-eye.eu/api", "task_tracker api URL")
|
||||||
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
|
|
||||||
viper.SetDefault(ConfAllocStats, 0)
|
pf.String(ConfTrackerProject, "1", "task_tracker project id")
|
||||||
viper.SetDefault(ConfVerbose, false)
|
|
||||||
viper.SetDefault(ConfPrintHTTP, false)
|
pf.String(ConfWsBucketScheme, "wss", "ws_bucket scheme")
|
||||||
viper.SetDefault(ConfLogFile, "")
|
|
||||||
viper.SetDefault(ConfRecheck, 3 * time.Second)
|
pf.String(ConfWsBucketHost, "wsb.the-eye.eu", "ws_bucket host")
|
||||||
viper.SetDefault(ConfChunkSize, "1 MB")
|
|
||||||
|
pf.String(ConfTrackerAlias, "changeme", "task_tracker worker alias")
|
||||||
|
|
||||||
|
pf.Duration(ConfServerTimeout, 60*time.Second, "OD-DB request timeout")
|
||||||
|
|
||||||
|
pf.Duration(ConfRecheck, 1*time.Second, "OD-DB: Poll interval for new jobs")
|
||||||
|
|
||||||
|
pf.Duration(ConfCooldown, 1*time.Minute, "OD-DB: Time to wait after a server-side error")
|
||||||
|
|
||||||
|
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
|
||||||
|
|
||||||
|
pf.Uint(ConfUploadRetries, 10, "OD-DB: Max upload retries")
|
||||||
|
|
||||||
|
pf.Duration(ConfUploadRetryInterval, 30*time.Second, "OD-DB: Time to wait between upload retries")
|
||||||
|
|
||||||
|
pf.Uint(ConfTasks, 25, "Crawler: Max concurrent tasks")
|
||||||
|
|
||||||
|
pf.Uint(ConfWorkers, 1, "Crawler: Connections per server")
|
||||||
|
|
||||||
|
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
|
||||||
|
|
||||||
|
pf.Duration(ConfDialTimeout, 10*time.Second, "Crawler: Handshake timeout")
|
||||||
|
|
||||||
|
pf.Duration(ConfTimeout, 30*time.Second, "Crawler: Request timeout")
|
||||||
|
|
||||||
|
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
|
||||||
|
|
||||||
|
pf.Int(ConfJobBufferSize, -1, "Crawler: Task queue cache size")
|
||||||
|
|
||||||
|
pf.Duration(ConfCrawlStats, 500*time.Second, "Log: Crawl stats interval")
|
||||||
|
|
||||||
|
pf.Duration(ConfAllocStats, 500*time.Second, "Log: Resource stats interval")
|
||||||
|
|
||||||
|
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")
|
||||||
|
|
||||||
|
pf.Bool(ConfPrintHTTP, false, "Log: Print HTTP client errors")
|
||||||
|
|
||||||
|
pf.String(ConfLogFile, "crawler.log", "Log file")
|
||||||
|
|
||||||
|
// Bind all flags to Viper
|
||||||
|
pf.VisitAll(func(flag *pflag.Flag) {
|
||||||
|
s := flag.Name
|
||||||
|
s = strings.TrimLeft(s, "-")
|
||||||
|
|
||||||
|
if err := viper.BindPFlag(s, flag); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
var envKey string
|
||||||
|
envKey = strings.Replace(s, ".", "_", -1)
|
||||||
|
envKey = strings.ToUpper(envKey)
|
||||||
|
envKey = "OD_" + envKey
|
||||||
|
if err := viper.BindEnv(s, envKey); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func readConfig() {
|
func readConfig() {
|
||||||
viper.AddConfigPath(".")
|
// If config.yml in working dir, use it
|
||||||
viper.SetConfigName("config")
|
if configFile == "" {
|
||||||
err := viper.ReadInConfig()
|
_, err := os.Stat("config.yml")
|
||||||
|
if err == nil {
|
||||||
|
configFile = "config.yml"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if configFile != "" {
|
||||||
|
confF, err := os.Open(configFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintln(os.Stderr, err)
|
fmt.Fprintln(os.Stderr, err)
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
defer confF.Close()
|
||||||
|
|
||||||
config.ServerUrl = viper.GetString(ConfServerUrl)
|
viper.SetConfigType("yml")
|
||||||
if config.ServerUrl == "" {
|
err = viper.ReadConfig(confF)
|
||||||
configMissing(ConfServerUrl)
|
if err != nil {
|
||||||
|
fmt.Fprintln(os.Stderr, err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
config.ServerUrl = strings.TrimRight(config.ServerUrl, "/")
|
|
||||||
|
|
||||||
config.Token = viper.GetString(ConfToken)
|
if onlineMode {
|
||||||
if config.Token == "" {
|
config.TrackerUrl = viper.GetString(ConfTrackerUrl)
|
||||||
configMissing(ConfToken)
|
if config.TrackerUrl == "" {
|
||||||
|
configMissing(ConfTrackerUrl)
|
||||||
}
|
}
|
||||||
|
config.TrackerUrl = strings.TrimRight(config.TrackerUrl, "/")
|
||||||
|
}
|
||||||
|
config.TrackerProject = viper.GetInt(ConfTrackerProject)
|
||||||
|
|
||||||
|
config.TrackerAlias = viper.GetString(ConfTrackerAlias)
|
||||||
|
|
||||||
|
config.WsBucketHost = viper.GetString(ConfWsBucketHost)
|
||||||
|
|
||||||
|
config.WsBucketScheme = viper.GetString(ConfWsBucketScheme)
|
||||||
|
|
||||||
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
|
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
|
||||||
|
|
||||||
@@ -117,19 +203,17 @@ func readConfig() {
|
|||||||
|
|
||||||
config.JobBufferSize = viper.GetInt(ConfJobBufferSize)
|
config.JobBufferSize = viper.GetInt(ConfJobBufferSize)
|
||||||
|
|
||||||
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
|
|
||||||
|
|
||||||
config.AllocStats = viper.GetDuration(ConfAllocStats)
|
|
||||||
|
|
||||||
config.Verbose = viper.GetBool(ConfVerbose)
|
config.Verbose = viper.GetBool(ConfVerbose)
|
||||||
if config.Verbose {
|
if config.Verbose {
|
||||||
logrus.SetLevel(logrus.DebugLevel)
|
logrus.SetLevel(logrus.DebugLevel)
|
||||||
}
|
}
|
||||||
|
|
||||||
if filePath := viper.GetString(ConfLogFile); filePath != "" {
|
if filePath := viper.GetString(ConfLogFile); filePath != "" {
|
||||||
f, err := os.OpenFile(filePath, os.O_CREATE | os.O_WRONLY | os.O_APPEND, 0644)
|
f, err := os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
|
||||||
bufWriter := bufio.NewWriter(f)
|
bufWriter := bufio.NewWriter(f)
|
||||||
if err != nil { panic(err) }
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
exitHooks.Add(func() {
|
exitHooks.Add(func() {
|
||||||
bufWriter.Flush()
|
bufWriter.Flush()
|
||||||
f.Close()
|
f.Close()
|
||||||
|
|||||||
31
config.yml
31
config.yml
@@ -1,10 +1,14 @@
|
|||||||
# OD-Database server settings
|
# OD-Database server settings
|
||||||
server:
|
server:
|
||||||
# Connection URL
|
# Connection URL
|
||||||
url: http://od-db.mine.terorie.com/api
|
url: https://tt.the-eye.eu/api
|
||||||
|
# OD-Database project id (for crawling)
|
||||||
# Server auth token
|
project: 1
|
||||||
token:
|
# Your worker alias
|
||||||
|
alias: changeme
|
||||||
|
# Websocket bucket host & scheme (ws/wss)
|
||||||
|
ws_bucket_host: https://wsb.the-eye.eu
|
||||||
|
ws_bucket_scheme: wss
|
||||||
|
|
||||||
# Request timeout
|
# Request timeout
|
||||||
timeout: 60s
|
timeout: 60s
|
||||||
@@ -15,17 +19,20 @@ server:
|
|||||||
# between /task/get requests to the server.
|
# between /task/get requests to the server.
|
||||||
recheck: 1s
|
recheck: 1s
|
||||||
|
|
||||||
# Upload chunk size
|
# Time to wait after receiving an error
|
||||||
# If the value is too high, the upload fails.
|
# from the server. Doesn't apply to uploads.
|
||||||
upload_chunk: 1 MB
|
cooldown: 1s
|
||||||
|
|
||||||
|
upload_retries: 10
|
||||||
|
upload_retry_interval: 30s
|
||||||
|
|
||||||
# Log output settings
|
# Log output settings
|
||||||
output:
|
output:
|
||||||
# Crawl statistics
|
# Crawl statistics
|
||||||
crawl_stats: 1s
|
crawl_stats: 1m
|
||||||
|
|
||||||
# CPU/RAM/Job queue stats
|
# CPU/RAM/Job queue stats
|
||||||
resource_stats: 10s
|
resource_stats: 1m
|
||||||
|
|
||||||
# More output? (Every listed dir)
|
# More output? (Every listed dir)
|
||||||
verbose: false
|
verbose: false
|
||||||
@@ -40,13 +47,13 @@ output:
|
|||||||
# Crawler settings
|
# Crawler settings
|
||||||
crawl:
|
crawl:
|
||||||
# Number of sites that can be processed at once
|
# Number of sites that can be processed at once
|
||||||
tasks: 100
|
tasks: 25
|
||||||
|
|
||||||
# Number of connections per site
|
# Number of connections per site
|
||||||
# Please be careful with this setting!
|
# Please be careful with this setting!
|
||||||
# The crawler fires fast and more than
|
# The crawler fires fast and more than
|
||||||
# ten connections can overwhelm a server.
|
# ten connections can overwhelm a server.
|
||||||
connections: 4
|
connections: 1
|
||||||
|
|
||||||
# How often to retry getting data
|
# How often to retry getting data
|
||||||
# from the site before giving up
|
# from the site before giving up
|
||||||
@@ -74,4 +81,4 @@ crawl:
|
|||||||
# in memory.
|
# in memory.
|
||||||
# A negative value will cause all jobs
|
# A negative value will cause all jobs
|
||||||
# to be stored in memory. (Don't do this)
|
# to be stored in memory. (Don't do this)
|
||||||
job_buffer: 5000
|
job_buffer: -1
|
||||||
|
|||||||
14
crawl.go
14
crawl.go
@@ -58,6 +58,10 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
body := res.Body()
|
body := res.Body()
|
||||||
|
return ParseDir(body, &j.Uri)
|
||||||
|
}
|
||||||
|
|
||||||
|
func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error) {
|
||||||
doc := html.NewTokenizer(bytes.NewReader(body))
|
doc := html.NewTokenizer(bytes.NewReader(body))
|
||||||
|
|
||||||
var linkHref string
|
var linkHref string
|
||||||
@@ -107,15 +111,15 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var link fasturl.URL
|
var link fasturl.URL
|
||||||
err = j.Uri.ParseRel(&link, href)
|
err = baseUri.ParseRel(&link, href)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if link.Scheme != j.Uri.Scheme ||
|
if link.Scheme != baseUri.Scheme ||
|
||||||
link.Host != j.Uri.Host ||
|
link.Host != baseUri.Host ||
|
||||||
link.Path == j.Uri.Path ||
|
link.Path == baseUri.Path ||
|
||||||
!strings.HasPrefix(link.Path, j.Uri.Path) {
|
!strings.HasPrefix(link.Path, baseUri.Path) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
4766
crawl_apache2_test.go
Normal file
4766
crawl_apache2_test.go
Normal file
File diff suppressed because it is too large
Load Diff
117
crawl_nginx_test.go
Normal file
117
crawl_nginx_test.go
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/terorie/od-database-crawler/fasturl"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseDirNginx(t *testing.T) {
|
||||||
|
var u fasturl.URL
|
||||||
|
err := u.Parse("https://the-eye.eu/public/")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal("Failed to parse URL", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
links, err := ParseDir([]byte(nginxListing), &u)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal("Failed to extract links", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(links) != len(nginxLinks) {
|
||||||
|
t.Fatalf("Expected %d links, got %d",
|
||||||
|
len(nginxLinks), len(links))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < len(links); i++ {
|
||||||
|
gotLink := links[i].String()
|
||||||
|
expLink := nginxLinks[i]
|
||||||
|
|
||||||
|
if gotLink != expLink {
|
||||||
|
t.Errorf(`Expected "%s" got "%s"`,
|
||||||
|
expLink, gotLink)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var nginxLinks = []string {
|
||||||
|
"https://the-eye.eu/public/AppleArchive/",
|
||||||
|
"https://the-eye.eu/public/AudioBooks/",
|
||||||
|
"https://the-eye.eu/public/Books/",
|
||||||
|
"https://the-eye.eu/public/Comics/",
|
||||||
|
"https://the-eye.eu/public/Games/",
|
||||||
|
"https://the-eye.eu/public/Icons/",
|
||||||
|
"https://the-eye.eu/public/Images/",
|
||||||
|
"https://the-eye.eu/public/JFK_Files/",
|
||||||
|
"https://the-eye.eu/public/MSDN/",
|
||||||
|
"https://the-eye.eu/public/Music/",
|
||||||
|
"https://the-eye.eu/public/Operating%20Systems/",
|
||||||
|
"https://the-eye.eu/public/Posters/",
|
||||||
|
"https://the-eye.eu/public/Psychedelics/",
|
||||||
|
"https://the-eye.eu/public/Psychoactives/",
|
||||||
|
"https://the-eye.eu/public/Radio/",
|
||||||
|
"https://the-eye.eu/public/Random/",
|
||||||
|
"https://the-eye.eu/public/Site-Dumps/",
|
||||||
|
"https://the-eye.eu/public/Software/",
|
||||||
|
"https://the-eye.eu/public/Strategic%20Intelligence%20Network/",
|
||||||
|
"https://the-eye.eu/public/WorldTracker.org/",
|
||||||
|
"https://the-eye.eu/public/concen.org/",
|
||||||
|
"https://the-eye.eu/public/freenrg.info/",
|
||||||
|
"https://the-eye.eu/public/murdercube.com/",
|
||||||
|
"https://the-eye.eu/public/parazite/",
|
||||||
|
"https://the-eye.eu/public/ripreddit/",
|
||||||
|
"https://the-eye.eu/public/rom/",
|
||||||
|
"https://the-eye.eu/public/touhou/",
|
||||||
|
"https://the-eye.eu/public/vns/",
|
||||||
|
"https://the-eye.eu/public/xbins/",
|
||||||
|
"https://the-eye.eu/public/xbins.diodematrix/",
|
||||||
|
"https://the-eye.eu/public/Rclone_for_Scrubs.pdf",
|
||||||
|
"https://the-eye.eu/public/Wget_Linux_Guide.pdf",
|
||||||
|
"https://the-eye.eu/public/Wget_Windows_Guide.pdf",
|
||||||
|
"https://the-eye.eu/public/rclone_guide.pdf",
|
||||||
|
"https://the-eye.eu/public/wget-noobs-guide.pdf",
|
||||||
|
"https://the-eye.eu/public/xbox-scene_Aug2014.7z",
|
||||||
|
}
|
||||||
|
|
||||||
|
const nginxListing =
|
||||||
|
`<html>
|
||||||
|
<head><title>Index of /public/</title></head>
|
||||||
|
<body bgcolor="white">
|
||||||
|
<h1>Index of /public/</h1><hr><pre><a href="../">../</a>
|
||||||
|
<a href="AppleArchive/">AppleArchive/</a> 03-Nov-2017 18:13 -
|
||||||
|
<a href="AudioBooks/">AudioBooks/</a> 29-Sep-2018 19:47 -
|
||||||
|
<a href="Books/">Books/</a> 27-Nov-2018 17:50 -
|
||||||
|
<a href="Comics/">Comics/</a> 05-Nov-2018 21:37 -
|
||||||
|
<a href="Games/">Games/</a> 28-Nov-2018 11:54 -
|
||||||
|
<a href="Icons/">Icons/</a> 22-May-2018 07:47 -
|
||||||
|
<a href="Images/">Images/</a> 21-Jan-2018 03:21 -
|
||||||
|
<a href="JFK_Files/">JFK_Files/</a> 03-Nov-2017 17:03 -
|
||||||
|
<a href="MSDN/">MSDN/</a> 03-Nov-2017 15:48 -
|
||||||
|
<a href="Music/">Music/</a> 02-Mar-2018 15:47 -
|
||||||
|
<a href="Operating%20Systems/">Operating Systems/</a> 25-Apr-2018 07:18 -
|
||||||
|
<a href="Posters/">Posters/</a> 07-Jul-2018 01:12 -
|
||||||
|
<a href="Psychedelics/">Psychedelics/</a> 11-Apr-2018 05:45 -
|
||||||
|
<a href="Psychoactives/">Psychoactives/</a> 18-May-2018 02:58 -
|
||||||
|
<a href="Radio/">Radio/</a> 09-Jun-2018 15:49 -
|
||||||
|
<a href="Random/">Random/</a> 04-Dec-2018 12:33 -
|
||||||
|
<a href="Site-Dumps/">Site-Dumps/</a> 15-Dec-2018 11:04 -
|
||||||
|
<a href="Software/">Software/</a> 27-Nov-2017 00:22 -
|
||||||
|
<a href="Strategic%20Intelligence%20Network/">Strategic Intelligence Network/</a> 17-Nov-2017 16:35 -
|
||||||
|
<a href="WorldTracker.org/">WorldTracker.org/</a> 12-Apr-2018 04:16 -
|
||||||
|
<a href="concen.org/">concen.org/</a> 08-Oct-2018 14:08 -
|
||||||
|
<a href="freenrg.info/">freenrg.info/</a> 19-Dec-2017 10:59 -
|
||||||
|
<a href="murdercube.com/">murdercube.com/</a> 06-Dec-2017 10:45 -
|
||||||
|
<a href="parazite/">parazite/</a> 20-Nov-2017 21:25 -
|
||||||
|
<a href="ripreddit/">ripreddit/</a> 04-Aug-2018 14:30 -
|
||||||
|
<a href="rom/">rom/</a> 28-Nov-2018 14:15 -
|
||||||
|
<a href="touhou/">touhou/</a> 03-Nov-2017 11:07 -
|
||||||
|
<a href="vns/">vns/</a> 03-Nov-2017 11:36 -
|
||||||
|
<a href="xbins/">xbins/</a> 03-Nov-2017 17:23 -
|
||||||
|
<a href="xbins.diodematrix/">xbins.diodematrix/</a> 21-Sep-2018 22:33 -
|
||||||
|
<a href="Rclone_for_Scrubs.pdf">Rclone_for_Scrubs.pdf</a> 04-Sep-2018 13:31 315K
|
||||||
|
<a href="Wget_Linux_Guide.pdf">Wget_Linux_Guide.pdf</a> 21-Dec-2017 20:28 168K
|
||||||
|
<a href="Wget_Windows_Guide.pdf">Wget_Windows_Guide.pdf</a> 25-Nov-2017 17:59 867K
|
||||||
|
<a href="rclone_guide.pdf">rclone_guide.pdf</a> 03-Sep-2018 23:37 315K
|
||||||
|
<a href="wget-noobs-guide.pdf">wget-noobs-guide.pdf</a> 21-Dec-2017 20:29 168K
|
||||||
|
<a href="xbox-scene_Aug2014.7z">xbox-scene_Aug2014.7z</a> 26-Oct-2017 23:09 1G
|
||||||
|
</pre><hr></body>
|
||||||
|
</html>`
|
||||||
59
crawl_test.go
Normal file
59
crawl_test.go
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"github.com/terorie/od-database-crawler/fasturl"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func BenchmarkParseDir(b *testing.B) {
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
var u fasturl.URL
|
||||||
|
err := u.Parse("http://archive.ubuntu.com/ubuntu/indices/")
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal("Failed to parse URL", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = ParseDir([]byte(apache2Listing), &u)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal("Failed to extract links", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkParseDirReference(b *testing.B) {
|
||||||
|
for n := 0; n < b.N; n++ {
|
||||||
|
u, err := url.Parse("http://archive.ubuntu.com/ubuntu/indices/")
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal("Failed to parse URL", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = referenceParseDir([]byte(apache2Listing), u)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal("Failed to extract links", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func referenceParseDir(body []byte, baseUri *url.URL) (links []*url.URL, err error) {
|
||||||
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||||
|
if err != nil { return nil, err }
|
||||||
|
|
||||||
|
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
||||||
|
href, _ := s.Attr("href")
|
||||||
|
|
||||||
|
sub, err := baseUri.Parse(href)
|
||||||
|
if err != nil { return } // continue
|
||||||
|
|
||||||
|
if !strings.HasPrefix(sub.String(), baseUri.String()) {
|
||||||
|
return // continue
|
||||||
|
}
|
||||||
|
|
||||||
|
links = append(links, sub)
|
||||||
|
})
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
28
errors.go
28
errors.go
@@ -3,6 +3,8 @@ package main
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/valyala/fasthttp"
|
||||||
|
"net"
|
||||||
)
|
)
|
||||||
|
|
||||||
var ErrRateLimit = errors.New("too many requests")
|
var ErrRateLimit = errors.New("too many requests")
|
||||||
@@ -15,3 +17,29 @@ type HttpError struct {
|
|||||||
func (e HttpError) Error() string {
|
func (e HttpError) Error() string {
|
||||||
return fmt.Sprintf("http status %d", e.code)
|
return fmt.Sprintf("http status %d", e.code)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func shouldRetry(err error) bool {
|
||||||
|
// HTTP errors
|
||||||
|
if httpErr, ok := err.(*HttpError); ok {
|
||||||
|
switch httpErr.code {
|
||||||
|
case fasthttp.StatusTooManyRequests:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
// Don't retry HTTP error codes
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if dnsError, ok := err.(*net.DNSError); ok {
|
||||||
|
// Don't retry permanent DNS errors
|
||||||
|
return dnsError.IsTemporary
|
||||||
|
}
|
||||||
|
|
||||||
|
if netErr, ok := err.(*net.OpError); ok {
|
||||||
|
// Don't retry permanent network errors
|
||||||
|
return netErr.Temporary()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retry by default
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|||||||
15
help.go
Normal file
15
help.go
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
const helpText =
|
||||||
|
`HTTP crawler for the OD-Database
|
||||||
|
DB >> https://od-db.the-eye.eu <<
|
||||||
|
Crawler >> https://github.com/terorie/od-database-crawler <<
|
||||||
|
Server >> https://github.com/simon987/od-database <<
|
||||||
|
|
||||||
|
Quick start:
|
||||||
|
- get config file (config.yml in working dir)
|
||||||
|
- get OD-DB server ("server.url": Database URL + /api)
|
||||||
|
- get access token ("server.token": e.g. c010b6dd-20...)
|
||||||
|
- ./od-database-crawler server
|
||||||
|
|
||||||
|
Questions? Discord @terorie#2664 / Telegram @terorie`
|
||||||
47
jenkins/Jenkinsfile
vendored
Normal file
47
jenkins/Jenkinsfile
vendored
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
def remote = [:]
|
||||||
|
remote.name = 'remote'
|
||||||
|
remote.host = env.DEPLOY_HOST
|
||||||
|
remote.user = env.DEPLOY_USER
|
||||||
|
remote.identityFile = '/var/lib/jenkins/.ssh/id_rsa'
|
||||||
|
remote.knownHosts = '/var/lib/jenkins/.ssh/known_hosts'
|
||||||
|
remote.allowAnyHosts = true
|
||||||
|
remote.retryCount = 3
|
||||||
|
remote.retryWaitSec = 3
|
||||||
|
logLevel = 'FINER'
|
||||||
|
|
||||||
|
pipeline {
|
||||||
|
agent none
|
||||||
|
environment {
|
||||||
|
GOOS='linux'
|
||||||
|
CGO_ENABLED='0'
|
||||||
|
HOME='.'
|
||||||
|
}
|
||||||
|
stages {
|
||||||
|
stage('Build') {
|
||||||
|
agent {
|
||||||
|
docker {
|
||||||
|
image 'golang:latest'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
steps {
|
||||||
|
sh 'mkdir -p /go/src/github.com/terorie/od-database-crawler'
|
||||||
|
sh 'cp -r *.go fasturl ds jenkins/build.sh "/go/src/github.com/terorie/od-database-crawler"'
|
||||||
|
sh 'cd /go/src/github.com/terorie/od-database-crawler && go get ./...'
|
||||||
|
sh './jenkins/build.sh'
|
||||||
|
stash includes: 'dist/', name: 'dist'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Deploy') {
|
||||||
|
agent none
|
||||||
|
steps {
|
||||||
|
node('master') {
|
||||||
|
unstash 'dist'
|
||||||
|
sshCommand remote: remote, command: "ls od-database-crawler/"
|
||||||
|
sshPut remote: remote, from: 'dist/', into: 'od-database-crawler'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
23
jenkins/build.sh
Executable file
23
jenkins/build.sh
Executable file
@@ -0,0 +1,23 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
appname="od-database-crawler"
|
||||||
|
outdir="dist/"
|
||||||
|
tag="${BUILD_ID}_$(date +%Y-%m-%d)"
|
||||||
|
|
||||||
|
rm -rf "./${outdir}"
|
||||||
|
mkdir build 2> /dev/null
|
||||||
|
|
||||||
|
name=${outdir}${appname}-${tag}-linux
|
||||||
|
GOOS="linux" GOARCH="amd64" go build -ldflags="-s -w" -o ${name}
|
||||||
|
gzip -f ${name}
|
||||||
|
echo ${name}
|
||||||
|
|
||||||
|
name=${outdir}${appname}-${tag}-mac
|
||||||
|
GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o ${name}
|
||||||
|
gzip -f ${name}
|
||||||
|
echo ${name}
|
||||||
|
|
||||||
|
name=${outdir}${appname}-${tag}-freebsd
|
||||||
|
GOOS="freebsd" GOARCH="amd64" go build -ldflags="-s -w" -o ${name}
|
||||||
|
gzip -f ${name}
|
||||||
|
echo ${name}
|
||||||
145
main.go
145
main.go
@@ -2,11 +2,13 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
|
"github.com/spf13/cobra"
|
||||||
"github.com/spf13/viper"
|
"github.com/spf13/viper"
|
||||||
"github.com/terorie/od-database-crawler/fasturl"
|
"github.com/terorie/od-database-crawler/fasturl"
|
||||||
"github.com/urfave/cli"
|
|
||||||
"os"
|
"os"
|
||||||
|
"os/signal"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
@@ -14,77 +16,89 @@ import (
|
|||||||
|
|
||||||
var configFile string
|
var configFile string
|
||||||
|
|
||||||
var app = cli.App {
|
var rootCmd = cobra.Command {
|
||||||
Name: "od-database-crawler",
|
Use: "od-database-crawler",
|
||||||
Usage: "OD-Database Go crawler",
|
Version: "1.2.2",
|
||||||
Version: "1.1.0",
|
Short: "OD-Database Go crawler",
|
||||||
BashComplete: cli.DefaultAppComplete,
|
Long: helpText,
|
||||||
Writer: os.Stdout,
|
PersistentPreRunE: preRun,
|
||||||
Action: cmdBase,
|
PersistentPostRun: func(cmd *cobra.Command, args []string) {
|
||||||
Commands: []cli.Command {
|
|
||||||
{
|
|
||||||
Name: "crawl",
|
|
||||||
Usage: "Crawl a list of URLs",
|
|
||||||
ArgsUsage: "<site>",
|
|
||||||
Action: cmdCrawler,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Flags: []cli.Flag {
|
|
||||||
cli.StringFlag {
|
|
||||||
Name: "config",
|
|
||||||
EnvVar: "CONFIG",
|
|
||||||
Destination: &configFile,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Before: func(i *cli.Context) error {
|
|
||||||
if configFile != "" {
|
|
||||||
viper.SetConfigFile(configFile)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
},
|
|
||||||
After: func(i *cli.Context) error {
|
|
||||||
exitHooks.Execute()
|
exitHooks.Execute()
|
||||||
return nil
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var serverCmd = cobra.Command {
|
||||||
|
Use: "server",
|
||||||
|
Short: "Start crawl server",
|
||||||
|
Long: "Connect to the OD-Database and contribute to the database\n" +
|
||||||
|
"by crawling the web for open directories!",
|
||||||
|
Run: cmdBase,
|
||||||
|
}
|
||||||
|
|
||||||
|
var crawlCmd = cobra.Command {
|
||||||
|
Use: "crawl",
|
||||||
|
Short: "Crawl an URL",
|
||||||
|
Long: "Crawl the URL specified.\n" +
|
||||||
|
"Results will not be uploaded to the database,\n" +
|
||||||
|
"they're saved under crawled/0.json instead.\n" +
|
||||||
|
"Primarily used for testing and benchmarking.",
|
||||||
|
RunE: cmdCrawler,
|
||||||
|
Args: cobra.ExactArgs(1),
|
||||||
|
}
|
||||||
|
|
||||||
var exitHooks Hooks
|
var exitHooks Hooks
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
rootCmd.AddCommand(&crawlCmd)
|
||||||
|
rootCmd.AddCommand(&serverCmd)
|
||||||
|
|
||||||
prepareConfig()
|
prepareConfig()
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func preRun(cmd *cobra.Command, args []string) error {
|
||||||
if err := os.MkdirAll("crawled", 0755);
|
if err := os.MkdirAll("crawled", 0755);
|
||||||
err != nil { panic(err) }
|
err != nil { panic(err) }
|
||||||
|
|
||||||
if err := os.MkdirAll("queue", 0755);
|
if err := os.MkdirAll("queue", 0755);
|
||||||
err != nil { panic(err) }
|
err != nil { panic(err) }
|
||||||
|
|
||||||
readConfig()
|
return nil
|
||||||
app.Run(os.Args)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func cmdBase(_ *cli.Context) error {
|
func main() {
|
||||||
// TODO Graceful shutdown
|
err := rootCmd.Execute()
|
||||||
appCtx := context.Background()
|
if err != nil {
|
||||||
forceCtx := context.Background()
|
fmt.Fprintln(os.Stderr, err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func cmdBase(_ *cobra.Command, _ []string) {
|
||||||
|
onlineMode = true
|
||||||
|
readConfig()
|
||||||
|
|
||||||
|
appCtx, soft := context.WithCancel(context.Background())
|
||||||
|
forceCtx, hard := context.WithCancel(context.Background())
|
||||||
|
go hardShutdown(forceCtx)
|
||||||
|
go listenCtrlC(soft, hard)
|
||||||
|
|
||||||
inRemotes := make(chan *OD)
|
inRemotes := make(chan *OD)
|
||||||
go Schedule(forceCtx, inRemotes)
|
go Schedule(appCtx, inRemotes)
|
||||||
|
|
||||||
ticker := time.NewTicker(config.Recheck)
|
ticker := time.NewTicker(config.Recheck)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-appCtx.Done():
|
case <-appCtx.Done():
|
||||||
return nil
|
goto shutdown
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
t, err := FetchTask()
|
t, err := FetchTask()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logrus.WithError(err).
|
logrus.WithError(err).
|
||||||
Error("Failed to get new task")
|
Error("Failed to get new task")
|
||||||
time.Sleep(30 * time.Second)
|
if !sleep(viper.GetDuration(ConfCooldown), appCtx) {
|
||||||
|
goto shutdown
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if t == nil {
|
if t == nil {
|
||||||
@@ -100,33 +114,27 @@ func cmdBase(_ *cli.Context) error {
|
|||||||
if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
|
if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
|
||||||
// Not an error
|
// Not an error
|
||||||
err = nil
|
err = nil
|
||||||
|
// TODO FTP crawler
|
||||||
// Give back task
|
|
||||||
//err2 := CancelTask(t.WebsiteId)
|
|
||||||
//if err2 != nil {
|
|
||||||
// logrus.Error(err2)
|
|
||||||
//}
|
|
||||||
|
|
||||||
continue
|
continue
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
logrus.WithError(err).
|
logrus.WithError(err).
|
||||||
Error("Failed to get new task")
|
Error("Failed to get new task")
|
||||||
time.Sleep(30 * time.Second)
|
time.Sleep(viper.GetDuration(ConfCooldown))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
ScheduleTask(inRemotes, t, &baseUri)
|
ScheduleTask(inRemotes, t, &baseUri)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
shutdown:
|
||||||
|
globalWait.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
func cmdCrawler(clic *cli.Context) error {
|
func cmdCrawler(_ *cobra.Command, args []string) error {
|
||||||
if clic.NArg() != 1 {
|
onlineMode = false
|
||||||
cli.ShowCommandHelpAndExit(clic, "crawl", 1)
|
readConfig()
|
||||||
}
|
|
||||||
|
|
||||||
arg := clic.Args()[0]
|
arg := args[0]
|
||||||
// https://github.com/golang/go/issues/19779
|
// https://github.com/golang/go/issues/19779
|
||||||
if !strings.Contains(arg, "://") {
|
if !strings.Contains(arg, "://") {
|
||||||
arg = "http://" + arg
|
arg = "http://" + arg
|
||||||
@@ -158,3 +166,30 @@ func cmdCrawler(clic *cli.Context) error {
|
|||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func listenCtrlC(soft, hard context.CancelFunc) {
|
||||||
|
c := make(chan os.Signal)
|
||||||
|
signal.Notify(c, os.Interrupt)
|
||||||
|
|
||||||
|
<-c
|
||||||
|
logrus.Info(">>> Shutting down crawler... <<<")
|
||||||
|
soft()
|
||||||
|
|
||||||
|
<-c
|
||||||
|
logrus.Warning(">>> Force shutdown! <<<")
|
||||||
|
hard()
|
||||||
|
}
|
||||||
|
|
||||||
|
func hardShutdown(c context.Context) {
|
||||||
|
<-c.Done()
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sleep(d time.Duration, c context.Context) bool {
|
||||||
|
select {
|
||||||
|
case <-time.After(d):
|
||||||
|
return true
|
||||||
|
case <-c.Done():
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
17
model.go
17
model.go
@@ -7,13 +7,23 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type ResultCode int
|
||||||
|
|
||||||
|
const (
|
||||||
|
TR_OK = ResultCode(iota)
|
||||||
|
TR_FAIL = 1
|
||||||
|
TR_SKIP = 2
|
||||||
|
)
|
||||||
|
|
||||||
type Task struct {
|
type Task struct {
|
||||||
WebsiteId uint64 `json:"website_id"`
|
WebsiteId uint64 `json:"website_id"`
|
||||||
Url string `json:"url"`
|
Url string `json:"url"`
|
||||||
|
UploadToken string `json:"upload_token"`
|
||||||
|
TaskId int64
|
||||||
}
|
}
|
||||||
|
|
||||||
type TaskResult struct {
|
type TaskResult struct {
|
||||||
StatusCode string `json:"status_code"`
|
ResultCode ResultCode `json:"status_code"`
|
||||||
FileCount uint64 `json:"file_count"`
|
FileCount uint64 `json:"file_count"`
|
||||||
ErrorCount uint64 `json:"-"`
|
ErrorCount uint64 `json:"-"`
|
||||||
StartTime time.Time `json:"-"`
|
StartTime time.Time `json:"-"`
|
||||||
@@ -51,13 +61,16 @@ func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) {
|
|||||||
defer o.Scanned.Unlock()
|
defer o.Scanned.Unlock()
|
||||||
|
|
||||||
exists = o.Scanned.Get(k)
|
exists = o.Scanned.Get(k)
|
||||||
if exists { return true }
|
if exists {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
o.Scanned.Put(k)
|
o.Scanned.Put(k)
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
type errorString string
|
type errorString string
|
||||||
|
|
||||||
func (e errorString) Error() string {
|
func (e errorString) Error() string {
|
||||||
return string(e)
|
return string(e)
|
||||||
}
|
}
|
||||||
|
|||||||
41
scheduler.go
41
scheduler.go
@@ -34,13 +34,16 @@ func Schedule(c context.Context, remotes <-chan *OD) {
|
|||||||
queuePath := path.Join("queue", fmt.Sprintf("%d", remote.Task.WebsiteId))
|
queuePath := path.Join("queue", fmt.Sprintf("%d", remote.Task.WebsiteId))
|
||||||
|
|
||||||
// Delete existing queue
|
// Delete existing queue
|
||||||
if err := os.RemoveAll(queuePath);
|
if err := os.RemoveAll(queuePath); err != nil {
|
||||||
err != nil { panic(err) }
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
// Start new queue
|
// Start new queue
|
||||||
var err error
|
var err error
|
||||||
remote.WCtx.Queue, err = OpenQueue(queuePath)
|
remote.WCtx.Queue, err = OpenQueue(queuePath)
|
||||||
if err != nil { panic(err) }
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
// Spawn workers
|
// Spawn workers
|
||||||
for i := 0; i < config.Workers; i++ {
|
for i := 0; i < config.Workers; i++ {
|
||||||
@@ -77,10 +80,10 @@ func ScheduleTask(remotes chan<- *OD, t *Task, u *fasturl.URL) {
|
|||||||
|
|
||||||
globalWait.Add(1)
|
globalWait.Add(1)
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
od := &OD {
|
od := &OD{
|
||||||
Task: *t,
|
Task: *t,
|
||||||
BaseUri: *u,
|
BaseUri: *u,
|
||||||
Result: TaskResult {
|
Result: TaskResult{
|
||||||
WebsiteId: t.WebsiteId,
|
WebsiteId: t.WebsiteId,
|
||||||
StartTime: now,
|
StartTime: now,
|
||||||
StartTimeUnix: now.Unix(),
|
StartTimeUnix: now.Unix(),
|
||||||
@@ -117,7 +120,7 @@ func (o *OD) Watch(results chan File) {
|
|||||||
// Open crawl results file
|
// Open crawl results file
|
||||||
f, err := os.OpenFile(
|
f, err := os.OpenFile(
|
||||||
filePath,
|
filePath,
|
||||||
os.O_CREATE | os.O_RDWR | os.O_TRUNC,
|
os.O_CREATE|os.O_RDWR|os.O_TRUNC,
|
||||||
0644,
|
0644,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -145,7 +148,7 @@ func (o *OD) Watch(results chan File) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Upload results
|
// Upload results
|
||||||
err = PushResult(&o.Result, f)
|
err = PushResult(&o.Task, f)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logrus.WithError(err).
|
logrus.WithError(err).
|
||||||
Error("Failed uploading crawl results")
|
Error("Failed uploading crawl results")
|
||||||
@@ -178,16 +181,10 @@ func (o *OD) handleCollect(results chan File, f *os.File, collectErrC chan error
|
|||||||
// Set status code
|
// Set status code
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
o.Result.EndTimeUnix = now.Unix()
|
o.Result.EndTimeUnix = now.Unix()
|
||||||
fileCount := atomic.LoadUint64(&o.Result.FileCount)
|
if atomic.LoadUint64(&o.Result.ErrorCount) != 0 {
|
||||||
if fileCount == 0 {
|
o.Result.ResultCode = TR_FAIL
|
||||||
errorCount := atomic.LoadUint64(&o.Result.ErrorCount)
|
|
||||||
if errorCount == 0 {
|
|
||||||
o.Result.StatusCode = "empty"
|
|
||||||
} else {
|
} else {
|
||||||
o.Result.StatusCode = "directory listing failed"
|
o.Result.ResultCode = TR_OK
|
||||||
}
|
|
||||||
} else {
|
|
||||||
o.Result.StatusCode = "success"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -205,11 +202,17 @@ func (t *Task) collect(results chan File, f *os.File) error {
|
|||||||
result.Path = fasturl.PathUnescape(result.Path)
|
result.Path = fasturl.PathUnescape(result.Path)
|
||||||
result.Name = fasturl.PathUnescape(result.Name)
|
result.Name = fasturl.PathUnescape(result.Name)
|
||||||
resJson, err := json.Marshal(result)
|
resJson, err := json.Marshal(result)
|
||||||
if err != nil { panic(err) }
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
_, err = f.Write(resJson)
|
_, err = f.Write(resJson)
|
||||||
if err != nil { return err }
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
_, err = f.Write([]byte{'\n'})
|
_, err = f.Write([]byte{'\n'})
|
||||||
if err != nil { return err }
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
293
server.go
293
server.go
@@ -2,47 +2,125 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"encoding/base64"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/fasthttp/websocket"
|
||||||
|
"github.com/pkg/errors"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
|
"golang.org/x/time/rate"
|
||||||
"io"
|
"io"
|
||||||
"mime/multipart"
|
"io/ioutil"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var serverClient = http.Client {
|
var serverWorker *TrackerWorker
|
||||||
|
|
||||||
|
var serverClient = http.Client{
|
||||||
Timeout: config.ServerTimeout,
|
Timeout: config.ServerTimeout,
|
||||||
|
Transport: new(ServerTripper),
|
||||||
|
}
|
||||||
|
|
||||||
|
var serverUserAgent = "od-database-crawler/" + rootCmd.Version
|
||||||
|
|
||||||
|
func getOrCreateWorker() {
|
||||||
|
|
||||||
|
if _, err := os.Stat("worker.json"); os.IsNotExist(err) {
|
||||||
|
req := CreateTrackerWorkerRequest{
|
||||||
|
Alias: config.TrackerAlias,
|
||||||
|
}
|
||||||
|
body, _ := json.Marshal(&req)
|
||||||
|
buf := bytes.NewBuffer(body)
|
||||||
|
resp, _ := serverClient.Post(config.TrackerUrl+"/worker/create", "application/json", buf)
|
||||||
|
|
||||||
|
workerResponse := CreateTrackerWorkerResponse{}
|
||||||
|
respBody, _ := ioutil.ReadAll(resp.Body)
|
||||||
|
_ = json.Unmarshal(respBody, &workerResponse)
|
||||||
|
|
||||||
|
workerJsonData, _ := json.Marshal(&workerResponse.Content.Worker)
|
||||||
|
fp, _ := os.OpenFile("worker.json", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
|
||||||
|
_, _ = fp.Write(workerJsonData)
|
||||||
|
|
||||||
|
//Request ASSIGN permission
|
||||||
|
serverWorker = &workerResponse.Content.Worker
|
||||||
|
accessReq, _ := json.Marshal(WorkerAccessRequest{
|
||||||
|
Project: config.TrackerProject,
|
||||||
|
Assign: true,
|
||||||
|
Submit: false,
|
||||||
|
})
|
||||||
|
buf = bytes.NewBuffer(accessReq)
|
||||||
|
res, err := serverClient.Post(config.TrackerUrl+"/project/request_access", "application/json", buf)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
logrus.WithFields(logrus.Fields{
|
||||||
|
"response": res.StatusCode,
|
||||||
|
}).Info("Requested ASSIGN permission")
|
||||||
|
} else {
|
||||||
|
var worker TrackerWorker
|
||||||
|
|
||||||
|
fp, _ := os.OpenFile("worker.json", os.O_RDONLY, 0600)
|
||||||
|
workerJsonData, _ := ioutil.ReadAll(fp)
|
||||||
|
_ = json.Unmarshal(workerJsonData, &worker)
|
||||||
|
|
||||||
|
serverWorker = &worker
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func FetchTask() (t *Task, err error) {
|
func FetchTask() (t *Task, err error) {
|
||||||
res, err := serverClient.PostForm(
|
|
||||||
config.ServerUrl + "/task/get",
|
if serverWorker == nil {
|
||||||
url.Values{ "token": {config.Token} })
|
getOrCreateWorker()
|
||||||
if err != nil { return }
|
}
|
||||||
|
|
||||||
|
res, err := serverClient.Get(config.TrackerUrl + "/task/get/" + strconv.Itoa(config.TrackerProject))
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
defer res.Body.Close()
|
defer res.Body.Close()
|
||||||
|
|
||||||
switch res.StatusCode {
|
switch res.StatusCode {
|
||||||
case 200:
|
case 200:
|
||||||
break
|
break
|
||||||
case 404, 500:
|
|
||||||
return nil, nil
|
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("http %s", res.Status)
|
return nil, fmt.Errorf("http %s", res.Status)
|
||||||
}
|
}
|
||||||
|
|
||||||
t = new(Task)
|
jsonResponse := FetchTaskResponse{}
|
||||||
err = json.NewDecoder(res.Body).Decode(t)
|
err = json.NewDecoder(res.Body).Decode(&jsonResponse)
|
||||||
if err != nil { return }
|
if _, ok := err.(*json.SyntaxError); ok {
|
||||||
|
return nil, fmt.Errorf("/task/get returned invalid JSON")
|
||||||
|
} else if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if !jsonResponse.Ok {
|
||||||
|
if jsonResponse.Message == "No task available" {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return nil, errors.New(jsonResponse.Message)
|
||||||
|
}
|
||||||
|
|
||||||
|
task := Task{}
|
||||||
|
err = json.Unmarshal([]byte(jsonResponse.Content.Task.Recipe), &task)
|
||||||
|
if _, ok := err.(*json.SyntaxError); ok {
|
||||||
|
return nil, fmt.Errorf("/task/get returned invalid JSON")
|
||||||
|
} else if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
t = &task
|
||||||
|
t.TaskId = jsonResponse.Content.Task.Id
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func PushResult(result *TaskResult, f *os.File) (err error) {
|
func PushResult(task *Task, f *os.File) (err error) {
|
||||||
if result.WebsiteId == 0 {
|
if task.WebsiteId == 0 {
|
||||||
// Not a real result, don't push
|
// Not a real result, don't push
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -53,10 +131,10 @@ func PushResult(result *TaskResult, f *os.File) (err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
err = uploadChunks(result.WebsiteId, f)
|
err = uploadWebsocket(f, task.UploadToken)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logrus.Errorf("Failed to upload file list: %s", err)
|
logrus.Errorf("Failed to upload file list: %s", err)
|
||||||
err2 := CancelTask(result.WebsiteId)
|
err2 := releaseTask(task, TR_FAIL)
|
||||||
if err2 != nil {
|
if err2 != nil {
|
||||||
logrus.Error(err2)
|
logrus.Error(err2)
|
||||||
}
|
}
|
||||||
@@ -64,91 +142,62 @@ func PushResult(result *TaskResult, f *os.File) (err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Upload result ignoring errors
|
// Upload result ignoring errors
|
||||||
uploadResult(result)
|
_ = releaseTask(task, TR_OK)
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func uploadChunks(websiteId uint64, f *os.File) error {
|
func uploadWebsocket(f *os.File, token string) (err error) {
|
||||||
eof := false
|
|
||||||
for iter := 1; !eof; iter++ {
|
|
||||||
// TODO Stream with io.Pipe?
|
|
||||||
var b bytes.Buffer
|
|
||||||
|
|
||||||
multi := multipart.NewWriter(&b)
|
u := url.URL{Scheme: config.WsBucketScheme, Host: config.WsBucketHost, Path: "/upload"}
|
||||||
|
|
||||||
// Set upload fields
|
header := http.Header{}
|
||||||
var err error
|
header.Add("X-Upload-Token", token)
|
||||||
err = multi.WriteField("token", config.Token)
|
conn, _, err := websocket.DefaultDialer.Dial(u.String(), header)
|
||||||
if err != nil { return err }
|
if err != nil {
|
||||||
err = multi.WriteField("website_id", fmt.Sprintf("%d", websiteId))
|
return
|
||||||
if err != nil { return err }
|
|
||||||
|
|
||||||
// Copy chunk to file_list
|
|
||||||
formFile, err := multi.CreateFormFile("file_list", "file_list")
|
|
||||||
var n int64
|
|
||||||
n, err = io.CopyN(formFile, f, config.ChunkSize)
|
|
||||||
if err != io.EOF && err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if n == 0 {
|
|
||||||
// Don't upload, no content
|
|
||||||
return nil
|
|
||||||
} else if n < config.ChunkSize {
|
|
||||||
err = nil
|
|
||||||
// Break at end of iteration
|
|
||||||
eof = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
multi.Close()
|
conn.EnableWriteCompression(true) //TODO: Is this necessary?
|
||||||
|
|
||||||
for retries := 0; retries < 10; retries++ {
|
socketWriter, _ := conn.NextWriter(websocket.BinaryMessage)
|
||||||
if retries > 0 {
|
_, _ = io.Copy(socketWriter, f)
|
||||||
// Error occurred, retry upload
|
err = socketWriter.Close()
|
||||||
time.Sleep(5 * time.Second)
|
if err != nil {
|
||||||
|
logrus.Error("FIXME: couldn't do file upload")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
err = conn.Close()
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
req, err := http.NewRequest(
|
|
||||||
http.MethodPost,
|
|
||||||
config.ServerUrl + "/task/upload",
|
|
||||||
&b)
|
|
||||||
req.Header.Set("content-type", multi.FormDataContentType())
|
|
||||||
if err != nil { continue }
|
|
||||||
|
|
||||||
res, err := serverClient.Do(req)
|
|
||||||
if err != nil { continue }
|
|
||||||
res.Body.Close()
|
|
||||||
|
|
||||||
if res.StatusCode != http.StatusOK {
|
|
||||||
logrus.WithField("status", res.Status).
|
|
||||||
WithField("part", iter).
|
|
||||||
Errorf("Upload failed")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Upload successful
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
logrus.WithField("id", websiteId).
|
|
||||||
WithField("part", iter).
|
|
||||||
Infof("Uploaded files chunk")
|
|
||||||
}
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func uploadResult(result *TaskResult) (err error) {
|
func releaseTask(task *Task, taskResult ResultCode) (err error) {
|
||||||
resultEnc, err := json.Marshal(result)
|
|
||||||
if err != nil { panic(err) }
|
|
||||||
|
|
||||||
res, err := serverClient.PostForm(
|
req := releaseTaskRequest{
|
||||||
config.ServerUrl + "/task/complete",
|
TaskId: task.TaskId,
|
||||||
url.Values {
|
ResultCode: taskResult,
|
||||||
"token": {config.Token},
|
// TODO Will implement verification in a later ODDB update
|
||||||
"result": {string(resultEnc)},
|
Verification: 0,
|
||||||
},
|
}
|
||||||
|
|
||||||
|
resultEnc, err := json.Marshal(&req)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
body := bytes.NewBuffer(resultEnc)
|
||||||
|
|
||||||
|
res, err := serverClient.Post(
|
||||||
|
config.TrackerUrl+"/task/release",
|
||||||
|
"application/json",
|
||||||
|
body,
|
||||||
)
|
)
|
||||||
if err != nil { return }
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
res.Body.Close()
|
res.Body.Close()
|
||||||
|
|
||||||
if res.StatusCode != http.StatusOK {
|
if res.StatusCode != http.StatusOK {
|
||||||
@@ -158,20 +207,66 @@ func uploadResult(result *TaskResult) (err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func CancelTask(websiteId uint64) (err error) {
|
type ServerTripper struct{}
|
||||||
res, err := serverClient.PostForm(
|
|
||||||
config.ServerUrl + "/task/cancel",
|
|
||||||
url.Values{
|
|
||||||
"token": {config.Token},
|
|
||||||
"website_id": {strconv.FormatUint(websiteId, 10)},
|
|
||||||
},
|
|
||||||
)
|
|
||||||
if err != nil { return }
|
|
||||||
res.Body.Close()
|
|
||||||
|
|
||||||
if res.StatusCode != http.StatusOK {
|
func (t *ServerTripper) RoundTrip(req *http.Request) (res *http.Response, err error) {
|
||||||
return fmt.Errorf("failed to cancel task: %s", res.Status)
|
req.Header.Set("User-Agent", serverUserAgent)
|
||||||
|
|
||||||
|
//TODO: Use task_tracker/client ?
|
||||||
|
if serverWorker != nil {
|
||||||
|
req.Header.Add("X-Worker-Id", strconv.Itoa(serverWorker.Id))
|
||||||
|
req.Header.Add("X-Secret", base64.StdEncoding.EncodeToString(serverWorker.Secret))
|
||||||
}
|
}
|
||||||
|
return http.DefaultTransport.RoundTrip(req)
|
||||||
return
|
}
|
||||||
|
|
||||||
|
// https://github.com/simon987/task_tracker/blob/master/api/models.go
|
||||||
|
|
||||||
|
type releaseTaskRequest struct {
|
||||||
|
TaskId int64 `json:"task_id"`
|
||||||
|
ResultCode ResultCode `json:"result"`
|
||||||
|
Verification int64 `json:"verification"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type WorkerAccessRequest struct {
|
||||||
|
Assign bool `json:"assign"`
|
||||||
|
Submit bool `json:"submit"`
|
||||||
|
Project int `json:"project"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type FetchTaskResponse struct {
|
||||||
|
Ok bool `json:"ok"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
Content struct {
|
||||||
|
Task struct {
|
||||||
|
Id int64 `json:"id"`
|
||||||
|
Priority int64 `json:"priority"`
|
||||||
|
Project struct {
|
||||||
|
Id int64 `json:"id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Version string `json:"version"`
|
||||||
|
AssignRate rate.Limit `json:"assign_rate"`
|
||||||
|
SubmitRate rate.Limit `json:"submit_rate"`
|
||||||
|
} `json:"project"`
|
||||||
|
Recipe string `json:"recipe"`
|
||||||
|
} `json:"task"`
|
||||||
|
} `json:"content"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type TrackerWorker struct {
|
||||||
|
Alias string `json:"alias"`
|
||||||
|
Id int `json:"id"`
|
||||||
|
Secret []byte `json:"secret"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type CreateTrackerWorkerResponse struct {
|
||||||
|
Ok bool `json:"ok"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
Content struct {
|
||||||
|
Worker TrackerWorker `json:"worker"`
|
||||||
|
} `json:"content"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type CreateTrackerWorkerRequest struct {
|
||||||
|
Alias string `json:"alias"`
|
||||||
}
|
}
|
||||||
|
|||||||
14
stats.go
14
stats.go
@@ -3,6 +3,7 @@ package main
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
|
"github.com/spf13/viper"
|
||||||
"math"
|
"math"
|
||||||
"runtime"
|
"runtime"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
@@ -19,11 +20,14 @@ func Stats(c context.Context) {
|
|||||||
var crawlTicker <-chan time.Time
|
var crawlTicker <-chan time.Time
|
||||||
var allocTicker <-chan time.Time
|
var allocTicker <-chan time.Time
|
||||||
|
|
||||||
if config.CrawlStats != 0 {
|
crawlInterval := viper.GetDuration(ConfCrawlStats)
|
||||||
crawlTicker = time.NewTicker(config.CrawlStats).C
|
allocInterval := viper.GetDuration(ConfAllocStats)
|
||||||
|
|
||||||
|
if crawlInterval != 0 {
|
||||||
|
crawlTicker = time.Tick(crawlInterval)
|
||||||
}
|
}
|
||||||
if config.AllocStats != 0 {
|
if allocInterval != 0 {
|
||||||
allocTicker = time.NewTicker(config.AllocStats).C
|
allocTicker = time.Tick(allocInterval)
|
||||||
}
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
@@ -32,7 +36,7 @@ func Stats(c context.Context) {
|
|||||||
startedNow := atomic.LoadUint64(&totalStarted)
|
startedNow := atomic.LoadUint64(&totalStarted)
|
||||||
|
|
||||||
perSecond := float64(startedNow - startedLast) /
|
perSecond := float64(startedNow - startedLast) /
|
||||||
config.CrawlStats.Seconds()
|
crawlInterval.Seconds()
|
||||||
|
|
||||||
// Round to .5
|
// Round to .5
|
||||||
perSecond *= 2
|
perSecond *= 2
|
||||||
|
|||||||
29
worker.go
29
worker.go
@@ -3,7 +3,6 @@ package main
|
|||||||
import (
|
import (
|
||||||
"github.com/beeker1121/goque"
|
"github.com/beeker1121/goque"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"github.com/valyala/fasthttp"
|
|
||||||
"math"
|
"math"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -42,7 +41,7 @@ func (w *WorkerContext) Worker(results chan<- File) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (w *WorkerContext) step(results chan<- File, job Job) {
|
func (w *WorkerContext) step(results chan<- File, job Job) {
|
||||||
defer w.finishJob(&job)
|
defer w.finishJob()
|
||||||
|
|
||||||
var f File
|
var f File
|
||||||
|
|
||||||
@@ -55,20 +54,18 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
job.Fails++
|
job.Fails++
|
||||||
|
|
||||||
if httpErr, ok := err.(*HttpError); ok {
|
if !shouldRetry(err) {
|
||||||
switch httpErr.code {
|
atomic.AddUint64(&totalAborted, 1)
|
||||||
case fasthttp.StatusTooManyRequests:
|
//logrus.WithField("url", job.UriStr).
|
||||||
err = ErrRateLimit
|
// WithError(err).
|
||||||
default:
|
// Error("Giving up after failure")
|
||||||
// Don't retry HTTP error codes
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if job.Fails > config.Retries {
|
if job.Fails > config.Retries {
|
||||||
atomic.AddUint64(&totalAborted, 1)
|
atomic.AddUint64(&totalAborted, 1)
|
||||||
logrus.WithField("url", job.UriStr).
|
//logrus.WithField("url", job.UriStr).
|
||||||
Errorf("Giving up after %d fails", job.Fails)
|
// Errorf("Giving up after %d fails", job.Fails)
|
||||||
} else {
|
} else {
|
||||||
atomic.AddUint64(&totalRetries, 1)
|
atomic.AddUint64(&totalRetries, 1)
|
||||||
if err == ErrRateLimit {
|
if err == ErrRateLimit {
|
||||||
@@ -91,7 +88,9 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
||||||
if len(job.Uri.Path) == 0 { return }
|
if len(job.Uri.Path) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
|
if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
|
||||||
// Load directory
|
// Load directory
|
||||||
links, err := GetDir(job, f)
|
links, err := GetDir(job, f)
|
||||||
@@ -162,10 +161,10 @@ func (w *WorkerContext) queueJob(job Job) {
|
|||||||
w.OD.Wait.Add(1)
|
w.OD.Wait.Add(1)
|
||||||
|
|
||||||
if w.numRateLimits > 0 {
|
if w.numRateLimits > 0 {
|
||||||
if time.Since(w.lastRateLimit) > 5 * time.Second {
|
if time.Since(w.lastRateLimit) > 5*time.Second {
|
||||||
w.numRateLimits = 0
|
w.numRateLimits = 0
|
||||||
} else {
|
} else {
|
||||||
time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) *
|
time.Sleep(time.Duration(math.Sqrt(float64(50*w.numRateLimits))) *
|
||||||
100 * time.Millisecond)
|
100 * time.Millisecond)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -175,7 +174,7 @@ func (w *WorkerContext) queueJob(job Job) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *WorkerContext) finishJob(job *Job) {
|
func (w *WorkerContext) finishJob() {
|
||||||
w.OD.Wait.Done()
|
w.OD.Wait.Done()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user