mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-20 19:06:46 +00:00
Compare commits
No commits in common. "master" and "v1.0.1" have entirely different histories.
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,6 +1,3 @@
|
|||||||
/.idea/
|
/.idea/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
/od-database-crawler
|
/od-database-crawler
|
||||||
*.log
|
|
||||||
/queue/
|
|
||||||
/crawled/
|
|
||||||
|
@ -1,5 +0,0 @@
|
|||||||
language: go
|
|
||||||
|
|
||||||
go:
|
|
||||||
- "1.11.x"
|
|
||||||
- master
|
|
15
Dockerfile
15
Dockerfile
@ -1,15 +0,0 @@
|
|||||||
FROM golang:alpine as builder
|
|
||||||
ADD . /go/src/github.com/terorie/od-database-crawler
|
|
||||||
RUN apk add git \
|
|
||||||
&& go get -d -v github.com/terorie/od-database-crawler \
|
|
||||||
&& CGO_ENABLED=0 go install -a \
|
|
||||||
-installsuffix cgo \
|
|
||||||
-ldflags="-s -w" \
|
|
||||||
github.com/terorie/od-database-crawler
|
|
||||||
|
|
||||||
FROM scratch
|
|
||||||
COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
|
|
||||||
COPY --from=builder /go/bin/od-database-crawler /bin/
|
|
||||||
WORKDIR /oddb
|
|
||||||
VOLUME [ "/oddb" ]
|
|
||||||
CMD ["/bin/od-database-crawler", "server"]
|
|
56
README.md
56
README.md
@ -1,57 +1,7 @@
|
|||||||
# OD-Database Crawler 🕷
|
# od-database Go crawler 🚀
|
||||||
[](https://travis-ci.org/terorie/od-database-crawler)
|
> by terorie 2018 :P
|
||||||
[](https://github.com/terorie/od-database-crawler)
|
|
||||||
[](https://www.codefactor.io/repository/github/terorie/od-database-crawler/overview/master)
|
|
||||||
|
|
||||||
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
|
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
|
||||||
* In production at https://od-db.the-eye.eu/
|
|
||||||
* Over 880 TB actively crawled
|
|
||||||
* Crawls HTTP open directories (standard Web Server Listings)
|
* Crawls HTTP open directories (standard Web Server Listings)
|
||||||
* Gets name, path, size and modification time of all files
|
* Gets name, path, size and modification time of all files
|
||||||
* Lightweight and fast
|
* Lightweight and fast: __over 9000 requests per second__ on a standard laptop
|
||||||
|
|
||||||
https://od-db.the-eye.eu/
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
### Deploys
|
|
||||||
|
|
||||||
1. With Config File (if `config.yml` found in working dir)
|
|
||||||
- Download [default config](https://github.com/terorie/od-database-crawler/blob/master/config.yml)
|
|
||||||
- Set `server.url` and `server.token`
|
|
||||||
- Start with `./od-database-crawler server --config <file>`
|
|
||||||
|
|
||||||
2. With Flags or env
|
|
||||||
- Override config file if it exists
|
|
||||||
- `--help` for list of flags
|
|
||||||
- Every flag is available as an environment variable:
|
|
||||||
`--server.crawl_stats` ➡️ `OD_SERVER_CRAWL_STATS`
|
|
||||||
- Start with `./od-database-crawler server <flags>`
|
|
||||||
|
|
||||||
3. With Docker
|
|
||||||
```bash
|
|
||||||
docker run \
|
|
||||||
-e OD_SERVER_URL=xxx \
|
|
||||||
-e OD_SERVER_TOKEN=xxx \
|
|
||||||
terorie/od-database-crawler
|
|
||||||
```
|
|
||||||
|
|
||||||
### Flag reference
|
|
||||||
|
|
||||||
Here are the most important config flags. For more fine control, take a look at `/config.yml`.
|
|
||||||
|
|
||||||
| Flag/Environment | Description | Example |
|
|
||||||
| ------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------- |
|
|
||||||
| `server.url`<br />`OD_SERVER_URL` | OD-DB Server URL | `https://od-db.mine.the-eye.eu/api` |
|
|
||||||
| `server.token`<br />`OD_SERVER_TOKEN` | OD-DB Server Access Token | _Ask Hexa **TM**_ |
|
|
||||||
| `server.recheck`<br />`OD_SERVER_RECHECK` | Job Fetching Interval | `3s` |
|
|
||||||
| `output.crawl_stats`<br />`OD_OUTPUT_CRAWL_STATS` | Crawl Stats Logging Interval (0 = disabled) | `500ms` |
|
|
||||||
| `output.resource_stats`<br />`OD_OUTPUT_RESORUCE_STATS` | Resource Stats Logging Interval (0 = disabled) | `8s` |
|
|
||||||
| `output.log`<br />`OD_OUTPUT_LOG` | Log File (none = disabled) | `crawler.log` |
|
|
||||||
| `crawl.tasks`<br />`OD_CRAWL_TASKS` | Max number of sites to crawl concurrently | `500` |
|
|
||||||
| `crawl.connections`<br />`OD_CRAWL_CONNECTIONS` | HTTP connections per site | `1` |
|
|
||||||
| `crawl.retries`<br />`OD_CRAWL_RETRIES` | How often to retry after a temporary failure (e.g. `HTTP 429` or timeouts) | `5` |
|
|
||||||
| `crawl.dial_timeout`<br />`OD_CRAWL_DIAL_TIMEOUT` | TCP Connect timeout | `5s` |
|
|
||||||
| `crawl.timeout`<br />`OD_CRAWL_TIMEOUT` | HTTP request timeout | `20s` |
|
|
||||||
| `crawl.user-agent`<br />`OD_CRAWL_USER_AGENT` | HTTP Crawler User-Agent | `googlebot/1.2.3` |
|
|
||||||
| `crawl.job_buffer`<br />`OD_CRAWL_JOB_BUFFER` | Number of URLs to keep in memory/cache, per job. The rest is offloaded to disk. Decrease this value if the crawler uses too much RAM. (0 = Disable Cache, -1 = Only use Cache) | `5000` |
|
|
||||||
|
118
config.go
118
config.go
@ -4,7 +4,6 @@ import (
|
|||||||
"bufio"
|
"bufio"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"github.com/spf13/pflag"
|
|
||||||
"github.com/spf13/viper"
|
"github.com/spf13/viper"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
@ -21,32 +20,25 @@ var config struct {
|
|||||||
Retries int
|
Retries int
|
||||||
Workers int
|
Workers int
|
||||||
UserAgent string
|
UserAgent string
|
||||||
|
Timeout time.Duration
|
||||||
Tasks int32
|
Tasks int32
|
||||||
|
CrawlStats time.Duration
|
||||||
|
AllocStats time.Duration
|
||||||
Verbose bool
|
Verbose bool
|
||||||
PrintHTTP bool
|
PrintHTTP bool
|
||||||
JobBufferSize int
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var onlineMode bool
|
|
||||||
|
|
||||||
const (
|
const (
|
||||||
ConfServerUrl = "server.url"
|
ConfServerUrl = "server.url"
|
||||||
ConfToken = "server.token"
|
ConfToken = "server.token"
|
||||||
ConfServerTimeout = "server.timeout"
|
ConfServerTimeout = "server.timeout"
|
||||||
ConfRecheck = "server.recheck"
|
ConfRecheck = "server.recheck"
|
||||||
ConfCooldown = "server.cooldown"
|
|
||||||
ConfChunkSize = "server.upload_chunk"
|
ConfChunkSize = "server.upload_chunk"
|
||||||
ConfUploadRetries = "server.upload_retries"
|
|
||||||
ConfUploadRetryInterval = "server.upload_retry_interval"
|
|
||||||
|
|
||||||
ConfTasks = "crawl.tasks"
|
ConfTasks = "crawl.tasks"
|
||||||
ConfRetries = "crawl.retries"
|
ConfRetries = "crawl.retries"
|
||||||
ConfWorkers = "crawl.connections"
|
ConfWorkers = "crawl.connections"
|
||||||
ConfUserAgent = "crawl.user-agent"
|
ConfUserAgent = "crawl.user-agent"
|
||||||
ConfDialTimeout = "crawl.dial_timeout"
|
|
||||||
ConfTimeout = "crawl.timeout"
|
ConfTimeout = "crawl.timeout"
|
||||||
ConfJobBufferSize = "crawl.job_buffer"
|
|
||||||
|
|
||||||
ConfCrawlStats = "output.crawl_stats"
|
ConfCrawlStats = "output.crawl_stats"
|
||||||
ConfAllocStats = "output.resource_stats"
|
ConfAllocStats = "output.resource_stats"
|
||||||
ConfVerbose = "output.verbose"
|
ConfVerbose = "output.verbose"
|
||||||
@ -55,96 +47,29 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func prepareConfig() {
|
func prepareConfig() {
|
||||||
pf := rootCmd.PersistentFlags()
|
viper.SetDefault(ConfRetries, 5)
|
||||||
|
viper.SetDefault(ConfWorkers, 2)
|
||||||
pf.SortFlags = false
|
viper.SetDefault(ConfTasks, 3)
|
||||||
pf.StringVar(&configFile, "config", "", "Config file")
|
viper.SetDefault(ConfUserAgent, "")
|
||||||
configFile = os.Getenv("OD_CONFIG")
|
viper.SetDefault(ConfTimeout, 10 * time.Second)
|
||||||
|
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
|
||||||
pf.String(ConfServerUrl, "http://od-db.the-eye.eu/api", "OD-DB server URL")
|
viper.SetDefault(ConfAllocStats, 0)
|
||||||
|
viper.SetDefault(ConfVerbose, false)
|
||||||
pf.String(ConfToken, "", "OD-DB access token (env OD_SERVER_TOKEN)")
|
viper.SetDefault(ConfPrintHTTP, false)
|
||||||
|
viper.SetDefault(ConfLogFile, "")
|
||||||
pf.Duration(ConfServerTimeout, 60 * time.Second, "OD-DB request timeout")
|
viper.SetDefault(ConfRecheck, 3 * time.Second)
|
||||||
|
viper.SetDefault(ConfChunkSize, "1 MB")
|
||||||
pf.Duration(ConfRecheck, 1 * time.Second, "OD-DB: Poll interval for new jobs")
|
|
||||||
|
|
||||||
pf.Duration(ConfCooldown, 30 * time.Second, "OD-DB: Time to wait after a server-side error")
|
|
||||||
|
|
||||||
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
|
|
||||||
|
|
||||||
pf.Uint(ConfUploadRetries, 10, "OD-DB: Max upload retries")
|
|
||||||
|
|
||||||
pf.Duration(ConfUploadRetryInterval, 30 * time.Second, "OD-DB: Time to wait between upload retries")
|
|
||||||
|
|
||||||
pf.Uint(ConfTasks, 100, "Crawler: Max concurrent tasks")
|
|
||||||
|
|
||||||
pf.Uint(ConfWorkers, 4, "Crawler: Connections per server")
|
|
||||||
|
|
||||||
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
|
|
||||||
|
|
||||||
pf.Duration(ConfDialTimeout, 10 * time.Second, "Crawler: Handshake timeout")
|
|
||||||
|
|
||||||
pf.Duration(ConfTimeout, 30 * time.Second, "Crawler: Request timeout")
|
|
||||||
|
|
||||||
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
|
|
||||||
|
|
||||||
pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size")
|
|
||||||
|
|
||||||
pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval")
|
|
||||||
|
|
||||||
pf.Duration(ConfAllocStats, 10 * time.Second, "Log: Resource stats interval")
|
|
||||||
|
|
||||||
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")
|
|
||||||
|
|
||||||
pf.Bool(ConfPrintHTTP, false, "Log: Print HTTP client errors")
|
|
||||||
|
|
||||||
pf.String(ConfLogFile, "crawler.log", "Log file")
|
|
||||||
|
|
||||||
// Bind all flags to Viper
|
|
||||||
pf.VisitAll(func(flag *pflag.Flag) {
|
|
||||||
s := flag.Name
|
|
||||||
s = strings.TrimLeft(s, "-")
|
|
||||||
|
|
||||||
if err := viper.BindPFlag(s, flag); err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
var envKey string
|
|
||||||
envKey = strings.Replace(s, ".", "_", -1)
|
|
||||||
envKey = strings.ToUpper(envKey)
|
|
||||||
envKey = "OD_" + envKey
|
|
||||||
if err := viper.BindEnv(s, envKey); err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func readConfig() {
|
func readConfig() {
|
||||||
// If config.yml in working dir, use it
|
viper.AddConfigPath(".")
|
||||||
if configFile == "" {
|
viper.SetConfigName("config")
|
||||||
_, err := os.Stat("config.yml")
|
err := viper.ReadInConfig()
|
||||||
if err == nil {
|
|
||||||
configFile = "config.yml"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if configFile != "" {
|
|
||||||
confF, err := os.Open(configFile)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintln(os.Stderr, err)
|
fmt.Fprintln(os.Stderr, err)
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
defer confF.Close()
|
|
||||||
|
|
||||||
viper.SetConfigType("yml")
|
|
||||||
err = viper.ReadConfig(confF)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Fprintln(os.Stderr, err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if onlineMode {
|
|
||||||
config.ServerUrl = viper.GetString(ConfServerUrl)
|
config.ServerUrl = viper.GetString(ConfServerUrl)
|
||||||
if config.ServerUrl == "" {
|
if config.ServerUrl == "" {
|
||||||
configMissing(ConfServerUrl)
|
configMissing(ConfServerUrl)
|
||||||
@ -155,7 +80,6 @@ func readConfig() {
|
|||||||
if config.Token == "" {
|
if config.Token == "" {
|
||||||
configMissing(ConfToken)
|
configMissing(ConfToken)
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
|
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
|
||||||
|
|
||||||
@ -183,11 +107,11 @@ func readConfig() {
|
|||||||
|
|
||||||
config.UserAgent = viper.GetString(ConfUserAgent)
|
config.UserAgent = viper.GetString(ConfUserAgent)
|
||||||
|
|
||||||
setDialTimeout(viper.GetDuration(ConfDialTimeout))
|
config.Timeout = viper.GetDuration(ConfTimeout)
|
||||||
|
|
||||||
setTimeout(viper.GetDuration(ConfTimeout))
|
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
|
||||||
|
|
||||||
config.JobBufferSize = viper.GetInt(ConfJobBufferSize)
|
config.AllocStats = viper.GetDuration(ConfAllocStats)
|
||||||
|
|
||||||
config.Verbose = viper.GetBool(ConfVerbose)
|
config.Verbose = viper.GetBool(ConfVerbose)
|
||||||
if config.Verbose {
|
if config.Verbose {
|
||||||
|
30
config.yml
30
config.yml
@ -15,17 +15,10 @@ server:
|
|||||||
# between /task/get requests to the server.
|
# between /task/get requests to the server.
|
||||||
recheck: 1s
|
recheck: 1s
|
||||||
|
|
||||||
# Time to wait after receiving an error
|
|
||||||
# from the server. Doesn't apply to uploads.
|
|
||||||
cooldown: 30s
|
|
||||||
|
|
||||||
# Upload chunk size
|
# Upload chunk size
|
||||||
# If the value is too high, the upload fails.
|
# If the value is too high, the upload fails.
|
||||||
upload_chunk: 1 MB
|
upload_chunk: 1 MB
|
||||||
|
|
||||||
upload_retries: 10
|
|
||||||
upload_retry_interval: 30s
|
|
||||||
|
|
||||||
# Log output settings
|
# Log output settings
|
||||||
output:
|
output:
|
||||||
# Crawl statistics
|
# Crawl statistics
|
||||||
@ -47,38 +40,21 @@ output:
|
|||||||
# Crawler settings
|
# Crawler settings
|
||||||
crawl:
|
crawl:
|
||||||
# Number of sites that can be processed at once
|
# Number of sites that can be processed at once
|
||||||
tasks: 25
|
tasks: 100
|
||||||
|
|
||||||
# Number of connections per site
|
# Number of connections per site
|
||||||
# Please be careful with this setting!
|
# Please be careful with this setting!
|
||||||
# The crawler fires fast and more than
|
# The crawler fires fast and more than
|
||||||
# ten connections can overwhelm a server.
|
# ten connections can overwhelm a server.
|
||||||
connections: 1
|
connections: 10
|
||||||
|
|
||||||
# How often to retry getting data
|
# How often to retry getting data
|
||||||
# from the site before giving up
|
# from the site before giving up
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
# Time before discarding a failed connection attempt
|
|
||||||
dial_timeout: 10s
|
|
||||||
|
|
||||||
# Time before discarding a network request
|
# Time before discarding a network request
|
||||||
timeout: 30s
|
timeout: 10s
|
||||||
|
|
||||||
# Crawler User-Agent
|
# Crawler User-Agent
|
||||||
# If empty, no User-Agent header is sent.
|
# If empty, no User-Agent header is sent.
|
||||||
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
|
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
|
||||||
|
|
||||||
# Job buffer size (per task)
|
|
||||||
# Higher values cause less disk writes
|
|
||||||
# but require more memory.
|
|
||||||
#
|
|
||||||
# The job queue contains all URLs
|
|
||||||
# that should be crawled next.
|
|
||||||
# As it grows very large over time,
|
|
||||||
# it's kept mainly on disk.
|
|
||||||
# This sets how many jobs are kept
|
|
||||||
# in memory.
|
|
||||||
# A negative value will cause all jobs
|
|
||||||
# to be stored in memory. (Don't do this)
|
|
||||||
job_buffer: -1
|
|
||||||
|
40
crawl.go
40
crawl.go
@ -8,7 +8,6 @@ import (
|
|||||||
"github.com/valyala/fasthttp"
|
"github.com/valyala/fasthttp"
|
||||||
"golang.org/x/crypto/blake2b"
|
"golang.org/x/crypto/blake2b"
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
"net"
|
|
||||||
"path"
|
"path"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@ -21,17 +20,6 @@ var client = fasthttp.Client {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
func setDialTimeout(d time.Duration) {
|
|
||||||
client.Dial = func(addr string) (net.Conn, error) {
|
|
||||||
return fasthttp.DialTimeout(addr, d)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func setTimeout(d time.Duration) {
|
|
||||||
client.ReadTimeout = d
|
|
||||||
client.WriteTimeout = d / 2
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
||||||
f.IsDir = true
|
f.IsDir = true
|
||||||
f.Name = path.Base(j.Uri.Path)
|
f.Name = path.Base(j.Uri.Path)
|
||||||
@ -45,7 +33,7 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
|||||||
res := fasthttp.AcquireResponse()
|
res := fasthttp.AcquireResponse()
|
||||||
defer fasthttp.ReleaseResponse(res)
|
defer fasthttp.ReleaseResponse(res)
|
||||||
|
|
||||||
err = client.Do(req, res)
|
err = client.DoTimeout(req, res, config.Timeout)
|
||||||
fasthttp.ReleaseRequest(req)
|
fasthttp.ReleaseRequest(req)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -58,16 +46,10 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
body := res.Body()
|
body := res.Body()
|
||||||
return ParseDir(body, &j.Uri)
|
|
||||||
}
|
|
||||||
|
|
||||||
func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error) {
|
|
||||||
doc := html.NewTokenizer(bytes.NewReader(body))
|
doc := html.NewTokenizer(bytes.NewReader(body))
|
||||||
|
|
||||||
var linkHref string
|
var linkHref string
|
||||||
for {
|
for {
|
||||||
err = nil
|
|
||||||
|
|
||||||
tokenType := doc.Next()
|
tokenType := doc.Next()
|
||||||
if tokenType == html.ErrorToken {
|
if tokenType == html.ErrorToken {
|
||||||
break
|
break
|
||||||
@ -98,34 +80,36 @@ func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error
|
|||||||
linkHref = ""
|
linkHref = ""
|
||||||
|
|
||||||
if strings.LastIndexByte(href, '?') != -1 {
|
if strings.LastIndexByte(href, '?') != -1 {
|
||||||
continue
|
goto nextToken
|
||||||
}
|
}
|
||||||
|
|
||||||
switch href {
|
switch href {
|
||||||
case "", " ", ".", "..", "/":
|
case "", " ", ".", "..", "/":
|
||||||
continue
|
goto nextToken
|
||||||
}
|
}
|
||||||
|
|
||||||
if strings.Contains(href, "../") {
|
if strings.Contains(href, "../") {
|
||||||
continue
|
goto nextToken
|
||||||
}
|
}
|
||||||
|
|
||||||
var link fasturl.URL
|
var link fasturl.URL
|
||||||
err = baseUri.ParseRel(&link, href)
|
err = j.Uri.ParseRel(&link, href)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if link.Scheme != baseUri.Scheme ||
|
if link.Scheme != j.Uri.Scheme ||
|
||||||
link.Host != baseUri.Host ||
|
link.Host != j.Uri.Host ||
|
||||||
link.Path == baseUri.Path ||
|
link.Path == j.Uri.Path ||
|
||||||
!strings.HasPrefix(link.Path, baseUri.Path) {
|
!strings.HasPrefix(link.Path, j.Uri.Path) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
links = append(links, link)
|
links = append(links, link)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nextToken:
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
@ -148,7 +132,7 @@ func GetFile(u fasturl.URL, f *File) (err error) {
|
|||||||
res.SkipBody = true
|
res.SkipBody = true
|
||||||
defer fasthttp.ReleaseResponse(res)
|
defer fasthttp.ReleaseResponse(res)
|
||||||
|
|
||||||
err = client.Do(req, res)
|
err = client.DoTimeout(req, res, config.Timeout)
|
||||||
fasthttp.ReleaseRequest(req)
|
fasthttp.ReleaseRequest(req)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,117 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"github.com/terorie/od-database-crawler/fasturl"
|
|
||||||
"testing"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestParseDirNginx(t *testing.T) {
|
|
||||||
var u fasturl.URL
|
|
||||||
err := u.Parse("https://the-eye.eu/public/")
|
|
||||||
if err != nil {
|
|
||||||
t.Fatal("Failed to parse URL", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
links, err := ParseDir([]byte(nginxListing), &u)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatal("Failed to extract links", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(links) != len(nginxLinks) {
|
|
||||||
t.Fatalf("Expected %d links, got %d",
|
|
||||||
len(nginxLinks), len(links))
|
|
||||||
}
|
|
||||||
|
|
||||||
for i := 0; i < len(links); i++ {
|
|
||||||
gotLink := links[i].String()
|
|
||||||
expLink := nginxLinks[i]
|
|
||||||
|
|
||||||
if gotLink != expLink {
|
|
||||||
t.Errorf(`Expected "%s" got "%s"`,
|
|
||||||
expLink, gotLink)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var nginxLinks = []string {
|
|
||||||
"https://the-eye.eu/public/AppleArchive/",
|
|
||||||
"https://the-eye.eu/public/AudioBooks/",
|
|
||||||
"https://the-eye.eu/public/Books/",
|
|
||||||
"https://the-eye.eu/public/Comics/",
|
|
||||||
"https://the-eye.eu/public/Games/",
|
|
||||||
"https://the-eye.eu/public/Icons/",
|
|
||||||
"https://the-eye.eu/public/Images/",
|
|
||||||
"https://the-eye.eu/public/JFK_Files/",
|
|
||||||
"https://the-eye.eu/public/MSDN/",
|
|
||||||
"https://the-eye.eu/public/Music/",
|
|
||||||
"https://the-eye.eu/public/Operating%20Systems/",
|
|
||||||
"https://the-eye.eu/public/Posters/",
|
|
||||||
"https://the-eye.eu/public/Psychedelics/",
|
|
||||||
"https://the-eye.eu/public/Psychoactives/",
|
|
||||||
"https://the-eye.eu/public/Radio/",
|
|
||||||
"https://the-eye.eu/public/Random/",
|
|
||||||
"https://the-eye.eu/public/Site-Dumps/",
|
|
||||||
"https://the-eye.eu/public/Software/",
|
|
||||||
"https://the-eye.eu/public/Strategic%20Intelligence%20Network/",
|
|
||||||
"https://the-eye.eu/public/WorldTracker.org/",
|
|
||||||
"https://the-eye.eu/public/concen.org/",
|
|
||||||
"https://the-eye.eu/public/freenrg.info/",
|
|
||||||
"https://the-eye.eu/public/murdercube.com/",
|
|
||||||
"https://the-eye.eu/public/parazite/",
|
|
||||||
"https://the-eye.eu/public/ripreddit/",
|
|
||||||
"https://the-eye.eu/public/rom/",
|
|
||||||
"https://the-eye.eu/public/touhou/",
|
|
||||||
"https://the-eye.eu/public/vns/",
|
|
||||||
"https://the-eye.eu/public/xbins/",
|
|
||||||
"https://the-eye.eu/public/xbins.diodematrix/",
|
|
||||||
"https://the-eye.eu/public/Rclone_for_Scrubs.pdf",
|
|
||||||
"https://the-eye.eu/public/Wget_Linux_Guide.pdf",
|
|
||||||
"https://the-eye.eu/public/Wget_Windows_Guide.pdf",
|
|
||||||
"https://the-eye.eu/public/rclone_guide.pdf",
|
|
||||||
"https://the-eye.eu/public/wget-noobs-guide.pdf",
|
|
||||||
"https://the-eye.eu/public/xbox-scene_Aug2014.7z",
|
|
||||||
}
|
|
||||||
|
|
||||||
const nginxListing =
|
|
||||||
`<html>
|
|
||||||
<head><title>Index of /public/</title></head>
|
|
||||||
<body bgcolor="white">
|
|
||||||
<h1>Index of /public/</h1><hr><pre><a href="../">../</a>
|
|
||||||
<a href="AppleArchive/">AppleArchive/</a> 03-Nov-2017 18:13 -
|
|
||||||
<a href="AudioBooks/">AudioBooks/</a> 29-Sep-2018 19:47 -
|
|
||||||
<a href="Books/">Books/</a> 27-Nov-2018 17:50 -
|
|
||||||
<a href="Comics/">Comics/</a> 05-Nov-2018 21:37 -
|
|
||||||
<a href="Games/">Games/</a> 28-Nov-2018 11:54 -
|
|
||||||
<a href="Icons/">Icons/</a> 22-May-2018 07:47 -
|
|
||||||
<a href="Images/">Images/</a> 21-Jan-2018 03:21 -
|
|
||||||
<a href="JFK_Files/">JFK_Files/</a> 03-Nov-2017 17:03 -
|
|
||||||
<a href="MSDN/">MSDN/</a> 03-Nov-2017 15:48 -
|
|
||||||
<a href="Music/">Music/</a> 02-Mar-2018 15:47 -
|
|
||||||
<a href="Operating%20Systems/">Operating Systems/</a> 25-Apr-2018 07:18 -
|
|
||||||
<a href="Posters/">Posters/</a> 07-Jul-2018 01:12 -
|
|
||||||
<a href="Psychedelics/">Psychedelics/</a> 11-Apr-2018 05:45 -
|
|
||||||
<a href="Psychoactives/">Psychoactives/</a> 18-May-2018 02:58 -
|
|
||||||
<a href="Radio/">Radio/</a> 09-Jun-2018 15:49 -
|
|
||||||
<a href="Random/">Random/</a> 04-Dec-2018 12:33 -
|
|
||||||
<a href="Site-Dumps/">Site-Dumps/</a> 15-Dec-2018 11:04 -
|
|
||||||
<a href="Software/">Software/</a> 27-Nov-2017 00:22 -
|
|
||||||
<a href="Strategic%20Intelligence%20Network/">Strategic Intelligence Network/</a> 17-Nov-2017 16:35 -
|
|
||||||
<a href="WorldTracker.org/">WorldTracker.org/</a> 12-Apr-2018 04:16 -
|
|
||||||
<a href="concen.org/">concen.org/</a> 08-Oct-2018 14:08 -
|
|
||||||
<a href="freenrg.info/">freenrg.info/</a> 19-Dec-2017 10:59 -
|
|
||||||
<a href="murdercube.com/">murdercube.com/</a> 06-Dec-2017 10:45 -
|
|
||||||
<a href="parazite/">parazite/</a> 20-Nov-2017 21:25 -
|
|
||||||
<a href="ripreddit/">ripreddit/</a> 04-Aug-2018 14:30 -
|
|
||||||
<a href="rom/">rom/</a> 28-Nov-2018 14:15 -
|
|
||||||
<a href="touhou/">touhou/</a> 03-Nov-2017 11:07 -
|
|
||||||
<a href="vns/">vns/</a> 03-Nov-2017 11:36 -
|
|
||||||
<a href="xbins/">xbins/</a> 03-Nov-2017 17:23 -
|
|
||||||
<a href="xbins.diodematrix/">xbins.diodematrix/</a> 21-Sep-2018 22:33 -
|
|
||||||
<a href="Rclone_for_Scrubs.pdf">Rclone_for_Scrubs.pdf</a> 04-Sep-2018 13:31 315K
|
|
||||||
<a href="Wget_Linux_Guide.pdf">Wget_Linux_Guide.pdf</a> 21-Dec-2017 20:28 168K
|
|
||||||
<a href="Wget_Windows_Guide.pdf">Wget_Windows_Guide.pdf</a> 25-Nov-2017 17:59 867K
|
|
||||||
<a href="rclone_guide.pdf">rclone_guide.pdf</a> 03-Sep-2018 23:37 315K
|
|
||||||
<a href="wget-noobs-guide.pdf">wget-noobs-guide.pdf</a> 21-Dec-2017 20:29 168K
|
|
||||||
<a href="xbox-scene_Aug2014.7z">xbox-scene_Aug2014.7z</a> 26-Oct-2017 23:09 1G
|
|
||||||
</pre><hr></body>
|
|
||||||
</html>`
|
|
@ -1,59 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
|
||||||
"github.com/terorie/od-database-crawler/fasturl"
|
|
||||||
"net/url"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
|
||||||
)
|
|
||||||
|
|
||||||
func BenchmarkParseDir(b *testing.B) {
|
|
||||||
for n := 0; n < b.N; n++ {
|
|
||||||
var u fasturl.URL
|
|
||||||
err := u.Parse("http://archive.ubuntu.com/ubuntu/indices/")
|
|
||||||
if err != nil {
|
|
||||||
b.Fatal("Failed to parse URL", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err = ParseDir([]byte(apache2Listing), &u)
|
|
||||||
if err != nil {
|
|
||||||
b.Fatal("Failed to extract links", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkParseDirReference(b *testing.B) {
|
|
||||||
for n := 0; n < b.N; n++ {
|
|
||||||
u, err := url.Parse("http://archive.ubuntu.com/ubuntu/indices/")
|
|
||||||
if err != nil {
|
|
||||||
b.Fatal("Failed to parse URL", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err = referenceParseDir([]byte(apache2Listing), u)
|
|
||||||
if err != nil {
|
|
||||||
b.Fatal("Failed to extract links", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func referenceParseDir(body []byte, baseUri *url.URL) (links []*url.URL, err error) {
|
|
||||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
|
||||||
if err != nil { return nil, err }
|
|
||||||
|
|
||||||
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
|
||||||
href, _ := s.Attr("href")
|
|
||||||
|
|
||||||
sub, err := baseUri.Parse(href)
|
|
||||||
if err != nil { return } // continue
|
|
||||||
|
|
||||||
if !strings.HasPrefix(sub.String(), baseUri.String()) {
|
|
||||||
return // continue
|
|
||||||
}
|
|
||||||
|
|
||||||
links = append(links, sub)
|
|
||||||
})
|
|
||||||
|
|
||||||
return
|
|
||||||
}
|
|
28
errors.go
28
errors.go
@ -3,8 +3,6 @@ package main
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/valyala/fasthttp"
|
|
||||||
"net"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var ErrRateLimit = errors.New("too many requests")
|
var ErrRateLimit = errors.New("too many requests")
|
||||||
@ -17,29 +15,3 @@ type HttpError struct {
|
|||||||
func (e HttpError) Error() string {
|
func (e HttpError) Error() string {
|
||||||
return fmt.Sprintf("http status %d", e.code)
|
return fmt.Sprintf("http status %d", e.code)
|
||||||
}
|
}
|
||||||
|
|
||||||
func shouldRetry(err error) bool {
|
|
||||||
// HTTP errors
|
|
||||||
if httpErr, ok := err.(*HttpError); ok {
|
|
||||||
switch httpErr.code {
|
|
||||||
case fasthttp.StatusTooManyRequests:
|
|
||||||
return true
|
|
||||||
default:
|
|
||||||
// Don't retry HTTP error codes
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if dnsError, ok := err.(*net.DNSError); ok {
|
|
||||||
// Don't retry permanent DNS errors
|
|
||||||
return dnsError.IsTemporary
|
|
||||||
}
|
|
||||||
|
|
||||||
if netErr, ok := err.(*net.OpError); ok {
|
|
||||||
// Don't retry permanent network errors
|
|
||||||
return netErr.Temporary()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Retry by default
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
13
go.mod
13
go.mod
@ -1,13 +0,0 @@
|
|||||||
module github.com/terorie/od-database-crawler
|
|
||||||
|
|
||||||
require (
|
|
||||||
github.com/beeker1121/goque v2.0.1+incompatible
|
|
||||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect
|
|
||||||
github.com/sirupsen/logrus v1.4.0
|
|
||||||
github.com/spf13/cobra v0.0.3
|
|
||||||
github.com/spf13/viper v1.3.2
|
|
||||||
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 // indirect
|
|
||||||
github.com/valyala/fasthttp v1.2.0
|
|
||||||
golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613
|
|
||||||
golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3
|
|
||||||
)
|
|
70
go.sum
70
go.sum
@ -1,70 +0,0 @@
|
|||||||
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
|
|
||||||
github.com/beeker1121/goque v2.0.1+incompatible h1:5nJHPMqQLxUvGFc8m/NW2QzxKyc0zICmqs/JUsmEjwE=
|
|
||||||
github.com/beeker1121/goque v2.0.1+incompatible/go.mod h1:L6dOWBhDOnxUVQsb0wkLve0VCnt2xJW/MI8pdRX4ANw=
|
|
||||||
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
|
|
||||||
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
|
|
||||||
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
|
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
|
||||||
github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=
|
|
||||||
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
|
|
||||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w=
|
|
||||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
|
||||||
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
|
|
||||||
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
|
|
||||||
github.com/klauspost/compress v1.4.0 h1:8nsMz3tWa9SWWPL60G1V6CUsf4lLjWLTNEtibhe8gh8=
|
|
||||||
github.com/klauspost/compress v1.4.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
|
|
||||||
github.com/klauspost/cpuid v0.0.0-20180405133222-e7e905edc00e h1:+lIPJOWl+jSiJOc70QXJ07+2eg2Jy2EC7Mi11BWujeM=
|
|
||||||
github.com/klauspost/cpuid v0.0.0-20180405133222-e7e905edc00e/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
|
|
||||||
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
|
|
||||||
github.com/magiconair/properties v1.8.0 h1:LLgXmsheXeRoUOBOjtwPQCWIYqM/LU1ayDtDePerRcY=
|
|
||||||
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
|
|
||||||
github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE=
|
|
||||||
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
|
|
||||||
github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc=
|
|
||||||
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
|
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
|
||||||
github.com/sirupsen/logrus v1.3.0 h1:hI/7Q+DtNZ2kINb6qt/lS+IyXnHQe9e90POfeewL/ME=
|
|
||||||
github.com/sirupsen/logrus v1.3.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
|
|
||||||
github.com/sirupsen/logrus v1.4.0 h1:yKenngtzGh+cUSSh6GWbxW2abRqhYUSR/t/6+2QqNvE=
|
|
||||||
github.com/sirupsen/logrus v1.4.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
|
|
||||||
github.com/spf13/afero v1.1.2 h1:m8/z1t7/fwjysjQRYbP0RD+bUIF/8tJwPdEZsI83ACI=
|
|
||||||
github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
|
|
||||||
github.com/spf13/cast v1.3.0 h1:oget//CVOEoFewqQxwr0Ej5yjygnqGkvggSE/gB35Q8=
|
|
||||||
github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
|
|
||||||
github.com/spf13/cobra v0.0.3 h1:ZlrZ4XsMRm04Fr5pSFxBgfND2EBVa1nLpiy1stUsX/8=
|
|
||||||
github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ=
|
|
||||||
github.com/spf13/jwalterweatherman v1.0.0 h1:XHEdyB+EcvlqZamSM4ZOMGlc93t6AcsBEu9Gc1vn7yk=
|
|
||||||
github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
|
|
||||||
github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg=
|
|
||||||
github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
|
|
||||||
github.com/spf13/viper v1.3.1 h1:5+8j8FTpnFV4nEImW/ofkzEt8VoOiLXxdYIDsB73T38=
|
|
||||||
github.com/spf13/viper v1.3.1/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
|
|
||||||
github.com/spf13/viper v1.3.2 h1:VUFqw5KcqRf7i70GOzW7N+Q7+gxVBkSSqiXB12+JQ4M=
|
|
||||||
github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
|
|
||||||
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
|
||||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
|
||||||
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 h1:GnOzE5fEFN3b2zDhJJABEofdb51uMRNb8eqIVtdducs=
|
|
||||||
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2/go.mod h1:Z4AUp2Km+PwemOoO/VB5AOx9XSsIItzFjoJlOSiYmn0=
|
|
||||||
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
|
|
||||||
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
|
|
||||||
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
|
|
||||||
github.com/valyala/fasthttp v1.1.0 h1:3BohG7mqwj4lq7PTX//7gLbUlzNvZSPmuHFnloXT0lw=
|
|
||||||
github.com/valyala/fasthttp v1.1.0/go.mod h1:4vX61m6KN+xDduDNwXrhIAVZaZaZiQ1luJk8LWSxF3s=
|
|
||||||
github.com/valyala/fasthttp v1.2.0 h1:dzZJf2IuMiclVjdw0kkT+f9u4YdrapbNyGAN47E/qnk=
|
|
||||||
github.com/valyala/fasthttp v1.2.0/go.mod h1:4vX61m6KN+xDduDNwXrhIAVZaZaZiQ1luJk8LWSxF3s=
|
|
||||||
github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
|
|
||||||
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
|
|
||||||
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
|
||||||
golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
|
||||||
golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613 h1:MQ/ZZiDsUapFFiMS+vzwXkCTeEKaum+Do5rINYJDmxc=
|
|
||||||
golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
|
||||||
golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3 h1:czFLhve3vsQetD6JOJ8NZZvGQIXlnN3/yXxbT6/awxI=
|
|
||||||
golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
|
||||||
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
|
||||||
golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a h1:1n5lsVfiQW3yfsRGu98756EH1YthsFqr/5mxHduZW2A=
|
|
||||||
golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
|
||||||
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
|
|
||||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
|
||||||
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
|
|
||||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
|
15
help.go
15
help.go
@ -1,15 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
const helpText =
|
|
||||||
`HTTP crawler for the OD-Database
|
|
||||||
DB >> https://od-db.the-eye.eu <<
|
|
||||||
Crawler >> https://github.com/terorie/od-database-crawler <<
|
|
||||||
Server >> https://github.com/simon987/od-database <<
|
|
||||||
|
|
||||||
Quick start:
|
|
||||||
- get config file (config.yml in working dir)
|
|
||||||
- get OD-DB server ("server.url": Database URL + /api)
|
|
||||||
- get access token ("server.token": e.g. c010b6dd-20...)
|
|
||||||
- ./od-database-crawler server
|
|
||||||
|
|
||||||
Questions? Discord @terorie#2664 / Telegram @terorie`
|
|
140
main.go
140
main.go
@ -2,103 +2,72 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"github.com/spf13/cobra"
|
|
||||||
"github.com/spf13/viper"
|
|
||||||
"github.com/terorie/od-database-crawler/fasturl"
|
"github.com/terorie/od-database-crawler/fasturl"
|
||||||
|
"github.com/urfave/cli"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
var configFile string
|
var app = cli.App {
|
||||||
|
Name: "od-database-crawler",
|
||||||
var rootCmd = cobra.Command {
|
Usage: "OD-Database Go crawler",
|
||||||
Use: "od-database-crawler",
|
Version: "1.0.1",
|
||||||
Version: "1.2.2",
|
BashComplete: cli.DefaultAppComplete,
|
||||||
Short: "OD-Database Go crawler",
|
Writer: os.Stdout,
|
||||||
Long: helpText,
|
Action: cmdBase,
|
||||||
PersistentPreRunE: preRun,
|
Commands: []cli.Command{
|
||||||
PersistentPostRun: func(cmd *cobra.Command, args []string) {
|
{
|
||||||
exitHooks.Execute()
|
Name: "crawl",
|
||||||
|
Usage: "Crawl a list of URLs",
|
||||||
|
ArgsUsage: "<site>",
|
||||||
|
Action: cmdCrawler,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
After: func(i *cli.Context) error {
|
||||||
|
exitHooks.Execute()
|
||||||
|
return nil
|
||||||
},
|
},
|
||||||
}
|
|
||||||
|
|
||||||
var serverCmd = cobra.Command {
|
|
||||||
Use: "server",
|
|
||||||
Short: "Start crawl server",
|
|
||||||
Long: "Connect to the OD-Database and contribute to the database\n" +
|
|
||||||
"by crawling the web for open directories!",
|
|
||||||
Run: cmdBase,
|
|
||||||
}
|
|
||||||
|
|
||||||
var crawlCmd = cobra.Command {
|
|
||||||
Use: "crawl",
|
|
||||||
Short: "Crawl an URL",
|
|
||||||
Long: "Crawl the URL specified.\n" +
|
|
||||||
"Results will not be uploaded to the database,\n" +
|
|
||||||
"they're saved under crawled/0.json instead.\n" +
|
|
||||||
"Primarily used for testing and benchmarking.",
|
|
||||||
RunE: cmdCrawler,
|
|
||||||
Args: cobra.ExactArgs(1),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var exitHooks Hooks
|
var exitHooks Hooks
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
rootCmd.AddCommand(&crawlCmd)
|
|
||||||
rootCmd.AddCommand(&serverCmd)
|
|
||||||
|
|
||||||
prepareConfig()
|
prepareConfig()
|
||||||
}
|
}
|
||||||
|
|
||||||
func preRun(cmd *cobra.Command, args []string) error {
|
|
||||||
if err := os.MkdirAll("crawled", 0755);
|
|
||||||
err != nil { panic(err) }
|
|
||||||
|
|
||||||
if err := os.MkdirAll("queue", 0755);
|
|
||||||
err != nil { panic(err) }
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
err := rootCmd.Execute()
|
err := os.MkdirAll("crawled", 0755)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintln(os.Stderr, err)
|
panic(err)
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func cmdBase(_ *cobra.Command, _ []string) {
|
|
||||||
onlineMode = true
|
|
||||||
readConfig()
|
readConfig()
|
||||||
|
app.Run(os.Args)
|
||||||
|
}
|
||||||
|
|
||||||
appCtx, soft := context.WithCancel(context.Background())
|
func cmdBase(_ *cli.Context) error {
|
||||||
forceCtx, hard := context.WithCancel(context.Background())
|
// TODO Graceful shutdown
|
||||||
go hardShutdown(forceCtx)
|
appCtx := context.Background()
|
||||||
go listenCtrlC(soft, hard)
|
forceCtx := context.Background()
|
||||||
|
|
||||||
inRemotes := make(chan *OD)
|
inRemotes := make(chan *OD)
|
||||||
go Schedule(appCtx, inRemotes)
|
go Schedule(forceCtx, inRemotes)
|
||||||
|
|
||||||
ticker := time.NewTicker(config.Recheck)
|
ticker := time.NewTicker(config.Recheck)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-appCtx.Done():
|
case <-appCtx.Done():
|
||||||
goto shutdown
|
return nil
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
t, err := FetchTask()
|
t, err := FetchTask()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logrus.WithError(err).
|
logrus.WithError(err).
|
||||||
Error("Failed to get new task")
|
Error("Failed to get new task")
|
||||||
if !sleep(viper.GetDuration(ConfCooldown), appCtx) {
|
time.Sleep(30 * time.Second)
|
||||||
goto shutdown
|
|
||||||
}
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if t == nil {
|
if t == nil {
|
||||||
@ -114,27 +83,33 @@ func cmdBase(_ *cobra.Command, _ []string) {
|
|||||||
if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
|
if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
|
||||||
// Not an error
|
// Not an error
|
||||||
err = nil
|
err = nil
|
||||||
// TODO FTP crawler
|
|
||||||
|
// Give back task
|
||||||
|
//err2 := CancelTask(t.WebsiteId)
|
||||||
|
//if err2 != nil {
|
||||||
|
// logrus.Error(err2)
|
||||||
|
//}
|
||||||
|
|
||||||
continue
|
continue
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
logrus.WithError(err).
|
logrus.WithError(err).
|
||||||
Error("Failed to get new task")
|
Error("Failed to get new task")
|
||||||
time.Sleep(viper.GetDuration(ConfCooldown))
|
time.Sleep(30 * time.Second)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
ScheduleTask(inRemotes, t, &baseUri)
|
ScheduleTask(inRemotes, t, &baseUri)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
shutdown:
|
return nil
|
||||||
globalWait.Wait()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func cmdCrawler(_ *cobra.Command, args []string) error {
|
func cmdCrawler(clic *cli.Context) error {
|
||||||
onlineMode = false
|
if clic.NArg() != 1 {
|
||||||
readConfig()
|
cli.ShowCommandHelpAndExit(clic, "crawl", 1)
|
||||||
|
}
|
||||||
|
|
||||||
arg := args[0]
|
arg := clic.Args()[0]
|
||||||
// https://github.com/golang/go/issues/19779
|
// https://github.com/golang/go/issues/19779
|
||||||
if !strings.Contains(arg, "://") {
|
if !strings.Contains(arg, "://") {
|
||||||
arg = "http://" + arg
|
arg = "http://" + arg
|
||||||
@ -166,30 +141,3 @@ func cmdCrawler(_ *cobra.Command, args []string) error {
|
|||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func listenCtrlC(soft, hard context.CancelFunc) {
|
|
||||||
c := make(chan os.Signal)
|
|
||||||
signal.Notify(c, os.Interrupt)
|
|
||||||
|
|
||||||
<-c
|
|
||||||
logrus.Info(">>> Shutting down crawler... <<<")
|
|
||||||
soft()
|
|
||||||
|
|
||||||
<-c
|
|
||||||
logrus.Warning(">>> Force shutdown! <<<")
|
|
||||||
hard()
|
|
||||||
}
|
|
||||||
|
|
||||||
func hardShutdown(c context.Context) {
|
|
||||||
<-c.Done()
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
func sleep(d time.Duration, c context.Context) bool {
|
|
||||||
select {
|
|
||||||
case <-time.After(d):
|
|
||||||
return true
|
|
||||||
case <-c.Done():
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
6
model.go
6
model.go
@ -23,6 +23,7 @@ type TaskResult struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Job struct {
|
type Job struct {
|
||||||
|
OD *OD
|
||||||
Uri fasturl.URL
|
Uri fasturl.URL
|
||||||
UriStr string
|
UriStr string
|
||||||
Fails int
|
Fails int
|
||||||
@ -56,8 +57,3 @@ func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) {
|
|||||||
o.Scanned.Put(k)
|
o.Scanned.Put(k)
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
type errorString string
|
|
||||||
func (e errorString) Error() string {
|
|
||||||
return string(e)
|
|
||||||
}
|
|
||||||
|
129
queue.go
129
queue.go
@ -1,129 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"github.com/beeker1121/goque"
|
|
||||||
"os"
|
|
||||||
"sync"
|
|
||||||
"sync/atomic"
|
|
||||||
)
|
|
||||||
|
|
||||||
type BufferedQueue struct {
|
|
||||||
dataDir string
|
|
||||||
q *goque.Queue
|
|
||||||
buf []Job
|
|
||||||
m sync.Mutex
|
|
||||||
}
|
|
||||||
|
|
||||||
func OpenQueue(dataDir string) (bq *BufferedQueue, err error) {
|
|
||||||
bq = new(BufferedQueue)
|
|
||||||
if config.JobBufferSize < 0 {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
bq.dataDir = dataDir
|
|
||||||
bq.q, err = goque.OpenQueue(dataDir)
|
|
||||||
if err != nil { return nil, err }
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
func (q *BufferedQueue) Enqueue(job *Job) error {
|
|
||||||
atomic.AddInt64(&totalQueued, 1)
|
|
||||||
if q.directEnqueue(job) {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var gob JobGob
|
|
||||||
gob.ToGob(job)
|
|
||||||
_, err := q.q.EnqueueObject(gob)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
func (q *BufferedQueue) Dequeue() (job Job, err error) {
|
|
||||||
if q.directDequeue(&job) {
|
|
||||||
atomic.AddInt64(&totalQueued, -1)
|
|
||||||
return job, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if config.JobBufferSize < 0 {
|
|
||||||
err = goque.ErrEmpty
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
var item *goque.Item
|
|
||||||
item, err = q.q.Dequeue()
|
|
||||||
if err != nil { return }
|
|
||||||
|
|
||||||
atomic.AddInt64(&totalQueued, -1)
|
|
||||||
|
|
||||||
var gob JobGob
|
|
||||||
err = item.ToObject(&gob)
|
|
||||||
if err != nil { return }
|
|
||||||
gob.FromGob(&job)
|
|
||||||
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
func (q *BufferedQueue) directEnqueue(job *Job) bool {
|
|
||||||
q.m.Lock()
|
|
||||||
defer q.m.Unlock()
|
|
||||||
|
|
||||||
bs := config.JobBufferSize
|
|
||||||
if len(q.buf) < bs || bs < 0 {
|
|
||||||
q.buf = append(q.buf, *job)
|
|
||||||
return true
|
|
||||||
} else {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (q *BufferedQueue) directDequeue(job *Job) bool {
|
|
||||||
q.m.Lock()
|
|
||||||
defer q.m.Unlock()
|
|
||||||
|
|
||||||
if len(q.buf) > 0 {
|
|
||||||
*job = q.buf[0]
|
|
||||||
q.buf = q.buf[1:]
|
|
||||||
return true
|
|
||||||
} else {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Always returns nil (But implements io.Closer)
|
|
||||||
func (q *BufferedQueue) Close() error {
|
|
||||||
if config.JobBufferSize < 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Close ignoring errors
|
|
||||||
q.q.Close()
|
|
||||||
|
|
||||||
// Delete files
|
|
||||||
if err := os.RemoveAll(q.dataDir);
|
|
||||||
err != nil { panic(err) }
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type JobGob struct {
|
|
||||||
Uri string
|
|
||||||
Fails int
|
|
||||||
LastError string
|
|
||||||
}
|
|
||||||
|
|
||||||
func (g *JobGob) ToGob(j *Job) {
|
|
||||||
g.Uri = j.UriStr
|
|
||||||
g.Fails = j.Fails
|
|
||||||
if j.LastError != nil {
|
|
||||||
g.LastError = j.LastError.Error()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (g *JobGob) FromGob(j *Job) {
|
|
||||||
if err := j.Uri.Parse(g.Uri);
|
|
||||||
err != nil { panic(err) }
|
|
||||||
j.UriStr = g.Uri
|
|
||||||
j.Fails = g.Fails
|
|
||||||
if g.LastError != "" {
|
|
||||||
j.LastError = errorString(g.LastError)
|
|
||||||
}
|
|
||||||
}
|
|
72
scheduler.go
72
scheduler.go
@ -16,7 +16,7 @@ import (
|
|||||||
var activeTasksLock sync.Mutex
|
var activeTasksLock sync.Mutex
|
||||||
var activeTasks = make(map[uint64]bool)
|
var activeTasks = make(map[uint64]bool)
|
||||||
var numActiveTasks int32
|
var numActiveTasks int32
|
||||||
var totalQueued int64
|
var totalBuffered int64
|
||||||
|
|
||||||
func Schedule(c context.Context, remotes <-chan *OD) {
|
func Schedule(c context.Context, remotes <-chan *OD) {
|
||||||
go Stats(c)
|
go Stats(c)
|
||||||
@ -28,21 +28,8 @@ func Schedule(c context.Context, remotes <-chan *OD) {
|
|||||||
// Collect results
|
// Collect results
|
||||||
results := make(chan File)
|
results := make(chan File)
|
||||||
|
|
||||||
remote.WCtx.OD = remote
|
|
||||||
|
|
||||||
// Get queue path
|
|
||||||
queuePath := path.Join("queue", fmt.Sprintf("%d", remote.Task.WebsiteId))
|
|
||||||
|
|
||||||
// Delete existing queue
|
|
||||||
if err := os.RemoveAll(queuePath);
|
|
||||||
err != nil { panic(err) }
|
|
||||||
|
|
||||||
// Start new queue
|
|
||||||
var err error
|
|
||||||
remote.WCtx.Queue, err = OpenQueue(queuePath)
|
|
||||||
if err != nil { panic(err) }
|
|
||||||
|
|
||||||
// Spawn workers
|
// Spawn workers
|
||||||
|
remote.WCtx.in, remote.WCtx.out = makeJobBuffer(c)
|
||||||
for i := 0; i < config.Workers; i++ {
|
for i := 0; i < config.Workers; i++ {
|
||||||
go remote.WCtx.Worker(results)
|
go remote.WCtx.Worker(results)
|
||||||
}
|
}
|
||||||
@ -50,6 +37,7 @@ func Schedule(c context.Context, remotes <-chan *OD) {
|
|||||||
// Enqueue initial job
|
// Enqueue initial job
|
||||||
atomic.AddInt32(&numActiveTasks, 1)
|
atomic.AddInt32(&numActiveTasks, 1)
|
||||||
remote.WCtx.queueJob(Job{
|
remote.WCtx.queueJob(Job{
|
||||||
|
OD: remote,
|
||||||
Uri: remote.BaseUri,
|
Uri: remote.BaseUri,
|
||||||
UriStr: remote.BaseUri.String(),
|
UriStr: remote.BaseUri.String(),
|
||||||
Fails: 0,
|
Fails: 0,
|
||||||
@ -160,11 +148,7 @@ func (o *OD) handleCollect(results chan File, f *os.File, collectErrC chan error
|
|||||||
|
|
||||||
// Wait for all jobs on remote to finish
|
// Wait for all jobs on remote to finish
|
||||||
o.Wait.Wait()
|
o.Wait.Wait()
|
||||||
|
close(o.WCtx.in)
|
||||||
// Close queue
|
|
||||||
if err := o.WCtx.Queue.Close(); err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
atomic.AddInt32(&numActiveTasks, -1)
|
atomic.AddInt32(&numActiveTasks, -1)
|
||||||
|
|
||||||
// Log finish
|
// Log finish
|
||||||
@ -214,3 +198,51 @@ func (t *Task) collect(results chan File, f *os.File) error {
|
|||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func makeJobBuffer(c context.Context) (chan<- Job, <-chan Job) {
|
||||||
|
in := make(chan Job)
|
||||||
|
out := make(chan Job)
|
||||||
|
go bufferJobs(c, in, out)
|
||||||
|
return in, out
|
||||||
|
}
|
||||||
|
|
||||||
|
func bufferJobs(c context.Context, in chan Job, out chan Job) {
|
||||||
|
defer close(out)
|
||||||
|
var inQueue []Job
|
||||||
|
outCh := func() chan Job {
|
||||||
|
if len(inQueue) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
for len(inQueue) > 0 || in != nil {
|
||||||
|
if len(inQueue) == 0 {
|
||||||
|
select {
|
||||||
|
case v, ok := <-in:
|
||||||
|
if !ok {
|
||||||
|
in = nil
|
||||||
|
} else {
|
||||||
|
atomic.AddInt64(&totalBuffered, 1)
|
||||||
|
inQueue = append(inQueue, v)
|
||||||
|
}
|
||||||
|
case <-c.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
select {
|
||||||
|
case v, ok := <-in:
|
||||||
|
if !ok {
|
||||||
|
in = nil
|
||||||
|
} else {
|
||||||
|
atomic.AddInt64(&totalBuffered, 1)
|
||||||
|
inQueue = append(inQueue, v)
|
||||||
|
}
|
||||||
|
case outCh() <- inQueue[0]:
|
||||||
|
atomic.AddInt64(&totalBuffered, -1)
|
||||||
|
inQueue = inQueue[1:]
|
||||||
|
case <-c.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
38
server.go
38
server.go
@ -5,23 +5,18 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"github.com/spf13/viper"
|
|
||||||
"io"
|
"io"
|
||||||
"mime/multipart"
|
"mime/multipart"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var serverClient = http.Client {
|
var serverClient = http.Client {
|
||||||
Timeout: config.ServerTimeout,
|
Timeout: config.ServerTimeout,
|
||||||
Transport: new(ServerTripper),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var serverUserAgent = "od-database-crawler/" + rootCmd.Version
|
|
||||||
|
|
||||||
func FetchTask() (t *Task, err error) {
|
func FetchTask() (t *Task, err error) {
|
||||||
res, err := serverClient.PostForm(
|
res, err := serverClient.PostForm(
|
||||||
config.ServerUrl + "/task/get",
|
config.ServerUrl + "/task/get",
|
||||||
@ -40,9 +35,7 @@ func FetchTask() (t *Task, err error) {
|
|||||||
|
|
||||||
t = new(Task)
|
t = new(Task)
|
||||||
err = json.NewDecoder(res.Body).Decode(t)
|
err = json.NewDecoder(res.Body).Decode(t)
|
||||||
if _, ok := err.(*json.SyntaxError); ok {
|
if err != nil { return }
|
||||||
return nil, fmt.Errorf("/task/get returned invalid JSON")
|
|
||||||
} else if err != nil { return }
|
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -108,37 +101,25 @@ func uploadChunks(websiteId uint64, f *os.File) error {
|
|||||||
|
|
||||||
multi.Close()
|
multi.Close()
|
||||||
|
|
||||||
for retries := 0; retries < viper.GetInt(ConfUploadRetries); retries++ {
|
|
||||||
if retries > 0 {
|
|
||||||
// Error occurred, retry upload
|
|
||||||
time.Sleep(viper.GetDuration(ConfUploadRetryInterval))
|
|
||||||
}
|
|
||||||
|
|
||||||
req, err := http.NewRequest(
|
req, err := http.NewRequest(
|
||||||
http.MethodPost,
|
http.MethodPost,
|
||||||
config.ServerUrl + "/task/upload",
|
config.ServerUrl + "/task/upload",
|
||||||
&b)
|
&b)
|
||||||
req.Header.Set("content-type", multi.FormDataContentType())
|
req.Header.Set("content-type", multi.FormDataContentType())
|
||||||
if err != nil { continue }
|
if err != nil { return err }
|
||||||
|
|
||||||
res, err := serverClient.Do(req)
|
res, err := serverClient.Do(req)
|
||||||
if err != nil { continue }
|
if err != nil { return err }
|
||||||
res.Body.Close()
|
res.Body.Close()
|
||||||
|
|
||||||
if res.StatusCode != http.StatusOK {
|
if res.StatusCode != http.StatusOK {
|
||||||
logrus.WithField("status", res.Status).
|
return fmt.Errorf("failed to upload list part %d: %s",
|
||||||
WithField("part", iter).
|
iter, res.Status)
|
||||||
Errorf("Upload failed")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Upload successful
|
|
||||||
break
|
|
||||||
}
|
}
|
||||||
|
|
||||||
logrus.WithField("id", websiteId).
|
logrus.WithField("id", websiteId).
|
||||||
WithField("part", iter).
|
WithField("part", iter).
|
||||||
Infof("Uploaded files chunk")
|
Infof("Uploading files chunk")
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -181,10 +162,3 @@ func CancelTask(websiteId uint64) (err error) {
|
|||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
type ServerTripper struct{}
|
|
||||||
|
|
||||||
func (t *ServerTripper) RoundTrip(req *http.Request) (res *http.Response, err error) {
|
|
||||||
req.Header.Set("User-Agent", serverUserAgent)
|
|
||||||
return http.DefaultTransport.RoundTrip(req)
|
|
||||||
}
|
|
||||||
|
16
stats.go
16
stats.go
@ -3,7 +3,6 @@ package main
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"github.com/spf13/viper"
|
|
||||||
"math"
|
"math"
|
||||||
"runtime"
|
"runtime"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
@ -20,14 +19,11 @@ func Stats(c context.Context) {
|
|||||||
var crawlTicker <-chan time.Time
|
var crawlTicker <-chan time.Time
|
||||||
var allocTicker <-chan time.Time
|
var allocTicker <-chan time.Time
|
||||||
|
|
||||||
crawlInterval := viper.GetDuration(ConfCrawlStats)
|
if config.CrawlStats != 0 {
|
||||||
allocInterval := viper.GetDuration(ConfAllocStats)
|
crawlTicker = time.NewTicker(config.CrawlStats).C
|
||||||
|
|
||||||
if crawlInterval != 0 {
|
|
||||||
crawlTicker = time.Tick(crawlInterval)
|
|
||||||
}
|
}
|
||||||
if allocInterval != 0 {
|
if config.AllocStats != 0 {
|
||||||
allocTicker = time.Tick(allocInterval)
|
allocTicker = time.NewTicker(config.AllocStats).C
|
||||||
}
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
@ -36,7 +32,7 @@ func Stats(c context.Context) {
|
|||||||
startedNow := atomic.LoadUint64(&totalStarted)
|
startedNow := atomic.LoadUint64(&totalStarted)
|
||||||
|
|
||||||
perSecond := float64(startedNow - startedLast) /
|
perSecond := float64(startedNow - startedLast) /
|
||||||
crawlInterval.Seconds()
|
config.CrawlStats.Seconds()
|
||||||
|
|
||||||
// Round to .5
|
// Round to .5
|
||||||
perSecond *= 2
|
perSecond *= 2
|
||||||
@ -61,7 +57,7 @@ func Stats(c context.Context) {
|
|||||||
runtime.ReadMemStats(&mem)
|
runtime.ReadMemStats(&mem)
|
||||||
|
|
||||||
logrus.WithFields(logrus.Fields{
|
logrus.WithFields(logrus.Fields{
|
||||||
"queue_count": atomic.LoadInt64(&totalQueued),
|
"queue_count": atomic.LoadInt64(&totalBuffered),
|
||||||
"heap": FormatByteCount(mem.Alloc),
|
"heap": FormatByteCount(mem.Alloc),
|
||||||
"objects": mem.HeapObjects,
|
"objects": mem.HeapObjects,
|
||||||
"num_gc": mem.NumGC,
|
"num_gc": mem.NumGC,
|
||||||
|
61
worker.go
61
worker.go
@ -1,8 +1,8 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/beeker1121/goque"
|
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
|
"github.com/valyala/fasthttp"
|
||||||
"math"
|
"math"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
@ -14,38 +14,24 @@ import (
|
|||||||
var globalWait sync.WaitGroup
|
var globalWait sync.WaitGroup
|
||||||
|
|
||||||
type WorkerContext struct {
|
type WorkerContext struct {
|
||||||
OD *OD
|
in chan<- Job
|
||||||
Queue *BufferedQueue
|
out <-chan Job
|
||||||
lastRateLimit time.Time
|
lastRateLimit time.Time
|
||||||
numRateLimits int
|
numRateLimits int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *WorkerContext) Worker(results chan<- File) {
|
func (w WorkerContext) Worker(results chan<- File) {
|
||||||
for {
|
for job := range w.out {
|
||||||
job, err := w.Queue.Dequeue()
|
|
||||||
switch err {
|
|
||||||
case goque.ErrEmpty:
|
|
||||||
time.Sleep(500 * time.Millisecond)
|
|
||||||
continue
|
|
||||||
|
|
||||||
case goque.ErrDBClosed:
|
|
||||||
return
|
|
||||||
|
|
||||||
case nil:
|
|
||||||
w.step(results, job)
|
w.step(results, job)
|
||||||
|
|
||||||
default:
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *WorkerContext) step(results chan<- File, job Job) {
|
func (w WorkerContext) step(results chan<- File, job Job) {
|
||||||
defer w.finishJob()
|
defer w.finishJob(&job)
|
||||||
|
|
||||||
var f File
|
var f File
|
||||||
|
|
||||||
newJobs, err := w.DoJob(&job, &f)
|
newJobs, err := DoJob(&job, &f)
|
||||||
atomic.AddUint64(&totalStarted, 1)
|
atomic.AddUint64(&totalStarted, 1)
|
||||||
if err == ErrKnown {
|
if err == ErrKnown {
|
||||||
return
|
return
|
||||||
@ -54,13 +40,15 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
job.Fails++
|
job.Fails++
|
||||||
|
|
||||||
if !shouldRetry(err) {
|
if httpErr, ok := err.(*HttpError); ok {
|
||||||
atomic.AddUint64(&totalAborted, 1)
|
switch httpErr.code {
|
||||||
logrus.WithField("url", job.UriStr).
|
case fasthttp.StatusTooManyRequests:
|
||||||
WithError(err).
|
err = ErrRateLimit
|
||||||
Error("Giving up after failure")
|
default:
|
||||||
|
// Don't retry HTTP error codes
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if job.Fails > config.Retries {
|
if job.Fails > config.Retries {
|
||||||
atomic.AddUint64(&totalAborted, 1)
|
atomic.AddUint64(&totalAborted, 1)
|
||||||
@ -87,7 +75,7 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
func DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
||||||
if len(job.Uri.Path) == 0 { return }
|
if len(job.Uri.Path) == 0 { return }
|
||||||
if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
|
if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
|
||||||
// Load directory
|
// Load directory
|
||||||
@ -105,7 +93,7 @@ func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
|||||||
hash := f.HashDir(links)
|
hash := f.HashDir(links)
|
||||||
|
|
||||||
// Skip symlinked dirs
|
// Skip symlinked dirs
|
||||||
if w.OD.LoadOrStoreKey(&hash) {
|
if job.OD.LoadOrStoreKey(&hash) {
|
||||||
return nil, ErrKnown
|
return nil, ErrKnown
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -126,6 +114,7 @@ func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
|||||||
lastLink = uriStr
|
lastLink = uriStr
|
||||||
|
|
||||||
newJobs = append(newJobs, Job{
|
newJobs = append(newJobs, Job{
|
||||||
|
OD: job.OD,
|
||||||
Uri: link,
|
Uri: link,
|
||||||
UriStr: uriStr,
|
UriStr: uriStr,
|
||||||
Fails: 0,
|
Fails: 0,
|
||||||
@ -150,13 +139,13 @@ func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
|||||||
}
|
}
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
atomic.AddUint64(&w.OD.Result.FileCount, 1)
|
atomic.AddUint64(&job.OD.Result.FileCount, 1)
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *WorkerContext) queueJob(job Job) {
|
func (w WorkerContext) queueJob(job Job) {
|
||||||
w.OD.Wait.Add(1)
|
job.OD.Wait.Add(1)
|
||||||
|
|
||||||
if w.numRateLimits > 0 {
|
if w.numRateLimits > 0 {
|
||||||
if time.Since(w.lastRateLimit) > 5 * time.Second {
|
if time.Since(w.lastRateLimit) > 5 * time.Second {
|
||||||
@ -167,13 +156,11 @@ func (w *WorkerContext) queueJob(job Job) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := w.Queue.Enqueue(&job); err != nil {
|
w.in <- job
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *WorkerContext) finishJob() {
|
func (w WorkerContext) finishJob(job *Job) {
|
||||||
w.OD.Wait.Done()
|
job.OD.Wait.Done()
|
||||||
}
|
}
|
||||||
|
|
||||||
func isErrSilent(err error) bool {
|
func isErrSilent(err error) bool {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user