mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-20 19:06:46 +00:00
Compare commits
27 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
88bf634cb6 | ||
|
796cf6ac23 | ||
|
defaf54e66 | ||
|
230824c58f | ||
|
d3c199b738 | ||
|
0b3f0d87fe | ||
|
da9c75e392 | ||
|
8947e05d0c | ||
|
8c5f99d616 | ||
|
206ea0e91d | ||
|
8b9d8bfd17 | ||
|
c9ff102d80 | ||
|
88856c1c19 | ||
|
9e9b606250 | ||
|
326e29e5e4 | ||
|
c2acd5463f | ||
|
e4d04e6a5f | ||
|
9f1402e841 | ||
|
7c8ab50ee4 | ||
|
281d2d17d6 | ||
|
45cbd4d535 | ||
|
771d49f2dd | ||
|
dbd787aa81 | ||
|
cea6c1658b | ||
|
885af5bb3b | ||
|
b18b70f798 | ||
|
9d5f549774 |
15
Dockerfile
Normal file
15
Dockerfile
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
FROM golang:alpine as builder
|
||||||
|
ADD . /go/src/github.com/terorie/od-database-crawler
|
||||||
|
RUN apk add git \
|
||||||
|
&& go get -d -v github.com/terorie/od-database-crawler \
|
||||||
|
&& CGO_ENABLED=0 go install -a \
|
||||||
|
-installsuffix cgo \
|
||||||
|
-ldflags="-s -w" \
|
||||||
|
github.com/terorie/od-database-crawler
|
||||||
|
|
||||||
|
FROM scratch
|
||||||
|
COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
|
||||||
|
COPY --from=builder /go/bin/od-database-crawler /bin/
|
||||||
|
WORKDIR /oddb
|
||||||
|
VOLUME [ "/oddb" ]
|
||||||
|
CMD ["/bin/od-database-crawler", "server"]
|
41
README.md
41
README.md
@ -1,15 +1,20 @@
|
|||||||
# od-database Go crawler 🚀
|
# OD-Database Crawler 🕷
|
||||||
[](https://travis-ci.org/terorie/od-database-crawler)
|
[](https://travis-ci.org/terorie/od-database-crawler)
|
||||||
> by terorie 2018 :P
|
[](https://github.com/terorie/od-database-crawler)
|
||||||
|
[](https://www.codefactor.io/repository/github/terorie/od-database-crawler/overview/master)
|
||||||
|
|
||||||
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
|
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
|
||||||
|
* In production at https://od-db.the-eye.eu/
|
||||||
|
* Over 880 TB actively crawled
|
||||||
* Crawls HTTP open directories (standard Web Server Listings)
|
* Crawls HTTP open directories (standard Web Server Listings)
|
||||||
* Gets name, path, size and modification time of all files
|
* Gets name, path, size and modification time of all files
|
||||||
* Lightweight and fast: __over 9000 requests per second__ on a standard laptop
|
* Lightweight and fast
|
||||||
|
|
||||||
https://od-db.the-eye.eu/
|
https://od-db.the-eye.eu/
|
||||||
|
|
||||||
#### Usage
|
## Usage
|
||||||
|
|
||||||
|
### Deploys
|
||||||
|
|
||||||
1. With Config File (if `config.yml` found in working dir)
|
1. With Config File (if `config.yml` found in working dir)
|
||||||
- Download [default config](https://github.com/terorie/od-database-crawler/blob/master/config.yml)
|
- Download [default config](https://github.com/terorie/od-database-crawler/blob/master/config.yml)
|
||||||
@ -22,3 +27,31 @@ https://od-db.the-eye.eu/
|
|||||||
- Every flag is available as an environment variable:
|
- Every flag is available as an environment variable:
|
||||||
`--server.crawl_stats` ➡️ `OD_SERVER_CRAWL_STATS`
|
`--server.crawl_stats` ➡️ `OD_SERVER_CRAWL_STATS`
|
||||||
- Start with `./od-database-crawler server <flags>`
|
- Start with `./od-database-crawler server <flags>`
|
||||||
|
|
||||||
|
3. With Docker
|
||||||
|
```bash
|
||||||
|
docker run \
|
||||||
|
-e OD_SERVER_URL=xxx \
|
||||||
|
-e OD_SERVER_TOKEN=xxx \
|
||||||
|
terorie/od-database-crawler
|
||||||
|
```
|
||||||
|
|
||||||
|
### Flag reference
|
||||||
|
|
||||||
|
Here are the most important config flags. For more fine control, take a look at `/config.yml`.
|
||||||
|
|
||||||
|
| Flag/Environment | Description | Example |
|
||||||
|
| ------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------- |
|
||||||
|
| `server.url`<br />`OD_SERVER_URL` | OD-DB Server URL | `https://od-db.mine.the-eye.eu/api` |
|
||||||
|
| `server.token`<br />`OD_SERVER_TOKEN` | OD-DB Server Access Token | _Ask Hexa **TM**_ |
|
||||||
|
| `server.recheck`<br />`OD_SERVER_RECHECK` | Job Fetching Interval | `3s` |
|
||||||
|
| `output.crawl_stats`<br />`OD_OUTPUT_CRAWL_STATS` | Crawl Stats Logging Interval (0 = disabled) | `500ms` |
|
||||||
|
| `output.resource_stats`<br />`OD_OUTPUT_RESORUCE_STATS` | Resource Stats Logging Interval (0 = disabled) | `8s` |
|
||||||
|
| `output.log`<br />`OD_OUTPUT_LOG` | Log File (none = disabled) | `crawler.log` |
|
||||||
|
| `crawl.tasks`<br />`OD_CRAWL_TASKS` | Max number of sites to crawl concurrently | `500` |
|
||||||
|
| `crawl.connections`<br />`OD_CRAWL_CONNECTIONS` | HTTP connections per site | `1` |
|
||||||
|
| `crawl.retries`<br />`OD_CRAWL_RETRIES` | How often to retry after a temporary failure (e.g. `HTTP 429` or timeouts) | `5` |
|
||||||
|
| `crawl.dial_timeout`<br />`OD_CRAWL_DIAL_TIMEOUT` | TCP Connect timeout | `5s` |
|
||||||
|
| `crawl.timeout`<br />`OD_CRAWL_TIMEOUT` | HTTP request timeout | `20s` |
|
||||||
|
| `crawl.user-agent`<br />`OD_CRAWL_USER_AGENT` | HTTP Crawler User-Agent | `googlebot/1.2.3` |
|
||||||
|
| `crawl.job_buffer`<br />`OD_CRAWL_JOB_BUFFER` | Number of URLs to keep in memory/cache, per job. The rest is offloaded to disk. Decrease this value if the crawler uses too much RAM. (0 = Disable Cache, -1 = Only use Cache) | `5000` |
|
||||||
|
123
config.go
123
config.go
@ -4,6 +4,7 @@ import (
|
|||||||
"bufio"
|
"bufio"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
|
"github.com/spf13/pflag"
|
||||||
"github.com/spf13/viper"
|
"github.com/spf13/viper"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
@ -26,6 +27,8 @@ var config struct {
|
|||||||
JobBufferSize int
|
JobBufferSize int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var onlineMode bool
|
||||||
|
|
||||||
const (
|
const (
|
||||||
ConfServerUrl = "server.url"
|
ConfServerUrl = "server.url"
|
||||||
ConfToken = "server.token"
|
ConfToken = "server.token"
|
||||||
@ -54,8 +57,56 @@ const (
|
|||||||
func prepareConfig() {
|
func prepareConfig() {
|
||||||
pf := rootCmd.PersistentFlags()
|
pf := rootCmd.PersistentFlags()
|
||||||
|
|
||||||
bind := func(s string) {
|
pf.SortFlags = false
|
||||||
if err := viper.BindPFlag(s, pf.Lookup(s)); err != nil {
|
pf.StringVar(&configFile, "config", "", "Config file")
|
||||||
|
configFile = os.Getenv("OD_CONFIG")
|
||||||
|
|
||||||
|
pf.String(ConfServerUrl, "http://od-db.the-eye.eu/api", "OD-DB server URL")
|
||||||
|
|
||||||
|
pf.String(ConfToken, "", "OD-DB access token (env OD_SERVER_TOKEN)")
|
||||||
|
|
||||||
|
pf.Duration(ConfServerTimeout, 60 * time.Second, "OD-DB request timeout")
|
||||||
|
|
||||||
|
pf.Duration(ConfRecheck, 1 * time.Second, "OD-DB: Poll interval for new jobs")
|
||||||
|
|
||||||
|
pf.Duration(ConfCooldown, 30 * time.Second, "OD-DB: Time to wait after a server-side error")
|
||||||
|
|
||||||
|
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
|
||||||
|
|
||||||
|
pf.Uint(ConfUploadRetries, 10, "OD-DB: Max upload retries")
|
||||||
|
|
||||||
|
pf.Duration(ConfUploadRetryInterval, 30 * time.Second, "OD-DB: Time to wait between upload retries")
|
||||||
|
|
||||||
|
pf.Uint(ConfTasks, 100, "Crawler: Max concurrent tasks")
|
||||||
|
|
||||||
|
pf.Uint(ConfWorkers, 4, "Crawler: Connections per server")
|
||||||
|
|
||||||
|
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
|
||||||
|
|
||||||
|
pf.Duration(ConfDialTimeout, 10 * time.Second, "Crawler: Handshake timeout")
|
||||||
|
|
||||||
|
pf.Duration(ConfTimeout, 30 * time.Second, "Crawler: Request timeout")
|
||||||
|
|
||||||
|
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
|
||||||
|
|
||||||
|
pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size")
|
||||||
|
|
||||||
|
pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval")
|
||||||
|
|
||||||
|
pf.Duration(ConfAllocStats, 10 * time.Second, "Log: Resource stats interval")
|
||||||
|
|
||||||
|
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")
|
||||||
|
|
||||||
|
pf.Bool(ConfPrintHTTP, false, "Log: Print HTTP client errors")
|
||||||
|
|
||||||
|
pf.String(ConfLogFile, "crawler.log", "Log file")
|
||||||
|
|
||||||
|
// Bind all flags to Viper
|
||||||
|
pf.VisitAll(func(flag *pflag.Flag) {
|
||||||
|
s := flag.Name
|
||||||
|
s = strings.TrimLeft(s, "-")
|
||||||
|
|
||||||
|
if err := viper.BindPFlag(s, flag); err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
var envKey string
|
var envKey string
|
||||||
@ -65,71 +116,7 @@ func prepareConfig() {
|
|||||||
if err := viper.BindEnv(s, envKey); err != nil {
|
if err := viper.BindEnv(s, envKey); err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
}
|
})
|
||||||
|
|
||||||
pf.SortFlags = false
|
|
||||||
pf.StringVar(&configFile, "config", "", "Config file")
|
|
||||||
configFile = os.Getenv("OD_CONFIG")
|
|
||||||
|
|
||||||
pf.String(ConfServerUrl, "http://od-db.the-eye.eu/api", "OD-DB server URL")
|
|
||||||
bind(ConfServerUrl)
|
|
||||||
|
|
||||||
pf.String(ConfToken, "", "OD-DB access token (env OD_SERVER_TOKEN)")
|
|
||||||
bind(ConfToken)
|
|
||||||
|
|
||||||
pf.Duration(ConfServerTimeout, 60 * time.Second, "OD-DB request timeout")
|
|
||||||
bind(ConfServerTimeout)
|
|
||||||
|
|
||||||
pf.Duration(ConfRecheck, 1 * time.Second, "OD-DB: Poll interval for new jobs")
|
|
||||||
bind(ConfRecheck)
|
|
||||||
|
|
||||||
pf.Duration(ConfCooldown, 30 * time.Second, "OD-DB: Time to wait after a server-side error")
|
|
||||||
bind(ConfCooldown)
|
|
||||||
|
|
||||||
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
|
|
||||||
bind(ConfChunkSize)
|
|
||||||
|
|
||||||
pf.Uint(ConfUploadRetries, 10, "OD-DB: Max upload retries")
|
|
||||||
bind(ConfUploadRetries)
|
|
||||||
|
|
||||||
pf.Duration(ConfUploadRetryInterval, 30 * time.Second, "OD-DB: Time to wait between upload retries")
|
|
||||||
bind(ConfUploadRetryInterval)
|
|
||||||
|
|
||||||
pf.Uint(ConfTasks, 100, "Crawler: Max concurrent tasks")
|
|
||||||
bind(ConfTasks)
|
|
||||||
|
|
||||||
pf.Uint(ConfWorkers, 4, "Crawler: Connections per server")
|
|
||||||
bind(ConfWorkers)
|
|
||||||
|
|
||||||
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
|
|
||||||
bind(ConfRetries)
|
|
||||||
|
|
||||||
pf.Duration(ConfDialTimeout, 10 * time.Second, "Crawler: Handshake timeout")
|
|
||||||
bind(ConfDialTimeout)
|
|
||||||
|
|
||||||
pf.Duration(ConfTimeout, 30 * time.Second, "Crawler: Request timeout")
|
|
||||||
bind(ConfTimeout)
|
|
||||||
|
|
||||||
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
|
|
||||||
bind(ConfUserAgent)
|
|
||||||
|
|
||||||
pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size")
|
|
||||||
bind(ConfJobBufferSize)
|
|
||||||
|
|
||||||
pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval")
|
|
||||||
bind(ConfCrawlStats)
|
|
||||||
|
|
||||||
pf.Duration(ConfAllocStats, 10 * time.Second, "Log: Resource stats interval")
|
|
||||||
bind(ConfAllocStats)
|
|
||||||
|
|
||||||
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")
|
|
||||||
bind(ConfVerbose)
|
|
||||||
|
|
||||||
pf.Bool(ConfPrintHTTP, false, "Log: Print HTTP client errors")
|
|
||||||
bind(ConfPrintHTTP)
|
|
||||||
|
|
||||||
pf.String(ConfLogFile, "crawler.log", "Log file")
|
|
||||||
bind(ConfLogFile)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func readConfig() {
|
func readConfig() {
|
||||||
@ -157,6 +144,7 @@ func readConfig() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if onlineMode {
|
||||||
config.ServerUrl = viper.GetString(ConfServerUrl)
|
config.ServerUrl = viper.GetString(ConfServerUrl)
|
||||||
if config.ServerUrl == "" {
|
if config.ServerUrl == "" {
|
||||||
configMissing(ConfServerUrl)
|
configMissing(ConfServerUrl)
|
||||||
@ -167,6 +155,7 @@ func readConfig() {
|
|||||||
if config.Token == "" {
|
if config.Token == "" {
|
||||||
configMissing(ConfToken)
|
configMissing(ConfToken)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
|
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
|
||||||
|
|
||||||
|
@ -47,13 +47,13 @@ output:
|
|||||||
# Crawler settings
|
# Crawler settings
|
||||||
crawl:
|
crawl:
|
||||||
# Number of sites that can be processed at once
|
# Number of sites that can be processed at once
|
||||||
tasks: 100
|
tasks: 25
|
||||||
|
|
||||||
# Number of connections per site
|
# Number of connections per site
|
||||||
# Please be careful with this setting!
|
# Please be careful with this setting!
|
||||||
# The crawler fires fast and more than
|
# The crawler fires fast and more than
|
||||||
# ten connections can overwhelm a server.
|
# ten connections can overwhelm a server.
|
||||||
connections: 4
|
connections: 1
|
||||||
|
|
||||||
# How often to retry getting data
|
# How often to retry getting data
|
||||||
# from the site before giving up
|
# from the site before giving up
|
||||||
@ -81,4 +81,4 @@ crawl:
|
|||||||
# in memory.
|
# in memory.
|
||||||
# A negative value will cause all jobs
|
# A negative value will cause all jobs
|
||||||
# to be stored in memory. (Don't do this)
|
# to be stored in memory. (Don't do this)
|
||||||
job_buffer: 5000
|
job_buffer: -1
|
||||||
|
9
go.mod
9
go.mod
@ -1,14 +1,13 @@
|
|||||||
module github.com/syndtr/od-database-crawler
|
module github.com/terorie/od-database-crawler
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/beeker1121/goque v2.0.1+incompatible
|
github.com/beeker1121/goque v2.0.1+incompatible
|
||||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect
|
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect
|
||||||
github.com/sirupsen/logrus v1.3.0
|
github.com/sirupsen/logrus v1.4.0
|
||||||
github.com/spf13/cobra v0.0.3
|
github.com/spf13/cobra v0.0.3
|
||||||
github.com/spf13/viper v1.3.1
|
github.com/spf13/viper v1.3.2
|
||||||
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 // indirect
|
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 // indirect
|
||||||
github.com/terorie/od-database-crawler v1.1.1
|
github.com/valyala/fasthttp v1.2.0
|
||||||
github.com/valyala/fasthttp v1.1.0
|
|
||||||
golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613
|
golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613
|
||||||
golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3
|
golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3
|
||||||
)
|
)
|
||||||
|
8
go.sum
8
go.sum
@ -25,6 +25,8 @@ github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/9
|
|||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
github.com/sirupsen/logrus v1.3.0 h1:hI/7Q+DtNZ2kINb6qt/lS+IyXnHQe9e90POfeewL/ME=
|
github.com/sirupsen/logrus v1.3.0 h1:hI/7Q+DtNZ2kINb6qt/lS+IyXnHQe9e90POfeewL/ME=
|
||||||
github.com/sirupsen/logrus v1.3.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
|
github.com/sirupsen/logrus v1.3.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
|
||||||
|
github.com/sirupsen/logrus v1.4.0 h1:yKenngtzGh+cUSSh6GWbxW2abRqhYUSR/t/6+2QqNvE=
|
||||||
|
github.com/sirupsen/logrus v1.4.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
|
||||||
github.com/spf13/afero v1.1.2 h1:m8/z1t7/fwjysjQRYbP0RD+bUIF/8tJwPdEZsI83ACI=
|
github.com/spf13/afero v1.1.2 h1:m8/z1t7/fwjysjQRYbP0RD+bUIF/8tJwPdEZsI83ACI=
|
||||||
github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
|
github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
|
||||||
github.com/spf13/cast v1.3.0 h1:oget//CVOEoFewqQxwr0Ej5yjygnqGkvggSE/gB35Q8=
|
github.com/spf13/cast v1.3.0 h1:oget//CVOEoFewqQxwr0Ej5yjygnqGkvggSE/gB35Q8=
|
||||||
@ -37,17 +39,19 @@ github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg=
|
|||||||
github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
|
github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
|
||||||
github.com/spf13/viper v1.3.1 h1:5+8j8FTpnFV4nEImW/ofkzEt8VoOiLXxdYIDsB73T38=
|
github.com/spf13/viper v1.3.1 h1:5+8j8FTpnFV4nEImW/ofkzEt8VoOiLXxdYIDsB73T38=
|
||||||
github.com/spf13/viper v1.3.1/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
|
github.com/spf13/viper v1.3.1/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
|
||||||
|
github.com/spf13/viper v1.3.2 h1:VUFqw5KcqRf7i70GOzW7N+Q7+gxVBkSSqiXB12+JQ4M=
|
||||||
|
github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
|
||||||
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||||
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 h1:GnOzE5fEFN3b2zDhJJABEofdb51uMRNb8eqIVtdducs=
|
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 h1:GnOzE5fEFN3b2zDhJJABEofdb51uMRNb8eqIVtdducs=
|
||||||
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2/go.mod h1:Z4AUp2Km+PwemOoO/VB5AOx9XSsIItzFjoJlOSiYmn0=
|
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2/go.mod h1:Z4AUp2Km+PwemOoO/VB5AOx9XSsIItzFjoJlOSiYmn0=
|
||||||
github.com/terorie/od-database-crawler v1.1.1 h1:Ca+ZqbZX3rVWBR8SDRzvroyxjBtUs75MQXZ9YG0gqGo=
|
|
||||||
github.com/terorie/od-database-crawler v1.1.1/go.mod h1:vVJ7pLkudrlUNp9qu24JCzQ8N6mFsrOmX1tPXr155DQ=
|
|
||||||
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
|
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
|
||||||
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
|
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
|
||||||
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
|
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
|
||||||
github.com/valyala/fasthttp v1.1.0 h1:3BohG7mqwj4lq7PTX//7gLbUlzNvZSPmuHFnloXT0lw=
|
github.com/valyala/fasthttp v1.1.0 h1:3BohG7mqwj4lq7PTX//7gLbUlzNvZSPmuHFnloXT0lw=
|
||||||
github.com/valyala/fasthttp v1.1.0/go.mod h1:4vX61m6KN+xDduDNwXrhIAVZaZaZiQ1luJk8LWSxF3s=
|
github.com/valyala/fasthttp v1.1.0/go.mod h1:4vX61m6KN+xDduDNwXrhIAVZaZaZiQ1luJk8LWSxF3s=
|
||||||
|
github.com/valyala/fasthttp v1.2.0 h1:dzZJf2IuMiclVjdw0kkT+f9u4YdrapbNyGAN47E/qnk=
|
||||||
|
github.com/valyala/fasthttp v1.2.0/go.mod h1:4vX61m6KN+xDduDNwXrhIAVZaZaZiQ1luJk8LWSxF3s=
|
||||||
github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
|
github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
|
||||||
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
|
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
|
||||||
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||||
|
64
main.go
64
main.go
@ -8,6 +8,7 @@ import (
|
|||||||
"github.com/spf13/viper"
|
"github.com/spf13/viper"
|
||||||
"github.com/terorie/od-database-crawler/fasturl"
|
"github.com/terorie/od-database-crawler/fasturl"
|
||||||
"os"
|
"os"
|
||||||
|
"os/signal"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
@ -17,7 +18,7 @@ var configFile string
|
|||||||
|
|
||||||
var rootCmd = cobra.Command {
|
var rootCmd = cobra.Command {
|
||||||
Use: "od-database-crawler",
|
Use: "od-database-crawler",
|
||||||
Version: "1.2.1",
|
Version: "1.2.2",
|
||||||
Short: "OD-Database Go crawler",
|
Short: "OD-Database Go crawler",
|
||||||
Long: helpText,
|
Long: helpText,
|
||||||
PersistentPreRunE: preRun,
|
PersistentPreRunE: preRun,
|
||||||
@ -61,8 +62,6 @@ func preRun(cmd *cobra.Command, args []string) error {
|
|||||||
if err := os.MkdirAll("queue", 0755);
|
if err := os.MkdirAll("queue", 0755);
|
||||||
err != nil { panic(err) }
|
err != nil { panic(err) }
|
||||||
|
|
||||||
readConfig()
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -75,25 +74,31 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func cmdBase(_ *cobra.Command, _ []string) {
|
func cmdBase(_ *cobra.Command, _ []string) {
|
||||||
// TODO Graceful shutdown
|
onlineMode = true
|
||||||
appCtx := context.Background()
|
readConfig()
|
||||||
forceCtx := context.Background()
|
|
||||||
|
appCtx, soft := context.WithCancel(context.Background())
|
||||||
|
forceCtx, hard := context.WithCancel(context.Background())
|
||||||
|
go hardShutdown(forceCtx)
|
||||||
|
go listenCtrlC(soft, hard)
|
||||||
|
|
||||||
inRemotes := make(chan *OD)
|
inRemotes := make(chan *OD)
|
||||||
go Schedule(forceCtx, inRemotes)
|
go Schedule(appCtx, inRemotes)
|
||||||
|
|
||||||
ticker := time.NewTicker(config.Recheck)
|
ticker := time.NewTicker(config.Recheck)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-appCtx.Done():
|
case <-appCtx.Done():
|
||||||
return
|
goto shutdown
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
t, err := FetchTask()
|
t, err := FetchTask()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logrus.WithError(err).
|
logrus.WithError(err).
|
||||||
Error("Failed to get new task")
|
Error("Failed to get new task")
|
||||||
time.Sleep(viper.GetDuration(ConfCooldown))
|
if !sleep(viper.GetDuration(ConfCooldown), appCtx) {
|
||||||
|
goto shutdown
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if t == nil {
|
if t == nil {
|
||||||
@ -109,13 +114,7 @@ func cmdBase(_ *cobra.Command, _ []string) {
|
|||||||
if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
|
if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
|
||||||
// Not an error
|
// Not an error
|
||||||
err = nil
|
err = nil
|
||||||
|
// TODO FTP crawler
|
||||||
// Give back task
|
|
||||||
//err2 := CancelTask(t.WebsiteId)
|
|
||||||
//if err2 != nil {
|
|
||||||
// logrus.Error(err2)
|
|
||||||
//}
|
|
||||||
|
|
||||||
continue
|
continue
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
logrus.WithError(err).
|
logrus.WithError(err).
|
||||||
@ -126,9 +125,15 @@ func cmdBase(_ *cobra.Command, _ []string) {
|
|||||||
ScheduleTask(inRemotes, t, &baseUri)
|
ScheduleTask(inRemotes, t, &baseUri)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
shutdown:
|
||||||
|
globalWait.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
func cmdCrawler(_ *cobra.Command, args []string) error {
|
func cmdCrawler(_ *cobra.Command, args []string) error {
|
||||||
|
onlineMode = false
|
||||||
|
readConfig()
|
||||||
|
|
||||||
arg := args[0]
|
arg := args[0]
|
||||||
// https://github.com/golang/go/issues/19779
|
// https://github.com/golang/go/issues/19779
|
||||||
if !strings.Contains(arg, "://") {
|
if !strings.Contains(arg, "://") {
|
||||||
@ -161,3 +166,30 @@ func cmdCrawler(_ *cobra.Command, args []string) error {
|
|||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func listenCtrlC(soft, hard context.CancelFunc) {
|
||||||
|
c := make(chan os.Signal)
|
||||||
|
signal.Notify(c, os.Interrupt)
|
||||||
|
|
||||||
|
<-c
|
||||||
|
logrus.Info(">>> Shutting down crawler... <<<")
|
||||||
|
soft()
|
||||||
|
|
||||||
|
<-c
|
||||||
|
logrus.Warning(">>> Force shutdown! <<<")
|
||||||
|
hard()
|
||||||
|
}
|
||||||
|
|
||||||
|
func hardShutdown(c context.Context) {
|
||||||
|
<-c.Done()
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sleep(d time.Duration, c context.Context) bool {
|
||||||
|
select {
|
||||||
|
case <-time.After(d):
|
||||||
|
return true
|
||||||
|
case <-c.Done():
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
14
server.go
14
server.go
@ -17,8 +17,11 @@ import (
|
|||||||
|
|
||||||
var serverClient = http.Client {
|
var serverClient = http.Client {
|
||||||
Timeout: config.ServerTimeout,
|
Timeout: config.ServerTimeout,
|
||||||
|
Transport: new(ServerTripper),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var serverUserAgent = "od-database-crawler/" + rootCmd.Version
|
||||||
|
|
||||||
func FetchTask() (t *Task, err error) {
|
func FetchTask() (t *Task, err error) {
|
||||||
res, err := serverClient.PostForm(
|
res, err := serverClient.PostForm(
|
||||||
config.ServerUrl + "/task/get",
|
config.ServerUrl + "/task/get",
|
||||||
@ -37,7 +40,9 @@ func FetchTask() (t *Task, err error) {
|
|||||||
|
|
||||||
t = new(Task)
|
t = new(Task)
|
||||||
err = json.NewDecoder(res.Body).Decode(t)
|
err = json.NewDecoder(res.Body).Decode(t)
|
||||||
if err != nil { return }
|
if _, ok := err.(*json.SyntaxError); ok {
|
||||||
|
return nil, fmt.Errorf("/task/get returned invalid JSON")
|
||||||
|
} else if err != nil { return }
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -176,3 +181,10 @@ func CancelTask(websiteId uint64) (err error) {
|
|||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ServerTripper struct{}
|
||||||
|
|
||||||
|
func (t *ServerTripper) RoundTrip(req *http.Request) (res *http.Response, err error) {
|
||||||
|
req.Header.Set("User-Agent", serverUserAgent)
|
||||||
|
return http.DefaultTransport.RoundTrip(req)
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user