112 Commits

Author SHA1 Message Date
simon987
a962c60b82 Don't panic on file upload error 2019-04-06 14:56:22 -04:00
simon987
24f0bd91f7 Remove debug messages & don't use disk queue by default 2019-04-06 12:21:35 -04:00
simon987
84c10e1981 Change default config 2019-04-05 17:53:00 -04:00
simon987
860fa79327 Jenkins setup 2019-04-04 19:22:14 -04:00
simon987
76bc8293d6 minimum viable 2019-03-30 09:02:55 -04:00
simon987
3470be6086 More work on task_tracker integration 2019-03-09 16:38:04 -05:00
Richard Patel
60471a081e Switch to simon987/task_tracker 2019-02-28 23:51:26 +01:00
dependabot[bot]
0b3f0d87fe Upgrade fasthttp to 1.2.0
Bumps [github.com/valyala/fasthttp](https://github.com/valyala/fasthttp) from 1.1.0 to 1.2.0.
- [Release notes](https://github.com/valyala/fasthttp/releases)
- [Commits](https://github.com/valyala/fasthttp/compare/v1.1.0...v1.2.0)

Thanks bot

Signed-off-by: dependabot[bot] <support@dependabot.com>
2019-02-28 22:42:40 +01:00
terorie
da9c75e392 Reduce Docker image size 2019-02-22 21:37:04 +01:00
Pascal
8947e05d0c Fix Dockerfile
Fixes #22
Credit to @pascaldulieu
2019-02-22 21:11:55 +01:00
Richard Patel
8c5f99d616 More descriptive error if /task/get returns invalid JSON 2019-02-22 20:17:59 +01:00
Richard Patel
206ea0e91d Simplify config 2019-02-22 18:50:35 +01:00
Richard Patel
8b9d8bfd17 Fix README.md format 2019-02-22 06:04:10 +01:00
Richard Patel
c9ff102d80 Fix Dockerfile 2019-02-22 06:00:57 +01:00
Richard Patel
88856c1c19 Flag explanation in README.md 2019-02-22 05:59:59 +01:00
Richard Patel
9e9b606250 Merge branch 'stable' 2019-02-22 05:37:52 +01:00
Richard Patel
326e29e5e4 Reset to stable branch 2019-02-22 05:37:45 +01:00
Richard Patel
c2acd5463f Restore .travis.yml
Now handling auto-build over Docker Hub directly
2019-02-22 05:16:25 +01:00
Richard Patel
e4d04e6a5f go.mod: Fix package path
lol
2019-02-22 05:10:43 +01:00
terorie
9f1402e841 New Dockerfile and Travis Config (#23) 2019-02-22 05:07:27 +01:00
terorie
7c8ab50ee4 Merge stable into master 2019-02-13 15:32:40 +01:00
terorie
281d2d17d6 Update config.yml 2019-02-13 15:32:00 +01:00
Richard Patel
45cbd4d535 Disable resume feature 2019-02-05 15:44:59 +01:00
Richard Patel
771d49f2dd Fix WaitGroup deadlock 2019-02-03 17:14:20 +01:00
Richard Patel
dbd787aa81 Fix WaitGroup crash 2019-02-03 17:09:43 +01:00
Richard Patel
cea6c1658b Bugfix: Don't schedule new tasks during shutdown 2019-02-03 17:02:44 +01:00
terorie
885af5bb3b Beta task resuming 2019-02-03 16:50:08 +01:00
Richard Patel
b18b70f798 Fix segfault (thanks Pikami) 2019-02-03 14:00:17 +01:00
Richard Patel
9d5f549774 Better server User-Agent string 2019-02-03 12:23:21 +01:00
Richard Patel
5239af08f7 Bump version to v1.2.1 2019-02-03 03:36:39 +01:00
Richard Patel
46c0e0bd32 Smarter HTTP error handling 2019-02-03 03:35:09 +01:00
Richard Patel
0ca6deede8 Fix --config flag 2019-02-03 03:26:48 +01:00
Richard Patel
120c026983 Bump version to v1.2.0 2019-02-03 02:55:21 +01:00
Richard Patel
527e8895ec Support configuration without config file 2019-02-03 02:54:52 +01:00
Richard Patel
108fff0503 Add Travis CI badge 2019-02-03 02:09:06 +01:00
Richard Patel
e5746baa5b Switch to spf13/cobra
lul
2019-02-03 02:02:23 +01:00
Richard Patel
17ba5583c9 Add .travis.yml 2019-02-02 23:18:03 +01:00
Richard Patel
92a8c07f4a Add go.mod 2019-02-02 23:15:52 +01:00
Richard Patel
43f96c6988 Benchmark: Reference parser 2018-12-18 15:39:41 +01:00
Richard Patel
b244cdae80 Minor cleanup 2018-12-18 15:31:33 +01:00
Richard Patel
4b8275c7bf Add parser tests 2018-12-18 15:31:09 +01:00
Richard Patel
f90bf94a44 Bump version to v1.1.1 2018-11-27 22:11:57 +01:00
Richard Patel
e82768ff80 Wait time control in config 2018-11-27 22:11:57 +01:00
Richard Patel
b1bf59adef Add The Eye DB to README.md 2018-11-27 17:40:12 +01:00
Richard Patel
a2df2972f4 Bump the upload retry interval up to 30s 2018-11-20 04:13:20 +01:00
Richard Patel
3fc8837dd7 Add output files to .gitignore 2018-11-20 03:51:42 +01:00
Richard Patel
f9a0d6bffe Bump to v1.1.0 2018-11-20 03:46:36 +01:00
Richard Patel
4dbe2aef2b Add job buffer size parameter 2018-11-20 03:42:32 +01:00
Richard Patel
86ec78cae1 Add TCP timeout option 2018-11-20 03:29:10 +01:00
Richard Patel
b846498030 Delete URL queues after crawling 2018-11-20 03:05:43 +01:00
Richard Patel
4f3140a39f Fix queue_count in log 2018-11-20 02:49:03 +01:00
Richard Patel
85d2aac9d4 Performance patch 2018-11-20 02:33:50 +01:00
Richard Patel
b6c0a45900 Job queue disk offloading 2018-11-20 02:03:10 +01:00
Richard Patel
d332f06659 Limit retries to 10 2018-11-18 21:05:26 +01:00
Richard Patel
1625d6c888 Bump to v1.0.2 2018-11-18 18:53:57 +01:00
Richard Patel
03a487f393 Fix crawl loop 2018-11-18 18:45:06 +01:00
Richard Patel
ac8221b109 Retry /task/upload 2018-11-18 18:33:26 +01:00
Richard Patel
8ed2cf3b93 Bump to v1.0.1 2018-11-18 14:49:07 +01:00
Richard Patel
f3620262fc Add log file support 2018-11-18 14:46:52 +01:00
Richard Patel
dc4e4212a0 Add freebsd to release.sh 2018-11-18 14:38:18 +01:00
Richard Patel
6e6a4edd27 Ignore all HTTP errors 2018-11-18 14:25:06 +01:00
Richard Patel
a71157b4d8 Add User-Agent parameter 2018-11-18 14:24:04 +01:00
Richard Patel
6dbec8c789 Add release script 2018-11-18 02:36:22 +01:00
Richard Patel
605f6db5a5 Don't call /task/upload for websites with no results 2018-11-18 01:42:57 +01:00
Richard Patel
d593ba2d0b Bump to 1.0 2018-11-18 00:54:58 +01:00
Richard Patel
6793086c22 Ignore HTTPS errors 2018-11-18 00:37:30 +01:00
Richard Patel
4464f34779 Add recheck and timeout parameters 2018-11-18 00:29:29 +01:00
Richard Patel
339175220d Refactor uploading & chunk size parameter 2018-11-18 00:19:43 +01:00
Richard Patel
1e6687c519 Upload result ignoring errors 2018-11-17 15:04:20 +01:00
Richard Patel
8060556089 Fix: make crawled dir 2018-11-17 13:36:35 +01:00
Richard Patel
73ba848e17 Grammar 2018-11-17 13:35:29 +01:00
Richard Patel
115983f70e Silent HTTP errors 2018-11-17 13:22:46 +01:00
Richard Patel
9210996b4c Fix multiple part file upload 2018-11-17 12:52:24 +01:00
Richard Patel
7b29da9340 Fix file uploads 2018-11-17 12:47:16 +01:00
Richard Patel
24ee6fcba2 Quickfix: Revert FTP give back 2018-11-17 12:43:30 +01:00
Richard Patel
bfb18d62b2 mini fix 2018-11-17 05:27:09 +01:00
Richard Patel
f4054441ab Return FTP tasks 2018-11-17 05:07:52 +01:00
Richard Patel
f8d2bf386d Fix FTP error ignore 2018-11-17 04:57:19 +01:00
Richard Patel
f41198b00c Ignore FTP URLs 2018-11-17 04:50:59 +01:00
Richard Patel
7fdffff58f Update config.yml 2018-11-17 04:19:04 +01:00
Richard Patel
d596882b40 Fix ton of bugs 2018-11-17 04:18:22 +01:00
Richard Patel
0fe97a8058 Update README.md 2018-11-17 01:36:07 +01:00
Richard Patel
718f9d7fbc Rename project 2018-11-17 01:33:15 +01:00
Richard Patel
f1687679ab Unescape results & don't recrawl 404 2018-11-17 01:21:20 +01:00
Richard Patel
145d37f84a Fix wait, add back crawl command 2018-11-17 00:49:09 +01:00
Richard Patel
cc777bcaeb redblackhash: Use bytes.Compare 2018-11-16 21:17:39 +01:00
Simon
1e78cea7e7 Saved path should not contain file name 2018-11-16 13:58:12 -05:00
Richard Patel
3f85cf679b Getting tasks 2018-11-16 04:47:08 +01:00
Richard Patel
3c39f0d621 Random hacks 2018-11-16 03:22:51 +01:00
Richard Patel
50952791c5 Almost done 2018-11-16 03:12:26 +01:00
Richard Patel
30bf98ad34 Fix tests 2018-11-16 03:02:10 +01:00
Richard Patel
ccaf758e90 Remove URL.Opaque 2018-11-16 01:53:16 +01:00
Richard Patel
f668365edb Add tests 2018-11-16 01:51:34 +01:00
Richard Patel
1db8ff43bb Bump version 2018-11-16 00:25:11 +01:00
Richard Patel
82234f949e Less tokenizer allocations 2018-11-16 00:22:40 +01:00
Richard Patel
084b3a5903 Optimizing with hexa :P 2018-11-15 23:51:31 +01:00
Richard Patel
ac0b8d2d0b Blacklist all paths with a query parameter 2018-11-15 23:36:41 +01:00
Richard Patel
ffde1a9e5d Timeout and results saving 2018-11-15 20:14:31 +01:00
Richard Patel
a268c6dbcf Reduce WaitQueue usage 2018-11-12 00:38:22 +01:00
Richard Patel
4c071171eb Exclude dups in dir instead of keeping hashes of links 2018-11-11 23:11:30 +01:00
Richard Patel
9c8174dd8d Fix header parsing 2018-11-11 18:53:17 +01:00
Richard Patel
93272e1da1 Update README.md 2018-11-06 02:41:20 +01:00
Richard Patel
0344a120ff fasturl: Remove path escape 2018-11-06 02:15:09 +01:00
Richard Patel
6e6afd771e fasturl: Remove query 2018-11-06 02:11:22 +01:00
Richard Patel
a8c27b2d21 Hash links 2018-11-06 02:01:53 +01:00
Richard Patel
ed5e35f005 Performance improvements 2018-11-06 00:34:22 +01:00
Richard Patel
a12bca01c8 fasturl: Discard UserInfo 2018-11-06 00:33:57 +01:00
Richard Patel
ba9c818461 fasturl: Don't parse username and password 2018-11-06 00:28:42 +01:00
Richard Patel
9cf31b1d81 fasturl: Remove fragment 2018-11-06 00:17:10 +01:00
Richard Patel
ed0d9c681f fasturl: Replace scheme with enum 2018-11-06 00:15:12 +01:00
Richard Patel
b88d45fc21 fasturl: Remove allocs from Parse 2018-11-05 23:05:21 +01:00
Richard Patel
4989adff9f Add net/url package 2018-11-05 22:57:57 +01:00
27 changed files with 8680 additions and 558 deletions

5
.gitignore vendored
View File

@@ -1,3 +1,6 @@
/.idea/
.DS_Store
/oddb-go
/od-database-crawler
*.log
/queue/
/crawled/

5
.travis.yml Normal file
View File

@@ -0,0 +1,5 @@
language: go
go:
- "1.11.x"
- master

15
Dockerfile Normal file
View File

@@ -0,0 +1,15 @@
FROM golang:alpine as builder
ADD . /go/src/github.com/terorie/od-database-crawler
RUN apk add git \
&& go get -d -v github.com/terorie/od-database-crawler \
&& CGO_ENABLED=0 go install -a \
-installsuffix cgo \
-ldflags="-s -w" \
github.com/terorie/od-database-crawler
FROM scratch
COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
COPY --from=builder /go/bin/od-database-crawler /bin/
WORKDIR /oddb
VOLUME [ "/oddb" ]
CMD ["/bin/od-database-crawler", "server"]

View File

@@ -1,2 +1,54 @@
# oddb Go crawler
# od-database Go crawler 🚀
[![Build Status](https://travis-ci.org/terorie/od-database-crawler.svg?branch=master)](https://travis-ci.org/terorie/od-database-crawler)
> by terorie 2018 :P
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
* Crawls HTTP open directories (standard Web Server Listings)
* Gets name, path, size and modification time of all files
* Lightweight and fast: __over 9000 requests per second__ on a standard laptop
https://od-db.the-eye.eu/
## Usage
### Deploys
1. With Config File (if `config.yml` found in working dir)
- Download [default config](https://github.com/terorie/od-database-crawler/blob/master/config.yml)
- Set `server.url` and `server.token`
- Start with `./od-database-crawler server --config <file>`
2. With Flags or env
- Override config file if it exists
- `--help` for list of flags
- Every flag is available as an environment variable:
`--server.crawl_stats` ➡️ `OD_SERVER_CRAWL_STATS`
- Start with `./od-database-crawler server <flags>`
3. With Docker
```bash
docker run \
-e OD_SERVER_URL=xxx \
-e OD_SERVER_TOKEN=xxx \
terorie/od-database-crawler
```
### Flag reference
Here are the most important config flags. For more fine control, take a look at `/config.yml`.
| Flag/Environment | Description | Example |
| ------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------- |
| `server.url`<br />`OD_SERVER_URL` | OD-DB Server URL | `https://od-db.mine.the-eye.eu/api` |
| `server.token`<br />`OD_SERVER_TOKEN` | OD-DB Server Access Token | _Ask Hexa **TM**_ |
| `server.recheck`<br />`OD_SERVER_RECHECK` | Job Fetching Interval | `3s` |
| `output.crawl_stats`<br />`OD_OUTPUT_CRAWL_STATS` | Crawl Stats Logging Interval (0 = disabled) | `500ms` |
| `output.resource_stats`<br />`OD_OUTPUT_RESORUCE_STATS` | Resource Stats Logging Interval (0 = disabled) | `8s` |
| `output.log`<br />`OD_OUTPUT_LOG` | Log File (none = disabled) | `crawler.log` |
| `crawl.tasks`<br />`OD_CRAWL_TASKS` | Max number of sites to crawl concurrently | `500` |
| `crawl.connections`<br />`OD_CRAWL_CONNECTIONS` | HTTP connections per site | `1` |
| `crawl.retries`<br />`OD_CRAWL_RETRIES` | How often to retry after a temporary failure (e.g. `HTTP 429` or timeouts) | `5` |
| `crawl.dial_timeout`<br />`OD_CRAWL_DIAL_TIMEOUT` | TCP Connect timeout | `5s` |
| `crawl.timeout`<br />`OD_CRAWL_TIMEOUT` | HTTP request timeout | `20s` |
| `crawl.user-agent`<br />`OD_CRAWL_USER_AGENT` | HTTP Crawler User-Agent | `googlebot/1.2.3` |
| `crawl.job_buffer`<br />`OD_CRAWL_JOB_BUFFER` | Number of URLs to keep in memory/cache, per job. The rest is offloaded to disk. Decrease this value if the crawler uses too much RAM. (0 = Disable Cache, -1 = Only use Cache) | `5000` |

215
config.go
View File

@@ -1,62 +1,184 @@
package main
import (
"bufio"
"fmt"
"github.com/sirupsen/logrus"
"github.com/spf13/pflag"
"github.com/spf13/viper"
"io"
"os"
"strings"
"time"
)
var config struct {
ServerUrl string
Token string
Retries int
Workers int
Tasks int32
CrawlStats time.Duration
AllocStats time.Duration
Verbose bool
TrackerUrl string
TrackerProject int
TrackerAlias string
WsBucketScheme string
WsBucketHost string
ServerTimeout time.Duration
Recheck time.Duration
ChunkSize int64
Retries int
Workers int
UserAgent string
Tasks int32
Verbose bool
PrintHTTP bool
JobBufferSize int
}
var onlineMode bool
const (
ConfServerUrl = "server.url"
ConfToken = "server.token"
ConfTasks = "crawl.tasks"
ConfRetries = "crawl.retries"
ConfWorkers = "crawl.connections"
ConfTrackerUrl = "server.url"
ConfTrackerProject = "server.project"
ConfTrackerAlias = "server.alias"
ConfWsBucketScheme = "server.ws_bucket_scheme"
ConfWsBucketHost = "server.ws_bucket_host"
ConfServerTimeout = "server.timeout"
ConfRecheck = "server.recheck"
ConfCooldown = "server.cooldown"
ConfChunkSize = "server.upload_chunk"
ConfUploadRetries = "server.upload_retries"
ConfUploadRetryInterval = "server.upload_retry_interval"
ConfTasks = "crawl.tasks"
ConfRetries = "crawl.retries"
ConfWorkers = "crawl.connections"
ConfUserAgent = "crawl.user-agent"
ConfDialTimeout = "crawl.dial_timeout"
ConfTimeout = "crawl.timeout"
ConfJobBufferSize = "crawl.job_buffer"
ConfCrawlStats = "output.crawl_stats"
ConfAllocStats = "output.resource_stats"
ConfVerbose = "output.verbose"
ConfPrintHTTP = "output.http"
ConfLogFile = "output.log"
)
func prepareConfig() {
viper.SetDefault(ConfRetries, 5)
viper.SetDefault(ConfWorkers, 2)
viper.SetDefault(ConfTasks, 3)
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
viper.SetDefault(ConfAllocStats, 0)
viper.SetDefault(ConfVerbose, false)
pf := rootCmd.PersistentFlags()
pf.SortFlags = false
pf.StringVar(&configFile, "config", "", "Config file")
configFile = os.Getenv("OD_CONFIG")
pf.String(ConfTrackerUrl, "https://tt.the-eye.eu/api", "task_tracker api URL")
pf.String(ConfTrackerProject, "1", "task_tracker project id")
pf.String(ConfWsBucketScheme, "wss", "ws_bucket scheme")
pf.String(ConfWsBucketHost, "wsb.the-eye.eu", "ws_bucket host")
pf.String(ConfTrackerAlias, "changeme", "task_tracker worker alias")
pf.Duration(ConfServerTimeout, 60*time.Second, "OD-DB request timeout")
pf.Duration(ConfRecheck, 1*time.Second, "OD-DB: Poll interval for new jobs")
pf.Duration(ConfCooldown, 1*time.Minute, "OD-DB: Time to wait after a server-side error")
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
pf.Uint(ConfUploadRetries, 10, "OD-DB: Max upload retries")
pf.Duration(ConfUploadRetryInterval, 30*time.Second, "OD-DB: Time to wait between upload retries")
pf.Uint(ConfTasks, 25, "Crawler: Max concurrent tasks")
pf.Uint(ConfWorkers, 1, "Crawler: Connections per server")
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
pf.Duration(ConfDialTimeout, 10*time.Second, "Crawler: Handshake timeout")
pf.Duration(ConfTimeout, 30*time.Second, "Crawler: Request timeout")
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
pf.Int(ConfJobBufferSize, -1, "Crawler: Task queue cache size")
pf.Duration(ConfCrawlStats, 500*time.Second, "Log: Crawl stats interval")
pf.Duration(ConfAllocStats, 500*time.Second, "Log: Resource stats interval")
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")
pf.Bool(ConfPrintHTTP, false, "Log: Print HTTP client errors")
pf.String(ConfLogFile, "crawler.log", "Log file")
// Bind all flags to Viper
pf.VisitAll(func(flag *pflag.Flag) {
s := flag.Name
s = strings.TrimLeft(s, "-")
if err := viper.BindPFlag(s, flag); err != nil {
panic(err)
}
var envKey string
envKey = strings.Replace(s, ".", "_", -1)
envKey = strings.ToUpper(envKey)
envKey = "OD_" + envKey
if err := viper.BindEnv(s, envKey); err != nil {
panic(err)
}
})
}
func readConfig() {
viper.AddConfigPath(".")
viper.SetConfigName("config")
err := viper.ReadInConfig()
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
// If config.yml in working dir, use it
if configFile == "" {
_, err := os.Stat("config.yml")
if err == nil {
configFile = "config.yml"
}
}
config.ServerUrl = viper.GetString(ConfServerUrl)
//if config.ServerUrl == "" {
// configMissing(ConfServerUrl)
//}
if configFile != "" {
confF, err := os.Open(configFile)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
defer confF.Close()
config.Token = viper.GetString(ConfToken)
//if config.Token == "" {
// configMissing(ConfToken)
//}
viper.SetConfigType("yml")
err = viper.ReadConfig(confF)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
if onlineMode {
config.TrackerUrl = viper.GetString(ConfTrackerUrl)
if config.TrackerUrl == "" {
configMissing(ConfTrackerUrl)
}
config.TrackerUrl = strings.TrimRight(config.TrackerUrl, "/")
}
config.TrackerProject = viper.GetInt(ConfTrackerProject)
config.TrackerAlias = viper.GetString(ConfTrackerAlias)
config.WsBucketHost = viper.GetString(ConfWsBucketHost)
config.WsBucketScheme = viper.GetString(ConfWsBucketScheme)
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
config.Recheck = viper.GetDuration(ConfRecheck)
config.ChunkSize = int64(viper.GetSizeInBytes(ConfChunkSize))
if config.ChunkSize < 100 {
configOOB(ConfChunkSize, config.ChunkSize)
}
config.Retries = viper.GetInt(ConfRetries)
if config.Retries < 0 {
@@ -73,14 +195,33 @@ func readConfig() {
configOOB(ConfTasks, int(config.Tasks))
}
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
config.UserAgent = viper.GetString(ConfUserAgent)
config.AllocStats = viper.GetDuration(ConfAllocStats)
setDialTimeout(viper.GetDuration(ConfDialTimeout))
setTimeout(viper.GetDuration(ConfTimeout))
config.JobBufferSize = viper.GetInt(ConfJobBufferSize)
config.Verbose = viper.GetBool(ConfVerbose)
if config.Verbose {
logrus.SetLevel(logrus.DebugLevel)
}
if filePath := viper.GetString(ConfLogFile); filePath != "" {
f, err := os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
bufWriter := bufio.NewWriter(f)
if err != nil {
panic(err)
}
exitHooks.Add(func() {
bufWriter.Flush()
f.Close()
})
logrus.SetOutput(io.MultiWriter(os.Stdout, bufWriter))
}
config.PrintHTTP = viper.GetBool(ConfPrintHTTP)
}
func configMissing(key string) {
@@ -88,7 +229,7 @@ func configMissing(key string) {
os.Exit(1)
}
func configOOB(key string, v int) {
fmt.Fprintf(os.Stderr, "config: illegal value %d for %key!\n", v, key)
func configOOB(key string, v interface{}) {
fmt.Fprintf(os.Stderr, "config: illegal value %v for key %s!\n", v, key)
os.Exit(1)
}

View File

@@ -1,26 +1,84 @@
# OD-Database server settings
server:
# Connection URL
url: localhost:6969
# Server auth token
token:
url: https://tt.the-eye.eu/api
# OD-Database project id (for crawling)
project: 1
# Your worker alias
alias: changeme
# Websocket bucket host & scheme (ws/wss)
ws_bucket_host: https://wsb.the-eye.eu
ws_bucket_scheme: wss
# Request timeout
timeout: 60s
# Recheck interval
# The crawler periodically asks the server
# for new jobs. Sets the minimum wait time
# between /task/get requests to the server.
recheck: 1s
# Time to wait after receiving an error
# from the server. Doesn't apply to uploads.
cooldown: 1s
upload_retries: 10
upload_retry_interval: 30s
# Log output settings
output:
# Crawl statistics
crawl_stats: 1s
crawl_stats: 1m
# CPU/RAM/Job queue stats
resource_stats: 1s
resource_stats: 1m
# More output? (Every listed dir)
verbose: false
# Print HTTP errors (Super spammy)
http: false
# Log file
# If empty, no log file is created.
log: crawler.log
# Crawler settings
crawl:
# Number of sites that can be
# processed at once
tasks: 3
# Number of sites that can be processed at once
tasks: 25
# Number of connections per site
connections: 2
# Please be careful with this setting!
# The crawler fires fast and more than
# ten connections can overwhelm a server.
connections: 1
# How often to retry getting data
# from the site before giving up
retries: 5
# Time before discarding a failed connection attempt
dial_timeout: 10s
# Time before discarding a network request
timeout: 30s
# Crawler User-Agent
# If empty, no User-Agent header is sent.
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
# Job buffer size (per task)
# Higher values cause less disk writes
# but require more memory.
#
# The job queue contains all URLs
# that should be crawled next.
# As it grows very large over time,
# it's kept mainly on disk.
# This sets how many jobs are kept
# in memory.
# A negative value will cause all jobs
# to be stored in memory. (Don't do this)
job_buffer: -1

272
crawl.go
View File

@@ -2,28 +2,45 @@ package main
import (
"bytes"
"encoding/base64"
"fmt"
"github.com/sirupsen/logrus"
"crypto/tls"
"github.com/terorie/od-database-crawler/ds/redblackhash"
"github.com/terorie/od-database-crawler/fasturl"
"github.com/valyala/fasthttp"
"golang.org/x/crypto/blake2b"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"net/url"
"net"
"path"
"strconv"
"strings"
"time"
)
var client fasthttp.Client
var client = fasthttp.Client {
TLSConfig: &tls.Config{
InsecureSkipVerify: true,
},
}
func GetDir(j *Job, f *File) (links []url.URL, err error) {
func setDialTimeout(d time.Duration) {
client.Dial = func(addr string) (net.Conn, error) {
return fasthttp.DialTimeout(addr, d)
}
}
func setTimeout(d time.Duration) {
client.ReadTimeout = d
client.WriteTimeout = d / 2
}
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
f.IsDir = true
f.Name = path.Base(j.Uri.Path)
req := fasthttp.AcquireRequest()
req.SetRequestURI(j.Uri.String())
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(j.UriStr)
res := fasthttp.AcquireResponse()
defer fasthttp.ReleaseResponse(res)
@@ -32,98 +49,99 @@ func GetDir(j *Job, f *File) (links []url.URL, err error) {
fasthttp.ReleaseRequest(req)
if err != nil {
logrus.Error(err)
return
}
err = checkStatusCode(res.StatusCode())
if err != nil { return }
if err != nil {
return
}
body := res.Body()
return ParseDir(body, &j.Uri)
}
func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error) {
doc := html.NewTokenizer(bytes.NewReader(body))
var linkHref string
var linkTexts []string
for {
err = nil
tokenType := doc.Next()
token := doc.Token()
if tokenType == html.ErrorToken {
break
}
switch tokenType {
case html.StartTagToken:
if token.DataAtom == atom.A {
for _, attr := range token.Attr {
if attr.Key == "href" {
linkHref = attr.Val
name, hasAttr := doc.TagName()
if len(name) == 1 && name[0] == 'a' {
for hasAttr {
var ks, vs []byte
ks, vs, hasAttr = doc.TagAttr()
if bytes.Equal(ks, []byte("href")) {
// TODO Check escape
linkHref = string(vs)
break
}
}
}
case html.TextToken:
if linkHref != "" {
linkTexts = append(linkTexts, token.Data)
}
case html.EndTagToken:
if linkHref != "" && token.DataAtom == atom.A {
name, _ := doc.TagName()
if len(name) == 1 && name[0] == 'a' {
// Copy params
href := linkHref
linkText := strings.Join(linkTexts, " ")
// Reset params
linkHref = ""
linkTexts = nil
// TODO Optimized decision tree
for _, entry := range urlBlackList {
if href == entry {
goto nextToken
}
}
for _, entry := range urlPartBlackList {
if strings.Contains(href, entry) {
goto nextToken
}
}
for _, entry := range fileNameBlackList {
if strings.Contains(linkText, entry) {
goto nextToken
}
if strings.LastIndexByte(href, '?') != -1 {
continue
}
subref, err := url.Parse(href)
if err != nil { continue }
switch href {
case "", " ", ".", "..", "/":
continue
}
link := *j.Uri.ResolveReference(subref)
if strings.Contains(href, "../") {
continue
}
if link.Scheme != j.Uri.Scheme ||
link.Host != j.Uri.Host ||
link.Path == j.Uri.Path ||
!strings.HasPrefix(link.Path, j.Uri.Path) {
var link fasturl.URL
err = baseUri.ParseRel(&link, href)
if err != nil {
continue
}
if link.Scheme != baseUri.Scheme ||
link.Host != baseUri.Host ||
link.Path == baseUri.Path ||
!strings.HasPrefix(link.Path, baseUri.Path) {
continue
}
links = append(links, link)
}
}
nextToken:
}
return
}
func GetFile(u url.URL, f *File) (err error) {
func GetFile(u fasturl.URL, f *File) (err error) {
f.IsDir = false
u.Path = path.Clean(u.Path)
f.Name = path.Base(u.Path)
f.Path = strings.Trim(u.Path, "/")
f.Path = strings.Trim(path.Dir(u.Path), "/")
req := fasthttp.AcquireRequest()
req.Header.SetMethod("HEAD")
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(u.String())
res := fasthttp.AcquireResponse()
@@ -133,19 +151,22 @@ func GetFile(u url.URL, f *File) (err error) {
err = client.Do(req, res)
fasthttp.ReleaseRequest(req)
if err != nil { return }
if err != nil {
return
}
err = checkStatusCode(res.StatusCode())
if err != nil { return }
if err != nil {
return
}
// TODO Inefficient af
header := res.Header.Header()
f.ParseHeader(header)
f.applyContentLength(string(res.Header.Peek("content-length")))
f.applyLastModified(string(res.Header.Peek("last-modified")))
return nil
}
func (f *File) HashDir(links []url.URL) string {
func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) {
h, _ := blake2b.New256(nil)
h.Write([]byte(f.Name))
for _, link := range links {
@@ -153,67 +174,46 @@ func (f *File) HashDir(links []url.URL) string {
h.Write([]byte(fileName))
}
sum := h.Sum(nil)
b64sum := base64.StdEncoding.EncodeToString(sum)
return b64sum
copy(o[:redblackhash.KeySize], sum)
return
}
func (f *File) ParseHeader(h []byte) {
var k1, k2 int
var v1, v2 int
// Simple finite state machine
state := 0
for i, b := range h {
switch state {
case 0:
if b == byte(':') {
state = 1
k2 = i
}
case 1:
state = 2
case 2:
state = 3
v1 = i
case 3:
if b == byte('\r') {
state = 4
}
case 4:
state = 0
v2 = i - 1
key := string(h[k1:k2])
val := string(h[v1:v2])
k1 = i
f.applyHeader(key, val)
}
func (f *File) applyContentLength(v string) {
if v == "" {
return
}
size, err := strconv.ParseInt(v, 10, 64)
if err != nil {
return
}
if size < 0 {
return
}
f.Size = size
}
func (f *File) applyHeader(k, v string) {
switch k {
case "content-length":
size, err := strconv.ParseInt(v, 10, 64)
if err != nil { break }
if size < 0 { break }
f.Size = size
case "last-modified":
var err error
f.MTime, err = time.Parse(time.RFC1123, v)
if err == nil { break }
f.MTime, err = time.Parse(time.RFC850, v)
if err == nil { break }
// TODO Parse asctime
f.MTime, err = time.Parse("2006-01-02", v[:10])
if err == nil { break }
// TODO Cleanup
func (f *File) applyLastModified(v string) {
if v == "" {
return
}
var t time.Time
var err error
t, err = time.Parse(time.RFC1123, v)
if err == nil {
f.MTime = t.Unix()
return
}
t, err = time.Parse(time.RFC850, v)
if err == nil {
f.MTime = t.Unix()
return
}
// TODO Parse asctime
t, err = time.Parse("2006-01-02", v[:10])
if err == nil {
f.MTime = t.Unix()
return
}
}
@@ -221,53 +221,7 @@ func checkStatusCode(status int) error {
switch status {
case fasthttp.StatusOK:
return nil
case fasthttp.StatusTooManyRequests:
return ErrRateLimit
case fasthttp.StatusForbidden,
fasthttp.StatusUnauthorized:
return ErrForbidden
default:
return fmt.Errorf("got HTTP status %d", status)
return &HttpError{status}
}
}
var urlBlackList = [...]string {
"",
" ",
".",
"..",
"/",
}
var urlPartBlackList = [...]string {
"?C=N&O=D",
"?C=M&O=A",
"?C=S&O=A",
"?C=D&O=A",
"?C=N;O=D",
"?C=M;O=A",
"?C=M&O=D",
"?C=S;O=A",
"?C=S&O=D",
"?C=D;O=A",
"?MA",
"?SA",
"?DA",
"?ND",
"?C=N&O=A",
"?C=N&O=A",
"?M=A",
"?N=D",
"?S=A",
"?D=A",
}
var fileNameBlackList = [...]string {
"Parent Directory",
" Parent Directory",
"../",
}

4766
crawl_apache2_test.go Normal file

File diff suppressed because it is too large Load Diff

117
crawl_nginx_test.go Normal file
View File

@@ -0,0 +1,117 @@
package main
import (
"github.com/terorie/od-database-crawler/fasturl"
"testing"
)
func TestParseDirNginx(t *testing.T) {
var u fasturl.URL
err := u.Parse("https://the-eye.eu/public/")
if err != nil {
t.Fatal("Failed to parse URL", err)
}
links, err := ParseDir([]byte(nginxListing), &u)
if err != nil {
t.Fatal("Failed to extract links", err)
}
if len(links) != len(nginxLinks) {
t.Fatalf("Expected %d links, got %d",
len(nginxLinks), len(links))
}
for i := 0; i < len(links); i++ {
gotLink := links[i].String()
expLink := nginxLinks[i]
if gotLink != expLink {
t.Errorf(`Expected "%s" got "%s"`,
expLink, gotLink)
}
}
}
var nginxLinks = []string {
"https://the-eye.eu/public/AppleArchive/",
"https://the-eye.eu/public/AudioBooks/",
"https://the-eye.eu/public/Books/",
"https://the-eye.eu/public/Comics/",
"https://the-eye.eu/public/Games/",
"https://the-eye.eu/public/Icons/",
"https://the-eye.eu/public/Images/",
"https://the-eye.eu/public/JFK_Files/",
"https://the-eye.eu/public/MSDN/",
"https://the-eye.eu/public/Music/",
"https://the-eye.eu/public/Operating%20Systems/",
"https://the-eye.eu/public/Posters/",
"https://the-eye.eu/public/Psychedelics/",
"https://the-eye.eu/public/Psychoactives/",
"https://the-eye.eu/public/Radio/",
"https://the-eye.eu/public/Random/",
"https://the-eye.eu/public/Site-Dumps/",
"https://the-eye.eu/public/Software/",
"https://the-eye.eu/public/Strategic%20Intelligence%20Network/",
"https://the-eye.eu/public/WorldTracker.org/",
"https://the-eye.eu/public/concen.org/",
"https://the-eye.eu/public/freenrg.info/",
"https://the-eye.eu/public/murdercube.com/",
"https://the-eye.eu/public/parazite/",
"https://the-eye.eu/public/ripreddit/",
"https://the-eye.eu/public/rom/",
"https://the-eye.eu/public/touhou/",
"https://the-eye.eu/public/vns/",
"https://the-eye.eu/public/xbins/",
"https://the-eye.eu/public/xbins.diodematrix/",
"https://the-eye.eu/public/Rclone_for_Scrubs.pdf",
"https://the-eye.eu/public/Wget_Linux_Guide.pdf",
"https://the-eye.eu/public/Wget_Windows_Guide.pdf",
"https://the-eye.eu/public/rclone_guide.pdf",
"https://the-eye.eu/public/wget-noobs-guide.pdf",
"https://the-eye.eu/public/xbox-scene_Aug2014.7z",
}
const nginxListing =
`<html>
<head><title>Index of /public/</title></head>
<body bgcolor="white">
<h1>Index of /public/</h1><hr><pre><a href="../">../</a>
<a href="AppleArchive/">AppleArchive/</a> 03-Nov-2017 18:13 -
<a href="AudioBooks/">AudioBooks/</a> 29-Sep-2018 19:47 -
<a href="Books/">Books/</a> 27-Nov-2018 17:50 -
<a href="Comics/">Comics/</a> 05-Nov-2018 21:37 -
<a href="Games/">Games/</a> 28-Nov-2018 11:54 -
<a href="Icons/">Icons/</a> 22-May-2018 07:47 -
<a href="Images/">Images/</a> 21-Jan-2018 03:21 -
<a href="JFK_Files/">JFK_Files/</a> 03-Nov-2017 17:03 -
<a href="MSDN/">MSDN/</a> 03-Nov-2017 15:48 -
<a href="Music/">Music/</a> 02-Mar-2018 15:47 -
<a href="Operating%20Systems/">Operating Systems/</a> 25-Apr-2018 07:18 -
<a href="Posters/">Posters/</a> 07-Jul-2018 01:12 -
<a href="Psychedelics/">Psychedelics/</a> 11-Apr-2018 05:45 -
<a href="Psychoactives/">Psychoactives/</a> 18-May-2018 02:58 -
<a href="Radio/">Radio/</a> 09-Jun-2018 15:49 -
<a href="Random/">Random/</a> 04-Dec-2018 12:33 -
<a href="Site-Dumps/">Site-Dumps/</a> 15-Dec-2018 11:04 -
<a href="Software/">Software/</a> 27-Nov-2017 00:22 -
<a href="Strategic%20Intelligence%20Network/">Strategic Intelligence Network/</a> 17-Nov-2017 16:35 -
<a href="WorldTracker.org/">WorldTracker.org/</a> 12-Apr-2018 04:16 -
<a href="concen.org/">concen.org/</a> 08-Oct-2018 14:08 -
<a href="freenrg.info/">freenrg.info/</a> 19-Dec-2017 10:59 -
<a href="murdercube.com/">murdercube.com/</a> 06-Dec-2017 10:45 -
<a href="parazite/">parazite/</a> 20-Nov-2017 21:25 -
<a href="ripreddit/">ripreddit/</a> 04-Aug-2018 14:30 -
<a href="rom/">rom/</a> 28-Nov-2018 14:15 -
<a href="touhou/">touhou/</a> 03-Nov-2017 11:07 -
<a href="vns/">vns/</a> 03-Nov-2017 11:36 -
<a href="xbins/">xbins/</a> 03-Nov-2017 17:23 -
<a href="xbins.diodematrix/">xbins.diodematrix/</a> 21-Sep-2018 22:33 -
<a href="Rclone_for_Scrubs.pdf">Rclone_for_Scrubs.pdf</a> 04-Sep-2018 13:31 315K
<a href="Wget_Linux_Guide.pdf">Wget_Linux_Guide.pdf</a> 21-Dec-2017 20:28 168K
<a href="Wget_Windows_Guide.pdf">Wget_Windows_Guide.pdf</a> 25-Nov-2017 17:59 867K
<a href="rclone_guide.pdf">rclone_guide.pdf</a> 03-Sep-2018 23:37 315K
<a href="wget-noobs-guide.pdf">wget-noobs-guide.pdf</a> 21-Dec-2017 20:29 168K
<a href="xbox-scene_Aug2014.7z">xbox-scene_Aug2014.7z</a> 26-Oct-2017 23:09 1G
</pre><hr></body>
</html>`

59
crawl_test.go Normal file
View File

@@ -0,0 +1,59 @@
package main
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"github.com/terorie/od-database-crawler/fasturl"
"net/url"
"strings"
"testing"
)
func BenchmarkParseDir(b *testing.B) {
for n := 0; n < b.N; n++ {
var u fasturl.URL
err := u.Parse("http://archive.ubuntu.com/ubuntu/indices/")
if err != nil {
b.Fatal("Failed to parse URL", err)
}
_, err = ParseDir([]byte(apache2Listing), &u)
if err != nil {
b.Fatal("Failed to extract links", err)
}
}
}
func BenchmarkParseDirReference(b *testing.B) {
for n := 0; n < b.N; n++ {
u, err := url.Parse("http://archive.ubuntu.com/ubuntu/indices/")
if err != nil {
b.Fatal("Failed to parse URL", err)
}
_, err = referenceParseDir([]byte(apache2Listing), u)
if err != nil {
b.Fatal("Failed to extract links", err)
}
}
}
func referenceParseDir(body []byte, baseUri *url.URL) (links []*url.URL, err error) {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil { return nil, err }
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
sub, err := baseUri.Parse(href)
if err != nil { return } // continue
if !strings.HasPrefix(sub.String(), baseUri.String()) {
return // continue
}
links = append(links, sub)
})
return
}

489
ds/redblackhash/redblack.go Normal file
View File

@@ -0,0 +1,489 @@
// Copyright (c) 2015, Emir Pasic. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Modifications by terorie
// Package redblacktree implements a red-black tree.
//
// Used by TreeSet and TreeMap.
//
// Structure is not thread safe.
//
// References: http://en.wikipedia.org/wiki/Red%E2%80%93black_tree
package redblackhash
import (
"bytes"
"fmt"
"sync"
)
const (
black, red color = true, false
KeySize = 64
)
type color bool
type Key [KeySize]byte
// Tree holds elements of the red-black tree
type Tree struct {
sync.Mutex
Root *Node
size int
}
// Node is a single element within the tree
type Node struct {
Key Key
color color
Left *Node
Right *Node
Parent *Node
}
func (k *Key) Compare(o *Key) int {
return bytes.Compare(k[:], o[:])
}
// Put inserts node into the tree.
// Key should adhere to the comparator's type assertion, otherwise method panics.
func (tree *Tree) Put(key *Key) {
var insertedNode *Node
if tree.Root == nil {
// Assert key is of comparator's type for initial tree
tree.Root = &Node{Key: *key, color: red}
insertedNode = tree.Root
} else {
node := tree.Root
loop := true
for loop {
compare := key.Compare(&node.Key)
switch {
case compare == 0:
node.Key = *key
return
case compare < 0:
if node.Left == nil {
node.Left = &Node{Key: *key, color: red}
insertedNode = node.Left
loop = false
} else {
node = node.Left
}
case compare > 0:
if node.Right == nil {
node.Right = &Node{Key: *key, color: red}
insertedNode = node.Right
loop = false
} else {
node = node.Right
}
}
}
insertedNode.Parent = node
}
tree.insertCase1(insertedNode)
tree.size++
}
// Get searches the node in the tree by key and returns its value or nil if key is not found in tree.
// Second return parameter is true if key was found, otherwise false.
// Key should adhere to the comparator's type assertion, otherwise method panics.
func (tree *Tree) Get(key *Key) (found bool) {
node := tree.lookup(key)
return node != nil
}
// Remove remove the node from the tree by key.
// Key should adhere to the comparator's type assertion, otherwise method panics.
func (tree *Tree) Remove(key *Key) {
var child *Node
node := tree.lookup(key)
if node == nil {
return
}
if node.Left != nil && node.Right != nil {
pred := node.Left.maximumNode()
node.Key = pred.Key
node = pred
}
if node.Left == nil || node.Right == nil {
if node.Right == nil {
child = node.Left
} else {
child = node.Right
}
if node.color == black {
node.color = nodeColor(child)
tree.deleteCase1(node)
}
tree.replaceNode(node, child)
if node.Parent == nil && child != nil {
child.color = black
}
}
tree.size--
}
// Empty returns true if tree does not contain any nodes
func (tree *Tree) Empty() bool {
return tree.size == 0
}
// Size returns number of nodes in the tree.
func (tree *Tree) Size() int {
return tree.size
}
// Left returns the left-most (min) node or nil if tree is empty.
func (tree *Tree) Left() *Node {
var parent *Node
current := tree.Root
for current != nil {
parent = current
current = current.Left
}
return parent
}
// Right returns the right-most (max) node or nil if tree is empty.
func (tree *Tree) Right() *Node {
var parent *Node
current := tree.Root
for current != nil {
parent = current
current = current.Right
}
return parent
}
// Floor Finds floor node of the input key, return the floor node or nil if no floor is found.
// Second return parameter is true if floor was found, otherwise false.
//
// Floor node is defined as the largest node that is smaller than or equal to the given node.
// A floor node may not be found, either because the tree is empty, or because
// all nodes in the tree are larger than the given node.
//
// Key should adhere to the comparator's type assertion, otherwise method panics.
func (tree *Tree) Floor(key *Key) (floor *Node, found bool) {
found = false
node := tree.Root
for node != nil {
compare := key.Compare(&node.Key)
switch {
case compare == 0:
return node, true
case compare < 0:
node = node.Left
case compare > 0:
floor, found = node, true
node = node.Right
}
}
if found {
return floor, true
}
return nil, false
}
// Ceiling finds ceiling node of the input key, return the ceiling node or nil if no ceiling is found.
// Second return parameter is true if ceiling was found, otherwise false.
//
// Ceiling node is defined as the smallest node that is larger than or equal to the given node.
// A ceiling node may not be found, either because the tree is empty, or because
// all nodes in the tree are smaller than the given node.
//
// Key should adhere to the comparator's type assertion, otherwise method panics.
func (tree *Tree) Ceiling(key *Key) (ceiling *Node, found bool) {
found = false
node := tree.Root
for node != nil {
compare := key.Compare(&node.Key)
switch {
case compare == 0:
return node, true
case compare < 0:
ceiling, found = node, true
node = node.Left
case compare > 0:
node = node.Right
}
}
if found {
return ceiling, true
}
return nil, false
}
// Clear removes all nodes from the tree.
func (tree *Tree) Clear() {
tree.Root = nil
tree.size = 0
}
// String returns a string representation of container
func (tree *Tree) String() string {
str := "RedBlackTree\n"
if !tree.Empty() {
output(tree.Root, "", true, &str)
}
return str
}
func (node *Node) String() string {
return fmt.Sprintf("%v", node.Key)
}
func output(node *Node, prefix string, isTail bool, str *string) {
if node.Right != nil {
newPrefix := prefix
if isTail {
newPrefix += "│ "
} else {
newPrefix += " "
}
output(node.Right, newPrefix, false, str)
}
*str += prefix
if isTail {
*str += "└── "
} else {
*str += "┌── "
}
*str += node.String() + "\n"
if node.Left != nil {
newPrefix := prefix
if isTail {
newPrefix += " "
} else {
newPrefix += "│ "
}
output(node.Left, newPrefix, true, str)
}
}
func (tree *Tree) lookup(key *Key) *Node {
node := tree.Root
for node != nil {
compare := key.Compare(&node.Key)
switch {
case compare == 0:
return node
case compare < 0:
node = node.Left
case compare > 0:
node = node.Right
}
}
return nil
}
func (node *Node) grandparent() *Node {
if node != nil && node.Parent != nil {
return node.Parent.Parent
}
return nil
}
func (node *Node) uncle() *Node {
if node == nil || node.Parent == nil || node.Parent.Parent == nil {
return nil
}
return node.Parent.sibling()
}
func (node *Node) sibling() *Node {
if node == nil || node.Parent == nil {
return nil
}
if node == node.Parent.Left {
return node.Parent.Right
}
return node.Parent.Left
}
func (tree *Tree) rotateLeft(node *Node) {
right := node.Right
tree.replaceNode(node, right)
node.Right = right.Left
if right.Left != nil {
right.Left.Parent = node
}
right.Left = node
node.Parent = right
}
func (tree *Tree) rotateRight(node *Node) {
left := node.Left
tree.replaceNode(node, left)
node.Left = left.Right
if left.Right != nil {
left.Right.Parent = node
}
left.Right = node
node.Parent = left
}
func (tree *Tree) replaceNode(old *Node, new *Node) {
if old.Parent == nil {
tree.Root = new
} else {
if old == old.Parent.Left {
old.Parent.Left = new
} else {
old.Parent.Right = new
}
}
if new != nil {
new.Parent = old.Parent
}
}
func (tree *Tree) insertCase1(node *Node) {
if node.Parent == nil {
node.color = black
} else {
tree.insertCase2(node)
}
}
func (tree *Tree) insertCase2(node *Node) {
if nodeColor(node.Parent) == black {
return
}
tree.insertCase3(node)
}
func (tree *Tree) insertCase3(node *Node) {
uncle := node.uncle()
if nodeColor(uncle) == red {
node.Parent.color = black
uncle.color = black
node.grandparent().color = red
tree.insertCase1(node.grandparent())
} else {
tree.insertCase4(node)
}
}
func (tree *Tree) insertCase4(node *Node) {
grandparent := node.grandparent()
if node == node.Parent.Right && node.Parent == grandparent.Left {
tree.rotateLeft(node.Parent)
node = node.Left
} else if node == node.Parent.Left && node.Parent == grandparent.Right {
tree.rotateRight(node.Parent)
node = node.Right
}
tree.insertCase5(node)
}
func (tree *Tree) insertCase5(node *Node) {
node.Parent.color = black
grandparent := node.grandparent()
grandparent.color = red
if node == node.Parent.Left && node.Parent == grandparent.Left {
tree.rotateRight(grandparent)
} else if node == node.Parent.Right && node.Parent == grandparent.Right {
tree.rotateLeft(grandparent)
}
}
func (node *Node) maximumNode() *Node {
if node == nil {
return nil
}
for node.Right != nil {
node = node.Right
}
return node
}
func (tree *Tree) deleteCase1(node *Node) {
if node.Parent == nil {
return
}
tree.deleteCase2(node)
}
func (tree *Tree) deleteCase2(node *Node) {
sibling := node.sibling()
if nodeColor(sibling) == red {
node.Parent.color = red
sibling.color = black
if node == node.Parent.Left {
tree.rotateLeft(node.Parent)
} else {
tree.rotateRight(node.Parent)
}
}
tree.deleteCase3(node)
}
func (tree *Tree) deleteCase3(node *Node) {
sibling := node.sibling()
if nodeColor(node.Parent) == black &&
nodeColor(sibling) == black &&
nodeColor(sibling.Left) == black &&
nodeColor(sibling.Right) == black {
sibling.color = red
tree.deleteCase1(node.Parent)
} else {
tree.deleteCase4(node)
}
}
func (tree *Tree) deleteCase4(node *Node) {
sibling := node.sibling()
if nodeColor(node.Parent) == red &&
nodeColor(sibling) == black &&
nodeColor(sibling.Left) == black &&
nodeColor(sibling.Right) == black {
sibling.color = red
node.Parent.color = black
} else {
tree.deleteCase5(node)
}
}
func (tree *Tree) deleteCase5(node *Node) {
sibling := node.sibling()
if node == node.Parent.Left &&
nodeColor(sibling) == black &&
nodeColor(sibling.Left) == red &&
nodeColor(sibling.Right) == black {
sibling.color = red
sibling.Left.color = black
tree.rotateRight(sibling)
} else if node == node.Parent.Right &&
nodeColor(sibling) == black &&
nodeColor(sibling.Right) == red &&
nodeColor(sibling.Left) == black {
sibling.color = red
sibling.Right.color = black
tree.rotateLeft(sibling)
}
tree.deleteCase6(node)
}
func (tree *Tree) deleteCase6(node *Node) {
sibling := node.sibling()
sibling.color = nodeColor(node.Parent)
node.Parent.color = black
if node == node.Parent.Left && nodeColor(sibling.Right) == red {
sibling.Right.color = black
tree.rotateLeft(node.Parent)
} else if nodeColor(sibling.Left) == red {
sibling.Left.color = black
tree.rotateRight(node.Parent)
}
}
func nodeColor(node *Node) color {
if node == nil {
return black
}
return node.color
}

View File

@@ -1,8 +1,45 @@
package main
import "errors"
import (
"errors"
"fmt"
"github.com/valyala/fasthttp"
"net"
)
var ErrRateLimit = errors.New("too many requests")
var ErrForbidden = errors.New("access denied")
var ErrKnown = errors.New("already crawled")
type HttpError struct {
code int
}
func (e HttpError) Error() string {
return fmt.Sprintf("http status %d", e.code)
}
func shouldRetry(err error) bool {
// HTTP errors
if httpErr, ok := err.(*HttpError); ok {
switch httpErr.code {
case fasthttp.StatusTooManyRequests:
return true
default:
// Don't retry HTTP error codes
return false
}
}
if dnsError, ok := err.(*net.DNSError); ok {
// Don't retry permanent DNS errors
return dnsError.IsTemporary
}
if netErr, ok := err.(*net.OpError); ok {
// Don't retry permanent network errors
return netErr.Temporary()
}
// Retry by default
return true
}

869
fasturl/url.go Normal file
View File

@@ -0,0 +1,869 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package fasturl parses URLs and implements query escaping.
package fasturl
// Modifications by terorie
// See RFC 3986. This package generally follows RFC 3986, except where
// it deviates for compatibility reasons. When sending changes, first
// search old issues for history on decisions. Unit tests should also
// contain references to issue numbers with details.
import (
"errors"
"fmt"
"strconv"
"strings"
)
type Scheme int
const (
SchemeInvalid = iota
SchemeHTTP
SchemeHTTPS
SchemeCount
)
var Schemes = [SchemeCount]string {
"",
"http",
"https",
}
var ErrUnknownScheme = errors.New("unknown protocol scheme")
// Error reports an error and the operation and URL that caused it.
type Error struct {
Op string
URL string
Err error
}
func (e *Error) Error() string { return e.Op + " " + e.URL + ": " + e.Err.Error() }
type timeout interface {
Timeout() bool
}
func (e *Error) Timeout() bool {
t, ok := e.Err.(timeout)
return ok && t.Timeout()
}
type temporary interface {
Temporary() bool
}
func (e *Error) Temporary() bool {
t, ok := e.Err.(temporary)
return ok && t.Temporary()
}
func ishex(c byte) bool {
switch {
case '0' <= c && c <= '9':
return true
case 'a' <= c && c <= 'f':
return true
case 'A' <= c && c <= 'F':
return true
}
return false
}
func unhex(c byte) byte {
switch {
case '0' <= c && c <= '9':
return c - '0'
case 'a' <= c && c <= 'f':
return c - 'a' + 10
case 'A' <= c && c <= 'F':
return c - 'A' + 10
}
return 0
}
type encoding int
const (
encodePath encoding = 1 + iota
encodePathSegment
encodeHost
encodeZone
encodeUserPassword
encodeQueryComponent
encodeFragment
)
type EscapeError string
func (e EscapeError) Error() string {
return "invalid URL escape " + strconv.Quote(string(e))
}
type InvalidHostError string
func (e InvalidHostError) Error() string {
return "invalid character " + strconv.Quote(string(e)) + " in host name"
}
// Return true if the specified character should be escaped when
// appearing in a URL string, according to RFC 3986.
//
// Please be informed that for now shouldEscape does not check all
// reserved characters correctly. See golang.org/issue/5684.
func shouldEscape(c byte, mode encoding) bool {
// §2.3 Unreserved characters (alphanum)
if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' {
return false
}
if mode == encodeHost || mode == encodeZone {
// §3.2.2 Host allows
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
// as part of reg-name.
// We add : because we include :port as part of host.
// We add [ ] because we include [ipv6]:port as part of host.
// We add < > because they're the only characters left that
// we could possibly allow, and Parse will reject them if we
// escape them (because hosts can't use %-encoding for
// ASCII bytes).
switch c {
case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
return false
}
}
switch c {
case '-', '_', '.', '~': // §2.3 Unreserved characters (mark)
return false
case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved)
// Different sections of the URL allow a few of
// the reserved characters to appear unescaped.
switch mode {
case encodePath: // §3.3
// The RFC allows : @ & = + $ but saves / ; , for assigning
// meaning to individual path segments. This package
// only manipulates the path as a whole, so we allow those
// last three as well. That leaves only ? to escape.
return c == '?'
case encodePathSegment: // §3.3
// The RFC allows : @ & = + $ but saves / ; , for assigning
// meaning to individual path segments.
return c == '/' || c == ';' || c == ',' || c == '?'
case encodeUserPassword: // §3.2.1
// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
// userinfo, so we must escape only '@', '/', and '?'.
// The parsing of userinfo treats ':' as special so we must escape
// that too.
return c == '@' || c == '/' || c == '?' || c == ':'
case encodeQueryComponent: // §3.4
// The RFC reserves (so we must escape) everything.
return true
case encodeFragment: // §4.1
// The RFC text is silent but the grammar allows
// everything, so escape nothing.
return false
}
}
if mode == encodeFragment {
// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
// need to be escaped. To minimize potential breakage, we apply two restrictions:
// (1) we always escape sub-delims outside of the fragment, and (2) we always
// escape single quote to avoid breaking callers that had previously assumed that
// single quotes would be escaped. See issue #19917.
switch c {
case '!', '(', ')', '*':
return false
}
}
// Everything else must be escaped.
return true
}
// unescape unescapes a string; the mode specifies
// which section of the URL string is being unescaped.
func unescape(s string, mode encoding) (string, error) {
// Count %, check that they're well-formed.
n := 0
hasPlus := false
for i := 0; i < len(s); {
switch s[i] {
case '%':
n++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:]
if len(s) > 3 {
s = s[:3]
}
return "", EscapeError(s)
}
// Per https://tools.ietf.org/html/rfc3986#page-21
// in the host component %-encoding can only be used
// for non-ASCII bytes.
// But https://tools.ietf.org/html/rfc6874#section-2
// introduces %25 being allowed to escape a percent sign
// in IPv6 scoped-address literals. Yay.
if mode == encodeHost && unhex(s[i+1]) < 8 && s[i:i+3] != "%25" {
return "", EscapeError(s[i : i+3])
}
if mode == encodeZone {
// RFC 6874 says basically "anything goes" for zone identifiers
// and that even non-ASCII can be redundantly escaped,
// but it seems prudent to restrict %-escaped bytes here to those
// that are valid host name bytes in their unescaped form.
// That is, you can use escaping in the zone identifier but not
// to introduce bytes you couldn't just write directly.
// But Windows puts spaces here! Yay.
v := unhex(s[i+1])<<4 | unhex(s[i+2])
if s[i:i+3] != "%25" && v != ' ' && shouldEscape(v, encodeHost) {
return "", EscapeError(s[i : i+3])
}
}
i += 3
case '+':
hasPlus = mode == encodeQueryComponent
i++
default:
if (mode == encodeHost || mode == encodeZone) && s[i] < 0x80 && shouldEscape(s[i], mode) {
return "", InvalidHostError(s[i : i+1])
}
i++
}
}
if n == 0 && !hasPlus {
return s, nil
}
t := make([]byte, len(s)-2*n)
j := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
j++
i += 3
case '+':
if mode == encodeQueryComponent {
t[j] = ' '
} else {
t[j] = '+'
}
j++
i++
default:
t[j] = s[i]
j++
i++
}
}
return string(t), nil
}
func escape(s string, mode encoding) string {
spaceCount, hexCount := 0, 0
for i := 0; i < len(s); i++ {
c := s[i]
if shouldEscape(c, mode) {
if c == ' ' && mode == encodeQueryComponent {
spaceCount++
} else {
hexCount++
}
}
}
if spaceCount == 0 && hexCount == 0 {
return s
}
t := make([]byte, len(s)+2*hexCount)
j := 0
for i := 0; i < len(s); i++ {
switch c := s[i]; {
case c == ' ' && mode == encodeQueryComponent:
t[j] = '+'
j++
case shouldEscape(c, mode):
t[j] = '%'
t[j+1] = "0123456789ABCDEF"[c>>4]
t[j+2] = "0123456789ABCDEF"[c&15]
j += 3
default:
t[j] = s[i]
j++
}
}
return string(t)
}
// A URL represents a parsed URL (technically, a URI reference).
//
// The general form represented is:
//
// [scheme:][//[userinfo@]host][/]path[?query][#fragment]
//
// URLs that do not start with a slash after the scheme are interpreted as:
//
// scheme:opaque[?query][#fragment]
//
// Note that the Path field is stored in decoded form: /%47%6f%2f becomes /Go/.
// A consequence is that it is impossible to tell which slashes in the Path were
// slashes in the raw URL and which were %2f. This distinction is rarely important,
// but when it is, code must not use Path directly.
// The Parse function sets both Path and RawPath in the URL it returns,
// and URL's String method uses RawPath if it is a valid encoding of Path,
// by calling the EscapedPath method.
type URL struct {
Scheme Scheme
Host string // host or host:port
Path string // path (relative paths may omit leading slash)
}
// Maybe rawurl is of the form scheme:path.
// (Scheme must be [a-zA-Z][a-zA-Z0-9+-.]*)
// If so, return scheme, path; else return "", rawurl.
func getscheme(rawurl string) (scheme Scheme, path string, err error) {
for i := 0; i < len(rawurl); i++ {
c := rawurl[i]
switch {
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
// do nothing
case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.':
if i == 0 {
return SchemeInvalid, rawurl, nil
}
case c == ':':
if i == 0 {
return SchemeInvalid, "", errors.New("missing protocol scheme")
}
switch rawurl[:i] {
case "http":
scheme = SchemeHTTP
case "https":
scheme = SchemeHTTPS
default:
return SchemeInvalid, "", ErrUnknownScheme
}
path = rawurl[i+1:]
return
default:
// we have encountered an invalid character,
// so there is no valid scheme
return SchemeInvalid, rawurl, nil
}
}
return SchemeInvalid, rawurl, nil
}
// Maybe s is of the form t c u.
// If so, return t, c u (or t, u if cutc == true).
// If not, return s, "".
func split(s string, c string, cutc bool) (string, string) {
i := strings.Index(s, c)
if i < 0 {
return s, ""
}
if cutc {
return s[:i], s[i+len(c):]
}
return s[:i], s[i:]
}
// Parse parses rawurl into a URL structure.
//
// The rawurl may be relative (a path, without a host) or absolute
// (starting with a scheme). Trying to parse a hostname and path
// without a scheme is invalid but may not necessarily return an
// error, due to parsing ambiguities.
func (u *URL) Parse(rawurl string) error {
// Cut off #frag
s, frag := split(rawurl, "#", true)
err := u.parse(s, false)
if err != nil {
return &Error{"parse", s, err}
}
if frag == "" {
return nil
}
return nil
}
// ParseRequestURI parses rawurl into a URL structure. It assumes that
// rawurl was received in an HTTP request, so the rawurl is interpreted
// only as an absolute URI or an absolute path.
// The string rawurl is assumed not to have a #fragment suffix.
// (Web browsers strip #fragment before sending the URL to a web server.)
func (u *URL) ParseRequestURI(rawurl string) error {
err := u.parse(rawurl, true)
if err != nil {
return &Error{"parse", rawurl, err}
}
return nil
}
// parse parses a URL from a string in one of two contexts. If
// viaRequest is true, the URL is assumed to have arrived via an HTTP request,
// in which case only absolute URLs or path-absolute relative URLs are allowed.
// If viaRequest is false, all forms of relative URLs are allowed.
func (u *URL) parse(rawurl string, viaRequest bool) error {
var rest string
var err error
if rawurl == "" && viaRequest {
return errors.New("empty url")
}
if rawurl == "*" {
u.Path = "*"
return nil
}
// Split off possible leading "http:", "mailto:", etc.
// Cannot contain escaped characters.
if u.Scheme, rest, err = getscheme(rawurl); err != nil {
return err
}
if strings.HasSuffix(rest, "?") && strings.Count(rest, "?") == 1 {
rest = rest[:len(rest)-1]
} else {
rest, _ = split(rest, "?", true)
}
if !strings.HasPrefix(rest, "/") {
if u.Scheme != SchemeInvalid {
// We consider rootless paths per RFC 3986 as opaque.
return nil
}
if viaRequest {
return errors.New("invalid URI for request")
}
// Avoid confusion with malformed schemes, like cache_object:foo/bar.
// See golang.org/issue/16822.
//
// RFC 3986, §3.3:
// In addition, a URI reference (Section 4.1) may be a relative-path reference,
// in which case the first path segment cannot contain a colon (":") character.
colon := strings.Index(rest, ":")
slash := strings.Index(rest, "/")
if colon >= 0 && (slash < 0 || colon < slash) {
// First path segment has colon. Not allowed in relative URL.
return errors.New("first path segment in URL cannot contain colon")
}
}
if (u.Scheme != SchemeInvalid || !viaRequest && !strings.HasPrefix(rest, "///")) && strings.HasPrefix(rest, "//") {
var authority string
authority, rest = split(rest[2:], "/", false)
u.Host, err = parseAuthority(authority)
if err != nil {
return err
}
}
u.Path = rest
return nil
}
func parseAuthority(authority string) (host string, err error) {
i := strings.LastIndex(authority, "@")
if i < 0 {
host, err = parseHost(authority)
} else {
host, err = parseHost(authority[i+1:])
}
if err != nil {
return "", err
}
if i < 0 {
return host, nil
}
userinfo := authority[:i]
if !validUserinfo(userinfo) {
return "", errors.New("fasturl: invalid userinfo")
}
return host, nil
}
// parseHost parses host as an authority without user
// information. That is, as host[:port].
func parseHost(host string) (string, error) {
if strings.HasPrefix(host, "[") {
// Parse an IP-Literal in RFC 3986 and RFC 6874.
// E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
i := strings.LastIndex(host, "]")
if i < 0 {
return "", errors.New("missing ']' in host")
}
colonPort := host[i+1:]
if !validOptionalPort(colonPort) {
return "", fmt.Errorf("invalid port %q after host", colonPort)
}
// RFC 6874 defines that %25 (%-encoded percent) introduces
// the zone identifier, and the zone identifier can use basically
// any %-encoding it likes. That's different from the host, which
// can only %-encode non-ASCII bytes.
// We do impose some restrictions on the zone, to avoid stupidity
// like newlines.
zone := strings.Index(host[:i], "%25")
if zone >= 0 {
host1, err := unescape(host[:zone], encodeHost)
if err != nil {
return "", err
}
host2, err := unescape(host[zone:i], encodeZone)
if err != nil {
return "", err
}
host3, err := unescape(host[i:], encodeHost)
if err != nil {
return "", err
}
return host1 + host2 + host3, nil
}
}
var err error
if host, err = unescape(host, encodeHost); err != nil {
return "", err
}
return host, nil
}
// validOptionalPort reports whether port is either an empty string
// or matches /^:\d*$/
func validOptionalPort(port string) bool {
if port == "" {
return true
}
if port[0] != ':' {
return false
}
for _, b := range port[1:] {
if b < '0' || b > '9' {
return false
}
}
return true
}
// String reassembles the URL into a valid URL string.
// The general form of the result is one of:
//
// scheme:opaque?query#fragment
// scheme://userinfo@host/path?query#fragment
//
// If u.Opaque is non-empty, String uses the first form;
// otherwise it uses the second form.
// To obtain the path, String uses u.EscapedPath().
//
// In the second form, the following rules apply:
// - if u.Scheme is empty, scheme: is omitted.
// - if u.User is nil, userinfo@ is omitted.
// - if u.Host is empty, host/ is omitted.
// - if u.Scheme and u.Host are empty and u.User is nil,
// the entire scheme://userinfo@host/ is omitted.
// - if u.Host is non-empty and u.Path begins with a /,
// the form host/path does not add its own /.
// - if u.RawQuery is empty, ?query is omitted.
// - if u.Fragment is empty, #fragment is omitted.
func (u *URL) String() string {
var buf strings.Builder
if u.Scheme != SchemeInvalid {
buf.WriteString(Schemes[u.Scheme])
buf.WriteByte(':')
}
if u.Scheme != SchemeInvalid || u.Host != "" {
if u.Host != "" || u.Path != "" {
buf.WriteString("//")
}
if h := u.Host; h != "" {
buf.WriteString(escape(h, encodeHost))
}
}
path := u.Path
if path != "" && path[0] != '/' && u.Host != "" {
buf.WriteByte('/')
}
if buf.Len() == 0 {
// RFC 3986 §4.2
// A path segment that contains a colon character (e.g., "this:that")
// cannot be used as the first segment of a relative-path reference, as
// it would be mistaken for a scheme name. Such a segment must be
// preceded by a dot-segment (e.g., "./this:that") to make a relative-
// path reference.
if i := strings.IndexByte(path, ':'); i > -1 && strings.IndexByte(path[:i], '/') == -1 {
buf.WriteString("./")
}
}
buf.WriteString(path)
return buf.String()
}
func isRunesDot(r []rune) bool {
return len(r) == 1 && r[0] == '.'
}
func isRunesDoubleDot(r []rune) bool {
return len(r) == 2 && r[0] == '.' && r[1] == '.'
}
// resolvePath applies special path segments from refs and applies
// them to base, per RFC 3986.
func resolvePath(base, ref string) string {
var full string
if ref == "" {
full = base
} else if ref[0] != '/' {
i := strings.LastIndex(base, "/")
full = base[:i+1] + ref
} else {
full = ref
}
if full == "" {
return ""
} else if full == "/" {
return "/"
}
dst := make([]rune, len(full))
dst = dst[0:0]
start := 0
rs := []rune(full)
if len(rs) != 0 && rs[0] == '/' {
rs = rs[1:]
}
var stack []int
stack = append(stack, 0)
for i, c := range rs {
if i == len(rs) - 1 {
closingSlash := false
part := rs[start:]
if len(part) == 0 {
dst = append(dst, '/')
} else if part[len(part)-1] == '/' {
part = part[:len(part)-1]
closingSlash = true
}
switch {
case isRunesDot(part):
dst = append(dst, '/')
case isRunesDoubleDot(part):
// Cut to the last slash
start = i+1
dst = dst[:stack[len(stack)-1]]
if len(stack) != 1 {
stack = stack[:len(stack)-1]
}
dst = append(dst, '/')
default:
dst = append(dst, '/')
dst = append(dst, part...)
}
if closingSlash && len(dst) != 0 && dst[len(dst)-1] != '/' {
dst = append(dst, '/')
}
} else if c == '/' {
part := rs[start:i]
switch {
case isRunesDot(part):
start = i+1
case isRunesDoubleDot(part):
// Cut to the last slash
start = i+1
dst = dst[:stack[len(stack)-1]]
if len(stack) != 1 {
stack = stack[:len(stack)-1]
}
default:
start = i+1
stack = append(stack, len(dst))
dst = append(dst, '/')
dst = append(dst, part...)
}
}
}
return string(dst)
/*var dst []string
src := strings.Split(full, "/")
for _, elem := range src {
switch elem {
case ".":
// drop
case "..":
if len(dst) > 0 {
dst = dst[:len(dst)-1]
}
default:
dst = append(dst, elem)
}
}
if last := src[len(src)-1]; last == "." || last == ".." {
// Add final slash to the joined path.
dst = append(dst, "")
}
return "/" + strings.TrimPrefix(strings.Join(dst, "/"), "/")*/
}
// IsAbs reports whether the URL is absolute.
// Absolute means that it has a non-empty scheme.
func (u *URL) IsAbs() bool {
return u.Scheme != SchemeInvalid
}
// ParseRel parses a URL in the context of the receiver. The provided URL
// may be relative or absolute. Parse returns nil, err on parse
// failure, otherwise its return value is the same as ResolveReference.
func (u *URL) ParseRel(out *URL, ref string) error {
var refurl URL
err := refurl.Parse(ref)
if err != nil {
return err
}
u.ResolveReference(out, &refurl)
return nil
}
// ResolveReference resolves a URI reference to an absolute URI from
// an absolute base URI u, per RFC 3986 Section 5.2. The URI reference
// may be relative or absolute. ResolveReference always returns a new
// URL instance, even if the returned URL is identical to either the
// base or reference. If ref is an absolute URL, then ResolveReference
// ignores base and returns a copy of ref.
func (u *URL) ResolveReference(url *URL, ref *URL) {
*url = *ref
if ref.Scheme == SchemeInvalid {
url.Scheme = u.Scheme
}
if ref.Scheme != SchemeInvalid || ref.Host != "" {
// The "absoluteURI" or "net_path" cases.
// We can ignore the error from setPath since we know we provided a
// validly-escaped path.
url.Path = resolvePath(ref.Path, "")
return
}
// The "abs_path" or "rel_path" cases.
url.Host = u.Host
url.Path = resolvePath(u.Path, ref.Path)
return
}
// Marshaling interface implementations.
// Would like to implement MarshalText/UnmarshalText but that will change the JSON representation of URLs.
func (u *URL) MarshalBinary() (text []byte, err error) {
return []byte(u.String()), nil
}
func (u *URL) UnmarshalBinary(text []byte) error {
var u1 URL
err := u1.Parse(string(text))
if err != nil {
return err
}
*u = u1
return nil
}
// validUserinfo reports whether s is a valid userinfo string per RFC 3986
// Section 3.2.1:
// userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
// / "*" / "+" / "," / ";" / "="
//
// It doesn't validate pct-encoded. The caller does that via func unescape.
func validUserinfo(s string) bool {
for _, r := range s {
if 'A' <= r && r <= 'Z' {
continue
}
if 'a' <= r && r <= 'z' {
continue
}
if '0' <= r && r <= '9' {
continue
}
switch r {
case '-', '.', '_', ':', '~', '!', '$', '&', '\'',
'(', ')', '*', '+', ',', ';', '=', '%', '@':
continue
default:
return false
}
}
return true
}
func PathUnescape(s string) string {
newStr, err := pathUnescape(s)
if err != nil {
return s
} else {
return newStr
}
}
func pathUnescape(s string) (string, error) {
// Count %, check that they're well-formed.
n := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
n++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:]
if len(s) > 3 {
s = s[:3]
}
return "", EscapeError(s)
}
i += 3
default:
i++
}
}
if n == 0 {
return s, nil
}
t := make([]byte, len(s)-2*n)
j := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
j++
i += 3
case '+':
t[j] = '+'
j++
i++
default:
t[j] = s[i]
j++
i++
}
}
return string(t), nil
}

897
fasturl/url_test.go Normal file
View File

@@ -0,0 +1,897 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package fasturl
import (
"bytes"
encodingPkg "encoding"
"encoding/gob"
"encoding/json"
"fmt"
"io"
"net"
"reflect"
"testing"
)
type URLTest struct {
in string
out *URL // expected parse; RawPath="" means same as Path
roundtrip string // expected result of reserializing the URL; empty means same as "in".
}
var urltests = []URLTest{
// no path
{
"http://www.google.com",
&URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
},
"",
},
// path
{
"http://www.google.com/",
&URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "/",
},
"",
},
// %20 outside query
{
"http://www.google.com/a%20b",
&URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "/a%20b",
},
"",
},
// leading // without scheme should create an authority
{
"//foo",
&URL{
Host: "foo",
},
"",
},
// Three leading slashes isn't an authority, but doesn't return an error.
// (We can't return an error, as this code is also used via
// ServeHTTP -> ReadRequest -> Parse, which is arguably a
// different URL parsing context, but currently shares the
// same codepath)
{
"///threeslashes",
&URL{
Path: "///threeslashes",
},
"",
},
// unescaped @ in username should not confuse host
{
"http://j@ne:password@google.com",
&URL{
Scheme: SchemeHTTP,
Host: "google.com",
},
"http://google.com",
},
// unescaped @ in password should not confuse host
{
"http://jane:p@ssword@google.com",
&URL{
Scheme: SchemeHTTP,
Host: "google.com",
},
"http://google.com",
},
// Relative path
{
"a/b/c",
&URL{
Path: "a/b/c",
},
"a/b/c",
},
// host subcomponent; IPv4 address in RFC 3986
{
"http://192.168.0.1/",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.1",
Path: "/",
},
"",
},
// host and port subcomponents; IPv4 address in RFC 3986
{
"http://192.168.0.1:8080/",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.1:8080",
Path: "/",
},
"",
},
// host subcomponent; IPv6 address in RFC 3986
{
"http://[fe80::1]/",
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1]",
Path: "/",
},
"",
},
// host and port subcomponents; IPv6 address in RFC 3986
{
"http://[fe80::1]:8080/",
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1]:8080",
Path: "/",
},
"",
},
// host subcomponent; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25en0]/", // alphanum zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en0]",
Path: "/",
},
"",
},
// host and port subcomponents; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25en0]:8080/", // alphanum zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en0]:8080",
Path: "/",
},
"",
},
// host subcomponent; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25%65%6e%301-._~]/", // percent-encoded+unreserved zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en01-._~]",
Path: "/",
},
"http://[fe80::1%25en01-._~]/",
},
// host and port subcomponents; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25%65%6e%301-._~]:8080/", // percent-encoded+unreserved zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en01-._~]:8080",
Path: "/",
},
"http://[fe80::1%25en01-._~]:8080/",
},
// golang.org/issue/12200 (colon with empty port)
{
"http://192.168.0.2:8080/foo",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.2:8080",
Path: "/foo",
},
"",
},
{
"http://192.168.0.2:/foo",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.2:",
Path: "/foo",
},
"",
},
{
// Malformed IPv6 but still accepted.
"http://2b01:e34:ef40:7730:8e70:5aff:fefe:edac:8080/foo",
&URL{
Scheme: SchemeHTTP,
Host: "2b01:e34:ef40:7730:8e70:5aff:fefe:edac:8080",
Path: "/foo",
},
"",
},
{
// Malformed IPv6 but still accepted.
"http://2b01:e34:ef40:7730:8e70:5aff:fefe:edac:/foo",
&URL{
Scheme: SchemeHTTP,
Host: "2b01:e34:ef40:7730:8e70:5aff:fefe:edac:",
Path: "/foo",
},
"",
},
{
"http://[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:8080/foo",
&URL{
Scheme: SchemeHTTP,
Host: "[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:8080",
Path: "/foo",
},
"",
},
{
"http://[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:/foo",
&URL{
Scheme: SchemeHTTP,
Host: "[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:",
Path: "/foo",
},
"",
},
// golang.org/issue/7991 and golang.org/issue/12719 (non-ascii %-encoded in host)
{
"http://hello.世界.com/foo",
&URL{
Scheme: SchemeHTTP,
Host: "hello.世界.com",
Path: "/foo",
},
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
},
{
"http://hello.%e4%b8%96%e7%95%8c.com/foo",
&URL{
Scheme: SchemeHTTP,
Host: "hello.世界.com",
Path: "/foo",
},
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
},
{
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
&URL{
Scheme: SchemeHTTP,
Host: "hello.世界.com",
Path: "/foo",
},
"",
},
// golang.org/issue/10433 (path beginning with //)
{
"http://example.com//foo",
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "//foo",
},
"",
},
// test that we can reparse the host names we accept.
{
"http://authority<\"hi\">/foo",
&URL{
Scheme: SchemeHTTP,
Host: "authority<\"hi\">",
Path: "/foo",
},
"",
},
}
// more useful string for debugging than fmt's struct printer
func ufmt(u *URL) string {
return fmt.Sprintf("scheme=%q, host=%q, path=%q",
Schemes[u.Scheme], u.Host, u.Path)
}
func BenchmarkString(b *testing.B) {
b.StopTimer()
b.ReportAllocs()
for _, tt := range urltests {
var u URL
err := u.Parse(tt.in)
if err != nil {
b.Errorf("Parse(%q) returned error %s", tt.in, err)
continue
}
if tt.roundtrip == "" {
continue
}
b.StartTimer()
var g string
for i := 0; i < b.N; i++ {
g = u.String()
}
b.StopTimer()
if w := tt.roundtrip; b.N > 0 && g != w {
b.Errorf("Parse(%q).String() == %q, want %q", tt.in, g, w)
}
}
}
func TestParse(t *testing.T) {
for _, tt := range urltests {
var u URL
err := u.Parse(tt.in)
if err != nil {
t.Errorf("Parse(%q) returned error %v", tt.in, err)
continue
}
if !reflect.DeepEqual(&u, tt.out) {
t.Errorf("Parse(%q):\n\tgot %v\n\twant %v\n", tt.in, ufmt(&u), ufmt(tt.out))
}
}
}
const pathThatLooksSchemeRelative = "//not.a.user@not.a.host/just/a/path"
var parseRequestURLTests = []struct {
url string
expectedValid bool
}{
{"http://foo.com", true},
{"http://foo.com/", true},
{"http://foo.com/path", true},
{"/", true},
{pathThatLooksSchemeRelative, true},
{"//not.a.user@%66%6f%6f.com/just/a/path/also", true},
{"*", true},
{"http://192.168.0.1/", true},
{"http://192.168.0.1:8080/", true},
{"http://[fe80::1]/", true},
{"http://[fe80::1]:8080/", true},
// Tests exercising RFC 6874 compliance:
{"http://[fe80::1%25en0]/", true}, // with alphanum zone identifier
{"http://[fe80::1%25en0]:8080/", true}, // with alphanum zone identifier
{"http://[fe80::1%25%65%6e%301-._~]/", true}, // with percent-encoded+unreserved zone identifier
{"http://[fe80::1%25%65%6e%301-._~]:8080/", true}, // with percent-encoded+unreserved zone identifier
{"foo.html", false},
{"../dir/", false},
{"http://192.168.0.%31/", false},
{"http://192.168.0.%31:8080/", false},
{"http://[fe80::%31]/", false},
{"http://[fe80::%31]:8080/", false},
{"http://[fe80::%31%25en0]/", false},
{"http://[fe80::%31%25en0]:8080/", false},
// These two cases are valid as textual representations as
// described in RFC 4007, but are not valid as address
// literals with IPv6 zone identifiers in URIs as described in
// RFC 6874.
{"http://[fe80::1%en0]/", false},
{"http://[fe80::1%en0]:8080/", false},
}
func TestParseRequestURI(t *testing.T) {
for _, test := range parseRequestURLTests {
var u URL
err := u.ParseRequestURI(test.url)
if test.expectedValid && err != nil {
t.Errorf("ParseRequestURI(%q) gave err %v; want no error", test.url, err)
} else if !test.expectedValid && err == nil {
t.Errorf("ParseRequestURI(%q) gave nil error; want some error", test.url)
}
}
var url URL
err := url.ParseRequestURI(pathThatLooksSchemeRelative)
if err != nil {
t.Fatalf("Unexpected error %v", err)
}
if url.Path != pathThatLooksSchemeRelative {
t.Errorf("ParseRequestURI path:\ngot %q\nwant %q", url.Path, pathThatLooksSchemeRelative)
}
}
var stringURLTests = []struct {
url URL
want string
}{
// No leading slash on path should prepend slash on String() call
{
url: URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "search",
},
want: "http://www.google.com/search",
},
// Relative path with first element containing ":" should be prepended with "./", golang.org/issue/17184
{
url: URL{
Path: "this:that",
},
want: "./this:that",
},
// Relative path with second element containing ":" should not be prepended with "./"
{
url: URL{
Path: "here/this:that",
},
want: "here/this:that",
},
// Non-relative path with first element containing ":" should not be prepended with "./"
{
url: URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "this:that",
},
want: "http://www.google.com/this:that",
},
}
func TestURLString(t *testing.T) {
for _, tt := range urltests {
var u URL
err := u.Parse(tt.in)
if err != nil {
t.Errorf("Parse(%q) returned error %s", tt.in, err)
continue
}
expected := tt.in
if tt.roundtrip != "" {
expected = tt.roundtrip
}
s := u.String()
if s != expected {
t.Errorf("Parse(%q).String() == %q (expected %q)", tt.in, s, expected)
}
}
for _, tt := range stringURLTests {
if got := tt.url.String(); got != tt.want {
t.Errorf("%+v.String() = %q; want %q", tt.url, got, tt.want)
}
}
}
var resolvePathTests = []struct {
base, ref, expected string
}{
{"a/b", ".", "/a/"},
{"a/b", "c", "/a/c"},
{"a/b", "..", "/"},
{"a/", "..", "/"},
{"a/", "../..", "/"},
{"a/b/c", "..", "/a/"},
{"a/b/c", "../d", "/a/d"},
{"a/b/c", ".././d", "/a/d"},
{"a/b", "./..", "/"},
{"a/./b", ".", "/a/"},
{"a/../", ".", "/"},
{"a/.././b", "c", "/c"},
}
func TestResolvePath(t *testing.T) {
for _, test := range resolvePathTests {
got := resolvePath(test.base, test.ref)
if got != test.expected {
t.Errorf("For %q + %q got %q; expected %q", test.base, test.ref, got, test.expected)
}
}
}
var resolveReferenceTests = []struct {
base, rel, expected string
}{
// Absolute URL references
{"http://foo.com?a=b", "https://bar.com/", "https://bar.com/"},
{"http://foo.com/", "https://bar.com/?a=b", "https://bar.com/"},
{"http://foo.com/", "https://bar.com/?", "https://bar.com/"},
// Path-absolute references
{"http://foo.com/bar", "/baz", "http://foo.com/baz"},
{"http://foo.com/bar?a=b#f", "/baz", "http://foo.com/baz"},
{"http://foo.com/bar?a=b", "/baz?", "http://foo.com/baz"},
{"http://foo.com/bar?a=b", "/baz?c=d", "http://foo.com/baz"},
// Multiple slashes
{"http://foo.com/bar", "http://foo.com//baz", "http://foo.com//baz"},
{"http://foo.com/bar", "http://foo.com///baz/quux", "http://foo.com///baz/quux"},
// Scheme-relative
{"https://foo.com/bar?a=b", "//bar.com/quux", "https://bar.com/quux"},
// Path-relative references:
// ... current directory
{"http://foo.com", ".", "http://foo.com/"},
{"http://foo.com/bar", ".", "http://foo.com/"},
{"http://foo.com/bar/", ".", "http://foo.com/bar/"},
// ... going down
{"http://foo.com", "bar", "http://foo.com/bar"},
{"http://foo.com/", "bar", "http://foo.com/bar"},
{"http://foo.com/bar/baz", "quux", "http://foo.com/bar/quux"},
// ... going up
{"http://foo.com/bar/baz", "../quux", "http://foo.com/quux"},
{"http://foo.com/bar/baz", "../../../../../quux", "http://foo.com/quux"},
{"http://foo.com/bar", "..", "http://foo.com/"},
{"http://foo.com/bar/baz", "./..", "http://foo.com/"},
// ".." in the middle (issue 3560)
{"http://foo.com/bar/baz", "quux/dotdot/../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/.././tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/./../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/././../../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/./.././../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/dotdot/./../../.././././tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/../dotdot/../dot/./tail/..", "http://foo.com/bar/quux/dot/"},
// Remove any dot-segments prior to forming the target URI.
// http://tools.ietf.org/html/rfc3986#section-5.2.4
{"http://foo.com/dot/./dotdot/../foo/bar", "../baz", "http://foo.com/dot/baz"},
// Triple dot isn't special
{"http://foo.com/bar", "...", "http://foo.com/..."},
// Fragment
{"http://foo.com/bar", ".#frag", "http://foo.com/"},
{"http://example.org/", "#!$&%27()*+,;=", "http://example.org/"},
// Paths with escaping (issue 16947).
{"http://foo.com/foo%2fbar/", "../baz", "http://foo.com/baz"},
{"http://foo.com/1/2%2f/3%2f4/5", "../../a/b/c", "http://foo.com/1/a/b/c"},
{"http://foo.com/1/2/3", "./a%2f../../b/..%2fc", "http://foo.com/1/2/b/..%2fc"},
{"http://foo.com/1/2%2f/3%2f4/5", "./a%2f../b/../c", "http://foo.com/1/2%2f/3%2f4/a%2f../c"},
{"http://foo.com/foo%20bar/", "../baz", "http://foo.com/baz"},
{"http://foo.com/foo", "../bar%2fbaz", "http://foo.com/bar%2fbaz"},
{"http://foo.com/foo%2dbar/", "./baz-quux", "http://foo.com/foo%2dbar/baz-quux"},
// RFC 3986: Normal Examples
// http://tools.ietf.org/html/rfc3986#section-5.4.1
{"http://a/b/c/d;p?q", "g", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "./g", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g/", "http://a/b/c/g/"},
{"http://a/b/c/d;p?q", "/g", "http://a/g"},
{"http://a/b/c/d;p?q", "//g", "http://g"},
{"http://a/b/c/d;p?q", "?y", "http://a/b/c/d;p"},
{"http://a/b/c/d;p?q", "g?y", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "#s", "http://a/b/c/d;p"},
{"http://a/b/c/d;p?q", "g#s", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g?y#s", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", ";x", "http://a/b/c/;x"},
{"http://a/b/c/d;p?q", "g;x", "http://a/b/c/g;x"},
{"http://a/b/c/d;p?q", "g;x?y#s", "http://a/b/c/g;x"},
{"http://a/b/c/d;p?q", "", "http://a/b/c/d;p"},
{"http://a/b/c/d;p?q", ".", "http://a/b/c/"},
{"http://a/b/c/d;p?q", "./", "http://a/b/c/"},
{"http://a/b/c/d;p?q", "..", "http://a/b/"},
{"http://a/b/c/d;p?q", "../", "http://a/b/"},
{"http://a/b/c/d;p?q", "../g", "http://a/b/g"},
{"http://a/b/c/d;p?q", "../..", "http://a/"},
{"http://a/b/c/d;p?q", "../../", "http://a/"},
{"http://a/b/c/d;p?q", "../../g", "http://a/g"},
// RFC 3986: Abnormal Examples
// http://tools.ietf.org/html/rfc3986#section-5.4.2
{"http://a/b/c/d;p?q", "../../../g", "http://a/g"},
{"http://a/b/c/d;p?q", "../../../../g", "http://a/g"},
{"http://a/b/c/d;p?q", "/./g", "http://a/g"},
{"http://a/b/c/d;p?q", "/../g", "http://a/g"},
{"http://a/b/c/d;p?q", "g.", "http://a/b/c/g."},
{"http://a/b/c/d;p?q", ".g", "http://a/b/c/.g"},
{"http://a/b/c/d;p?q", "g..", "http://a/b/c/g.."},
{"http://a/b/c/d;p?q", "..g", "http://a/b/c/..g"},
{"http://a/b/c/d;p?q", "./../g", "http://a/b/g"},
{"http://a/b/c/d;p?q", "./g/.", "http://a/b/c/g/"},
{"http://a/b/c/d;p?q", "g/./h", "http://a/b/c/g/h"},
{"http://a/b/c/d;p?q", "g/../h", "http://a/b/c/h"},
{"http://a/b/c/d;p?q", "g;x=1/./y", "http://a/b/c/g;x=1/y"},
{"http://a/b/c/d;p?q", "g;x=1/../y", "http://a/b/c/y"},
{"http://a/b/c/d;p?q", "g?y/./x", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g?y/../x", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g#s/./x", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g#s/../x", "http://a/b/c/g"},
// Extras.
{"https://a/b/c/d;p?q", "//g?q", "https://g"},
{"https://a/b/c/d;p?q", "//g#s", "https://g"},
{"https://a/b/c/d;p?q", "//g/d/e/f?y#s", "https://g/d/e/f"},
{"https://a/b/c/d;p#s", "?y", "https://a/b/c/d;p"},
{"https://a/b/c/d;p?q#s", "?y", "https://a/b/c/d;p"},
}
func TestResolveReference(t *testing.T) {
mustParse := func(url string) *URL {
u := new(URL)
err := u.Parse(url)
if err != nil {
t.Fatalf("Parse(%q) got err %v", url, err)
}
return u
}
for _, test := range resolveReferenceTests {
base := mustParse(test.base)
rel := mustParse(test.rel)
var url URL
base.ResolveReference(&url, rel)
if got := url.String(); got != test.expected {
t.Errorf("URL(%q).ResolveReference(%q)\ngot %q\nwant %q", test.base, test.rel, got, test.expected)
}
}
}
type RequestURITest struct {
url *URL
out string
}
var requritests = []RequestURITest{
{
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "",
},
"/",
},
{
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "/a b",
},
"/a%20b",
},
{
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "//foo",
},
"//foo",
},
}
func TestParseErrors(t *testing.T) {
tests := []struct {
in string
wantErr bool
}{
{"http://[::1]", false},
{"http://[::1]:80", false},
{"http://[::1]:namedport", true}, // rfc3986 3.2.3
{"http://[::1]/", false},
{"http://[::1]a", true},
{"http://[::1]%23", true},
{"http://[::1%25en0]", false}, // valid zone id
{"http://[::1]:", false}, // colon, but no port OK
{"http://[::1]:%38%30", true}, // not allowed: % encoding only for non-ASCII
{"http://[::1%25%41]", false}, // RFC 6874 allows over-escaping in zone
{"http://[%10::1]", true}, // no %xx escapes in IP address
{"http://[::1]/%48", false}, // %xx in path is fine
{"http://%41:8080/", true}, // not allowed: % encoding only for non-ASCII
{"http://[]%20%48%54%54%50%2f%31%2e%31%0a%4d%79%48%65%61%64%65%72%3a%20%31%32%33%0a%0a/", true}, // golang.org/issue/11208
{"http://a b.com/", true}, // no space in host name please
}
for _, tt := range tests {
var u URL
err := u.Parse(tt.in)
if tt.wantErr {
if err == nil {
t.Errorf("Parse(%q) = %#v; want an error", tt.in, u)
}
continue
}
if err != nil {
t.Logf("Parse(%q) = %v; want no error", tt.in, err)
}
}
}
type shouldEscapeTest struct {
in byte
mode encoding
escape bool
}
var shouldEscapeTests = []shouldEscapeTest{
// Unreserved characters (§2.3)
{'a', encodePath, false},
{'a', encodeUserPassword, false},
{'a', encodeQueryComponent, false},
{'a', encodeFragment, false},
{'a', encodeHost, false},
{'z', encodePath, false},
{'A', encodePath, false},
{'Z', encodePath, false},
{'0', encodePath, false},
{'9', encodePath, false},
{'-', encodePath, false},
{'-', encodeUserPassword, false},
{'-', encodeQueryComponent, false},
{'-', encodeFragment, false},
{'.', encodePath, false},
{'_', encodePath, false},
{'~', encodePath, false},
// User information (§3.2.1)
{':', encodeUserPassword, true},
{'/', encodeUserPassword, true},
{'?', encodeUserPassword, true},
{'@', encodeUserPassword, true},
{'$', encodeUserPassword, false},
{'&', encodeUserPassword, false},
{'+', encodeUserPassword, false},
{',', encodeUserPassword, false},
{';', encodeUserPassword, false},
{'=', encodeUserPassword, false},
// Host (IP address, IPv6 address, registered name, port suffix; §3.2.2)
{'!', encodeHost, false},
{'$', encodeHost, false},
{'&', encodeHost, false},
{'\'', encodeHost, false},
{'(', encodeHost, false},
{')', encodeHost, false},
{'*', encodeHost, false},
{'+', encodeHost, false},
{',', encodeHost, false},
{';', encodeHost, false},
{'=', encodeHost, false},
{':', encodeHost, false},
{'[', encodeHost, false},
{']', encodeHost, false},
{'0', encodeHost, false},
{'9', encodeHost, false},
{'A', encodeHost, false},
{'z', encodeHost, false},
{'_', encodeHost, false},
{'-', encodeHost, false},
{'.', encodeHost, false},
}
func TestShouldEscape(t *testing.T) {
for _, tt := range shouldEscapeTests {
if shouldEscape(tt.in, tt.mode) != tt.escape {
t.Errorf("shouldEscape(%q, %v) returned %v; expected %v", tt.in, tt.mode, !tt.escape, tt.escape)
}
}
}
type timeoutError struct {
timeout bool
}
func (e *timeoutError) Error() string { return "timeout error" }
func (e *timeoutError) Timeout() bool { return e.timeout }
type temporaryError struct {
temporary bool
}
func (e *temporaryError) Error() string { return "temporary error" }
func (e *temporaryError) Temporary() bool { return e.temporary }
type timeoutTemporaryError struct {
timeoutError
temporaryError
}
func (e *timeoutTemporaryError) Error() string { return "timeout/temporary error" }
var netErrorTests = []struct {
err error
timeout bool
temporary bool
}{{
err: &Error{"Get", "http://google.com/", &timeoutError{timeout: true}},
timeout: true,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &timeoutError{timeout: false}},
timeout: false,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &temporaryError{temporary: true}},
timeout: false,
temporary: true,
}, {
err: &Error{"Get", "http://google.com/", &temporaryError{temporary: false}},
timeout: false,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: true}, temporaryError{temporary: true}}},
timeout: true,
temporary: true,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: false}, temporaryError{temporary: true}}},
timeout: false,
temporary: true,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: true}, temporaryError{temporary: false}}},
timeout: true,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: false}, temporaryError{temporary: false}}},
timeout: false,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", io.EOF},
timeout: false,
temporary: false,
}}
// Test that url.Error implements net.Error and that it forwards
func TestURLErrorImplementsNetError(t *testing.T) {
for i, tt := range netErrorTests {
err, ok := tt.err.(net.Error)
if !ok {
t.Errorf("%d: %T does not implement net.Error", i+1, tt.err)
continue
}
if err.Timeout() != tt.timeout {
t.Errorf("%d: err.Timeout(): got %v, want %v", i+1, err.Timeout(), tt.timeout)
continue
}
if err.Temporary() != tt.temporary {
t.Errorf("%d: err.Temporary(): got %v, want %v", i+1, err.Temporary(), tt.temporary)
}
}
}
var _ encodingPkg.BinaryMarshaler = (*URL)(nil)
var _ encodingPkg.BinaryUnmarshaler = (*URL)(nil)
func TestJSON(t *testing.T) {
var u URL
err := u.Parse("https://www.google.com/x?y=z")
if err != nil {
t.Fatal(err)
}
js, err := json.Marshal(&u)
if err != nil {
t.Fatal(err)
}
// If only we could implement TextMarshaler/TextUnmarshaler,
// this would work:
//
// if string(js) != strconv.Quote(u.String()) {
// t.Errorf("json encoding: %s\nwant: %s\n", js, strconv.Quote(u.String()))
// }
u1 := new(URL)
err = json.Unmarshal(js, u1)
if err != nil {
t.Fatal(err)
}
if u1.String() != u.String() {
t.Errorf("json decoded to: %s\nwant: %s\n", u1, &u)
}
}
func TestGob(t *testing.T) {
var u URL
err := u.Parse("https://www.google.com/x?y=z")
if err != nil {
t.Fatal(err)
}
var w bytes.Buffer
err = gob.NewEncoder(&w).Encode(&u)
if err != nil {
t.Fatal(err)
}
u1 := new(URL)
err = gob.NewDecoder(&w).Decode(u1)
if err != nil {
t.Fatal(err)
}
if u1.String() != u.String() {
t.Errorf("json decoded to: %s\nwant: %s\n", u1, &u)
}
}

15
help.go Normal file
View File

@@ -0,0 +1,15 @@
package main
const helpText =
`HTTP crawler for the OD-Database
DB >> https://od-db.the-eye.eu <<
Crawler >> https://github.com/terorie/od-database-crawler <<
Server >> https://github.com/simon987/od-database <<
Quick start:
- get config file (config.yml in working dir)
- get OD-DB server ("server.url": Database URL + /api)
- get access token ("server.token": e.g. c010b6dd-20...)
- ./od-database-crawler server
Questions? Discord @terorie#2664 / Telegram @terorie`

47
jenkins/Jenkinsfile vendored Normal file
View File

@@ -0,0 +1,47 @@
def remote = [:]
remote.name = 'remote'
remote.host = env.DEPLOY_HOST
remote.user = env.DEPLOY_USER
remote.identityFile = '/var/lib/jenkins/.ssh/id_rsa'
remote.knownHosts = '/var/lib/jenkins/.ssh/known_hosts'
remote.allowAnyHosts = true
remote.retryCount = 3
remote.retryWaitSec = 3
logLevel = 'FINER'
pipeline {
agent none
environment {
GOOS='linux'
CGO_ENABLED='0'
HOME='.'
}
stages {
stage('Build') {
agent {
docker {
image 'golang:latest'
}
}
steps {
sh 'mkdir -p /go/src/github.com/terorie/od-database-crawler'
sh 'cp -r *.go fasturl ds jenkins/build.sh "/go/src/github.com/terorie/od-database-crawler"'
sh 'cd /go/src/github.com/terorie/od-database-crawler && go get ./...'
sh './jenkins/build.sh'
stash includes: 'dist/', name: 'dist'
}
}
stage('Deploy') {
agent none
steps {
node('master') {
unstash 'dist'
sshCommand remote: remote, command: "ls od-database-crawler/"
sshPut remote: remote, from: 'dist/', into: 'od-database-crawler'
}
}
}
}
}

23
jenkins/build.sh Executable file
View File

@@ -0,0 +1,23 @@
#!/usr/bin/env bash
appname="od-database-crawler"
outdir="dist/"
tag="${BUILD_ID}_$(date +%Y-%m-%d)"
rm -rf "./${outdir}"
mkdir build 2> /dev/null
name=${outdir}${appname}-${tag}-linux
GOOS="linux" GOARCH="amd64" go build -ldflags="-s -w" -o ${name}
gzip -f ${name}
echo ${name}
name=${outdir}${appname}-${tag}-mac
GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o ${name}
gzip -f ${name}
echo ${name}
name=${outdir}${appname}-${tag}-freebsd
GOOS="freebsd" GOARCH="amd64" go build -ldflags="-s -w" -o ${name}
gzip -f ${name}
echo ${name}

238
main.go
View File

@@ -2,86 +2,194 @@ package main
import (
"context"
"fmt"
"github.com/sirupsen/logrus"
"github.com/urfave/cli"
"log"
"net/http"
_ "net/http/pprof"
"net/url"
"github.com/spf13/cobra"
"github.com/spf13/viper"
"github.com/terorie/od-database-crawler/fasturl"
"os"
"os/signal"
"strings"
"sync/atomic"
"time"
)
var app = cli.App {
Name: "oddb-go",
Usage: "OD-Database Go crawler",
Version: "0.1",
BashComplete: cli.DefaultAppComplete,
Writer: os.Stdout,
Compiled: buildDate,
Commands: []cli.Command{
{
Name: "crawl",
Usage: "Crawl a list of URLs",
ArgsUsage: "[site, site, ...]",
Action: cmdCrawler,
},
var configFile string
var rootCmd = cobra.Command {
Use: "od-database-crawler",
Version: "1.2.2",
Short: "OD-Database Go crawler",
Long: helpText,
PersistentPreRunE: preRun,
PersistentPostRun: func(cmd *cobra.Command, args []string) {
exitHooks.Execute()
},
}
var serverCmd = cobra.Command {
Use: "server",
Short: "Start crawl server",
Long: "Connect to the OD-Database and contribute to the database\n" +
"by crawling the web for open directories!",
Run: cmdBase,
}
var crawlCmd = cobra.Command {
Use: "crawl",
Short: "Crawl an URL",
Long: "Crawl the URL specified.\n" +
"Results will not be uploaded to the database,\n" +
"they're saved under crawled/0.json instead.\n" +
"Primarily used for testing and benchmarking.",
RunE: cmdCrawler,
Args: cobra.ExactArgs(1),
}
var exitHooks Hooks
func init() {
rootCmd.AddCommand(&crawlCmd)
rootCmd.AddCommand(&serverCmd)
prepareConfig()
}
func main() {
go func() {
log.Println(http.ListenAndServe("localhost:42069", nil))
}()
app.Run(os.Args)
}
func preRun(cmd *cobra.Command, args []string) error {
if err := os.MkdirAll("crawled", 0755);
err != nil { panic(err) }
func cmdCrawler(clic *cli.Context) error {
readConfig()
if clic.NArg() == 0 {
cli.ShowCommandHelpAndExit(clic, "crawl", 1)
}
args := clic.Args()
remotes := make([]*OD, len(args))
for i, arg := range args {
// https://github.com/golang/go/issues/19779
if !strings.Contains(arg, "://") {
arg = "http://" + arg
}
u, err := url.Parse(arg)
if !strings.HasSuffix(u.Path, "/") {
u.Path += "/"
}
if err != nil { return err }
remotes[i] = &OD{ BaseUri: *u }
}
c := context.Background()
inRemotes := make(chan *OD)
go Schedule(c, inRemotes)
for _, remote := range remotes {
globalWait.Add(1)
inRemotes <- remote
}
// Wait for all jobs to finish
globalWait.Wait()
logrus.Info("All dirs processed!")
if err := os.MkdirAll("queue", 0755);
err != nil { panic(err) }
return nil
}
var buildDate = time.Date(
2018, 10, 28,
17, 10, 0, 0,
time.UTC)
func main() {
err := rootCmd.Execute()
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
func cmdBase(_ *cobra.Command, _ []string) {
onlineMode = true
readConfig()
appCtx, soft := context.WithCancel(context.Background())
forceCtx, hard := context.WithCancel(context.Background())
go hardShutdown(forceCtx)
go listenCtrlC(soft, hard)
inRemotes := make(chan *OD)
go Schedule(appCtx, inRemotes)
ticker := time.NewTicker(config.Recheck)
defer ticker.Stop()
for {
select {
case <-appCtx.Done():
goto shutdown
case <-ticker.C:
t, err := FetchTask()
if err != nil {
logrus.WithError(err).
Error("Failed to get new task")
if !sleep(viper.GetDuration(ConfCooldown), appCtx) {
goto shutdown
}
continue
}
if t == nil {
// No new task
if atomic.LoadInt32(&numActiveTasks) == 0 {
logrus.Info("Waiting …")
}
continue
}
var baseUri fasturl.URL
err = baseUri.Parse(t.Url)
if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
// Not an error
err = nil
// TODO FTP crawler
continue
} else if err != nil {
logrus.WithError(err).
Error("Failed to get new task")
time.Sleep(viper.GetDuration(ConfCooldown))
continue
}
ScheduleTask(inRemotes, t, &baseUri)
}
}
shutdown:
globalWait.Wait()
}
func cmdCrawler(_ *cobra.Command, args []string) error {
onlineMode = false
readConfig()
arg := args[0]
// https://github.com/golang/go/issues/19779
if !strings.Contains(arg, "://") {
arg = "http://" + arg
}
var u fasturl.URL
err := u.Parse(arg)
if !strings.HasSuffix(u.Path, "/") {
u.Path += "/"
}
if err != nil { return err }
// TODO Graceful shutdown
forceCtx := context.Background()
inRemotes := make(chan *OD)
go Schedule(forceCtx, inRemotes)
ticker := time.NewTicker(3 * time.Second)
defer ticker.Stop()
task := Task {
WebsiteId: 0,
Url: u.String(),
}
ScheduleTask(inRemotes, &task, &u)
// Wait for all jobs to finish
globalWait.Wait()
return nil
}
func listenCtrlC(soft, hard context.CancelFunc) {
c := make(chan os.Signal)
signal.Notify(c, os.Interrupt)
<-c
logrus.Info(">>> Shutting down crawler... <<<")
soft()
<-c
logrus.Warning(">>> Force shutdown! <<<")
hard()
}
func hardShutdown(c context.Context) {
<-c.Done()
os.Exit(1)
}
func sleep(d time.Duration, c context.Context) bool {
select {
case <-time.After(d):
return true
case <-c.Done():
return false
}
}

View File

@@ -1,32 +1,76 @@
package main
import (
"net/url"
"github.com/terorie/od-database-crawler/ds/redblackhash"
"github.com/terorie/od-database-crawler/fasturl"
"sync"
"time"
)
type ResultCode int
const (
TR_OK = ResultCode(iota)
TR_FAIL = 1
TR_SKIP = 2
)
type Task struct {
WebsiteId uint64 `json:"website_id"`
Url string `json:"url"`
UploadToken string `json:"upload_token"`
TaskId int64
}
type TaskResult struct {
ResultCode ResultCode `json:"status_code"`
FileCount uint64 `json:"file_count"`
ErrorCount uint64 `json:"-"`
StartTime time.Time `json:"-"`
StartTimeUnix int64 `json:"start_time"`
EndTimeUnix int64 `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}
type Job struct {
OD *OD
Uri url.URL
Uri fasturl.URL
UriStr string
Fails int
LastError error
}
type OD struct {
Task Task
Result TaskResult
Wait sync.WaitGroup
BaseUri url.URL
lock sync.Mutex
Files []File
BaseUri fasturl.URL
WCtx WorkerContext
Scanned sync.Map
Scanned redblackhash.Tree
}
type File struct {
Name string `json:"name"`
Size int64 `json:"size"`
MTime time.Time `json:"mtime"`
Path string `json:"path"`
IsDir bool `json:"-"`
Name string `json:"name"`
Size int64 `json:"size"`
MTime int64 `json:"mtime"`
Path string `json:"path"`
IsDir bool `json:"-"`
}
func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) {
o.Scanned.Lock()
defer o.Scanned.Unlock()
exists = o.Scanned.Get(k)
if exists {
return true
}
o.Scanned.Put(k)
return false
}
type errorString string
func (e errorString) Error() string {
return string(e)
}

129
queue.go Normal file
View File

@@ -0,0 +1,129 @@
package main
import (
"github.com/beeker1121/goque"
"os"
"sync"
"sync/atomic"
)
type BufferedQueue struct {
dataDir string
q *goque.Queue
buf []Job
m sync.Mutex
}
func OpenQueue(dataDir string) (bq *BufferedQueue, err error) {
bq = new(BufferedQueue)
if config.JobBufferSize < 0 {
return
}
bq.dataDir = dataDir
bq.q, err = goque.OpenQueue(dataDir)
if err != nil { return nil, err }
return
}
func (q *BufferedQueue) Enqueue(job *Job) error {
atomic.AddInt64(&totalQueued, 1)
if q.directEnqueue(job) {
return nil
}
var gob JobGob
gob.ToGob(job)
_, err := q.q.EnqueueObject(gob)
return err
}
func (q *BufferedQueue) Dequeue() (job Job, err error) {
if q.directDequeue(&job) {
atomic.AddInt64(&totalQueued, -1)
return job, nil
}
if config.JobBufferSize < 0 {
err = goque.ErrEmpty
return
}
var item *goque.Item
item, err = q.q.Dequeue()
if err != nil { return }
atomic.AddInt64(&totalQueued, -1)
var gob JobGob
err = item.ToObject(&gob)
if err != nil { return }
gob.FromGob(&job)
return
}
func (q *BufferedQueue) directEnqueue(job *Job) bool {
q.m.Lock()
defer q.m.Unlock()
bs := config.JobBufferSize
if len(q.buf) < bs || bs < 0 {
q.buf = append(q.buf, *job)
return true
} else {
return false
}
}
func (q *BufferedQueue) directDequeue(job *Job) bool {
q.m.Lock()
defer q.m.Unlock()
if len(q.buf) > 0 {
*job = q.buf[0]
q.buf = q.buf[1:]
return true
} else {
return false
}
}
// Always returns nil (But implements io.Closer)
func (q *BufferedQueue) Close() error {
if config.JobBufferSize < 0 {
return nil
}
// Close ignoring errors
q.q.Close()
// Delete files
if err := os.RemoveAll(q.dataDir);
err != nil { panic(err) }
return nil
}
type JobGob struct {
Uri string
Fails int
LastError string
}
func (g *JobGob) ToGob(j *Job) {
g.Uri = j.UriStr
g.Fails = j.Fails
if j.LastError != nil {
g.LastError = j.LastError.Error()
}
}
func (g *JobGob) FromGob(j *Job) {
if err := j.Uri.Parse(g.Uri);
err != nil { panic(err) }
j.UriStr = g.Uri
j.Fails = g.Fails
if g.LastError != "" {
j.LastError = errorString(g.LastError)
}
}

25
release.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
appname="od-database-crawler"
tag=$1
[ -z "$tag" ] && echo "Usage: build <version>" && exit 1
name=${appname}-${tag}-windows.exe
GOOS="windows" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-linux
GOOS="linux" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-mac
GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-freebsd
GOOS="freebsd" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name

View File

@@ -2,101 +2,218 @@ package main
import (
"context"
"encoding/json"
"fmt"
"github.com/sirupsen/logrus"
"github.com/terorie/od-database-crawler/fasturl"
"os"
"path"
"sync"
"sync/atomic"
"time"
)
var activeTasks int32
var totalBuffered int64
var activeTasksLock sync.Mutex
var activeTasks = make(map[uint64]bool)
var numActiveTasks int32
var totalQueued int64
func Schedule(c context.Context, remotes <-chan *OD) {
go Stats(c)
for {
select {
case <-c.Done():
return
for remote := range remotes {
logrus.WithField("url", remote.BaseUri.String()).
Info("Starting crawler")
case remote := <-remotes:
logrus.WithField("url", remote.BaseUri.String()).
Info("Starting crawler")
// Collect results
results := make(chan File)
// Spawn workers
remote.WCtx.in, remote.WCtx.out = makeJobBuffer(c)
for i := 0; i < config.Workers; i++ {
go remote.WCtx.Worker()
remote.WCtx.OD = remote
// Get queue path
queuePath := path.Join("queue", fmt.Sprintf("%d", remote.Task.WebsiteId))
// Delete existing queue
if err := os.RemoveAll(queuePath); err != nil {
panic(err)
}
// Start new queue
var err error
remote.WCtx.Queue, err = OpenQueue(queuePath)
if err != nil {
panic(err)
}
// Spawn workers
for i := 0; i < config.Workers; i++ {
go remote.WCtx.Worker(results)
}
// Enqueue initial job
atomic.AddInt32(&numActiveTasks, 1)
remote.WCtx.queueJob(Job{
Uri: remote.BaseUri,
UriStr: remote.BaseUri.String(),
Fails: 0,
})
// Upload result when ready
go remote.Watch(results)
// Sleep if max number of tasks are active
for atomic.LoadInt32(&numActiveTasks) > config.Tasks {
select {
case <-c.Done():
return
case <-time.After(time.Second):
continue
}
// Enqueue initial job
atomic.AddInt32(&activeTasks, 1)
remote.WCtx.queueJob(Job{
OD: remote,
Uri: remote.BaseUri,
UriStr: remote.BaseUri.String(),
Fails: 0,
})
globalWait.Done()
// Upload result when ready
go remote.Watch()
}
}
}
func (r *OD) Watch() {
func ScheduleTask(remotes chan<- *OD, t *Task, u *fasturl.URL) {
if !t.register() {
return
}
globalWait.Add(1)
now := time.Now()
od := &OD{
Task: *t,
BaseUri: *u,
Result: TaskResult{
WebsiteId: t.WebsiteId,
StartTime: now,
StartTimeUnix: now.Unix(),
},
}
remotes <- od
}
func (t *Task) register() bool {
activeTasksLock.Lock()
defer activeTasksLock.Unlock()
if _, known := activeTasks[t.WebsiteId]; known {
return false
} else {
activeTasks[t.WebsiteId] = true
return true
}
}
func (t *Task) unregister() {
activeTasksLock.Lock()
delete(activeTasks, t.WebsiteId)
activeTasksLock.Unlock()
}
func (o *OD) Watch(results chan File) {
// Mark job as completely done
defer globalWait.Done()
defer o.Task.unregister()
filePath := path.Join("crawled", fmt.Sprintf("%d.json", o.Task.WebsiteId))
// Open crawl results file
f, err := os.OpenFile(
filePath,
os.O_CREATE|os.O_RDWR|os.O_TRUNC,
0644,
)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
return
}
defer f.Close()
defer os.Remove(filePath)
// Listen for exit code of Collect()
collectErrC := make(chan error)
// Block until all results are written
// (closes results channel)
o.handleCollect(results, f, collectErrC)
// Exit code of Collect()
err = <-collectErrC
close(collectErrC)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
return
}
// Upload results
err = PushResult(&o.Task, f)
if err != nil {
logrus.WithError(err).
Error("Failed uploading crawl results")
return
}
}
func (o *OD) handleCollect(results chan File, f *os.File, collectErrC chan error) {
// Begin collecting results
go o.Task.Collect(results, f, collectErrC)
defer close(results)
// Wait for all jobs on remote to finish
r.Wait.Wait()
close(r.WCtx.in)
atomic.AddInt32(&activeTasks, -1)
o.Wait.Wait()
logrus.WithField("url", r.BaseUri.String()).
Info("Crawler finished")
}
func makeJobBuffer(c context.Context) (chan<- Job, <-chan Job) {
in := make(chan Job)
out := make(chan Job)
go bufferJobs(c, in, out)
return in, out
}
func bufferJobs(c context.Context, in chan Job, out chan Job) {
defer close(out)
var inQueue []Job
outCh := func() chan Job {
if len(inQueue) == 0 {
return nil
}
return out
// Close queue
if err := o.WCtx.Queue.Close(); err != nil {
panic(err)
}
for len(inQueue) > 0 || in != nil {
if len(inQueue) == 0 {
select {
case v, ok := <-in:
if !ok {
in = nil
} else {
atomic.AddInt64(&totalBuffered, 1)
inQueue = append(inQueue, v)
}
case <-c.Done():
return
}
} else {
select {
case v, ok := <-in:
if !ok {
in = nil
} else {
atomic.AddInt64(&totalBuffered, 1)
inQueue = append(inQueue, v)
}
case outCh() <- inQueue[0]:
atomic.AddInt64(&totalBuffered, -1)
inQueue = inQueue[1:]
case <-c.Done():
return
}
}
atomic.AddInt32(&numActiveTasks, -1)
// Log finish
logrus.WithFields(logrus.Fields{
"id": o.Task.WebsiteId,
"url": o.BaseUri.String(),
"duration": time.Since(o.Result.StartTime),
}).Info("Crawler finished")
// Set status code
now := time.Now()
o.Result.EndTimeUnix = now.Unix()
if atomic.LoadUint64(&o.Result.ErrorCount) != 0 {
o.Result.ResultCode = TR_FAIL
} else {
o.Result.ResultCode = TR_OK
}
}
func (t *Task) Collect(results chan File, f *os.File, errC chan<- error) {
err := t.collect(results, f)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
}
errC <- err
}
func (t *Task) collect(results chan File, f *os.File) error {
for result := range results {
result.Path = fasturl.PathUnescape(result.Path)
result.Name = fasturl.PathUnescape(result.Name)
resJson, err := json.Marshal(result)
if err != nil {
panic(err)
}
_, err = f.Write(resJson)
if err != nil {
return err
}
_, err = f.Write([]byte{'\n'})
if err != nil {
return err
}
}
return nil
}

348
server.go
View File

@@ -2,181 +2,271 @@ package main
import (
"bytes"
"encoding/base64"
"encoding/json"
"fmt"
"github.com/fasthttp/websocket"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/time/rate"
"io"
"mime/multipart"
"io/ioutil"
"net/http"
"net/url"
"os"
"path/filepath"
"strconv"
"strings"
)
const (
fileListChunkSize int64 = 5000000 // 5 mb
)
var serverWorker *TrackerWorker
var serverClient = http.DefaultClient
func FetchTask() (t *Task, err error) {
escToken, _ := json.Marshal(config.Token)
payload := `{"token":` + string(escToken) + `}`
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/get",
strings.NewReader(payload))
if err != nil { return }
res, err := serverClient.Do(req)
if err != nil { return }
defer res.Body.Close()
if res.StatusCode != 200 {
err = fmt.Errorf("http %s", res.Status)
return
}
t = new(Task)
err = json.NewDecoder(res.Body).Decode(t)
if err != nil { return }
return
var serverClient = http.Client{
Timeout: config.ServerTimeout,
Transport: new(ServerTripper),
}
func PushResult(result *TaskResult) (err error) {
filePath := filepath.Join(
".", "crawled",
fmt.Sprintf("%d.json", result.WebsiteId))
var serverUserAgent = "od-database-crawler/" + rootCmd.Version
defer os.Remove(filePath)
func getOrCreateWorker() {
f, err := os.Open(filePath)
if os.IsNotExist(err) {
err = fmt.Errorf("cannot upload result: %s does not exist", filePath)
if _, err := os.Stat("worker.json"); os.IsNotExist(err) {
req := CreateTrackerWorkerRequest{
Alias: config.TrackerAlias,
}
body, _ := json.Marshal(&req)
buf := bytes.NewBuffer(body)
resp, _ := serverClient.Post(config.TrackerUrl+"/worker/create", "application/json", buf)
workerResponse := CreateTrackerWorkerResponse{}
respBody, _ := ioutil.ReadAll(resp.Body)
_ = json.Unmarshal(respBody, &workerResponse)
workerJsonData, _ := json.Marshal(&workerResponse.Content.Worker)
fp, _ := os.OpenFile("worker.json", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
_, _ = fp.Write(workerJsonData)
//Request ASSIGN permission
serverWorker = &workerResponse.Content.Worker
accessReq, _ := json.Marshal(WorkerAccessRequest{
Project: config.TrackerProject,
Assign: true,
Submit: false,
})
buf = bytes.NewBuffer(accessReq)
res, err := serverClient.Post(config.TrackerUrl+"/project/request_access", "application/json", buf)
if err != nil {
panic(err)
}
logrus.WithFields(logrus.Fields{
"response": res.StatusCode,
}).Info("Requested ASSIGN permission")
} else {
var worker TrackerWorker
fp, _ := os.OpenFile("worker.json", os.O_RDONLY, 0600)
workerJsonData, _ := ioutil.ReadAll(fp)
_ = json.Unmarshal(workerJsonData, &worker)
serverWorker = &worker
}
}
func FetchTask() (t *Task, err error) {
if serverWorker == nil {
getOrCreateWorker()
}
res, err := serverClient.Get(config.TrackerUrl + "/task/get/" + strconv.Itoa(config.TrackerProject))
if err != nil {
return
}
defer res.Body.Close()
switch res.StatusCode {
case 200:
break
default:
return nil, fmt.Errorf("http %s", res.Status)
}
jsonResponse := FetchTaskResponse{}
err = json.NewDecoder(res.Body).Decode(&jsonResponse)
if _, ok := err.(*json.SyntaxError); ok {
return nil, fmt.Errorf("/task/get returned invalid JSON")
} else if err != nil {
return
}
defer f.Close()
err = uploadChunks(result.WebsiteId, f)
if !jsonResponse.Ok {
if jsonResponse.Message == "No task available" {
return nil, nil
}
return nil, errors.New(jsonResponse.Message)
}
task := Task{}
err = json.Unmarshal([]byte(jsonResponse.Content.Task.Recipe), &task)
if _, ok := err.(*json.SyntaxError); ok {
return nil, fmt.Errorf("/task/get returned invalid JSON")
} else if err != nil {
return
}
t = &task
t.TaskId = jsonResponse.Content.Task.Id
return
}
func PushResult(task *Task, f *os.File) (err error) {
if task.WebsiteId == 0 {
// Not a real result, don't push
return nil
}
// Rewind to the beginning of the file
_, err = f.Seek(0, 0)
if err != nil {
return
}
err = uploadWebsocket(f, task.UploadToken)
if err != nil {
logrus.Errorf("Failed to upload file list: %s", err)
err2 := CancelTask(result.WebsiteId)
err2 := releaseTask(task, TR_FAIL)
if err2 != nil {
logrus.Error(err2)
}
return
}
err = uploadResult(result)
// Upload result ignoring errors
_ = releaseTask(task, TR_OK)
return
}
func uploadWebsocket(f *os.File, token string) (err error) {
u := url.URL{Scheme: config.WsBucketScheme, Host: config.WsBucketHost, Path: "/upload"}
header := http.Header{}
header.Add("X-Upload-Token", token)
conn, _, err := websocket.DefaultDialer.Dial(u.String(), header)
if err != nil {
logrus.Errorf("Failed to upload result: %s", err)
err2 := CancelTask(result.WebsiteId)
if err2 != nil {
logrus.Error(err2)
}
return
}
return
}
conn.EnableWriteCompression(true) //TODO: Is this necessary?
func uploadChunks(websiteId uint64, f *os.File) (err error) {
for iter := 1; iter > 0; iter++ {
// TODO Stream with io.Pipe?
var b bytes.Buffer
multi := multipart.NewWriter(&b)
// Set upload fields
err = multi.WriteField("token", config.Token)
if err != nil { return }
err = multi.WriteField("website_id", fmt.Sprintf("%d", websiteId))
if err != nil { return }
// Copy chunk to file_list
formFile, err := multi.CreateFormFile("file_list", "file_list")
_, err = io.CopyN(formFile, f, fileListChunkSize)
if err == io.EOF {
break
} else if err == io.ErrUnexpectedEOF {
err = nil
// Break at end of iteration
iter = -420
}
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/upload",
&b)
if err != nil { return err }
res, err := serverClient.Do(req)
if err != nil { return err }
res.Body.Close()
if res.StatusCode != http.StatusOK {
return fmt.Errorf("failed to upload list part %d: %s",
iter, res.Status)
}
logrus.Infof("Uploading file list part %d: %s",
iter, res.Status)
socketWriter, _ := conn.NextWriter(websocket.BinaryMessage)
_, _ = io.Copy(socketWriter, f)
err = socketWriter.Close()
if err != nil {
logrus.Error("FIXME: couldn't do file upload")
return
}
return
err = conn.Close()
if err != nil {
return
}
return nil
}
func uploadResult(result *TaskResult) (err error) {
resultEnc, err := json.Marshal(result)
if err != nil { panic(err) }
func releaseTask(task *Task, taskResult ResultCode) (err error) {
payload := url.Values {
"token": {config.Token},
"result": {string(resultEnc)},
}.Encode()
req := releaseTaskRequest{
TaskId: task.TaskId,
ResultCode: taskResult,
// TODO Will implement verification in a later ODDB update
Verification: 0,
}
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/complete",
strings.NewReader(payload))
if err != nil { return }
resultEnc, err := json.Marshal(&req)
if err != nil {
panic(err)
}
body := bytes.NewBuffer(resultEnc)
res, err := serverClient.Do(req)
if err != nil { return }
res, err := serverClient.Post(
config.TrackerUrl+"/task/release",
"application/json",
body,
)
if err != nil {
return
}
res.Body.Close()
if res.StatusCode != http.StatusOK {
return fmt.Errorf("failed to cancel task: %s", res.Status)
return HttpError{res.StatusCode}
}
return
}
func CancelTask(websiteId uint64) (err error) {
form := url.Values{
"token": {config.Token},
"website_id": {strconv.FormatUint(websiteId, 10)},
type ServerTripper struct{}
func (t *ServerTripper) RoundTrip(req *http.Request) (res *http.Response, err error) {
req.Header.Set("User-Agent", serverUserAgent)
//TODO: Use task_tracker/client ?
if serverWorker != nil {
req.Header.Add("X-Worker-Id", strconv.Itoa(serverWorker.Id))
req.Header.Add("X-Secret", base64.StdEncoding.EncodeToString(serverWorker.Secret))
}
encForm := form.Encode()
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/cancel",
strings.NewReader(encForm))
if err != nil { return }
res, err := serverClient.Do(req)
if err != nil { return }
res.Body.Close()
if res.StatusCode != http.StatusOK {
return fmt.Errorf("failed to cancel task: %s", res.Status)
}
return
return http.DefaultTransport.RoundTrip(req)
}
// https://github.com/simon987/task_tracker/blob/master/api/models.go
type releaseTaskRequest struct {
TaskId int64 `json:"task_id"`
ResultCode ResultCode `json:"result"`
Verification int64 `json:"verification"`
}
type WorkerAccessRequest struct {
Assign bool `json:"assign"`
Submit bool `json:"submit"`
Project int `json:"project"`
}
type FetchTaskResponse struct {
Ok bool `json:"ok"`
Message string `json:"message"`
Content struct {
Task struct {
Id int64 `json:"id"`
Priority int64 `json:"priority"`
Project struct {
Id int64 `json:"id"`
Name string `json:"name"`
Version string `json:"version"`
AssignRate rate.Limit `json:"assign_rate"`
SubmitRate rate.Limit `json:"submit_rate"`
} `json:"project"`
Recipe string `json:"recipe"`
} `json:"task"`
} `json:"content"`
}
type TrackerWorker struct {
Alias string `json:"alias"`
Id int `json:"id"`
Secret []byte `json:"secret"`
}
type CreateTrackerWorkerResponse struct {
Ok bool `json:"ok"`
Message string `json:"message"`
Content struct {
Worker TrackerWorker `json:"worker"`
} `json:"content"`
}
type CreateTrackerWorkerRequest struct {
Alias string `json:"alias"`
}

View File

@@ -3,6 +3,7 @@ package main
import (
"context"
"github.com/sirupsen/logrus"
"github.com/spf13/viper"
"math"
"runtime"
"sync/atomic"
@@ -19,11 +20,14 @@ func Stats(c context.Context) {
var crawlTicker <-chan time.Time
var allocTicker <-chan time.Time
if config.CrawlStats != 0 {
crawlTicker = time.NewTicker(config.CrawlStats).C
crawlInterval := viper.GetDuration(ConfCrawlStats)
allocInterval := viper.GetDuration(ConfAllocStats)
if crawlInterval != 0 {
crawlTicker = time.Tick(crawlInterval)
}
if config.AllocStats != 0 {
allocTicker = time.NewTicker(config.AllocStats).C
if allocInterval != 0 {
allocTicker = time.Tick(allocInterval)
}
for {
@@ -32,13 +36,17 @@ func Stats(c context.Context) {
startedNow := atomic.LoadUint64(&totalStarted)
perSecond := float64(startedNow - startedLast) /
config.CrawlStats.Seconds()
crawlInterval.Seconds()
// Round to .5
perSecond *= 2
perSecond = math.Round(perSecond)
perSecond /= 2
if perSecond <= 0 {
continue
}
logrus.WithFields(logrus.Fields{
"per_second": perSecond,
"done": atomic.LoadUint64(&totalDone),
@@ -53,7 +61,7 @@ func Stats(c context.Context) {
runtime.ReadMemStats(&mem)
logrus.WithFields(logrus.Fields{
"queue_count": totalBuffered,
"queue_count": atomic.LoadInt64(&totalQueued),
"heap": FormatByteCount(mem.Alloc),
"objects": mem.HeapObjects,
"num_gc": mem.NumGC,

View File

@@ -1,16 +0,0 @@
package main
import "time"
type Task struct {
WebsiteId int `json:"website_id"`
Url string `json:"url"`
}
type TaskResult struct {
StatusCode int `json:"status_code"`
FileCount uint64 `json:"file_count"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}

22
util.go
View File

@@ -1,6 +1,9 @@
package main
import "fmt"
import (
"fmt"
"sync"
)
// https://programming.guide/go/formatting-byte-size-to-human-readable-format.html
func FormatByteCount(b uint64) string {
@@ -16,3 +19,20 @@ func FormatByteCount(b uint64) string {
return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp])
}
}
type Hooks struct {
m sync.Mutex
l []func()
}
func (h *Hooks) Add(hook func()) {
h.m.Lock()
h.l = append(h.l, hook)
h.m.Unlock()
}
func (h *Hooks) Execute() {
for _, hook := range h.l {
hook()
}
}

134
worker.go
View File

@@ -1,8 +1,10 @@
package main
import (
"github.com/beeker1121/goque"
"github.com/sirupsen/logrus"
"math"
"sort"
"strings"
"sync"
"sync/atomic"
@@ -12,24 +14,38 @@ import (
var globalWait sync.WaitGroup
type WorkerContext struct {
in chan<- Job
out <-chan Job
OD *OD
Queue *BufferedQueue
lastRateLimit time.Time
numRateLimits int
}
func (w WorkerContext) Worker() {
for job := range w.out {
w.step(job)
func (w *WorkerContext) Worker(results chan<- File) {
for {
job, err := w.Queue.Dequeue()
switch err {
case goque.ErrEmpty:
time.Sleep(500 * time.Millisecond)
continue
case goque.ErrDBClosed:
return
case nil:
w.step(results, job)
default:
panic(err)
}
}
}
func (w WorkerContext) step(job Job) {
defer w.finishJob(&job)
func (w *WorkerContext) step(results chan<- File, job Job) {
defer w.finishJob()
var f File
newJobs, err := DoJob(&job, &f)
newJobs, err := w.DoJob(&job, &f)
atomic.AddUint64(&totalStarted, 1)
if err == ErrKnown {
return
@@ -38,15 +54,18 @@ func (w WorkerContext) step(job Job) {
if err != nil {
job.Fails++
if err == ErrForbidden {
// Don't attempt crawling again
if !shouldRetry(err) {
atomic.AddUint64(&totalAborted, 1)
//logrus.WithField("url", job.UriStr).
// WithError(err).
// Error("Giving up after failure")
return
}
if job.Fails > config.Retries {
atomic.AddUint64(&totalAborted, 1)
logrus.WithField("url", job.UriStr).
Errorf("Giving up after %d fails", job.Fails)
//logrus.WithField("url", job.UriStr).
// Errorf("Giving up after %d fails", job.Fails)
} else {
atomic.AddUint64(&totalRetries, 1)
if err == ErrRateLimit {
@@ -63,17 +82,24 @@ func (w WorkerContext) step(job Job) {
w.queueJob(job)
}
job.OD.Files = append(job.OD.Files, f)
if !f.IsDir {
results <- f
}
}
func DoJob(job *Job, f *File) (newJobs []Job, err error) {
if strings.HasSuffix(job.Uri.Path, "/") {
func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
if len(job.Uri.Path) == 0 {
return
}
if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
// Load directory
links, err := GetDir(job, f)
if err != nil {
logrus.WithError(err).
WithField("url", job.Uri.String()).
Error("Failed getting dir")
if !isErrSilent(err) {
logrus.WithError(err).
WithField("url", job.UriStr).
Error("Failed to crawl dir")
}
return nil, err
}
@@ -81,58 +107,82 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
hash := f.HashDir(links)
// Skip symlinked dirs
if _, old := job.OD.Scanned.LoadOrStore(hash, true); old {
if w.OD.LoadOrStoreKey(&hash) {
return nil, ErrKnown
}
// Sort by path
sort.Slice(links, func(i, j int) bool {
return strings.Compare(links[i].Path, links[j].Path) < 0
})
var newJobCount int
var lastLink string
for _, link := range links {
// Skip already queued links
if _, old := job.OD.Scanned.LoadOrStore(link, true); old {
uriStr := link.String()
// Ignore dupes
if uriStr == lastLink {
continue
}
job.OD.Wait.Add(1)
lastLink = uriStr
newJobs = append(newJobs, Job{
OD: job.OD,
Uri: link,
UriStr: link.String(),
UriStr: uriStr,
Fails: 0,
})
newJobCount++
}
if config.Verbose {
logrus.WithFields(logrus.Fields{
"url": job.UriStr,
"files": newJobCount,
}).Debug("Listed")
}
logrus.WithFields(logrus.Fields{
"url": job.UriStr,
"files": len(links),
}).Debug("Listed")
} else {
// Load file
err := GetFile(job.Uri, f)
if err != nil {
logrus.WithError(err).
WithField("url", job.Uri.String()).
Error("Failed getting file")
if !isErrSilent(err) {
logrus.WithError(err).
WithField("url", job.UriStr).
Error("Failed to crawl file")
}
return nil, err
}
atomic.AddUint64(&w.OD.Result.FileCount, 1)
}
return
}
func (w WorkerContext) queueJob(job Job) {
job.OD.Wait.Add(1)
globalWait.Add(1)
func (w *WorkerContext) queueJob(job Job) {
w.OD.Wait.Add(1)
if w.numRateLimits > 0 {
if time.Since(w.lastRateLimit) > 5 * time.Second {
if time.Since(w.lastRateLimit) > 5*time.Second {
w.numRateLimits = 0
} else {
time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) *
time.Sleep(time.Duration(math.Sqrt(float64(50*w.numRateLimits))) *
100 * time.Millisecond)
w.in <- job
}
} else {
w.in <- job
}
if err := w.Queue.Enqueue(&job); err != nil {
panic(err)
}
}
func (w WorkerContext) finishJob(job *Job) {
job.OD.Wait.Done()
globalWait.Done()
func (w *WorkerContext) finishJob() {
w.OD.Wait.Done()
}
func isErrSilent(err error) bool {
if !config.PrintHTTP {
if _, ok := err.(*HttpError); ok {
return true
}
}
return false
}