1 Commits

Author SHA1 Message Date
Richard Patel
8cfada7904 kill perf 2018-11-06 01:44:09 +01:00
29 changed files with 1027 additions and 7545 deletions

5
.gitignore vendored
View File

@@ -1,6 +1,3 @@
/.idea/
.DS_Store
/od-database-crawler
*.log
/queue/
/crawled/
/oddb-go

View File

@@ -1,5 +0,0 @@
language: go
go:
- "1.11.x"
- master

View File

@@ -1,15 +0,0 @@
FROM golang:alpine as builder
ADD . /go/src/github.com/terorie/od-database-crawler
RUN apk add git \
&& go get -d -v github.com/terorie/od-database-crawler \
&& CGO_ENABLED=0 go install -a \
-installsuffix cgo \
-ldflags="-s -w" \
github.com/terorie/od-database-crawler
FROM scratch
COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
COPY --from=builder /go/bin/od-database-crawler /bin/
WORKDIR /oddb
VOLUME [ "/oddb" ]
CMD ["/bin/od-database-crawler", "server"]

View File

@@ -1,54 +1,2 @@
# od-database Go crawler 🚀
[![Build Status](https://travis-ci.org/terorie/od-database-crawler.svg?branch=master)](https://travis-ci.org/terorie/od-database-crawler)
# oddb Go crawler
> by terorie 2018 :P
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
* Crawls HTTP open directories (standard Web Server Listings)
* Gets name, path, size and modification time of all files
* Lightweight and fast: __over 9000 requests per second__ on a standard laptop
https://od-db.the-eye.eu/
## Usage
### Deploys
1. With Config File (if `config.yml` found in working dir)
- Download [default config](https://github.com/terorie/od-database-crawler/blob/master/config.yml)
- Set `server.url` and `server.token`
- Start with `./od-database-crawler server --config <file>`
2. With Flags or env
- Override config file if it exists
- `--help` for list of flags
- Every flag is available as an environment variable:
`--server.crawl_stats` ➡️ `OD_SERVER_CRAWL_STATS`
- Start with `./od-database-crawler server <flags>`
3. With Docker
```bash
docker run \
-e OD_SERVER_URL=xxx \
-e OD_SERVER_TOKEN=xxx \
terorie/od-database-crawler
```
### Flag reference
Here are the most important config flags. For more fine control, take a look at `/config.yml`.
| Flag/Environment | Description | Example |
| ------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------- |
| `server.url`<br />`OD_SERVER_URL` | OD-DB Server URL | `https://od-db.mine.the-eye.eu/api` |
| `server.token`<br />`OD_SERVER_TOKEN` | OD-DB Server Access Token | _Ask Hexa **TM**_ |
| `server.recheck`<br />`OD_SERVER_RECHECK` | Job Fetching Interval | `3s` |
| `output.crawl_stats`<br />`OD_OUTPUT_CRAWL_STATS` | Crawl Stats Logging Interval (0 = disabled) | `500ms` |
| `output.resource_stats`<br />`OD_OUTPUT_RESORUCE_STATS` | Resource Stats Logging Interval (0 = disabled) | `8s` |
| `output.log`<br />`OD_OUTPUT_LOG` | Log File (none = disabled) | `crawler.log` |
| `crawl.tasks`<br />`OD_CRAWL_TASKS` | Max number of sites to crawl concurrently | `500` |
| `crawl.connections`<br />`OD_CRAWL_CONNECTIONS` | HTTP connections per site | `1` |
| `crawl.retries`<br />`OD_CRAWL_RETRIES` | How often to retry after a temporary failure (e.g. `HTTP 429` or timeouts) | `5` |
| `crawl.dial_timeout`<br />`OD_CRAWL_DIAL_TIMEOUT` | TCP Connect timeout | `5s` |
| `crawl.timeout`<br />`OD_CRAWL_TIMEOUT` | HTTP request timeout | `20s` |
| `crawl.user-agent`<br />`OD_CRAWL_USER_AGENT` | HTTP Crawler User-Agent | `googlebot/1.2.3` |
| `crawl.job_buffer`<br />`OD_CRAWL_JOB_BUFFER` | Number of URLs to keep in memory/cache, per job. The rest is offloaded to disk. Decrease this value if the crawler uses too much RAM. (0 = Disable Cache, -1 = Only use Cache) | `5000` |

195
config.go
View File

@@ -1,184 +1,62 @@
package main
import (
"bufio"
"fmt"
"github.com/sirupsen/logrus"
"github.com/spf13/pflag"
"github.com/spf13/viper"
"io"
"os"
"strings"
"time"
)
var config struct {
TrackerUrl string
TrackerProject int
TrackerAlias string
WsBucketScheme string
WsBucketHost string
ServerTimeout time.Duration
Recheck time.Duration
ChunkSize int64
ServerUrl string
Token string
Retries int
Workers int
UserAgent string
Tasks int32
CrawlStats time.Duration
AllocStats time.Duration
Verbose bool
PrintHTTP bool
JobBufferSize int
}
var onlineMode bool
const (
ConfTrackerUrl = "server.url"
ConfTrackerProject = "server.project"
ConfTrackerAlias = "server.alias"
ConfWsBucketScheme = "server.ws_bucket_scheme"
ConfWsBucketHost = "server.ws_bucket_host"
ConfServerTimeout = "server.timeout"
ConfRecheck = "server.recheck"
ConfCooldown = "server.cooldown"
ConfChunkSize = "server.upload_chunk"
ConfUploadRetries = "server.upload_retries"
ConfUploadRetryInterval = "server.upload_retry_interval"
ConfServerUrl = "server.url"
ConfToken = "server.token"
ConfTasks = "crawl.tasks"
ConfRetries = "crawl.retries"
ConfWorkers = "crawl.connections"
ConfUserAgent = "crawl.user-agent"
ConfDialTimeout = "crawl.dial_timeout"
ConfTimeout = "crawl.timeout"
ConfJobBufferSize = "crawl.job_buffer"
ConfCrawlStats = "output.crawl_stats"
ConfAllocStats = "output.resource_stats"
ConfVerbose = "output.verbose"
ConfPrintHTTP = "output.http"
ConfLogFile = "output.log"
)
func prepareConfig() {
pf := rootCmd.PersistentFlags()
pf.SortFlags = false
pf.StringVar(&configFile, "config", "", "Config file")
configFile = os.Getenv("OD_CONFIG")
pf.String(ConfTrackerUrl, "https://tt.the-eye.eu/api", "task_tracker api URL")
pf.String(ConfTrackerProject, "1", "task_tracker project id")
pf.String(ConfWsBucketScheme, "wss", "ws_bucket scheme")
pf.String(ConfWsBucketHost, "wsb.the-eye.eu", "ws_bucket host")
pf.String(ConfTrackerAlias, "changeme", "task_tracker worker alias")
pf.Duration(ConfServerTimeout, 60*time.Second, "OD-DB request timeout")
pf.Duration(ConfRecheck, 1*time.Second, "OD-DB: Poll interval for new jobs")
pf.Duration(ConfCooldown, 1*time.Minute, "OD-DB: Time to wait after a server-side error")
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
pf.Uint(ConfUploadRetries, 10, "OD-DB: Max upload retries")
pf.Duration(ConfUploadRetryInterval, 30*time.Second, "OD-DB: Time to wait between upload retries")
pf.Uint(ConfTasks, 25, "Crawler: Max concurrent tasks")
pf.Uint(ConfWorkers, 1, "Crawler: Connections per server")
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
pf.Duration(ConfDialTimeout, 10*time.Second, "Crawler: Handshake timeout")
pf.Duration(ConfTimeout, 30*time.Second, "Crawler: Request timeout")
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
pf.Int(ConfJobBufferSize, -1, "Crawler: Task queue cache size")
pf.Duration(ConfCrawlStats, 500*time.Second, "Log: Crawl stats interval")
pf.Duration(ConfAllocStats, 500*time.Second, "Log: Resource stats interval")
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")
pf.Bool(ConfPrintHTTP, false, "Log: Print HTTP client errors")
pf.String(ConfLogFile, "crawler.log", "Log file")
// Bind all flags to Viper
pf.VisitAll(func(flag *pflag.Flag) {
s := flag.Name
s = strings.TrimLeft(s, "-")
if err := viper.BindPFlag(s, flag); err != nil {
panic(err)
}
var envKey string
envKey = strings.Replace(s, ".", "_", -1)
envKey = strings.ToUpper(envKey)
envKey = "OD_" + envKey
if err := viper.BindEnv(s, envKey); err != nil {
panic(err)
}
})
viper.SetDefault(ConfRetries, 5)
viper.SetDefault(ConfWorkers, 2)
viper.SetDefault(ConfTasks, 3)
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
viper.SetDefault(ConfAllocStats, 0)
viper.SetDefault(ConfVerbose, false)
}
func readConfig() {
// If config.yml in working dir, use it
if configFile == "" {
_, err := os.Stat("config.yml")
if err == nil {
configFile = "config.yml"
}
}
if configFile != "" {
confF, err := os.Open(configFile)
viper.AddConfigPath(".")
viper.SetConfigName("config")
err := viper.ReadInConfig()
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
defer confF.Close()
viper.SetConfigType("yml")
err = viper.ReadConfig(confF)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
config.ServerUrl = viper.GetString(ConfServerUrl)
//if config.ServerUrl == "" {
// configMissing(ConfServerUrl)
//}
if onlineMode {
config.TrackerUrl = viper.GetString(ConfTrackerUrl)
if config.TrackerUrl == "" {
configMissing(ConfTrackerUrl)
}
config.TrackerUrl = strings.TrimRight(config.TrackerUrl, "/")
}
config.TrackerProject = viper.GetInt(ConfTrackerProject)
config.TrackerAlias = viper.GetString(ConfTrackerAlias)
config.WsBucketHost = viper.GetString(ConfWsBucketHost)
config.WsBucketScheme = viper.GetString(ConfWsBucketScheme)
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
config.Recheck = viper.GetDuration(ConfRecheck)
config.ChunkSize = int64(viper.GetSizeInBytes(ConfChunkSize))
if config.ChunkSize < 100 {
configOOB(ConfChunkSize, config.ChunkSize)
}
config.Token = viper.GetString(ConfToken)
//if config.Token == "" {
// configMissing(ConfToken)
//}
config.Retries = viper.GetInt(ConfRetries)
if config.Retries < 0 {
@@ -195,33 +73,14 @@ func readConfig() {
configOOB(ConfTasks, int(config.Tasks))
}
config.UserAgent = viper.GetString(ConfUserAgent)
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
setDialTimeout(viper.GetDuration(ConfDialTimeout))
setTimeout(viper.GetDuration(ConfTimeout))
config.JobBufferSize = viper.GetInt(ConfJobBufferSize)
config.AllocStats = viper.GetDuration(ConfAllocStats)
config.Verbose = viper.GetBool(ConfVerbose)
if config.Verbose {
logrus.SetLevel(logrus.DebugLevel)
}
if filePath := viper.GetString(ConfLogFile); filePath != "" {
f, err := os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
bufWriter := bufio.NewWriter(f)
if err != nil {
panic(err)
}
exitHooks.Add(func() {
bufWriter.Flush()
f.Close()
})
logrus.SetOutput(io.MultiWriter(os.Stdout, bufWriter))
}
config.PrintHTTP = viper.GetBool(ConfPrintHTTP)
}
func configMissing(key string) {
@@ -229,7 +88,7 @@ func configMissing(key string) {
os.Exit(1)
}
func configOOB(key string, v interface{}) {
fmt.Fprintf(os.Stderr, "config: illegal value %v for key %s!\n", v, key)
func configOOB(key string, v int) {
fmt.Fprintf(os.Stderr, "config: illegal value %d for %key!\n", v, key)
os.Exit(1)
}

View File

@@ -1,84 +1,26 @@
# OD-Database server settings
server:
# Connection URL
url: https://tt.the-eye.eu/api
# OD-Database project id (for crawling)
project: 1
# Your worker alias
alias: changeme
# Websocket bucket host & scheme (ws/wss)
ws_bucket_host: https://wsb.the-eye.eu
ws_bucket_scheme: wss
# Request timeout
timeout: 60s
# Recheck interval
# The crawler periodically asks the server
# for new jobs. Sets the minimum wait time
# between /task/get requests to the server.
recheck: 1s
# Time to wait after receiving an error
# from the server. Doesn't apply to uploads.
cooldown: 1s
upload_retries: 10
upload_retry_interval: 30s
url: localhost:6969
# Server auth token
token:
# Log output settings
output:
# Crawl statistics
crawl_stats: 1m
crawl_stats: 1s
# CPU/RAM/Job queue stats
resource_stats: 1m
resource_stats: 1s
# More output? (Every listed dir)
verbose: false
# Print HTTP errors (Super spammy)
http: false
# Log file
# If empty, no log file is created.
log: crawler.log
# Crawler settings
crawl:
# Number of sites that can be processed at once
tasks: 25
# Number of sites that can be
# processed at once
tasks: 3
# Number of connections per site
# Please be careful with this setting!
# The crawler fires fast and more than
# ten connections can overwhelm a server.
connections: 1
connections: 2
# How often to retry getting data
# from the site before giving up
retries: 5
# Time before discarding a failed connection attempt
dial_timeout: 10s
# Time before discarding a network request
timeout: 30s
# Crawler User-Agent
# If empty, no User-Agent header is sent.
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
# Job buffer size (per task)
# Higher values cause less disk writes
# but require more memory.
#
# The job queue contains all URLs
# that should be crawled next.
# As it grows very large over time,
# it's kept mainly on disk.
# This sets how many jobs are kept
# in memory.
# A negative value will cause all jobs
# to be stored in memory. (Don't do this)
job_buffer: -1

261
crawl.go
View File

@@ -2,44 +2,29 @@ package main
import (
"bytes"
"crypto/tls"
"github.com/terorie/od-database-crawler/ds/redblackhash"
"github.com/terorie/od-database-crawler/fasturl"
"fmt"
"github.com/sirupsen/logrus"
"github.com/terorie/oddb-go/ds/redblackhash"
"github.com/terorie/oddb-go/fasturl"
"github.com/terorie/oddb-go/runes"
"github.com/terorie/oddb-go/runespath"
"github.com/valyala/fasthttp"
"golang.org/x/crypto/blake2b"
"golang.org/x/net/html"
"net"
"golang.org/x/net/html/atom"
"path"
"strconv"
"strings"
"time"
)
var client = fasthttp.Client {
TLSConfig: &tls.Config{
InsecureSkipVerify: true,
},
}
func setDialTimeout(d time.Duration) {
client.Dial = func(addr string) (net.Conn, error) {
return fasthttp.DialTimeout(addr, d)
}
}
func setTimeout(d time.Duration) {
client.ReadTimeout = d
client.WriteTimeout = d / 2
}
var client fasthttp.Client
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
f.IsDir = true
f.Name = path.Base(j.Uri.Path)
f.Name = runespath.Base(j.Uri.Path)
req := fasthttp.AcquireRequest()
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(j.UriStr)
res := fasthttp.AcquireResponse()
@@ -49,83 +34,84 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
fasthttp.ReleaseRequest(req)
if err != nil {
logrus.Error(err)
return
}
err = checkStatusCode(res.StatusCode())
if err != nil {
return
}
if err != nil { return }
body := res.Body()
return ParseDir(body, &j.Uri)
}
func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error) {
doc := html.NewTokenizer(bytes.NewReader(body))
var linkHref string
var linkTexts []string
for {
err = nil
tokenType := doc.Next()
token := doc.Token()
if tokenType == html.ErrorToken {
break
}
switch tokenType {
case html.StartTagToken:
name, hasAttr := doc.TagName()
if len(name) == 1 && name[0] == 'a' {
for hasAttr {
var ks, vs []byte
ks, vs, hasAttr = doc.TagAttr()
if bytes.Equal(ks, []byte("href")) {
// TODO Check escape
linkHref = string(vs)
if token.DataAtom == atom.A {
for _, attr := range token.Attr {
if attr.Key == "href" {
linkHref = attr.Val
break
}
}
}
case html.TextToken:
if linkHref != "" {
linkTexts = append(linkTexts, token.Data)
}
case html.EndTagToken:
name, _ := doc.TagName()
if len(name) == 1 && name[0] == 'a' {
if linkHref != "" && token.DataAtom == atom.A {
// Copy params
href := linkHref
linkText := strings.Join(linkTexts, " ")
// Reset params
linkHref = ""
linkTexts = nil
if strings.LastIndexByte(href, '?') != -1 {
continue
// TODO Optimized decision tree
for _, entry := range urlBlackList {
if href == entry {
goto nextToken
}
switch href {
case "", " ", ".", "..", "/":
continue
}
if strings.Contains(href, "../") {
continue
for _, entry := range urlPartBlackList {
if strings.Contains(href, entry) {
goto nextToken
}
}
for _, entry := range fileNameBlackList {
if strings.Contains(linkText, entry) {
goto nextToken
}
}
var link fasturl.URL
err = baseUri.ParseRel(&link, href)
if err != nil {
continue
}
err = j.Uri.ParseRel(&link, []rune(href))
if err != nil { continue }
if link.Scheme != baseUri.Scheme ||
link.Host != baseUri.Host ||
link.Path == baseUri.Path ||
!strings.HasPrefix(link.Path, baseUri.Path) {
if !runes.Equals(link.Scheme, j.Uri.Scheme) ||
!runes.Equals(link.Host, j.Uri.Host) ||
runes.Equals(link.Path, j.Uri.Path) ||
!runes.HasPrefix(link.Path, j.Uri.Path) {
continue
}
links = append(links, link)
}
}
nextToken:
}
return
@@ -133,15 +119,12 @@ func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error
func GetFile(u fasturl.URL, f *File) (err error) {
f.IsDir = false
u.Path = path.Clean(u.Path)
f.Name = path.Base(u.Path)
f.Path = strings.Trim(path.Dir(u.Path), "/")
u.Path = []rune(path.Clean(string(u.Path)))
f.Name = runespath.Base(u.Path)
f.Path = runes.TrimRune(u.Path, '/')
req := fasthttp.AcquireRequest()
req.Header.SetMethod("HEAD")
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(u.String())
res := fasthttp.AcquireResponse()
@@ -151,69 +134,87 @@ func GetFile(u fasturl.URL, f *File) (err error) {
err = client.Do(req, res)
fasthttp.ReleaseRequest(req)
if err != nil {
return
}
if err != nil { return }
err = checkStatusCode(res.StatusCode())
if err != nil {
return
}
if err != nil { return }
f.applyContentLength(string(res.Header.Peek("content-length")))
f.applyLastModified(string(res.Header.Peek("last-modified")))
// TODO Inefficient af
header := res.Header.Header()
f.ParseHeader(header)
return nil
}
func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) {
h, _ := blake2b.New256(nil)
h.Write([]byte(f.Name))
h.Write([]byte(string(f.Name)))
for _, link := range links {
fileName := path.Base(link.Path)
h.Write([]byte(fileName))
fileName := runespath.Base(link.Path)
h.Write([]byte(string(fileName)))
}
sum := h.Sum(nil)
copy(o[:redblackhash.KeySize], sum)
return
}
func (f *File) applyContentLength(v string) {
if v == "" {
return
func (f *File) ParseHeader(h []byte) {
var k1, k2 int
var v1, v2 int
// Simple finite state machine
state := 0
for i, b := range h {
switch state {
case 0:
if b == byte(':') {
state = 1
k2 = i
}
size, err := strconv.ParseInt(v, 10, 64)
if err != nil {
return
case 1:
state = 2
case 2:
state = 3
v1 = i
case 3:
if b == byte('\r') {
state = 4
}
if size < 0 {
return
case 4:
state = 0
v2 = i - 1
key := string(h[k1:k2])
val := string(h[v1:v2])
k1 = i
f.applyHeader(key, val)
}
f.Size = size
}
}
// TODO Cleanup
func (f *File) applyLastModified(v string) {
if v == "" {
return
}
var t time.Time
func (f *File) applyHeader(k, v string) {
switch k {
case "content-length":
size, err := strconv.ParseInt(v, 10, 64)
if err != nil { break }
if size < 0 { break }
f.Size = size
case "last-modified":
var err error
t, err = time.Parse(time.RFC1123, v)
if err == nil {
f.MTime = t.Unix()
return
}
t, err = time.Parse(time.RFC850, v)
if err == nil {
f.MTime = t.Unix()
return
}
f.MTime, err = time.Parse(time.RFC1123, v)
if err == nil { break }
f.MTime, err = time.Parse(time.RFC850, v)
if err == nil { break }
// TODO Parse asctime
t, err = time.Parse("2006-01-02", v[:10])
if err == nil {
f.MTime = t.Unix()
return
f.MTime, err = time.Parse("2006-01-02", v[:10])
if err == nil { break }
}
}
@@ -221,7 +222,53 @@ func checkStatusCode(status int) error {
switch status {
case fasthttp.StatusOK:
return nil
case fasthttp.StatusTooManyRequests:
return ErrRateLimit
case fasthttp.StatusForbidden,
fasthttp.StatusUnauthorized:
return ErrForbidden
default:
return &HttpError{status}
return fmt.Errorf("got HTTP status %d", status)
}
}
var urlBlackList = [...]string {
"",
" ",
".",
"..",
"/",
}
var urlPartBlackList = [...]string {
"?C=N&O=D",
"?C=M&O=A",
"?C=S&O=A",
"?C=D&O=A",
"?C=N;O=D",
"?C=M;O=A",
"?C=M&O=D",
"?C=S;O=A",
"?C=S&O=D",
"?C=D;O=A",
"?MA",
"?SA",
"?DA",
"?ND",
"?C=N&O=A",
"?C=N&O=A",
"?M=A",
"?N=D",
"?S=A",
"?D=A",
}
var fileNameBlackList = [...]string {
"Parent Directory",
" Parent Directory",
"../",
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,117 +0,0 @@
package main
import (
"github.com/terorie/od-database-crawler/fasturl"
"testing"
)
func TestParseDirNginx(t *testing.T) {
var u fasturl.URL
err := u.Parse("https://the-eye.eu/public/")
if err != nil {
t.Fatal("Failed to parse URL", err)
}
links, err := ParseDir([]byte(nginxListing), &u)
if err != nil {
t.Fatal("Failed to extract links", err)
}
if len(links) != len(nginxLinks) {
t.Fatalf("Expected %d links, got %d",
len(nginxLinks), len(links))
}
for i := 0; i < len(links); i++ {
gotLink := links[i].String()
expLink := nginxLinks[i]
if gotLink != expLink {
t.Errorf(`Expected "%s" got "%s"`,
expLink, gotLink)
}
}
}
var nginxLinks = []string {
"https://the-eye.eu/public/AppleArchive/",
"https://the-eye.eu/public/AudioBooks/",
"https://the-eye.eu/public/Books/",
"https://the-eye.eu/public/Comics/",
"https://the-eye.eu/public/Games/",
"https://the-eye.eu/public/Icons/",
"https://the-eye.eu/public/Images/",
"https://the-eye.eu/public/JFK_Files/",
"https://the-eye.eu/public/MSDN/",
"https://the-eye.eu/public/Music/",
"https://the-eye.eu/public/Operating%20Systems/",
"https://the-eye.eu/public/Posters/",
"https://the-eye.eu/public/Psychedelics/",
"https://the-eye.eu/public/Psychoactives/",
"https://the-eye.eu/public/Radio/",
"https://the-eye.eu/public/Random/",
"https://the-eye.eu/public/Site-Dumps/",
"https://the-eye.eu/public/Software/",
"https://the-eye.eu/public/Strategic%20Intelligence%20Network/",
"https://the-eye.eu/public/WorldTracker.org/",
"https://the-eye.eu/public/concen.org/",
"https://the-eye.eu/public/freenrg.info/",
"https://the-eye.eu/public/murdercube.com/",
"https://the-eye.eu/public/parazite/",
"https://the-eye.eu/public/ripreddit/",
"https://the-eye.eu/public/rom/",
"https://the-eye.eu/public/touhou/",
"https://the-eye.eu/public/vns/",
"https://the-eye.eu/public/xbins/",
"https://the-eye.eu/public/xbins.diodematrix/",
"https://the-eye.eu/public/Rclone_for_Scrubs.pdf",
"https://the-eye.eu/public/Wget_Linux_Guide.pdf",
"https://the-eye.eu/public/Wget_Windows_Guide.pdf",
"https://the-eye.eu/public/rclone_guide.pdf",
"https://the-eye.eu/public/wget-noobs-guide.pdf",
"https://the-eye.eu/public/xbox-scene_Aug2014.7z",
}
const nginxListing =
`<html>
<head><title>Index of /public/</title></head>
<body bgcolor="white">
<h1>Index of /public/</h1><hr><pre><a href="../">../</a>
<a href="AppleArchive/">AppleArchive/</a> 03-Nov-2017 18:13 -
<a href="AudioBooks/">AudioBooks/</a> 29-Sep-2018 19:47 -
<a href="Books/">Books/</a> 27-Nov-2018 17:50 -
<a href="Comics/">Comics/</a> 05-Nov-2018 21:37 -
<a href="Games/">Games/</a> 28-Nov-2018 11:54 -
<a href="Icons/">Icons/</a> 22-May-2018 07:47 -
<a href="Images/">Images/</a> 21-Jan-2018 03:21 -
<a href="JFK_Files/">JFK_Files/</a> 03-Nov-2017 17:03 -
<a href="MSDN/">MSDN/</a> 03-Nov-2017 15:48 -
<a href="Music/">Music/</a> 02-Mar-2018 15:47 -
<a href="Operating%20Systems/">Operating Systems/</a> 25-Apr-2018 07:18 -
<a href="Posters/">Posters/</a> 07-Jul-2018 01:12 -
<a href="Psychedelics/">Psychedelics/</a> 11-Apr-2018 05:45 -
<a href="Psychoactives/">Psychoactives/</a> 18-May-2018 02:58 -
<a href="Radio/">Radio/</a> 09-Jun-2018 15:49 -
<a href="Random/">Random/</a> 04-Dec-2018 12:33 -
<a href="Site-Dumps/">Site-Dumps/</a> 15-Dec-2018 11:04 -
<a href="Software/">Software/</a> 27-Nov-2017 00:22 -
<a href="Strategic%20Intelligence%20Network/">Strategic Intelligence Network/</a> 17-Nov-2017 16:35 -
<a href="WorldTracker.org/">WorldTracker.org/</a> 12-Apr-2018 04:16 -
<a href="concen.org/">concen.org/</a> 08-Oct-2018 14:08 -
<a href="freenrg.info/">freenrg.info/</a> 19-Dec-2017 10:59 -
<a href="murdercube.com/">murdercube.com/</a> 06-Dec-2017 10:45 -
<a href="parazite/">parazite/</a> 20-Nov-2017 21:25 -
<a href="ripreddit/">ripreddit/</a> 04-Aug-2018 14:30 -
<a href="rom/">rom/</a> 28-Nov-2018 14:15 -
<a href="touhou/">touhou/</a> 03-Nov-2017 11:07 -
<a href="vns/">vns/</a> 03-Nov-2017 11:36 -
<a href="xbins/">xbins/</a> 03-Nov-2017 17:23 -
<a href="xbins.diodematrix/">xbins.diodematrix/</a> 21-Sep-2018 22:33 -
<a href="Rclone_for_Scrubs.pdf">Rclone_for_Scrubs.pdf</a> 04-Sep-2018 13:31 315K
<a href="Wget_Linux_Guide.pdf">Wget_Linux_Guide.pdf</a> 21-Dec-2017 20:28 168K
<a href="Wget_Windows_Guide.pdf">Wget_Windows_Guide.pdf</a> 25-Nov-2017 17:59 867K
<a href="rclone_guide.pdf">rclone_guide.pdf</a> 03-Sep-2018 23:37 315K
<a href="wget-noobs-guide.pdf">wget-noobs-guide.pdf</a> 21-Dec-2017 20:29 168K
<a href="xbox-scene_Aug2014.7z">xbox-scene_Aug2014.7z</a> 26-Oct-2017 23:09 1G
</pre><hr></body>
</html>`

View File

@@ -1,59 +0,0 @@
package main
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"github.com/terorie/od-database-crawler/fasturl"
"net/url"
"strings"
"testing"
)
func BenchmarkParseDir(b *testing.B) {
for n := 0; n < b.N; n++ {
var u fasturl.URL
err := u.Parse("http://archive.ubuntu.com/ubuntu/indices/")
if err != nil {
b.Fatal("Failed to parse URL", err)
}
_, err = ParseDir([]byte(apache2Listing), &u)
if err != nil {
b.Fatal("Failed to extract links", err)
}
}
}
func BenchmarkParseDirReference(b *testing.B) {
for n := 0; n < b.N; n++ {
u, err := url.Parse("http://archive.ubuntu.com/ubuntu/indices/")
if err != nil {
b.Fatal("Failed to parse URL", err)
}
_, err = referenceParseDir([]byte(apache2Listing), u)
if err != nil {
b.Fatal("Failed to extract links", err)
}
}
}
func referenceParseDir(body []byte, baseUri *url.URL) (links []*url.URL, err error) {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil { return nil, err }
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
sub, err := baseUri.Parse(href)
if err != nil { return } // continue
if !strings.HasPrefix(sub.String(), baseUri.String()) {
return // continue
}
links = append(links, sub)
})
return
}

View File

@@ -14,9 +14,7 @@
package redblackhash
import (
"bytes"
"fmt"
"sync"
)
const (
@@ -29,7 +27,6 @@ type Key [KeySize]byte
// Tree holds elements of the red-black tree
type Tree struct {
sync.Mutex
Root *Node
size int
}
@@ -44,7 +41,42 @@ type Node struct {
}
func (k *Key) Compare(o *Key) int {
return bytes.Compare(k[:], o[:])
// TODO Assembly
/*for i := 0; i < KeySize / 8; i++ {
a := uint64(k[i+0] ) |
uint64(k[i+1] >> 8) |
uint64(k[i+2] >> 16) |
uint64(k[i+3] >> 24) |
uint64(k[i+4] >> 32) |
uint64(k[i+5] >> 40) |
uint64(k[i+6] >> 48) |
uint64(k[i+7] >> 56)
b := uint64(o[i+0] ) |
uint64(o[i+1] >> 8) |
uint64(o[i+2] >> 16) |
uint64(o[i+3] >> 24) |
uint64(o[i+4] >> 32) |
uint64(o[i+5] >> 40) |
uint64(o[i+6] >> 48) |
uint64(o[i+7] >> 56)
switch {
case a < b:
return -1
case a > b:
return 1
}
}*/
for i := 0; i < KeySize; i++ {
switch {
case k[i] < o[i]:
return -1
case k[i] > o[i]:
return 1
}
}
return 0
}
// Put inserts node into the tree.

View File

@@ -1,45 +1,8 @@
package main
import (
"errors"
"fmt"
"github.com/valyala/fasthttp"
"net"
)
import "errors"
var ErrRateLimit = errors.New("too many requests")
var ErrForbidden = errors.New("access denied")
var ErrKnown = errors.New("already crawled")
type HttpError struct {
code int
}
func (e HttpError) Error() string {
return fmt.Sprintf("http status %d", e.code)
}
func shouldRetry(err error) bool {
// HTTP errors
if httpErr, ok := err.(*HttpError); ok {
switch httpErr.code {
case fasthttp.StatusTooManyRequests:
return true
default:
// Don't retry HTTP error codes
return false
}
}
if dnsError, ok := err.(*net.DNSError); ok {
// Don't retry permanent DNS errors
return dnsError.IsTemporary
}
if netErr, ok := err.(*net.OpError); ok {
// Don't retry permanent network errors
return netErr.Temporary()
}
// Retry by default
return true
}

View File

@@ -15,34 +15,19 @@ package fasturl
import (
"errors"
"fmt"
"github.com/terorie/oddb-go/runes"
"strconv"
"strings"
)
type Scheme int
const (
SchemeInvalid = iota
SchemeHTTP
SchemeHTTPS
SchemeCount
)
var Schemes = [SchemeCount]string {
"",
"http",
"https",
}
var ErrUnknownScheme = errors.New("unknown protocol scheme")
// Error reports an error and the operation and URL that caused it.
type Error struct {
Op string
URL string
URL []rune
Err error
}
func (e *Error) Error() string { return e.Op + " " + e.URL + ": " + e.Err.Error() }
func (e *Error) Error() string { return e.Op + " " + string(e.URL) + ": " + e.Err.Error() }
type timeout interface {
Timeout() bool
@@ -115,7 +100,7 @@ func (e InvalidHostError) Error() string {
//
// Please be informed that for now shouldEscape does not check all
// reserved characters correctly. See golang.org/issue/5684.
func shouldEscape(c byte, mode encoding) bool {
func shouldEscape(c rune, mode encoding) bool {
// §2.3 Unreserved characters (alphanum)
if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' {
return false
@@ -192,9 +177,29 @@ func shouldEscape(c byte, mode encoding) bool {
return true
}
// QueryUnescape does the inverse transformation of QueryEscape,
// converting each 3-byte encoded substring of the form "%AB" into the
// hex-decoded byte 0xAB.
// It returns an error if any % is not followed by two hexadecimal
// digits.
func QueryUnescape(s []rune) ([]rune, error) {
return unescape(s, encodeQueryComponent)
}
// PathUnescape does the inverse transformation of PathEscape,
// converting each 3-byte encoded substring of the form "%AB" into the
// hex-decoded byte 0xAB. It returns an error if any % is not followed
// by two hexadecimal digits.
//
// PathUnescape is identical to QueryUnescape except that it does not
// unescape '+' to ' ' (space).
func PathUnescape(s []rune) ([]rune, error) {
return unescape(s, encodePathSegment)
}
// unescape unescapes a string; the mode specifies
// which section of the URL string is being unescaped.
func unescape(s string, mode encoding) (string, error) {
func unescape(s []rune, mode encoding) ([]rune, error) {
// Count %, check that they're well-formed.
n := 0
hasPlus := false
@@ -202,12 +207,12 @@ func unescape(s string, mode encoding) (string, error) {
switch s[i] {
case '%':
n++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
if i+2 >= len(s) || !ishex(byte(s[i+1])) || !ishex(byte(s[i+2])) {
s = s[i:]
if len(s) > 3 {
s = s[:3]
}
return "", EscapeError(s)
return nil, EscapeError(s)
}
// Per https://tools.ietf.org/html/rfc3986#page-21
// in the host component %-encoding can only be used
@@ -215,8 +220,8 @@ func unescape(s string, mode encoding) (string, error) {
// But https://tools.ietf.org/html/rfc6874#section-2
// introduces %25 being allowed to escape a percent sign
// in IPv6 scoped-address literals. Yay.
if mode == encodeHost && unhex(s[i+1]) < 8 && s[i:i+3] != "%25" {
return "", EscapeError(s[i : i+3])
if mode == encodeHost && unhex(byte(s[i+1])) < 8 && !runes.Equals(s[i:i+3], []rune("%25")) {
return nil, EscapeError(s[i : i+3])
}
if mode == encodeZone {
// RFC 6874 says basically "anything goes" for zone identifiers
@@ -226,9 +231,9 @@ func unescape(s string, mode encoding) (string, error) {
// That is, you can use escaping in the zone identifier but not
// to introduce bytes you couldn't just write directly.
// But Windows puts spaces here! Yay.
v := unhex(s[i+1])<<4 | unhex(s[i+2])
if s[i:i+3] != "%25" && v != ' ' && shouldEscape(v, encodeHost) {
return "", EscapeError(s[i : i+3])
v := unhex(byte(s[i+1]))<<4 | unhex(byte(s[i+2]))
if !runes.Equals(s[i:i+3], []rune("%25")) && v != ' ' && shouldEscape(rune(v), encodeHost) {
return nil, EscapeError(s[i : i+3])
}
}
i += 3
@@ -237,7 +242,7 @@ func unescape(s string, mode encoding) (string, error) {
i++
default:
if (mode == encodeHost || mode == encodeZone) && s[i] < 0x80 && shouldEscape(s[i], mode) {
return "", InvalidHostError(s[i : i+1])
return nil, InvalidHostError(s[i : i+1])
}
i++
}
@@ -252,7 +257,7 @@ func unescape(s string, mode encoding) (string, error) {
for i := 0; i < len(s); {
switch s[i] {
case '%':
t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
t[j] = unhex(byte(s[i+1]))<<4 | unhex(byte(s[i+2]))
j++
i += 3
case '+':
@@ -264,15 +269,27 @@ func unescape(s string, mode encoding) (string, error) {
j++
i++
default:
t[j] = s[i]
t[j] = byte(s[i])
j++
i++
}
}
return string(t), nil
return []rune(string(t)), nil
}
func escape(s string, mode encoding) string {
// QueryEscape escapes the string so it can be safely placed
// inside a URL query.
func QueryEscape(s []rune) []rune {
return escape(s, encodeQueryComponent)
}
// PathEscape escapes the string so it can be safely placed
// inside a URL path segment.
func PathEscape(s []rune) []rune {
return escape(s, encodePathSegment)
}
func escape(s []rune, mode encoding) []rune {
spaceCount, hexCount := 0, 0
for i := 0; i < len(s); i++ {
c := s[i]
@@ -302,11 +319,11 @@ func escape(s string, mode encoding) string {
t[j+2] = "0123456789ABCDEF"[c&15]
j += 3
default:
t[j] = s[i]
t[j] = byte(s[i])
j++
}
}
return string(t)
return []rune(string(t))
}
// A URL represents a parsed URL (technically, a URI reference).
@@ -327,15 +344,19 @@ func escape(s string, mode encoding) string {
// and URL's String method uses RawPath if it is a valid encoding of Path,
// by calling the EscapedPath method.
type URL struct {
Scheme Scheme
Host string // host or host:port
Path string // path (relative paths may omit leading slash)
Scheme []rune
Opaque []rune // encoded opaque data
Host []rune // host or host:port
Path []rune // path (relative paths may omit leading slash)
RawPath []rune // encoded path hint (see EscapedPath method)
ForceQuery bool // append a query ('?') even if RawQuery is empty
RawQuery []rune // encoded query values, without '?'
}
// Maybe rawurl is of the form scheme:path.
// (Scheme must be [a-zA-Z][a-zA-Z0-9+-.]*)
// If so, return scheme, path; else return "", rawurl.
func getscheme(rawurl string) (scheme Scheme, path string, err error) {
func getscheme(rawurl []rune) (scheme []rune, path []rune, err error) {
for i := 0; i < len(rawurl); i++ {
c := rawurl[i]
switch {
@@ -343,42 +364,34 @@ func getscheme(rawurl string) (scheme Scheme, path string, err error) {
// do nothing
case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.':
if i == 0 {
return SchemeInvalid, rawurl, nil
return nil, rawurl, nil
}
case c == ':':
if i == 0 {
return SchemeInvalid, "", errors.New("missing protocol scheme")
return nil, nil, errors.New("missing protocol scheme")
}
switch rawurl[:i] {
case "http":
scheme = SchemeHTTP
case "https":
scheme = SchemeHTTPS
default:
return SchemeInvalid, "", ErrUnknownScheme
}
scheme = rawurl[:i]
path = rawurl[i+1:]
return
default:
// we have encountered an invalid character,
// so there is no valid scheme
return SchemeInvalid, rawurl, nil
return nil, rawurl, nil
}
}
return SchemeInvalid, rawurl, nil
return nil, rawurl, nil
}
// Maybe s is of the form t c u.
// If so, return t, c u (or t, u if cutc == true).
// If not, return s, "".
func split(s string, c string, cutc bool) (string, string) {
i := strings.Index(s, c)
func split(s []rune, c rune, cutc bool) ([]rune, []rune) {
i := strings.Index(string(s), string(c)) // TODO Optimize
if i < 0 {
return s, ""
return s, nil
}
if cutc {
return s[:i], s[i+len(c):]
return s[:i], s[i+1:]
}
return s[:i], s[i:]
}
@@ -389,14 +402,14 @@ func split(s string, c string, cutc bool) (string, string) {
// (starting with a scheme). Trying to parse a hostname and path
// without a scheme is invalid but may not necessarily return an
// error, due to parsing ambiguities.
func (u *URL) Parse(rawurl string) error {
func (u *URL) Parse(rawurl []rune) error {
// Cut off #frag
s, frag := split(rawurl, "#", true)
s, frag := split(rawurl, '#', true)
err := u.parse(s, false)
if err != nil {
return &Error{"parse", s, err}
}
if frag == "" {
if len(frag) == 0 {
return nil
}
return nil
@@ -407,7 +420,7 @@ func (u *URL) Parse(rawurl string) error {
// only as an absolute URI or an absolute path.
// The string rawurl is assumed not to have a #fragment suffix.
// (Web browsers strip #fragment before sending the URL to a web server.)
func (u *URL) ParseRequestURI(rawurl string) error {
func (u *URL) ParseRequestURI(rawurl []rune) error {
err := u.parse(rawurl, true)
if err != nil {
return &Error{"parse", rawurl, err}
@@ -419,16 +432,16 @@ func (u *URL) ParseRequestURI(rawurl string) error {
// viaRequest is true, the URL is assumed to have arrived via an HTTP request,
// in which case only absolute URLs or path-absolute relative URLs are allowed.
// If viaRequest is false, all forms of relative URLs are allowed.
func (u *URL) parse(rawurl string, viaRequest bool) error {
var rest string
func (u *URL) parse(rawurl []rune, viaRequest bool) error {
var rest []rune
var err error
if rawurl == "" && viaRequest {
if len(rawurl) == 0 && viaRequest {
return errors.New("empty url")
}
if rawurl == "*" {
u.Path = "*"
if runes.Equals(rawurl, []rune("*")) {
u.Path = []rune("*")
return nil
}
@@ -438,15 +451,17 @@ func (u *URL) parse(rawurl string, viaRequest bool) error {
return err
}
if strings.HasSuffix(rest, "?") && strings.Count(rest, "?") == 1 {
if runes.HasSuffix(rest, []rune("?")) && runes.Count(rest, '?') == 1 {
u.ForceQuery = true
rest = rest[:len(rest)-1]
} else {
rest, _ = split(rest, "?", true)
rest, u.RawQuery = split(rest, '?', true)
}
if !strings.HasPrefix(rest, "/") {
if u.Scheme != SchemeInvalid {
if !runes.HasPrefix(rest, []rune("/")) {
if len(u.Scheme) != 0 {
// We consider rootless paths per RFC 3986 as opaque.
u.Opaque = rest
return nil
}
if viaRequest {
@@ -459,59 +474,65 @@ func (u *URL) parse(rawurl string, viaRequest bool) error {
// RFC 3986, §3.3:
// In addition, a URI reference (Section 4.1) may be a relative-path reference,
// in which case the first path segment cannot contain a colon (":") character.
colon := strings.Index(rest, ":")
slash := strings.Index(rest, "/")
colon := runes.IndexRune(rest, ':')
slash := runes.IndexRune(rest, '/')
if colon >= 0 && (slash < 0 || colon < slash) {
// First path segment has colon. Not allowed in relative URL.
return errors.New("first path segment in URL cannot contain colon")
}
}
if (u.Scheme != SchemeInvalid || !viaRequest && !strings.HasPrefix(rest, "///")) && strings.HasPrefix(rest, "//") {
var authority string
authority, rest = split(rest[2:], "/", false)
if (len(u.Scheme) != 0 || !viaRequest && !runes.HasPrefix(rest, []rune("///"))) && runes.HasPrefix(rest, []rune("//")) {
var authority []rune
authority, rest = split(rest[2:], '/', false)
u.Host, err = parseAuthority(authority)
if err != nil {
return err
}
}
u.Path = rest
// Set Path and, optionally, RawPath.
// RawPath is a hint of the encoding of Path. We don't want to set it if
// the default escaping of Path is equivalent, to help make sure that people
// don't rely on it in general.
if err := u.setPath(rest); err != nil {
return err
}
return nil
}
func parseAuthority(authority string) (host string, err error) {
i := strings.LastIndex(authority, "@")
func parseAuthority(authority []rune) (host []rune, err error) {
i := runes.LastIndexRune(authority, '@')
if i < 0 {
host, err = parseHost(authority)
} else {
host, err = parseHost(authority[i+1:])
}
if err != nil {
return "", err
return nil, err
}
if i < 0 {
return host, nil
}
userinfo := authority[:i]
if !validUserinfo(userinfo) {
return "", errors.New("fasturl: invalid userinfo")
return nil, errors.New("fasturl: invalid userinfo")
}
return host, nil
}
// parseHost parses host as an authority without user
// information. That is, as host[:port].
func parseHost(host string) (string, error) {
if strings.HasPrefix(host, "[") {
func parseHost(host []rune) ([]rune, error) {
if runes.HasPrefix(host, []rune("[")) {
// Parse an IP-Literal in RFC 3986 and RFC 6874.
// E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
i := strings.LastIndex(host, "]")
i := runes.LastIndexRune(host, ']')
if i < 0 {
return "", errors.New("missing ']' in host")
return nil, errors.New("missing ']' in host")
}
colonPort := host[i+1:]
if !validOptionalPort(colonPort) {
return "", fmt.Errorf("invalid port %q after host", colonPort)
return nil, fmt.Errorf("invalid port %q after host", colonPort)
}
// RFC 6874 defines that %25 (%-encoded percent) introduces
@@ -520,35 +541,106 @@ func parseHost(host string) (string, error) {
// can only %-encode non-ASCII bytes.
// We do impose some restrictions on the zone, to avoid stupidity
// like newlines.
zone := strings.Index(host[:i], "%25")
zone := strings.Index(string(host[:i]), "%25")
if zone >= 0 {
host1, err := unescape(host[:zone], encodeHost)
if err != nil {
return "", err
return nil, err
}
host2, err := unescape(host[zone:i], encodeZone)
if err != nil {
return "", err
return nil, err
}
host3, err := unescape(host[i:], encodeHost)
if err != nil {
return "", err
return nil, err
}
return host1 + host2 + host3, nil
// TODO Optimize
return runes.Create(host1, host2, host3), nil
}
}
var err error
if host, err = unescape(host, encodeHost); err != nil {
return "", err
return nil, err
}
return host, nil
}
// setPath sets the Path and RawPath fields of the URL based on the provided
// escaped path p. It maintains the invariant that RawPath is only specified
// when it differs from the default encoding of the path.
// For example:
// - setPath("/foo/bar") will set Path="/foo/bar" and RawPath=""
// - setPath("/foo%2fbar") will set Path="/foo/bar" and RawPath="/foo%2fbar"
// setPath will return an error only if the provided path contains an invalid
// escaping.
func (u *URL) setPath(p []rune) error {
path, err := unescape(p, encodePath)
if err != nil {
return err
}
u.Path = path
if escp := escape(path, encodePath); runes.Equals(p, escp) {
// Default encoding is fine.
u.RawPath = nil
} else {
u.RawPath = p
}
return nil
}
// EscapedPath returns the escaped form of u.Path.
// In general there are multiple possible escaped forms of any path.
// EscapedPath returns u.RawPath when it is a valid escaping of u.Path.
// Otherwise EscapedPath ignores u.RawPath and computes an escaped
// form on its own.
// The String and RequestURI methods use EscapedPath to construct
// their results.
// In general, code should call EscapedPath instead of
// reading u.RawPath directly.
func (u *URL) EscapedPath() []rune {
if len(u.RawPath) != 0 && validEncodedPath(u.RawPath) {
p, err := unescape(u.RawPath, encodePath)
if err == nil && runes.Equals(p, u.Path) {
return u.RawPath
}
}
if runes.Equals(u.Path, []rune("*")) {
return []rune("*") // don't escape (Issue 11202)
}
return escape(u.Path, encodePath)
}
// validEncodedPath reports whether s is a valid encoded path.
// It must not contain any bytes that require escaping during path encoding.
func validEncodedPath(s []rune) bool {
for i := 0; i < len(s); i++ {
// RFC 3986, Appendix A.
// pchar = unreserved / pct-encoded / sub-delims / ":" / "@".
// shouldEscape is not quite compliant with the RFC,
// so we check the sub-delims ourselves and let
// shouldEscape handle the others.
switch s[i] {
case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '@':
// ok
case '[', ']':
// ok - not specified in RFC 3986 but left alone by modern browsers
case '%':
// ok - percent encoded, will decode
default:
if shouldEscape(s[i], encodePath) {
return false
}
}
}
return true
}
// validOptionalPort reports whether port is either an empty string
// or matches /^:\d*$/
func validOptionalPort(port string) bool {
if port == "" {
func validOptionalPort(port []rune) bool {
if len(port) == 0 {
return true
}
if port[0] != ':' {
@@ -562,6 +654,46 @@ func validOptionalPort(port string) bool {
return true
}
func (u *URL) Runes() (buf []rune) {
if len(u.Scheme) != 0 {
buf = append(buf, u.Scheme...)
buf = append(buf, ':')
}
if len(u.Opaque) != 0 {
buf = append(buf, u.Opaque...)
} else {
if len(u.Scheme) != 0 || len(u.Host) != 0 {
if len(u.Host) != 0 || len(u.Path) != 0 {
buf = append(buf, '/', '/')
}
if h := u.Host; len(h) != 0 {
buf = append(buf, escape(h, encodeHost)...)
}
}
path := u.EscapedPath()
if len(path) != 0 && path[0] != '/' && len(u.Host) != 0 {
buf = append(buf, '/')
}
if len(buf) == 0 {
// RFC 3986 §4.2
// A path segment that contains a colon character (e.g., "this:that")
// cannot be used as the first segment of a relative-path reference, as
// it would be mistaken for a scheme name. Such a segment must be
// preceded by a dot-segment (e.g., "./this:that") to make a relative-
// path reference.
if i := runes.IndexRune(path, ':'); i > -1 && runes.IndexRune(path[:i], '/') == -1 {
buf = append(buf, '.', '/')
}
}
buf = append(buf, path...)
}
if u.ForceQuery || len(u.RawQuery) != 0 {
buf = append(buf, '?')
buf = append(buf, u.RawQuery...)
}
return
}
// String reassembles the URL into a valid URL string.
// The general form of the result is one of:
//
@@ -583,126 +715,28 @@ func validOptionalPort(port string) bool {
// - if u.RawQuery is empty, ?query is omitted.
// - if u.Fragment is empty, #fragment is omitted.
func (u *URL) String() string {
var buf strings.Builder
if u.Scheme != SchemeInvalid {
buf.WriteString(Schemes[u.Scheme])
buf.WriteByte(':')
}
if u.Scheme != SchemeInvalid || u.Host != "" {
if u.Host != "" || u.Path != "" {
buf.WriteString("//")
}
if h := u.Host; h != "" {
buf.WriteString(escape(h, encodeHost))
}
}
path := u.Path
if path != "" && path[0] != '/' && u.Host != "" {
buf.WriteByte('/')
}
if buf.Len() == 0 {
// RFC 3986 §4.2
// A path segment that contains a colon character (e.g., "this:that")
// cannot be used as the first segment of a relative-path reference, as
// it would be mistaken for a scheme name. Such a segment must be
// preceded by a dot-segment (e.g., "./this:that") to make a relative-
// path reference.
if i := strings.IndexByte(path, ':'); i > -1 && strings.IndexByte(path[:i], '/') == -1 {
buf.WriteString("./")
}
}
buf.WriteString(path)
return buf.String()
}
func isRunesDot(r []rune) bool {
return len(r) == 1 && r[0] == '.'
}
func isRunesDoubleDot(r []rune) bool {
return len(r) == 2 && r[0] == '.' && r[1] == '.'
return string(u.Runes())
}
// resolvePath applies special path segments from refs and applies
// them to base, per RFC 3986.
func resolvePath(base, ref string) string {
var full string
if ref == "" {
func resolvePath(base, ref []rune) []rune {
var full []rune
if len(ref) == 0 {
full = base
} else if ref[0] != '/' {
i := strings.LastIndex(base, "/")
full = base[:i+1] + ref
// TODO Optimize
i := strings.LastIndex(string(base), "/")
full = runes.Create(base[:i+1], ref)
} else {
full = ref
}
if full == "" {
return ""
} else if full == "/" {
return "/"
if len(full) == 0 {
return nil
}
dst := make([]rune, len(full))
dst = dst[0:0]
start := 0
rs := []rune(full)
if len(rs) != 0 && rs[0] == '/' {
rs = rs[1:]
}
var stack []int
stack = append(stack, 0)
for i, c := range rs {
if i == len(rs) - 1 {
closingSlash := false
part := rs[start:]
if len(part) == 0 {
dst = append(dst, '/')
} else if part[len(part)-1] == '/' {
part = part[:len(part)-1]
closingSlash = true
}
switch {
case isRunesDot(part):
dst = append(dst, '/')
case isRunesDoubleDot(part):
// Cut to the last slash
start = i+1
dst = dst[:stack[len(stack)-1]]
if len(stack) != 1 {
stack = stack[:len(stack)-1]
}
dst = append(dst, '/')
default:
dst = append(dst, '/')
dst = append(dst, part...)
}
if closingSlash && len(dst) != 0 && dst[len(dst)-1] != '/' {
dst = append(dst, '/')
}
} else if c == '/' {
part := rs[start:i]
switch {
case isRunesDot(part):
start = i+1
case isRunesDoubleDot(part):
// Cut to the last slash
start = i+1
dst = dst[:stack[len(stack)-1]]
if len(stack) != 1 {
stack = stack[:len(stack)-1]
}
default:
start = i+1
stack = append(stack, len(dst))
dst = append(dst, '/')
dst = append(dst, part...)
}
}
}
return string(dst)
/*var dst []string
src := strings.Split(full, "/")
var dst []string
// TODO Optimize
src := strings.Split(string(full), "/")
for _, elem := range src {
switch elem {
case ".":
@@ -717,21 +751,22 @@ func resolvePath(base, ref string) string {
}
if last := src[len(src)-1]; last == "." || last == ".." {
// Add final slash to the joined path.
dst = append(dst, "")
dst = append(dst, "") // TODO Wtf?
}
return "/" + strings.TrimPrefix(strings.Join(dst, "/"), "/")*/
// TODO Optimize
return []rune("/" + strings.TrimPrefix(strings.Join(dst, "/"), "/"))
}
// IsAbs reports whether the URL is absolute.
// Absolute means that it has a non-empty scheme.
func (u *URL) IsAbs() bool {
return u.Scheme != SchemeInvalid
return len(u.Scheme) != 0
}
// ParseRel parses a URL in the context of the receiver. The provided URL
// may be relative or absolute. Parse returns nil, err on parse
// failure, otherwise its return value is the same as ResolveReference.
func (u *URL) ParseRel(out *URL, ref string) error {
func (u *URL) ParseRel(out *URL, ref []rune) error {
var refurl URL
err := refurl.Parse(ref)
@@ -751,22 +786,92 @@ func (u *URL) ParseRel(out *URL, ref string) error {
// ignores base and returns a copy of ref.
func (u *URL) ResolveReference(url *URL, ref *URL) {
*url = *ref
if ref.Scheme == SchemeInvalid {
if len(ref.Scheme) == 0 {
url.Scheme = u.Scheme
}
if ref.Scheme != SchemeInvalid || ref.Host != "" {
if len(ref.Scheme) != 0 || len(ref.Host) != 0 {
// The "absoluteURI" or "net_path" cases.
// We can ignore the error from setPath since we know we provided a
// validly-escaped path.
url.Path = resolvePath(ref.Path, "")
url.setPath(resolvePath(ref.EscapedPath(), nil))
return
}
if len(ref.Opaque) != 0 {
url.Host = nil
url.Path = nil
return
}
if len(ref.Path) == 0 && len(ref.RawQuery) == 0 {
url.RawQuery = u.RawQuery
}
// The "abs_path" or "rel_path" cases.
url.Host = u.Host
url.Path = resolvePath(u.Path, ref.Path)
url.setPath(resolvePath(u.EscapedPath(), ref.EscapedPath()))
return
}
// RequestURI returns the encoded path?query or opaque?query
// string that would be used in an HTTP request for u.
func (u *URL) RequestURI() []rune {
result := u.Opaque
if len(result) == 0 {
result = u.EscapedPath()
if len(result) == 0 {
result = []rune("/")
}
} else {
if runes.HasPrefix(result, []rune("//")) {
result = runes.Create(u.Scheme, []rune(":"), result)
}
}
if u.ForceQuery || len(u.RawQuery) != 0 {
result = append(result, '?')
result = append(result, u.RawQuery...)
}
return result
}
// Hostname returns u.Host, without any port number.
//
// If Host is an IPv6 literal with a port number, Hostname returns the
// IPv6 literal without the square brackets. IPv6 literals may include
// a zone identifier.
func (u *URL) Hostname() []rune {
return stripPort(u.Host)
}
// Port returns the port part of u.Host, without the leading colon.
// If u.Host doesn't contain a port, Port returns an empty string.
func (u *URL) Port() []rune {
return portOnly(u.Host)
}
func stripPort(hostport []rune) []rune {
colon := runes.IndexRune(hostport, ':')
if colon == -1 {
return hostport
}
if i := runes.IndexRune(hostport, ']'); i != -1 {
return runes.TrimPrefix(hostport[:i], []rune("["))
}
return hostport[:colon]
}
func portOnly(hostport []rune) []rune {
colon := runes.IndexRune(hostport, ':')
if colon == -1 {
return nil
}
// TODO Optimize
if i := strings.Index(string(hostport), "]:"); i != -1 {
return hostport[i+len("]:"):]
}
if strings.Contains(string(hostport), "]") {
return nil
}
return hostport[colon+len(":"):]
}
// Marshaling interface implementations.
// Would like to implement MarshalText/UnmarshalText but that will change the JSON representation of URLs.
@@ -776,7 +881,7 @@ func (u *URL) MarshalBinary() (text []byte, err error) {
func (u *URL) UnmarshalBinary(text []byte) error {
var u1 URL
err := u1.Parse(string(text))
err := u1.Parse([]rune(string(text)))
if err != nil {
return err
}
@@ -792,7 +897,7 @@ func (u *URL) UnmarshalBinary(text []byte) error {
// / "*" / "+" / "," / ";" / "="
//
// It doesn't validate pct-encoded. The caller does that via func unescape.
func validUserinfo(s string) bool {
func validUserinfo(s []rune) bool {
for _, r := range s {
if 'A' <= r && r <= 'Z' {
continue
@@ -813,57 +918,3 @@ func validUserinfo(s string) bool {
}
return true
}
func PathUnescape(s string) string {
newStr, err := pathUnescape(s)
if err != nil {
return s
} else {
return newStr
}
}
func pathUnescape(s string) (string, error) {
// Count %, check that they're well-formed.
n := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
n++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:]
if len(s) > 3 {
s = s[:3]
}
return "", EscapeError(s)
}
i += 3
default:
i++
}
}
if n == 0 {
return s, nil
}
t := make([]byte, len(s)-2*n)
j := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
j++
i += 3
case '+':
t[j] = '+'
j++
i++
default:
t[j] = s[i]
j++
i++
}
}
return string(t), nil
}

View File

@@ -1,897 +0,0 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package fasturl
import (
"bytes"
encodingPkg "encoding"
"encoding/gob"
"encoding/json"
"fmt"
"io"
"net"
"reflect"
"testing"
)
type URLTest struct {
in string
out *URL // expected parse; RawPath="" means same as Path
roundtrip string // expected result of reserializing the URL; empty means same as "in".
}
var urltests = []URLTest{
// no path
{
"http://www.google.com",
&URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
},
"",
},
// path
{
"http://www.google.com/",
&URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "/",
},
"",
},
// %20 outside query
{
"http://www.google.com/a%20b",
&URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "/a%20b",
},
"",
},
// leading // without scheme should create an authority
{
"//foo",
&URL{
Host: "foo",
},
"",
},
// Three leading slashes isn't an authority, but doesn't return an error.
// (We can't return an error, as this code is also used via
// ServeHTTP -> ReadRequest -> Parse, which is arguably a
// different URL parsing context, but currently shares the
// same codepath)
{
"///threeslashes",
&URL{
Path: "///threeslashes",
},
"",
},
// unescaped @ in username should not confuse host
{
"http://j@ne:password@google.com",
&URL{
Scheme: SchemeHTTP,
Host: "google.com",
},
"http://google.com",
},
// unescaped @ in password should not confuse host
{
"http://jane:p@ssword@google.com",
&URL{
Scheme: SchemeHTTP,
Host: "google.com",
},
"http://google.com",
},
// Relative path
{
"a/b/c",
&URL{
Path: "a/b/c",
},
"a/b/c",
},
// host subcomponent; IPv4 address in RFC 3986
{
"http://192.168.0.1/",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.1",
Path: "/",
},
"",
},
// host and port subcomponents; IPv4 address in RFC 3986
{
"http://192.168.0.1:8080/",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.1:8080",
Path: "/",
},
"",
},
// host subcomponent; IPv6 address in RFC 3986
{
"http://[fe80::1]/",
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1]",
Path: "/",
},
"",
},
// host and port subcomponents; IPv6 address in RFC 3986
{
"http://[fe80::1]:8080/",
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1]:8080",
Path: "/",
},
"",
},
// host subcomponent; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25en0]/", // alphanum zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en0]",
Path: "/",
},
"",
},
// host and port subcomponents; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25en0]:8080/", // alphanum zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en0]:8080",
Path: "/",
},
"",
},
// host subcomponent; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25%65%6e%301-._~]/", // percent-encoded+unreserved zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en01-._~]",
Path: "/",
},
"http://[fe80::1%25en01-._~]/",
},
// host and port subcomponents; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25%65%6e%301-._~]:8080/", // percent-encoded+unreserved zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en01-._~]:8080",
Path: "/",
},
"http://[fe80::1%25en01-._~]:8080/",
},
// golang.org/issue/12200 (colon with empty port)
{
"http://192.168.0.2:8080/foo",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.2:8080",
Path: "/foo",
},
"",
},
{
"http://192.168.0.2:/foo",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.2:",
Path: "/foo",
},
"",
},
{
// Malformed IPv6 but still accepted.
"http://2b01:e34:ef40:7730:8e70:5aff:fefe:edac:8080/foo",
&URL{
Scheme: SchemeHTTP,
Host: "2b01:e34:ef40:7730:8e70:5aff:fefe:edac:8080",
Path: "/foo",
},
"",
},
{
// Malformed IPv6 but still accepted.
"http://2b01:e34:ef40:7730:8e70:5aff:fefe:edac:/foo",
&URL{
Scheme: SchemeHTTP,
Host: "2b01:e34:ef40:7730:8e70:5aff:fefe:edac:",
Path: "/foo",
},
"",
},
{
"http://[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:8080/foo",
&URL{
Scheme: SchemeHTTP,
Host: "[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:8080",
Path: "/foo",
},
"",
},
{
"http://[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:/foo",
&URL{
Scheme: SchemeHTTP,
Host: "[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:",
Path: "/foo",
},
"",
},
// golang.org/issue/7991 and golang.org/issue/12719 (non-ascii %-encoded in host)
{
"http://hello.世界.com/foo",
&URL{
Scheme: SchemeHTTP,
Host: "hello.世界.com",
Path: "/foo",
},
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
},
{
"http://hello.%e4%b8%96%e7%95%8c.com/foo",
&URL{
Scheme: SchemeHTTP,
Host: "hello.世界.com",
Path: "/foo",
},
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
},
{
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
&URL{
Scheme: SchemeHTTP,
Host: "hello.世界.com",
Path: "/foo",
},
"",
},
// golang.org/issue/10433 (path beginning with //)
{
"http://example.com//foo",
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "//foo",
},
"",
},
// test that we can reparse the host names we accept.
{
"http://authority<\"hi\">/foo",
&URL{
Scheme: SchemeHTTP,
Host: "authority<\"hi\">",
Path: "/foo",
},
"",
},
}
// more useful string for debugging than fmt's struct printer
func ufmt(u *URL) string {
return fmt.Sprintf("scheme=%q, host=%q, path=%q",
Schemes[u.Scheme], u.Host, u.Path)
}
func BenchmarkString(b *testing.B) {
b.StopTimer()
b.ReportAllocs()
for _, tt := range urltests {
var u URL
err := u.Parse(tt.in)
if err != nil {
b.Errorf("Parse(%q) returned error %s", tt.in, err)
continue
}
if tt.roundtrip == "" {
continue
}
b.StartTimer()
var g string
for i := 0; i < b.N; i++ {
g = u.String()
}
b.StopTimer()
if w := tt.roundtrip; b.N > 0 && g != w {
b.Errorf("Parse(%q).String() == %q, want %q", tt.in, g, w)
}
}
}
func TestParse(t *testing.T) {
for _, tt := range urltests {
var u URL
err := u.Parse(tt.in)
if err != nil {
t.Errorf("Parse(%q) returned error %v", tt.in, err)
continue
}
if !reflect.DeepEqual(&u, tt.out) {
t.Errorf("Parse(%q):\n\tgot %v\n\twant %v\n", tt.in, ufmt(&u), ufmt(tt.out))
}
}
}
const pathThatLooksSchemeRelative = "//not.a.user@not.a.host/just/a/path"
var parseRequestURLTests = []struct {
url string
expectedValid bool
}{
{"http://foo.com", true},
{"http://foo.com/", true},
{"http://foo.com/path", true},
{"/", true},
{pathThatLooksSchemeRelative, true},
{"//not.a.user@%66%6f%6f.com/just/a/path/also", true},
{"*", true},
{"http://192.168.0.1/", true},
{"http://192.168.0.1:8080/", true},
{"http://[fe80::1]/", true},
{"http://[fe80::1]:8080/", true},
// Tests exercising RFC 6874 compliance:
{"http://[fe80::1%25en0]/", true}, // with alphanum zone identifier
{"http://[fe80::1%25en0]:8080/", true}, // with alphanum zone identifier
{"http://[fe80::1%25%65%6e%301-._~]/", true}, // with percent-encoded+unreserved zone identifier
{"http://[fe80::1%25%65%6e%301-._~]:8080/", true}, // with percent-encoded+unreserved zone identifier
{"foo.html", false},
{"../dir/", false},
{"http://192.168.0.%31/", false},
{"http://192.168.0.%31:8080/", false},
{"http://[fe80::%31]/", false},
{"http://[fe80::%31]:8080/", false},
{"http://[fe80::%31%25en0]/", false},
{"http://[fe80::%31%25en0]:8080/", false},
// These two cases are valid as textual representations as
// described in RFC 4007, but are not valid as address
// literals with IPv6 zone identifiers in URIs as described in
// RFC 6874.
{"http://[fe80::1%en0]/", false},
{"http://[fe80::1%en0]:8080/", false},
}
func TestParseRequestURI(t *testing.T) {
for _, test := range parseRequestURLTests {
var u URL
err := u.ParseRequestURI(test.url)
if test.expectedValid && err != nil {
t.Errorf("ParseRequestURI(%q) gave err %v; want no error", test.url, err)
} else if !test.expectedValid && err == nil {
t.Errorf("ParseRequestURI(%q) gave nil error; want some error", test.url)
}
}
var url URL
err := url.ParseRequestURI(pathThatLooksSchemeRelative)
if err != nil {
t.Fatalf("Unexpected error %v", err)
}
if url.Path != pathThatLooksSchemeRelative {
t.Errorf("ParseRequestURI path:\ngot %q\nwant %q", url.Path, pathThatLooksSchemeRelative)
}
}
var stringURLTests = []struct {
url URL
want string
}{
// No leading slash on path should prepend slash on String() call
{
url: URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "search",
},
want: "http://www.google.com/search",
},
// Relative path with first element containing ":" should be prepended with "./", golang.org/issue/17184
{
url: URL{
Path: "this:that",
},
want: "./this:that",
},
// Relative path with second element containing ":" should not be prepended with "./"
{
url: URL{
Path: "here/this:that",
},
want: "here/this:that",
},
// Non-relative path with first element containing ":" should not be prepended with "./"
{
url: URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "this:that",
},
want: "http://www.google.com/this:that",
},
}
func TestURLString(t *testing.T) {
for _, tt := range urltests {
var u URL
err := u.Parse(tt.in)
if err != nil {
t.Errorf("Parse(%q) returned error %s", tt.in, err)
continue
}
expected := tt.in
if tt.roundtrip != "" {
expected = tt.roundtrip
}
s := u.String()
if s != expected {
t.Errorf("Parse(%q).String() == %q (expected %q)", tt.in, s, expected)
}
}
for _, tt := range stringURLTests {
if got := tt.url.String(); got != tt.want {
t.Errorf("%+v.String() = %q; want %q", tt.url, got, tt.want)
}
}
}
var resolvePathTests = []struct {
base, ref, expected string
}{
{"a/b", ".", "/a/"},
{"a/b", "c", "/a/c"},
{"a/b", "..", "/"},
{"a/", "..", "/"},
{"a/", "../..", "/"},
{"a/b/c", "..", "/a/"},
{"a/b/c", "../d", "/a/d"},
{"a/b/c", ".././d", "/a/d"},
{"a/b", "./..", "/"},
{"a/./b", ".", "/a/"},
{"a/../", ".", "/"},
{"a/.././b", "c", "/c"},
}
func TestResolvePath(t *testing.T) {
for _, test := range resolvePathTests {
got := resolvePath(test.base, test.ref)
if got != test.expected {
t.Errorf("For %q + %q got %q; expected %q", test.base, test.ref, got, test.expected)
}
}
}
var resolveReferenceTests = []struct {
base, rel, expected string
}{
// Absolute URL references
{"http://foo.com?a=b", "https://bar.com/", "https://bar.com/"},
{"http://foo.com/", "https://bar.com/?a=b", "https://bar.com/"},
{"http://foo.com/", "https://bar.com/?", "https://bar.com/"},
// Path-absolute references
{"http://foo.com/bar", "/baz", "http://foo.com/baz"},
{"http://foo.com/bar?a=b#f", "/baz", "http://foo.com/baz"},
{"http://foo.com/bar?a=b", "/baz?", "http://foo.com/baz"},
{"http://foo.com/bar?a=b", "/baz?c=d", "http://foo.com/baz"},
// Multiple slashes
{"http://foo.com/bar", "http://foo.com//baz", "http://foo.com//baz"},
{"http://foo.com/bar", "http://foo.com///baz/quux", "http://foo.com///baz/quux"},
// Scheme-relative
{"https://foo.com/bar?a=b", "//bar.com/quux", "https://bar.com/quux"},
// Path-relative references:
// ... current directory
{"http://foo.com", ".", "http://foo.com/"},
{"http://foo.com/bar", ".", "http://foo.com/"},
{"http://foo.com/bar/", ".", "http://foo.com/bar/"},
// ... going down
{"http://foo.com", "bar", "http://foo.com/bar"},
{"http://foo.com/", "bar", "http://foo.com/bar"},
{"http://foo.com/bar/baz", "quux", "http://foo.com/bar/quux"},
// ... going up
{"http://foo.com/bar/baz", "../quux", "http://foo.com/quux"},
{"http://foo.com/bar/baz", "../../../../../quux", "http://foo.com/quux"},
{"http://foo.com/bar", "..", "http://foo.com/"},
{"http://foo.com/bar/baz", "./..", "http://foo.com/"},
// ".." in the middle (issue 3560)
{"http://foo.com/bar/baz", "quux/dotdot/../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/.././tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/./../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/././../../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/./.././../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/dotdot/./../../.././././tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/../dotdot/../dot/./tail/..", "http://foo.com/bar/quux/dot/"},
// Remove any dot-segments prior to forming the target URI.
// http://tools.ietf.org/html/rfc3986#section-5.2.4
{"http://foo.com/dot/./dotdot/../foo/bar", "../baz", "http://foo.com/dot/baz"},
// Triple dot isn't special
{"http://foo.com/bar", "...", "http://foo.com/..."},
// Fragment
{"http://foo.com/bar", ".#frag", "http://foo.com/"},
{"http://example.org/", "#!$&%27()*+,;=", "http://example.org/"},
// Paths with escaping (issue 16947).
{"http://foo.com/foo%2fbar/", "../baz", "http://foo.com/baz"},
{"http://foo.com/1/2%2f/3%2f4/5", "../../a/b/c", "http://foo.com/1/a/b/c"},
{"http://foo.com/1/2/3", "./a%2f../../b/..%2fc", "http://foo.com/1/2/b/..%2fc"},
{"http://foo.com/1/2%2f/3%2f4/5", "./a%2f../b/../c", "http://foo.com/1/2%2f/3%2f4/a%2f../c"},
{"http://foo.com/foo%20bar/", "../baz", "http://foo.com/baz"},
{"http://foo.com/foo", "../bar%2fbaz", "http://foo.com/bar%2fbaz"},
{"http://foo.com/foo%2dbar/", "./baz-quux", "http://foo.com/foo%2dbar/baz-quux"},
// RFC 3986: Normal Examples
// http://tools.ietf.org/html/rfc3986#section-5.4.1
{"http://a/b/c/d;p?q", "g", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "./g", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g/", "http://a/b/c/g/"},
{"http://a/b/c/d;p?q", "/g", "http://a/g"},
{"http://a/b/c/d;p?q", "//g", "http://g"},
{"http://a/b/c/d;p?q", "?y", "http://a/b/c/d;p"},
{"http://a/b/c/d;p?q", "g?y", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "#s", "http://a/b/c/d;p"},
{"http://a/b/c/d;p?q", "g#s", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g?y#s", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", ";x", "http://a/b/c/;x"},
{"http://a/b/c/d;p?q", "g;x", "http://a/b/c/g;x"},
{"http://a/b/c/d;p?q", "g;x?y#s", "http://a/b/c/g;x"},
{"http://a/b/c/d;p?q", "", "http://a/b/c/d;p"},
{"http://a/b/c/d;p?q", ".", "http://a/b/c/"},
{"http://a/b/c/d;p?q", "./", "http://a/b/c/"},
{"http://a/b/c/d;p?q", "..", "http://a/b/"},
{"http://a/b/c/d;p?q", "../", "http://a/b/"},
{"http://a/b/c/d;p?q", "../g", "http://a/b/g"},
{"http://a/b/c/d;p?q", "../..", "http://a/"},
{"http://a/b/c/d;p?q", "../../", "http://a/"},
{"http://a/b/c/d;p?q", "../../g", "http://a/g"},
// RFC 3986: Abnormal Examples
// http://tools.ietf.org/html/rfc3986#section-5.4.2
{"http://a/b/c/d;p?q", "../../../g", "http://a/g"},
{"http://a/b/c/d;p?q", "../../../../g", "http://a/g"},
{"http://a/b/c/d;p?q", "/./g", "http://a/g"},
{"http://a/b/c/d;p?q", "/../g", "http://a/g"},
{"http://a/b/c/d;p?q", "g.", "http://a/b/c/g."},
{"http://a/b/c/d;p?q", ".g", "http://a/b/c/.g"},
{"http://a/b/c/d;p?q", "g..", "http://a/b/c/g.."},
{"http://a/b/c/d;p?q", "..g", "http://a/b/c/..g"},
{"http://a/b/c/d;p?q", "./../g", "http://a/b/g"},
{"http://a/b/c/d;p?q", "./g/.", "http://a/b/c/g/"},
{"http://a/b/c/d;p?q", "g/./h", "http://a/b/c/g/h"},
{"http://a/b/c/d;p?q", "g/../h", "http://a/b/c/h"},
{"http://a/b/c/d;p?q", "g;x=1/./y", "http://a/b/c/g;x=1/y"},
{"http://a/b/c/d;p?q", "g;x=1/../y", "http://a/b/c/y"},
{"http://a/b/c/d;p?q", "g?y/./x", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g?y/../x", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g#s/./x", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g#s/../x", "http://a/b/c/g"},
// Extras.
{"https://a/b/c/d;p?q", "//g?q", "https://g"},
{"https://a/b/c/d;p?q", "//g#s", "https://g"},
{"https://a/b/c/d;p?q", "//g/d/e/f?y#s", "https://g/d/e/f"},
{"https://a/b/c/d;p#s", "?y", "https://a/b/c/d;p"},
{"https://a/b/c/d;p?q#s", "?y", "https://a/b/c/d;p"},
}
func TestResolveReference(t *testing.T) {
mustParse := func(url string) *URL {
u := new(URL)
err := u.Parse(url)
if err != nil {
t.Fatalf("Parse(%q) got err %v", url, err)
}
return u
}
for _, test := range resolveReferenceTests {
base := mustParse(test.base)
rel := mustParse(test.rel)
var url URL
base.ResolveReference(&url, rel)
if got := url.String(); got != test.expected {
t.Errorf("URL(%q).ResolveReference(%q)\ngot %q\nwant %q", test.base, test.rel, got, test.expected)
}
}
}
type RequestURITest struct {
url *URL
out string
}
var requritests = []RequestURITest{
{
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "",
},
"/",
},
{
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "/a b",
},
"/a%20b",
},
{
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "//foo",
},
"//foo",
},
}
func TestParseErrors(t *testing.T) {
tests := []struct {
in string
wantErr bool
}{
{"http://[::1]", false},
{"http://[::1]:80", false},
{"http://[::1]:namedport", true}, // rfc3986 3.2.3
{"http://[::1]/", false},
{"http://[::1]a", true},
{"http://[::1]%23", true},
{"http://[::1%25en0]", false}, // valid zone id
{"http://[::1]:", false}, // colon, but no port OK
{"http://[::1]:%38%30", true}, // not allowed: % encoding only for non-ASCII
{"http://[::1%25%41]", false}, // RFC 6874 allows over-escaping in zone
{"http://[%10::1]", true}, // no %xx escapes in IP address
{"http://[::1]/%48", false}, // %xx in path is fine
{"http://%41:8080/", true}, // not allowed: % encoding only for non-ASCII
{"http://[]%20%48%54%54%50%2f%31%2e%31%0a%4d%79%48%65%61%64%65%72%3a%20%31%32%33%0a%0a/", true}, // golang.org/issue/11208
{"http://a b.com/", true}, // no space in host name please
}
for _, tt := range tests {
var u URL
err := u.Parse(tt.in)
if tt.wantErr {
if err == nil {
t.Errorf("Parse(%q) = %#v; want an error", tt.in, u)
}
continue
}
if err != nil {
t.Logf("Parse(%q) = %v; want no error", tt.in, err)
}
}
}
type shouldEscapeTest struct {
in byte
mode encoding
escape bool
}
var shouldEscapeTests = []shouldEscapeTest{
// Unreserved characters (§2.3)
{'a', encodePath, false},
{'a', encodeUserPassword, false},
{'a', encodeQueryComponent, false},
{'a', encodeFragment, false},
{'a', encodeHost, false},
{'z', encodePath, false},
{'A', encodePath, false},
{'Z', encodePath, false},
{'0', encodePath, false},
{'9', encodePath, false},
{'-', encodePath, false},
{'-', encodeUserPassword, false},
{'-', encodeQueryComponent, false},
{'-', encodeFragment, false},
{'.', encodePath, false},
{'_', encodePath, false},
{'~', encodePath, false},
// User information (§3.2.1)
{':', encodeUserPassword, true},
{'/', encodeUserPassword, true},
{'?', encodeUserPassword, true},
{'@', encodeUserPassword, true},
{'$', encodeUserPassword, false},
{'&', encodeUserPassword, false},
{'+', encodeUserPassword, false},
{',', encodeUserPassword, false},
{';', encodeUserPassword, false},
{'=', encodeUserPassword, false},
// Host (IP address, IPv6 address, registered name, port suffix; §3.2.2)
{'!', encodeHost, false},
{'$', encodeHost, false},
{'&', encodeHost, false},
{'\'', encodeHost, false},
{'(', encodeHost, false},
{')', encodeHost, false},
{'*', encodeHost, false},
{'+', encodeHost, false},
{',', encodeHost, false},
{';', encodeHost, false},
{'=', encodeHost, false},
{':', encodeHost, false},
{'[', encodeHost, false},
{']', encodeHost, false},
{'0', encodeHost, false},
{'9', encodeHost, false},
{'A', encodeHost, false},
{'z', encodeHost, false},
{'_', encodeHost, false},
{'-', encodeHost, false},
{'.', encodeHost, false},
}
func TestShouldEscape(t *testing.T) {
for _, tt := range shouldEscapeTests {
if shouldEscape(tt.in, tt.mode) != tt.escape {
t.Errorf("shouldEscape(%q, %v) returned %v; expected %v", tt.in, tt.mode, !tt.escape, tt.escape)
}
}
}
type timeoutError struct {
timeout bool
}
func (e *timeoutError) Error() string { return "timeout error" }
func (e *timeoutError) Timeout() bool { return e.timeout }
type temporaryError struct {
temporary bool
}
func (e *temporaryError) Error() string { return "temporary error" }
func (e *temporaryError) Temporary() bool { return e.temporary }
type timeoutTemporaryError struct {
timeoutError
temporaryError
}
func (e *timeoutTemporaryError) Error() string { return "timeout/temporary error" }
var netErrorTests = []struct {
err error
timeout bool
temporary bool
}{{
err: &Error{"Get", "http://google.com/", &timeoutError{timeout: true}},
timeout: true,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &timeoutError{timeout: false}},
timeout: false,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &temporaryError{temporary: true}},
timeout: false,
temporary: true,
}, {
err: &Error{"Get", "http://google.com/", &temporaryError{temporary: false}},
timeout: false,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: true}, temporaryError{temporary: true}}},
timeout: true,
temporary: true,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: false}, temporaryError{temporary: true}}},
timeout: false,
temporary: true,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: true}, temporaryError{temporary: false}}},
timeout: true,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: false}, temporaryError{temporary: false}}},
timeout: false,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", io.EOF},
timeout: false,
temporary: false,
}}
// Test that url.Error implements net.Error and that it forwards
func TestURLErrorImplementsNetError(t *testing.T) {
for i, tt := range netErrorTests {
err, ok := tt.err.(net.Error)
if !ok {
t.Errorf("%d: %T does not implement net.Error", i+1, tt.err)
continue
}
if err.Timeout() != tt.timeout {
t.Errorf("%d: err.Timeout(): got %v, want %v", i+1, err.Timeout(), tt.timeout)
continue
}
if err.Temporary() != tt.temporary {
t.Errorf("%d: err.Temporary(): got %v, want %v", i+1, err.Temporary(), tt.temporary)
}
}
}
var _ encodingPkg.BinaryMarshaler = (*URL)(nil)
var _ encodingPkg.BinaryUnmarshaler = (*URL)(nil)
func TestJSON(t *testing.T) {
var u URL
err := u.Parse("https://www.google.com/x?y=z")
if err != nil {
t.Fatal(err)
}
js, err := json.Marshal(&u)
if err != nil {
t.Fatal(err)
}
// If only we could implement TextMarshaler/TextUnmarshaler,
// this would work:
//
// if string(js) != strconv.Quote(u.String()) {
// t.Errorf("json encoding: %s\nwant: %s\n", js, strconv.Quote(u.String()))
// }
u1 := new(URL)
err = json.Unmarshal(js, u1)
if err != nil {
t.Fatal(err)
}
if u1.String() != u.String() {
t.Errorf("json decoded to: %s\nwant: %s\n", u1, &u)
}
}
func TestGob(t *testing.T) {
var u URL
err := u.Parse("https://www.google.com/x?y=z")
if err != nil {
t.Fatal(err)
}
var w bytes.Buffer
err = gob.NewEncoder(&w).Encode(&u)
if err != nil {
t.Fatal(err)
}
u1 := new(URL)
err = gob.NewDecoder(&w).Decode(u1)
if err != nil {
t.Fatal(err)
}
if u1.String() != u.String() {
t.Errorf("json decoded to: %s\nwant: %s\n", u1, &u)
}
}

15
help.go
View File

@@ -1,15 +0,0 @@
package main
const helpText =
`HTTP crawler for the OD-Database
DB >> https://od-db.the-eye.eu <<
Crawler >> https://github.com/terorie/od-database-crawler <<
Server >> https://github.com/simon987/od-database <<
Quick start:
- get config file (config.yml in working dir)
- get OD-DB server ("server.url": Database URL + /api)
- get access token ("server.token": e.g. c010b6dd-20...)
- ./od-database-crawler server
Questions? Discord @terorie#2664 / Telegram @terorie`

47
jenkins/Jenkinsfile vendored
View File

@@ -1,47 +0,0 @@
def remote = [:]
remote.name = 'remote'
remote.host = env.DEPLOY_HOST
remote.user = env.DEPLOY_USER
remote.identityFile = '/var/lib/jenkins/.ssh/id_rsa'
remote.knownHosts = '/var/lib/jenkins/.ssh/known_hosts'
remote.allowAnyHosts = true
remote.retryCount = 3
remote.retryWaitSec = 3
logLevel = 'FINER'
pipeline {
agent none
environment {
GOOS='linux'
CGO_ENABLED='0'
HOME='.'
}
stages {
stage('Build') {
agent {
docker {
image 'golang:latest'
}
}
steps {
sh 'mkdir -p /go/src/github.com/terorie/od-database-crawler'
sh 'cp -r *.go fasturl ds jenkins/build.sh "/go/src/github.com/terorie/od-database-crawler"'
sh 'cd /go/src/github.com/terorie/od-database-crawler && go get ./...'
sh './jenkins/build.sh'
stash includes: 'dist/', name: 'dist'
}
}
stage('Deploy') {
agent none
steps {
node('master') {
unstash 'dist'
sshCommand remote: remote, command: "ls od-database-crawler/"
sshPut remote: remote, from: 'dist/', into: 'od-database-crawler'
}
}
}
}
}

View File

@@ -1,23 +0,0 @@
#!/usr/bin/env bash
appname="od-database-crawler"
outdir="dist/"
tag="${BUILD_ID}_$(date +%Y-%m-%d)"
rm -rf "./${outdir}"
mkdir build 2> /dev/null
name=${outdir}${appname}-${tag}-linux
GOOS="linux" GOARCH="amd64" go build -ldflags="-s -w" -o ${name}
gzip -f ${name}
echo ${name}
name=${outdir}${appname}-${tag}-mac
GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o ${name}
gzip -f ${name}
echo ${name}
name=${outdir}${appname}-${tag}-freebsd
GOOS="freebsd" GOARCH="amd64" go build -ldflags="-s -w" -o ${name}
gzip -f ${name}
echo ${name}

203
main.go
View File

@@ -2,194 +2,91 @@ package main
import (
"context"
"fmt"
"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"github.com/spf13/viper"
"github.com/terorie/od-database-crawler/fasturl"
"github.com/terorie/oddb-go/fasturl"
"github.com/terorie/oddb-go/runes"
"github.com/urfave/cli"
"log"
"net/http"
_ "net/http/pprof"
"os"
"os/signal"
"strings"
"sync/atomic"
"time"
)
var configFile string
var rootCmd = cobra.Command {
Use: "od-database-crawler",
Version: "1.2.2",
Short: "OD-Database Go crawler",
Long: helpText,
PersistentPreRunE: preRun,
PersistentPostRun: func(cmd *cobra.Command, args []string) {
exitHooks.Execute()
var app = cli.App {
Name: "oddb-go",
Usage: "OD-Database Go crawler",
Version: "0.1",
BashComplete: cli.DefaultAppComplete,
Writer: os.Stdout,
Compiled: buildDate,
Commands: []cli.Command{
{
Name: "crawl",
Usage: "Crawl a list of URLs",
ArgsUsage: "[site, site, ...]",
Action: cmdCrawler,
},
},
}
var serverCmd = cobra.Command {
Use: "server",
Short: "Start crawl server",
Long: "Connect to the OD-Database and contribute to the database\n" +
"by crawling the web for open directories!",
Run: cmdBase,
}
var crawlCmd = cobra.Command {
Use: "crawl",
Short: "Crawl an URL",
Long: "Crawl the URL specified.\n" +
"Results will not be uploaded to the database,\n" +
"they're saved under crawled/0.json instead.\n" +
"Primarily used for testing and benchmarking.",
RunE: cmdCrawler,
Args: cobra.ExactArgs(1),
}
var exitHooks Hooks
func init() {
rootCmd.AddCommand(&crawlCmd)
rootCmd.AddCommand(&serverCmd)
prepareConfig()
}
func preRun(cmd *cobra.Command, args []string) error {
if err := os.MkdirAll("crawled", 0755);
err != nil { panic(err) }
if err := os.MkdirAll("queue", 0755);
err != nil { panic(err) }
return nil
}
func main() {
err := rootCmd.Execute()
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
go func() {
log.Println(http.ListenAndServe("localhost:42069", nil))
}()
app.Run(os.Args)
}
func cmdBase(_ *cobra.Command, _ []string) {
onlineMode = true
func cmdCrawler(clic *cli.Context) error {
readConfig()
appCtx, soft := context.WithCancel(context.Background())
forceCtx, hard := context.WithCancel(context.Background())
go hardShutdown(forceCtx)
go listenCtrlC(soft, hard)
inRemotes := make(chan *OD)
go Schedule(appCtx, inRemotes)
ticker := time.NewTicker(config.Recheck)
defer ticker.Stop()
for {
select {
case <-appCtx.Done():
goto shutdown
case <-ticker.C:
t, err := FetchTask()
if err != nil {
logrus.WithError(err).
Error("Failed to get new task")
if !sleep(viper.GetDuration(ConfCooldown), appCtx) {
goto shutdown
}
continue
}
if t == nil {
// No new task
if atomic.LoadInt32(&numActiveTasks) == 0 {
logrus.Info("Waiting …")
}
continue
if clic.NArg() == 0 {
cli.ShowCommandHelpAndExit(clic, "crawl", 1)
}
var baseUri fasturl.URL
err = baseUri.Parse(t.Url)
if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
// Not an error
err = nil
// TODO FTP crawler
continue
} else if err != nil {
logrus.WithError(err).
Error("Failed to get new task")
time.Sleep(viper.GetDuration(ConfCooldown))
continue
}
ScheduleTask(inRemotes, t, &baseUri)
}
}
shutdown:
globalWait.Wait()
}
func cmdCrawler(_ *cobra.Command, args []string) error {
onlineMode = false
readConfig()
arg := args[0]
args := clic.Args()
remotes := make([]*OD, len(args))
for i, argStr := range args {
// https://github.com/golang/go/issues/19779
if !strings.Contains(arg, "://") {
arg = "http://" + arg
if !strings.Contains(argStr, "://") {
argStr = "http://" + argStr
}
arg := []rune(argStr)
var u fasturl.URL
err := u.Parse(arg)
if !strings.HasSuffix(u.Path, "/") {
u.Path += "/"
if !runes.HasSuffix(u.Path, []rune("/")) {
u.Path = append(u.Path, '/')
}
if err != nil { return err }
remotes[i] = &OD{ BaseUri: u }
}
// TODO Graceful shutdown
forceCtx := context.Background()
c := context.Background()
inRemotes := make(chan *OD)
go Schedule(forceCtx, inRemotes)
go Schedule(c, inRemotes)
ticker := time.NewTicker(3 * time.Second)
defer ticker.Stop()
task := Task {
WebsiteId: 0,
Url: u.String(),
for _, remote := range remotes {
globalWait.Add(1)
inRemotes <- remote
}
ScheduleTask(inRemotes, &task, &u)
// Wait for all jobs to finish
globalWait.Wait()
logrus.Info("All dirs processed!")
return nil
}
func listenCtrlC(soft, hard context.CancelFunc) {
c := make(chan os.Signal)
signal.Notify(c, os.Interrupt)
<-c
logrus.Info(">>> Shutting down crawler... <<<")
soft()
<-c
logrus.Warning(">>> Force shutdown! <<<")
hard()
}
func hardShutdown(c context.Context) {
<-c.Done()
os.Exit(1)
}
func sleep(d time.Duration, c context.Context) bool {
select {
case <-time.After(d):
return true
case <-c.Done():
return false
}
}
var buildDate = time.Date(
2018, 10, 28,
17, 10, 0, 0,
time.UTC)

View File

@@ -1,38 +1,14 @@
package main
import (
"github.com/terorie/od-database-crawler/ds/redblackhash"
"github.com/terorie/od-database-crawler/fasturl"
"github.com/terorie/oddb-go/ds/redblackhash"
"github.com/terorie/oddb-go/fasturl"
"sync"
"time"
)
type ResultCode int
const (
TR_OK = ResultCode(iota)
TR_FAIL = 1
TR_SKIP = 2
)
type Task struct {
WebsiteId uint64 `json:"website_id"`
Url string `json:"url"`
UploadToken string `json:"upload_token"`
TaskId int64
}
type TaskResult struct {
ResultCode ResultCode `json:"status_code"`
FileCount uint64 `json:"file_count"`
ErrorCount uint64 `json:"-"`
StartTime time.Time `json:"-"`
StartTimeUnix int64 `json:"start_time"`
EndTimeUnix int64 `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}
type Job struct {
OD *OD
Uri fasturl.URL
UriStr string
Fails int
@@ -40,37 +16,30 @@ type Job struct {
}
type OD struct {
Task Task
Result TaskResult
Wait sync.WaitGroup
BaseUri fasturl.URL
Files []File
WCtx WorkerContext
Scanned redblackhash.Tree
lock sync.Mutex
}
type File struct {
Name string `json:"name"`
Name []rune `json:"name"`
Size int64 `json:"size"`
MTime int64 `json:"mtime"`
Path string `json:"path"`
MTime time.Time `json:"mtime"`
Path []rune `json:"path"`
IsDir bool `json:"-"`
}
func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) {
o.Scanned.Lock()
defer o.Scanned.Unlock()
o.lock.Lock()
defer o.lock.Unlock()
exists = o.Scanned.Get(k)
if exists {
return true
}
if exists { return true }
o.Scanned.Put(k)
return false
}
type errorString string
func (e errorString) Error() string {
return string(e)
}

129
queue.go
View File

@@ -1,129 +0,0 @@
package main
import (
"github.com/beeker1121/goque"
"os"
"sync"
"sync/atomic"
)
type BufferedQueue struct {
dataDir string
q *goque.Queue
buf []Job
m sync.Mutex
}
func OpenQueue(dataDir string) (bq *BufferedQueue, err error) {
bq = new(BufferedQueue)
if config.JobBufferSize < 0 {
return
}
bq.dataDir = dataDir
bq.q, err = goque.OpenQueue(dataDir)
if err != nil { return nil, err }
return
}
func (q *BufferedQueue) Enqueue(job *Job) error {
atomic.AddInt64(&totalQueued, 1)
if q.directEnqueue(job) {
return nil
}
var gob JobGob
gob.ToGob(job)
_, err := q.q.EnqueueObject(gob)
return err
}
func (q *BufferedQueue) Dequeue() (job Job, err error) {
if q.directDequeue(&job) {
atomic.AddInt64(&totalQueued, -1)
return job, nil
}
if config.JobBufferSize < 0 {
err = goque.ErrEmpty
return
}
var item *goque.Item
item, err = q.q.Dequeue()
if err != nil { return }
atomic.AddInt64(&totalQueued, -1)
var gob JobGob
err = item.ToObject(&gob)
if err != nil { return }
gob.FromGob(&job)
return
}
func (q *BufferedQueue) directEnqueue(job *Job) bool {
q.m.Lock()
defer q.m.Unlock()
bs := config.JobBufferSize
if len(q.buf) < bs || bs < 0 {
q.buf = append(q.buf, *job)
return true
} else {
return false
}
}
func (q *BufferedQueue) directDequeue(job *Job) bool {
q.m.Lock()
defer q.m.Unlock()
if len(q.buf) > 0 {
*job = q.buf[0]
q.buf = q.buf[1:]
return true
} else {
return false
}
}
// Always returns nil (But implements io.Closer)
func (q *BufferedQueue) Close() error {
if config.JobBufferSize < 0 {
return nil
}
// Close ignoring errors
q.q.Close()
// Delete files
if err := os.RemoveAll(q.dataDir);
err != nil { panic(err) }
return nil
}
type JobGob struct {
Uri string
Fails int
LastError string
}
func (g *JobGob) ToGob(j *Job) {
g.Uri = j.UriStr
g.Fails = j.Fails
if j.LastError != nil {
g.LastError = j.LastError.Error()
}
}
func (g *JobGob) FromGob(j *Job) {
if err := j.Uri.Parse(g.Uri);
err != nil { panic(err) }
j.UriStr = g.Uri
j.Fails = g.Fails
if g.LastError != "" {
j.LastError = errorString(g.LastError)
}
}

View File

@@ -1,25 +0,0 @@
#!/usr/bin/env bash
appname="od-database-crawler"
tag=$1
[ -z "$tag" ] && echo "Usage: build <version>" && exit 1
name=${appname}-${tag}-windows.exe
GOOS="windows" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-linux
GOOS="linux" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-mac
GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-freebsd
GOOS="freebsd" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name

98
runes/runes.go Normal file
View File

@@ -0,0 +1,98 @@
package runes
func Create(rs ...[]rune) (x []rune) {
for _, r := range rs {
x = append(x, r...)
}
return x
}
func IndexRune(s []rune, r rune) int {
for i, sr := range s {
if r == sr {
return i
}
}
return -1
}
func LastIndexRune(s []rune, r rune) int {
for i := len(s)-1; i >= 0; i-- {
sr := s[i]
if r == sr {
return i
}
}
return -1
}
func Equals(a, b []rune) bool {
if len(a) != len(b) {
return false
}
for i := 0; i < len(a); i++ {
if a[i] != b[i] {
return false
}
}
return true
}
func Count(s []rune, r rune) (n int) {
for _, sr := range s {
if sr == r {
n++
}
}
return
}
func HasPrefix(s, prefix []rune) bool {
return len(s) >= len(prefix) && Equals(s[0:len(prefix)], prefix)
}
func HasSuffix(s, suffix []rune) bool {
return len(s) >= len(suffix) && Equals(s[len(s)-len(suffix):], suffix)
}
// TrimPrefix returns s without the provided leading prefix string.
// If s doesn't start with prefix, s is returned unchanged.
func TrimPrefix(s, prefix []rune) []rune {
if HasPrefix(s, prefix) {
return s[len(prefix):]
}
return s
}
func TrimRune(s []rune, r rune) (out []rune) {
if len(s) == 0 {
return nil
}
var i = 0
var sr rune
// Trim prefix
for i, sr = range s {
if sr == r {
out = s[i:]
} else {
break
}
}
s = out
if len(s) == 0 {
return nil
}
// Trim suffix
for i := len(s)-1; i >= 0; i++ {
if s[i] == r {
out = s[:i]
} else {
break
}
}
return out
}

44
runespath/path.go Normal file
View File

@@ -0,0 +1,44 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package path implements utility routines for manipulating slash-separated
// paths.
//
// The path package should only be used for paths separated by forward
// slashes, such as the paths in URLs. This package does not deal with
// Windows paths with drive letters or backslashes; to manipulate
// operating system paths, use the path/filepath package.
package runespath
import (
"github.com/terorie/oddb-go/runes"
)
// Base returns the last element of path.
// Trailing slashes are removed before extracting the last element.
// If the path is empty, Base returns ".".
// If the path consists entirely of slashes, Base returns "/".
func Base(path []rune) []rune {
if len(path) == 0 {
return []rune(".")
}
// Strip trailing slashes.
for len(path) > 0 && path[len(path)-1] == '/' {
path = path[0 : len(path)-1]
}
// Find the last element
if i := runes.LastIndexRune(path, '/'); i >= 0 {
path = path[i+1:]
}
// If empty now, it had only slashes.
if len(path) == 0 {
return []rune("/")
}
return path
}
// IsAbs reports whether the path is absolute.
func IsAbs(path string) bool {
return len(path) > 0 && path[0] == '/'
}

View File

@@ -2,218 +2,101 @@ package main
import (
"context"
"encoding/json"
"fmt"
"github.com/sirupsen/logrus"
"github.com/terorie/od-database-crawler/fasturl"
"os"
"path"
"sync"
"sync/atomic"
"time"
)
var activeTasksLock sync.Mutex
var activeTasks = make(map[uint64]bool)
var numActiveTasks int32
var totalQueued int64
var activeTasks int32
var totalBuffered int64
func Schedule(c context.Context, remotes <-chan *OD) {
go Stats(c)
for remote := range remotes {
for {
select {
case <-c.Done():
return
case remote := <-remotes:
logrus.WithField("url", remote.BaseUri.String()).
Info("Starting crawler")
// Collect results
results := make(chan File)
remote.WCtx.OD = remote
// Get queue path
queuePath := path.Join("queue", fmt.Sprintf("%d", remote.Task.WebsiteId))
// Delete existing queue
if err := os.RemoveAll(queuePath); err != nil {
panic(err)
}
// Start new queue
var err error
remote.WCtx.Queue, err = OpenQueue(queuePath)
if err != nil {
panic(err)
}
// Spawn workers
remote.WCtx.in, remote.WCtx.out = makeJobBuffer(c)
for i := 0; i < config.Workers; i++ {
go remote.WCtx.Worker(results)
go remote.WCtx.Worker()
}
// Enqueue initial job
atomic.AddInt32(&numActiveTasks, 1)
atomic.AddInt32(&activeTasks, 1)
remote.WCtx.queueJob(Job{
OD: remote,
Uri: remote.BaseUri,
UriStr: remote.BaseUri.String(),
Fails: 0,
})
globalWait.Done()
// Upload result when ready
go remote.Watch(results)
go remote.Watch()
}
}
}
// Sleep if max number of tasks are active
for atomic.LoadInt32(&numActiveTasks) > config.Tasks {
func (r *OD) Watch() {
// Wait for all jobs on remote to finish
r.Wait.Wait()
close(r.WCtx.in)
atomic.AddInt32(&activeTasks, -1)
logrus.WithField("url", r.BaseUri.String()).
Info("Crawler finished")
}
func makeJobBuffer(c context.Context) (chan<- Job, <-chan Job) {
in := make(chan Job)
out := make(chan Job)
go bufferJobs(c, in, out)
return in, out
}
func bufferJobs(c context.Context, in chan Job, out chan Job) {
defer close(out)
var inQueue []Job
outCh := func() chan Job {
if len(inQueue) == 0 {
return nil
}
return out
}
for len(inQueue) > 0 || in != nil {
if len(inQueue) == 0 {
select {
case v, ok := <-in:
if !ok {
in = nil
} else {
atomic.AddInt64(&totalBuffered, 1)
inQueue = append(inQueue, v)
}
case <-c.Done():
return
case <-time.After(time.Second):
continue
}
}
}
}
func ScheduleTask(remotes chan<- *OD, t *Task, u *fasturl.URL) {
if !t.register() {
return
}
globalWait.Add(1)
now := time.Now()
od := &OD{
Task: *t,
BaseUri: *u,
Result: TaskResult{
WebsiteId: t.WebsiteId,
StartTime: now,
StartTimeUnix: now.Unix(),
},
}
remotes <- od
}
func (t *Task) register() bool {
activeTasksLock.Lock()
defer activeTasksLock.Unlock()
if _, known := activeTasks[t.WebsiteId]; known {
return false
} else {
activeTasks[t.WebsiteId] = true
return true
}
}
func (t *Task) unregister() {
activeTasksLock.Lock()
delete(activeTasks, t.WebsiteId)
activeTasksLock.Unlock()
}
func (o *OD) Watch(results chan File) {
// Mark job as completely done
defer globalWait.Done()
defer o.Task.unregister()
filePath := path.Join("crawled", fmt.Sprintf("%d.json", o.Task.WebsiteId))
// Open crawl results file
f, err := os.OpenFile(
filePath,
os.O_CREATE|os.O_RDWR|os.O_TRUNC,
0644,
)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
return
}
defer f.Close()
defer os.Remove(filePath)
// Listen for exit code of Collect()
collectErrC := make(chan error)
// Block until all results are written
// (closes results channel)
o.handleCollect(results, f, collectErrC)
// Exit code of Collect()
err = <-collectErrC
close(collectErrC)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
return
}
// Upload results
err = PushResult(&o.Task, f)
if err != nil {
logrus.WithError(err).
Error("Failed uploading crawl results")
return
}
}
func (o *OD) handleCollect(results chan File, f *os.File, collectErrC chan error) {
// Begin collecting results
go o.Task.Collect(results, f, collectErrC)
defer close(results)
// Wait for all jobs on remote to finish
o.Wait.Wait()
// Close queue
if err := o.WCtx.Queue.Close(); err != nil {
panic(err)
}
atomic.AddInt32(&numActiveTasks, -1)
// Log finish
logrus.WithFields(logrus.Fields{
"id": o.Task.WebsiteId,
"url": o.BaseUri.String(),
"duration": time.Since(o.Result.StartTime),
}).Info("Crawler finished")
// Set status code
now := time.Now()
o.Result.EndTimeUnix = now.Unix()
if atomic.LoadUint64(&o.Result.ErrorCount) != 0 {
o.Result.ResultCode = TR_FAIL
select {
case v, ok := <-in:
if !ok {
in = nil
} else {
o.Result.ResultCode = TR_OK
atomic.AddInt64(&totalBuffered, 1)
inQueue = append(inQueue, v)
}
case outCh() <- inQueue[0]:
atomic.AddInt64(&totalBuffered, -1)
inQueue = inQueue[1:]
case <-c.Done():
return
}
}
}
}
func (t *Task) Collect(results chan File, f *os.File, errC chan<- error) {
err := t.collect(results, f)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
}
errC <- err
}
func (t *Task) collect(results chan File, f *os.File) error {
for result := range results {
result.Path = fasturl.PathUnescape(result.Path)
result.Name = fasturl.PathUnescape(result.Name)
resJson, err := json.Marshal(result)
if err != nil {
panic(err)
}
_, err = f.Write(resJson)
if err != nil {
return err
}
_, err = f.Write([]byte{'\n'})
if err != nil {
return err
}
}
return nil
}

328
server.go
View File

@@ -2,271 +2,181 @@ package main
import (
"bytes"
"encoding/base64"
"encoding/json"
"fmt"
"github.com/fasthttp/websocket"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/time/rate"
"io"
"io/ioutil"
"mime/multipart"
"net/http"
"net/url"
"os"
"path/filepath"
"strconv"
"strings"
)
var serverWorker *TrackerWorker
const (
fileListChunkSize int64 = 5000000 // 5 mb
)
var serverClient = http.Client{
Timeout: config.ServerTimeout,
Transport: new(ServerTripper),
}
var serverUserAgent = "od-database-crawler/" + rootCmd.Version
func getOrCreateWorker() {
if _, err := os.Stat("worker.json"); os.IsNotExist(err) {
req := CreateTrackerWorkerRequest{
Alias: config.TrackerAlias,
}
body, _ := json.Marshal(&req)
buf := bytes.NewBuffer(body)
resp, _ := serverClient.Post(config.TrackerUrl+"/worker/create", "application/json", buf)
workerResponse := CreateTrackerWorkerResponse{}
respBody, _ := ioutil.ReadAll(resp.Body)
_ = json.Unmarshal(respBody, &workerResponse)
workerJsonData, _ := json.Marshal(&workerResponse.Content.Worker)
fp, _ := os.OpenFile("worker.json", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
_, _ = fp.Write(workerJsonData)
//Request ASSIGN permission
serverWorker = &workerResponse.Content.Worker
accessReq, _ := json.Marshal(WorkerAccessRequest{
Project: config.TrackerProject,
Assign: true,
Submit: false,
})
buf = bytes.NewBuffer(accessReq)
res, err := serverClient.Post(config.TrackerUrl+"/project/request_access", "application/json", buf)
if err != nil {
panic(err)
}
logrus.WithFields(logrus.Fields{
"response": res.StatusCode,
}).Info("Requested ASSIGN permission")
} else {
var worker TrackerWorker
fp, _ := os.OpenFile("worker.json", os.O_RDONLY, 0600)
workerJsonData, _ := ioutil.ReadAll(fp)
_ = json.Unmarshal(workerJsonData, &worker)
serverWorker = &worker
}
}
var serverClient = http.DefaultClient
func FetchTask() (t *Task, err error) {
escToken, _ := json.Marshal(config.Token)
payload := `{"token":` + string(escToken) + `}`
if serverWorker == nil {
getOrCreateWorker()
}
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/get",
strings.NewReader(payload))
if err != nil { return }
res, err := serverClient.Get(config.TrackerUrl + "/task/get/" + strconv.Itoa(config.TrackerProject))
if err != nil {
return
}
res, err := serverClient.Do(req)
if err != nil { return }
defer res.Body.Close()
switch res.StatusCode {
case 200:
break
default:
return nil, fmt.Errorf("http %s", res.Status)
}
jsonResponse := FetchTaskResponse{}
err = json.NewDecoder(res.Body).Decode(&jsonResponse)
if _, ok := err.(*json.SyntaxError); ok {
return nil, fmt.Errorf("/task/get returned invalid JSON")
} else if err != nil {
if res.StatusCode != 200 {
err = fmt.Errorf("http %s", res.Status)
return
}
if !jsonResponse.Ok {
if jsonResponse.Message == "No task available" {
return nil, nil
}
return nil, errors.New(jsonResponse.Message)
}
task := Task{}
err = json.Unmarshal([]byte(jsonResponse.Content.Task.Recipe), &task)
if _, ok := err.(*json.SyntaxError); ok {
return nil, fmt.Errorf("/task/get returned invalid JSON")
} else if err != nil {
return
}
t = &task
t.TaskId = jsonResponse.Content.Task.Id
t = new(Task)
err = json.NewDecoder(res.Body).Decode(t)
if err != nil { return }
return
}
func PushResult(task *Task, f *os.File) (err error) {
if task.WebsiteId == 0 {
// Not a real result, don't push
return nil
}
func PushResult(result *TaskResult) (err error) {
filePath := filepath.Join(
".", "crawled",
fmt.Sprintf("%d.json", result.WebsiteId))
// Rewind to the beginning of the file
_, err = f.Seek(0, 0)
if err != nil {
defer os.Remove(filePath)
f, err := os.Open(filePath)
if os.IsNotExist(err) {
err = fmt.Errorf("cannot upload result: %s does not exist", filePath)
return
} else if err != nil {
return
}
defer f.Close()
err = uploadWebsocket(f, task.UploadToken)
err = uploadChunks(result.WebsiteId, f)
if err != nil {
logrus.Errorf("Failed to upload file list: %s", err)
err2 := releaseTask(task, TR_FAIL)
err2 := CancelTask(result.WebsiteId)
if err2 != nil {
logrus.Error(err2)
}
return
}
// Upload result ignoring errors
_ = releaseTask(task, TR_OK)
err = uploadResult(result)
if err != nil {
logrus.Errorf("Failed to upload result: %s", err)
err2 := CancelTask(result.WebsiteId)
if err2 != nil {
logrus.Error(err2)
}
return
}
return
}
func uploadWebsocket(f *os.File, token string) (err error) {
func uploadChunks(websiteId uint64, f *os.File) (err error) {
for iter := 1; iter > 0; iter++ {
// TODO Stream with io.Pipe?
var b bytes.Buffer
u := url.URL{Scheme: config.WsBucketScheme, Host: config.WsBucketHost, Path: "/upload"}
multi := multipart.NewWriter(&b)
header := http.Header{}
header.Add("X-Upload-Token", token)
conn, _, err := websocket.DefaultDialer.Dial(u.String(), header)
if err != nil {
return
// Set upload fields
err = multi.WriteField("token", config.Token)
if err != nil { return }
err = multi.WriteField("website_id", fmt.Sprintf("%d", websiteId))
if err != nil { return }
// Copy chunk to file_list
formFile, err := multi.CreateFormFile("file_list", "file_list")
_, err = io.CopyN(formFile, f, fileListChunkSize)
if err == io.EOF {
break
} else if err == io.ErrUnexpectedEOF {
err = nil
// Break at end of iteration
iter = -420
}
conn.EnableWriteCompression(true) //TODO: Is this necessary?
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/upload",
&b)
if err != nil { return err }
socketWriter, _ := conn.NextWriter(websocket.BinaryMessage)
_, _ = io.Copy(socketWriter, f)
err = socketWriter.Close()
if err != nil {
logrus.Error("FIXME: couldn't do file upload")
return
}
err = conn.Close()
if err != nil {
return
}
return nil
}
func releaseTask(task *Task, taskResult ResultCode) (err error) {
req := releaseTaskRequest{
TaskId: task.TaskId,
ResultCode: taskResult,
// TODO Will implement verification in a later ODDB update
Verification: 0,
}
resultEnc, err := json.Marshal(&req)
if err != nil {
panic(err)
}
body := bytes.NewBuffer(resultEnc)
res, err := serverClient.Post(
config.TrackerUrl+"/task/release",
"application/json",
body,
)
if err != nil {
return
}
res, err := serverClient.Do(req)
if err != nil { return err }
res.Body.Close()
if res.StatusCode != http.StatusOK {
return HttpError{res.StatusCode}
return fmt.Errorf("failed to upload list part %d: %s",
iter, res.Status)
}
logrus.Infof("Uploading file list part %d: %s",
iter, res.Status)
}
return
}
func uploadResult(result *TaskResult) (err error) {
resultEnc, err := json.Marshal(result)
if err != nil { panic(err) }
payload := url.Values {
"token": {config.Token},
"result": {string(resultEnc)},
}.Encode()
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/complete",
strings.NewReader(payload))
if err != nil { return }
res, err := serverClient.Do(req)
if err != nil { return }
res.Body.Close()
if res.StatusCode != http.StatusOK {
return fmt.Errorf("failed to cancel task: %s", res.Status)
}
return
}
type ServerTripper struct{}
func (t *ServerTripper) RoundTrip(req *http.Request) (res *http.Response, err error) {
req.Header.Set("User-Agent", serverUserAgent)
//TODO: Use task_tracker/client ?
if serverWorker != nil {
req.Header.Add("X-Worker-Id", strconv.Itoa(serverWorker.Id))
req.Header.Add("X-Secret", base64.StdEncoding.EncodeToString(serverWorker.Secret))
func CancelTask(websiteId uint64) (err error) {
form := url.Values{
"token": {config.Token},
"website_id": {strconv.FormatUint(websiteId, 10)},
}
return http.DefaultTransport.RoundTrip(req)
}
encForm := form.Encode()
// https://github.com/simon987/task_tracker/blob/master/api/models.go
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/cancel",
strings.NewReader(encForm))
if err != nil { return }
type releaseTaskRequest struct {
TaskId int64 `json:"task_id"`
ResultCode ResultCode `json:"result"`
Verification int64 `json:"verification"`
}
res, err := serverClient.Do(req)
if err != nil { return }
res.Body.Close()
type WorkerAccessRequest struct {
Assign bool `json:"assign"`
Submit bool `json:"submit"`
Project int `json:"project"`
}
if res.StatusCode != http.StatusOK {
return fmt.Errorf("failed to cancel task: %s", res.Status)
}
type FetchTaskResponse struct {
Ok bool `json:"ok"`
Message string `json:"message"`
Content struct {
Task struct {
Id int64 `json:"id"`
Priority int64 `json:"priority"`
Project struct {
Id int64 `json:"id"`
Name string `json:"name"`
Version string `json:"version"`
AssignRate rate.Limit `json:"assign_rate"`
SubmitRate rate.Limit `json:"submit_rate"`
} `json:"project"`
Recipe string `json:"recipe"`
} `json:"task"`
} `json:"content"`
}
type TrackerWorker struct {
Alias string `json:"alias"`
Id int `json:"id"`
Secret []byte `json:"secret"`
}
type CreateTrackerWorkerResponse struct {
Ok bool `json:"ok"`
Message string `json:"message"`
Content struct {
Worker TrackerWorker `json:"worker"`
} `json:"content"`
}
type CreateTrackerWorkerRequest struct {
Alias string `json:"alias"`
return
}

View File

@@ -3,7 +3,6 @@ package main
import (
"context"
"github.com/sirupsen/logrus"
"github.com/spf13/viper"
"math"
"runtime"
"sync/atomic"
@@ -20,14 +19,11 @@ func Stats(c context.Context) {
var crawlTicker <-chan time.Time
var allocTicker <-chan time.Time
crawlInterval := viper.GetDuration(ConfCrawlStats)
allocInterval := viper.GetDuration(ConfAllocStats)
if crawlInterval != 0 {
crawlTicker = time.Tick(crawlInterval)
if config.CrawlStats != 0 {
crawlTicker = time.NewTicker(config.CrawlStats).C
}
if allocInterval != 0 {
allocTicker = time.Tick(allocInterval)
if config.AllocStats != 0 {
allocTicker = time.NewTicker(config.AllocStats).C
}
for {
@@ -36,17 +32,13 @@ func Stats(c context.Context) {
startedNow := atomic.LoadUint64(&totalStarted)
perSecond := float64(startedNow - startedLast) /
crawlInterval.Seconds()
config.CrawlStats.Seconds()
// Round to .5
perSecond *= 2
perSecond = math.Round(perSecond)
perSecond /= 2
if perSecond <= 0 {
continue
}
logrus.WithFields(logrus.Fields{
"per_second": perSecond,
"done": atomic.LoadUint64(&totalDone),
@@ -61,7 +53,7 @@ func Stats(c context.Context) {
runtime.ReadMemStats(&mem)
logrus.WithFields(logrus.Fields{
"queue_count": atomic.LoadInt64(&totalQueued),
"queue_count": totalBuffered,
"heap": FormatByteCount(mem.Alloc),
"objects": mem.HeapObjects,
"num_gc": mem.NumGC,

16
tasks.go Normal file
View File

@@ -0,0 +1,16 @@
package main
import "time"
type Task struct {
WebsiteId int `json:"website_id"`
Url string `json:"url"`
}
type TaskResult struct {
StatusCode int `json:"status_code"`
FileCount uint64 `json:"file_count"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}

22
util.go
View File

@@ -1,9 +1,6 @@
package main
import (
"fmt"
"sync"
)
import "fmt"
// https://programming.guide/go/formatting-byte-size-to-human-readable-format.html
func FormatByteCount(b uint64) string {
@@ -19,20 +16,3 @@ func FormatByteCount(b uint64) string {
return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp])
}
}
type Hooks struct {
m sync.Mutex
l []func()
}
func (h *Hooks) Add(hook func()) {
h.m.Lock()
h.l = append(h.l, hook)
h.m.Unlock()
}
func (h *Hooks) Execute() {
for _, hook := range h.l {
hook()
}
}

124
worker.go
View File

@@ -1,11 +1,8 @@
package main
import (
"github.com/beeker1121/goque"
"github.com/sirupsen/logrus"
"math"
"sort"
"strings"
"sync"
"sync/atomic"
"time"
@@ -14,38 +11,24 @@ import (
var globalWait sync.WaitGroup
type WorkerContext struct {
OD *OD
Queue *BufferedQueue
in chan<- Job
out <-chan Job
lastRateLimit time.Time
numRateLimits int
}
func (w *WorkerContext) Worker(results chan<- File) {
for {
job, err := w.Queue.Dequeue()
switch err {
case goque.ErrEmpty:
time.Sleep(500 * time.Millisecond)
continue
case goque.ErrDBClosed:
return
case nil:
w.step(results, job)
default:
panic(err)
}
func (w WorkerContext) Worker() {
for job := range w.out {
w.step(job)
}
}
func (w *WorkerContext) step(results chan<- File, job Job) {
defer w.finishJob()
func (w WorkerContext) step(job Job) {
defer w.finishJob(&job)
var f File
newJobs, err := w.DoJob(&job, &f)
newJobs, err := DoJob(&job, &f)
atomic.AddUint64(&totalStarted, 1)
if err == ErrKnown {
return
@@ -54,18 +37,15 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
if err != nil {
job.Fails++
if !shouldRetry(err) {
atomic.AddUint64(&totalAborted, 1)
//logrus.WithField("url", job.UriStr).
// WithError(err).
// Error("Giving up after failure")
if err == ErrForbidden {
// Don't attempt crawling again
return
}
if job.Fails > config.Retries {
atomic.AddUint64(&totalAborted, 1)
//logrus.WithField("url", job.UriStr).
// Errorf("Giving up after %d fails", job.Fails)
logrus.WithField("url", job.UriStr).
Errorf("Giving up after %d fails", job.Fails)
} else {
atomic.AddUint64(&totalRetries, 1)
if err == ErrRateLimit {
@@ -82,24 +62,18 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
w.queueJob(job)
}
if !f.IsDir {
results <- f
}
job.OD.Files = append(job.OD.Files, f)
}
func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
if len(job.Uri.Path) == 0 {
return
}
func DoJob(job *Job, f *File) (newJobs []Job, err error) {
if len(job.Uri.Path) == 0 { return }
if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
// Load directory
links, err := GetDir(job, f)
if err != nil {
if !isErrSilent(err) {
logrus.WithError(err).
WithField("url", job.UriStr).
Error("Failed to crawl dir")
}
Error("Failed getting dir")
return nil, err
}
@@ -107,82 +81,60 @@ func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
hash := f.HashDir(links)
// Skip symlinked dirs
if w.OD.LoadOrStoreKey(&hash) {
if job.OD.LoadOrStoreKey(&hash) {
return nil, ErrKnown
}
// Sort by path
sort.Slice(links, func(i, j int) bool {
return strings.Compare(links[i].Path, links[j].Path) < 0
})
var newJobCount int
var lastLink string
for _, link := range links {
uriStr := link.String()
// Ignore dupes
if uriStr == lastLink {
continue
}
lastLink = uriStr
// Skip already queued links
//if _, old := job.OD.Scanned.LoadOrStore(link, true); old {
// continue
//}
job.OD.Wait.Add(1)
newJobs = append(newJobs, Job{
OD: job.OD,
Uri: link,
UriStr: uriStr,
UriStr: link.String(),
Fails: 0,
})
newJobCount++
}
if config.Verbose {
logrus.WithFields(logrus.Fields{
"url": job.UriStr,
"files": newJobCount,
"files": len(links),
}).Debug("Listed")
}
} else {
// Load file
err := GetFile(job.Uri, f)
if err != nil {
if !isErrSilent(err) {
logrus.WithError(err).
WithField("url", job.UriStr).
Error("Failed to crawl file")
}
WithField("url", job.Uri.String()).
Error("Failed getting file")
return nil, err
}
atomic.AddUint64(&w.OD.Result.FileCount, 1)
}
return
}
func (w *WorkerContext) queueJob(job Job) {
w.OD.Wait.Add(1)
func (w WorkerContext) queueJob(job Job) {
job.OD.Wait.Add(1)
globalWait.Add(1)
if w.numRateLimits > 0 {
if time.Since(w.lastRateLimit) > 5*time.Second {
if time.Since(w.lastRateLimit) > 5 * time.Second {
w.numRateLimits = 0
} else {
time.Sleep(time.Duration(math.Sqrt(float64(50*w.numRateLimits))) *
time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) *
100 * time.Millisecond)
w.in <- job
}
}
if err := w.Queue.Enqueue(&job); err != nil {
panic(err)
} else {
w.in <- job
}
}
func (w *WorkerContext) finishJob() {
w.OD.Wait.Done()
}
func isErrSilent(err error) bool {
if !config.PrintHTTP {
if _, ok := err.(*HttpError); ok {
return true
}
}
return false
func (w WorkerContext) finishJob(job *Job) {
job.OD.Wait.Done()
globalWait.Done()
}