64 Commits
rip ... v1.1.1

Author SHA1 Message Date
Richard Patel
f90bf94a44 Bump version to v1.1.1 2018-11-27 22:11:57 +01:00
Richard Patel
e82768ff80 Wait time control in config 2018-11-27 22:11:57 +01:00
Richard Patel
b1bf59adef Add The Eye DB to README.md 2018-11-27 17:40:12 +01:00
Richard Patel
a2df2972f4 Bump the upload retry interval up to 30s 2018-11-20 04:13:20 +01:00
Richard Patel
3fc8837dd7 Add output files to .gitignore 2018-11-20 03:51:42 +01:00
Richard Patel
f9a0d6bffe Bump to v1.1.0 2018-11-20 03:46:36 +01:00
Richard Patel
4dbe2aef2b Add job buffer size parameter 2018-11-20 03:42:32 +01:00
Richard Patel
86ec78cae1 Add TCP timeout option 2018-11-20 03:29:10 +01:00
Richard Patel
b846498030 Delete URL queues after crawling 2018-11-20 03:05:43 +01:00
Richard Patel
4f3140a39f Fix queue_count in log 2018-11-20 02:49:03 +01:00
Richard Patel
85d2aac9d4 Performance patch 2018-11-20 02:33:50 +01:00
Richard Patel
b6c0a45900 Job queue disk offloading 2018-11-20 02:03:10 +01:00
Richard Patel
d332f06659 Limit retries to 10 2018-11-18 21:05:26 +01:00
Richard Patel
1625d6c888 Bump to v1.0.2 2018-11-18 18:53:57 +01:00
Richard Patel
03a487f393 Fix crawl loop 2018-11-18 18:45:06 +01:00
Richard Patel
ac8221b109 Retry /task/upload 2018-11-18 18:33:26 +01:00
Richard Patel
8ed2cf3b93 Bump to v1.0.1 2018-11-18 14:49:07 +01:00
Richard Patel
f3620262fc Add log file support 2018-11-18 14:46:52 +01:00
Richard Patel
dc4e4212a0 Add freebsd to release.sh 2018-11-18 14:38:18 +01:00
Richard Patel
6e6a4edd27 Ignore all HTTP errors 2018-11-18 14:25:06 +01:00
Richard Patel
a71157b4d8 Add User-Agent parameter 2018-11-18 14:24:04 +01:00
Richard Patel
6dbec8c789 Add release script 2018-11-18 02:36:22 +01:00
Richard Patel
605f6db5a5 Don't call /task/upload for websites with no results 2018-11-18 01:42:57 +01:00
Richard Patel
d593ba2d0b Bump to 1.0 2018-11-18 00:54:58 +01:00
Richard Patel
6793086c22 Ignore HTTPS errors 2018-11-18 00:37:30 +01:00
Richard Patel
4464f34779 Add recheck and timeout parameters 2018-11-18 00:29:29 +01:00
Richard Patel
339175220d Refactor uploading & chunk size parameter 2018-11-18 00:19:43 +01:00
Richard Patel
1e6687c519 Upload result ignoring errors 2018-11-17 15:04:20 +01:00
Richard Patel
8060556089 Fix: make crawled dir 2018-11-17 13:36:35 +01:00
Richard Patel
73ba848e17 Grammar 2018-11-17 13:35:29 +01:00
Richard Patel
115983f70e Silent HTTP errors 2018-11-17 13:22:46 +01:00
Richard Patel
9210996b4c Fix multiple part file upload 2018-11-17 12:52:24 +01:00
Richard Patel
7b29da9340 Fix file uploads 2018-11-17 12:47:16 +01:00
Richard Patel
24ee6fcba2 Quickfix: Revert FTP give back 2018-11-17 12:43:30 +01:00
Richard Patel
bfb18d62b2 mini fix 2018-11-17 05:27:09 +01:00
Richard Patel
f4054441ab Return FTP tasks 2018-11-17 05:07:52 +01:00
Richard Patel
f8d2bf386d Fix FTP error ignore 2018-11-17 04:57:19 +01:00
Richard Patel
f41198b00c Ignore FTP URLs 2018-11-17 04:50:59 +01:00
Richard Patel
7fdffff58f Update config.yml 2018-11-17 04:19:04 +01:00
Richard Patel
d596882b40 Fix ton of bugs 2018-11-17 04:18:22 +01:00
Richard Patel
0fe97a8058 Update README.md 2018-11-17 01:36:07 +01:00
Richard Patel
718f9d7fbc Rename project 2018-11-17 01:33:15 +01:00
Richard Patel
f1687679ab Unescape results & don't recrawl 404 2018-11-17 01:21:20 +01:00
Richard Patel
145d37f84a Fix wait, add back crawl command 2018-11-17 00:49:09 +01:00
Richard Patel
cc777bcaeb redblackhash: Use bytes.Compare 2018-11-16 21:17:39 +01:00
Simon
1e78cea7e7 Saved path should not contain file name 2018-11-16 13:58:12 -05:00
Richard Patel
3f85cf679b Getting tasks 2018-11-16 04:47:08 +01:00
Richard Patel
3c39f0d621 Random hacks 2018-11-16 03:22:51 +01:00
Richard Patel
50952791c5 Almost done 2018-11-16 03:12:26 +01:00
Richard Patel
30bf98ad34 Fix tests 2018-11-16 03:02:10 +01:00
Richard Patel
ccaf758e90 Remove URL.Opaque 2018-11-16 01:53:16 +01:00
Richard Patel
f668365edb Add tests 2018-11-16 01:51:34 +01:00
Richard Patel
1db8ff43bb Bump version 2018-11-16 00:25:11 +01:00
Richard Patel
82234f949e Less tokenizer allocations 2018-11-16 00:22:40 +01:00
Richard Patel
084b3a5903 Optimizing with hexa :P 2018-11-15 23:51:31 +01:00
Richard Patel
ac0b8d2d0b Blacklist all paths with a query parameter 2018-11-15 23:36:41 +01:00
Richard Patel
ffde1a9e5d Timeout and results saving 2018-11-15 20:14:31 +01:00
Richard Patel
a268c6dbcf Reduce WaitQueue usage 2018-11-12 00:38:22 +01:00
Richard Patel
4c071171eb Exclude dups in dir instead of keeping hashes of links 2018-11-11 23:11:30 +01:00
Richard Patel
9c8174dd8d Fix header parsing 2018-11-11 18:53:17 +01:00
Richard Patel
93272e1da1 Update README.md 2018-11-06 02:41:20 +01:00
Richard Patel
0344a120ff fasturl: Remove path escape 2018-11-06 02:15:09 +01:00
Richard Patel
6e6afd771e fasturl: Remove query 2018-11-06 02:11:22 +01:00
Richard Patel
a8c27b2d21 Hash links 2018-11-06 02:01:53 +01:00
21 changed files with 2123 additions and 953 deletions

5
.gitignore vendored
View File

@@ -1,3 +1,6 @@
/.idea/ /.idea/
.DS_Store .DS_Store
/oddb-go /od-database-crawler
*.log
/queue/
/crawled/

View File

@@ -1,2 +1,9 @@
# oddb Go crawler # od-database Go crawler 🚀
> by terorie 2018 :P > by terorie 2018 :P
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
* Crawls HTTP open directories (standard Web Server Listings)
* Gets name, path, size and modification time of all files
* Lightweight and fast: __over 9000 requests per second__ on a standard laptop
https://od-db.the-eye.eu/

View File

@@ -1,42 +1,76 @@
package main package main
import ( import (
"bufio"
"fmt" "fmt"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/spf13/viper" "github.com/spf13/viper"
"io"
"os" "os"
"strings"
"time" "time"
) )
var config struct { var config struct {
ServerUrl string ServerUrl string
Token string Token string
ServerTimeout time.Duration
Recheck time.Duration
ChunkSize int64
Retries int Retries int
Workers int Workers int
UserAgent string
Tasks int32 Tasks int32
CrawlStats time.Duration CrawlStats time.Duration
AllocStats time.Duration AllocStats time.Duration
Verbose bool Verbose bool
PrintHTTP bool
JobBufferSize int
} }
const ( const (
ConfServerUrl = "server.url" ConfServerUrl = "server.url"
ConfToken = "server.token" ConfToken = "server.token"
ConfServerTimeout = "server.timeout"
ConfRecheck = "server.recheck"
ConfCooldown = "server.cooldown"
ConfChunkSize = "server.upload_chunk"
ConfUploadRetries = "server.upload_retries"
ConfUploadRetryInterval = "server.upload_retry_interval"
ConfTasks = "crawl.tasks" ConfTasks = "crawl.tasks"
ConfRetries = "crawl.retries" ConfRetries = "crawl.retries"
ConfWorkers = "crawl.connections" ConfWorkers = "crawl.connections"
ConfUserAgent = "crawl.user-agent"
ConfDialTimeout = "crawl.dial_timeout"
ConfTimeout = "crawl.timeout"
ConfJobBufferSize = "crawl.job_buffer"
ConfCrawlStats = "output.crawl_stats" ConfCrawlStats = "output.crawl_stats"
ConfAllocStats = "output.resource_stats" ConfAllocStats = "output.resource_stats"
ConfVerbose = "output.verbose" ConfVerbose = "output.verbose"
ConfPrintHTTP = "output.http"
ConfLogFile = "output.log"
) )
func prepareConfig() { func prepareConfig() {
viper.SetDefault(ConfRetries, 5) viper.SetDefault(ConfRetries, 5)
viper.SetDefault(ConfWorkers, 2) viper.SetDefault(ConfWorkers, 2)
viper.SetDefault(ConfTasks, 3) viper.SetDefault(ConfTasks, 3)
viper.SetDefault(ConfUserAgent, "")
viper.SetDefault(ConfDialTimeout, 10 * time.Second)
viper.SetDefault(ConfTimeout, 60 * time.Second)
viper.SetDefault(ConfJobBufferSize, 5000)
viper.SetDefault(ConfCrawlStats, 3 * time.Second) viper.SetDefault(ConfCrawlStats, 3 * time.Second)
viper.SetDefault(ConfAllocStats, 0) viper.SetDefault(ConfAllocStats, 0)
viper.SetDefault(ConfVerbose, false) viper.SetDefault(ConfVerbose, false)
viper.SetDefault(ConfPrintHTTP, false)
viper.SetDefault(ConfLogFile, "")
viper.SetDefault(ConfRecheck, 3 * time.Second)
viper.SetDefault(ConfCooldown, 30 * time.Second)
viper.SetDefault(ConfChunkSize, "1 MB")
viper.SetDefault(ConfUploadRetries, 10)
viper.SetDefault(ConfUploadRetryInterval, 30 * time.Second)
} }
func readConfig() { func readConfig() {
@@ -49,14 +83,24 @@ func readConfig() {
} }
config.ServerUrl = viper.GetString(ConfServerUrl) config.ServerUrl = viper.GetString(ConfServerUrl)
//if config.ServerUrl == "" { if config.ServerUrl == "" {
// configMissing(ConfServerUrl) configMissing(ConfServerUrl)
//} }
config.ServerUrl = strings.TrimRight(config.ServerUrl, "/")
config.Token = viper.GetString(ConfToken) config.Token = viper.GetString(ConfToken)
//if config.Token == "" { if config.Token == "" {
// configMissing(ConfToken) configMissing(ConfToken)
//} }
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
config.Recheck = viper.GetDuration(ConfRecheck)
config.ChunkSize = int64(viper.GetSizeInBytes(ConfChunkSize))
if config.ChunkSize < 100 {
configOOB(ConfChunkSize, config.ChunkSize)
}
config.Retries = viper.GetInt(ConfRetries) config.Retries = viper.GetInt(ConfRetries)
if config.Retries < 0 { if config.Retries < 0 {
@@ -73,6 +117,14 @@ func readConfig() {
configOOB(ConfTasks, int(config.Tasks)) configOOB(ConfTasks, int(config.Tasks))
} }
config.UserAgent = viper.GetString(ConfUserAgent)
setDialTimeout(viper.GetDuration(ConfDialTimeout))
setTimeout(viper.GetDuration(ConfTimeout))
config.JobBufferSize = viper.GetInt(ConfJobBufferSize)
config.CrawlStats = viper.GetDuration(ConfCrawlStats) config.CrawlStats = viper.GetDuration(ConfCrawlStats)
config.AllocStats = viper.GetDuration(ConfAllocStats) config.AllocStats = viper.GetDuration(ConfAllocStats)
@@ -81,6 +133,19 @@ func readConfig() {
if config.Verbose { if config.Verbose {
logrus.SetLevel(logrus.DebugLevel) logrus.SetLevel(logrus.DebugLevel)
} }
if filePath := viper.GetString(ConfLogFile); filePath != "" {
f, err := os.OpenFile(filePath, os.O_CREATE | os.O_WRONLY | os.O_APPEND, 0644)
bufWriter := bufio.NewWriter(f)
if err != nil { panic(err) }
exitHooks.Add(func() {
bufWriter.Flush()
f.Close()
})
logrus.SetOutput(io.MultiWriter(os.Stdout, bufWriter))
}
config.PrintHTTP = viper.GetBool(ConfPrintHTTP)
} }
func configMissing(key string) { func configMissing(key string) {
@@ -88,7 +153,7 @@ func configMissing(key string) {
os.Exit(1) os.Exit(1)
} }
func configOOB(key string, v int) { func configOOB(key string, v interface{}) {
fmt.Fprintf(os.Stderr, "config: illegal value %d for %key!\n", v, key) fmt.Fprintf(os.Stderr, "config: illegal value %v for key %s!\n", v, key)
os.Exit(1) os.Exit(1)
} }

View File

@@ -1,26 +1,84 @@
# OD-Database server settings # OD-Database server settings
server: server:
# Connection URL # Connection URL
url: localhost:6969 url: http://od-db.mine.terorie.com/api
# Server auth token # Server auth token
token: token:
# Request timeout
timeout: 60s
# Recheck interval
# The crawler periodically asks the server
# for new jobs. Sets the minimum wait time
# between /task/get requests to the server.
recheck: 1s
# Time to wait after receiving an error
# from the server. Doesn't apply to uploads.
cooldown: 30s
# Upload chunk size
# If the value is too high, the upload fails.
upload_chunk: 1 MB
upload_retries: 10
upload_retry_interval: 30s
# Log output settings # Log output settings
output: output:
# Crawl statistics # Crawl statistics
crawl_stats: 1s crawl_stats: 1s
# CPU/RAM/Job queue stats # CPU/RAM/Job queue stats
resource_stats: 1s resource_stats: 10s
# More output? (Every listed dir) # More output? (Every listed dir)
verbose: false verbose: false
# Print HTTP errors (Super spammy)
http: false
# Log file
# If empty, no log file is created.
log: crawler.log
# Crawler settings # Crawler settings
crawl: crawl:
# Number of sites that can be # Number of sites that can be processed at once
# processed at once tasks: 100
tasks: 3
# Number of connections per site # Number of connections per site
connections: 2 # Please be careful with this setting!
# The crawler fires fast and more than
# ten connections can overwhelm a server.
connections: 4
# How often to retry getting data # How often to retry getting data
# from the site before giving up # from the site before giving up
retries: 5 retries: 5
# Time before discarding a failed connection attempt
dial_timeout: 10s
# Time before discarding a network request
timeout: 30s
# Crawler User-Agent
# If empty, no User-Agent header is sent.
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"
# Job buffer size (per task)
# Higher values cause less disk writes
# but require more memory.
#
# The job queue contains all URLs
# that should be crawled next.
# As it grows very large over time,
# it's kept mainly on disk.
# This sets how many jobs are kept
# in memory.
# A negative value will cause all jobs
# to be stored in memory. (Don't do this)
job_buffer: 5000

265
crawl.go
View File

@@ -2,29 +2,44 @@ package main
import ( import (
"bytes" "bytes"
"fmt" "crypto/tls"
"github.com/sirupsen/logrus" "github.com/terorie/od-database-crawler/ds/redblackhash"
"github.com/terorie/oddb-go/ds/redblackhash" "github.com/terorie/od-database-crawler/fasturl"
"github.com/terorie/oddb-go/fasturl"
"github.com/terorie/oddb-go/runes"
"github.com/terorie/oddb-go/runespath"
"github.com/valyala/fasthttp" "github.com/valyala/fasthttp"
"golang.org/x/crypto/blake2b" "golang.org/x/crypto/blake2b"
"golang.org/x/net/html" "golang.org/x/net/html"
"golang.org/x/net/html/atom" "net"
"path" "path"
"strconv" "strconv"
"strings" "strings"
"time" "time"
) )
var client fasthttp.Client var client = fasthttp.Client {
TLSConfig: &tls.Config{
InsecureSkipVerify: true,
},
}
func setDialTimeout(d time.Duration) {
client.Dial = func(addr string) (net.Conn, error) {
return fasthttp.DialTimeout(addr, d)
}
}
func setTimeout(d time.Duration) {
client.ReadTimeout = d
client.WriteTimeout = d / 2
}
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) { func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
f.IsDir = true f.IsDir = true
f.Name = runespath.Base(j.Uri.Path) f.Name = path.Base(j.Uri.Path)
req := fasthttp.AcquireRequest() req := fasthttp.AcquireRequest()
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(j.UriStr) req.SetRequestURI(j.UriStr)
res := fasthttp.AcquireResponse() res := fasthttp.AcquireResponse()
@@ -34,84 +49,79 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
fasthttp.ReleaseRequest(req) fasthttp.ReleaseRequest(req)
if err != nil { if err != nil {
logrus.Error(err)
return return
} }
err = checkStatusCode(res.StatusCode()) err = checkStatusCode(res.StatusCode())
if err != nil { return } if err != nil {
return
}
body := res.Body() body := res.Body()
doc := html.NewTokenizer(bytes.NewReader(body)) doc := html.NewTokenizer(bytes.NewReader(body))
var linkHref string var linkHref string
var linkTexts []string
for { for {
err = nil
tokenType := doc.Next() tokenType := doc.Next()
token := doc.Token()
if tokenType == html.ErrorToken { if tokenType == html.ErrorToken {
break break
} }
switch tokenType { switch tokenType {
case html.StartTagToken: case html.StartTagToken:
if token.DataAtom == atom.A { name, hasAttr := doc.TagName()
for _, attr := range token.Attr { if len(name) == 1 && name[0] == 'a' {
if attr.Key == "href" { for hasAttr {
linkHref = attr.Val var ks, vs []byte
ks, vs, hasAttr = doc.TagAttr()
if bytes.Equal(ks, []byte("href")) {
// TODO Check escape
linkHref = string(vs)
break break
} }
} }
} }
case html.TextToken:
if linkHref != "" {
linkTexts = append(linkTexts, token.Data)
}
case html.EndTagToken: case html.EndTagToken:
if linkHref != "" && token.DataAtom == atom.A { name, _ := doc.TagName()
if len(name) == 1 && name[0] == 'a' {
// Copy params // Copy params
href := linkHref href := linkHref
linkText := strings.Join(linkTexts, " ")
// Reset params // Reset params
linkHref = "" linkHref = ""
linkTexts = nil
// TODO Optimized decision tree if strings.LastIndexByte(href, '?') != -1 {
for _, entry := range urlBlackList { continue
if href == entry {
goto nextToken
}
} }
for _, entry := range urlPartBlackList {
if strings.Contains(href, entry) { switch href {
goto nextToken case "", " ", ".", "..", "/":
} continue
} }
for _, entry := range fileNameBlackList {
if strings.Contains(linkText, entry) { if strings.Contains(href, "../") {
goto nextToken continue
}
} }
var link fasturl.URL var link fasturl.URL
err = j.Uri.ParseRel(&link, []rune(href)) err = j.Uri.ParseRel(&link, href)
if err != nil { continue } if err != nil {
continue
}
if !runes.Equals(link.Scheme, j.Uri.Scheme) || if link.Scheme != j.Uri.Scheme ||
!runes.Equals(link.Host, j.Uri.Host) || link.Host != j.Uri.Host ||
runes.Equals(link.Path, j.Uri.Path) || link.Path == j.Uri.Path ||
!runes.HasPrefix(link.Path, j.Uri.Path) { !strings.HasPrefix(link.Path, j.Uri.Path) {
continue continue
} }
links = append(links, link) links = append(links, link)
} }
} }
nextToken:
} }
return return
@@ -119,12 +129,15 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
func GetFile(u fasturl.URL, f *File) (err error) { func GetFile(u fasturl.URL, f *File) (err error) {
f.IsDir = false f.IsDir = false
u.Path = []rune(path.Clean(string(u.Path))) u.Path = path.Clean(u.Path)
f.Name = runespath.Base(u.Path) f.Name = path.Base(u.Path)
f.Path = runes.TrimRune(u.Path, '/') f.Path = strings.Trim(path.Dir(u.Path), "/")
req := fasthttp.AcquireRequest() req := fasthttp.AcquireRequest()
req.Header.SetMethod("HEAD") req.Header.SetMethod("HEAD")
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(u.String()) req.SetRequestURI(u.String())
res := fasthttp.AcquireResponse() res := fasthttp.AcquireResponse()
@@ -134,87 +147,69 @@ func GetFile(u fasturl.URL, f *File) (err error) {
err = client.Do(req, res) err = client.Do(req, res)
fasthttp.ReleaseRequest(req) fasthttp.ReleaseRequest(req)
if err != nil { return } if err != nil {
return
}
err = checkStatusCode(res.StatusCode()) err = checkStatusCode(res.StatusCode())
if err != nil { return } if err != nil {
return
}
// TODO Inefficient af f.applyContentLength(string(res.Header.Peek("content-length")))
header := res.Header.Header() f.applyLastModified(string(res.Header.Peek("last-modified")))
f.ParseHeader(header)
return nil return nil
} }
func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) { func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) {
h, _ := blake2b.New256(nil) h, _ := blake2b.New256(nil)
h.Write([]byte(string(f.Name))) h.Write([]byte(f.Name))
for _, link := range links { for _, link := range links {
fileName := runespath.Base(link.Path) fileName := path.Base(link.Path)
h.Write([]byte(string(fileName))) h.Write([]byte(fileName))
} }
sum := h.Sum(nil) sum := h.Sum(nil)
copy(o[:redblackhash.KeySize], sum) copy(o[:redblackhash.KeySize], sum)
return return
} }
func (f *File) ParseHeader(h []byte) { func (f *File) applyContentLength(v string) {
var k1, k2 int if v == "" {
var v1, v2 int return
// Simple finite state machine
state := 0
for i, b := range h {
switch state {
case 0:
if b == byte(':') {
state = 1
k2 = i
}
case 1:
state = 2
case 2:
state = 3
v1 = i
case 3:
if b == byte('\r') {
state = 4
}
case 4:
state = 0
v2 = i - 1
key := string(h[k1:k2])
val := string(h[v1:v2])
k1 = i
f.applyHeader(key, val)
}
} }
size, err := strconv.ParseInt(v, 10, 64)
if err != nil {
return
}
if size < 0 {
return
}
f.Size = size
} }
func (f *File) applyHeader(k, v string) { // TODO Cleanup
switch k { func (f *File) applyLastModified(v string) {
case "content-length": if v == "" {
size, err := strconv.ParseInt(v, 10, 64) return
if err != nil { break } }
if size < 0 { break } var t time.Time
f.Size = size var err error
t, err = time.Parse(time.RFC1123, v)
case "last-modified": if err == nil {
var err error f.MTime = t.Unix()
f.MTime, err = time.Parse(time.RFC1123, v) return
if err == nil { break } }
f.MTime, err = time.Parse(time.RFC850, v) t, err = time.Parse(time.RFC850, v)
if err == nil { break } if err == nil {
// TODO Parse asctime f.MTime = t.Unix()
f.MTime, err = time.Parse("2006-01-02", v[:10]) return
if err == nil { break } }
// TODO Parse asctime
t, err = time.Parse("2006-01-02", v[:10])
if err == nil {
f.MTime = t.Unix()
return
} }
} }
@@ -222,53 +217,7 @@ func checkStatusCode(status int) error {
switch status { switch status {
case fasthttp.StatusOK: case fasthttp.StatusOK:
return nil return nil
case fasthttp.StatusTooManyRequests:
return ErrRateLimit
case fasthttp.StatusForbidden,
fasthttp.StatusUnauthorized:
return ErrForbidden
default: default:
return fmt.Errorf("got HTTP status %d", status) return &HttpError{status}
} }
} }
var urlBlackList = [...]string {
"",
" ",
".",
"..",
"/",
}
var urlPartBlackList = [...]string {
"?C=N&O=D",
"?C=M&O=A",
"?C=S&O=A",
"?C=D&O=A",
"?C=N;O=D",
"?C=M;O=A",
"?C=M&O=D",
"?C=S;O=A",
"?C=S&O=D",
"?C=D;O=A",
"?MA",
"?SA",
"?DA",
"?ND",
"?C=N&O=A",
"?C=N&O=A",
"?M=A",
"?N=D",
"?S=A",
"?D=A",
}
var fileNameBlackList = [...]string {
"Parent Directory",
" Parent Directory",
"../",
}

View File

@@ -14,7 +14,9 @@
package redblackhash package redblackhash
import ( import (
"bytes"
"fmt" "fmt"
"sync"
) )
const ( const (
@@ -27,6 +29,7 @@ type Key [KeySize]byte
// Tree holds elements of the red-black tree // Tree holds elements of the red-black tree
type Tree struct { type Tree struct {
sync.Mutex
Root *Node Root *Node
size int size int
} }
@@ -41,42 +44,7 @@ type Node struct {
} }
func (k *Key) Compare(o *Key) int { func (k *Key) Compare(o *Key) int {
// TODO Assembly return bytes.Compare(k[:], o[:])
/*for i := 0; i < KeySize / 8; i++ {
a := uint64(k[i+0] ) |
uint64(k[i+1] >> 8) |
uint64(k[i+2] >> 16) |
uint64(k[i+3] >> 24) |
uint64(k[i+4] >> 32) |
uint64(k[i+5] >> 40) |
uint64(k[i+6] >> 48) |
uint64(k[i+7] >> 56)
b := uint64(o[i+0] ) |
uint64(o[i+1] >> 8) |
uint64(o[i+2] >> 16) |
uint64(o[i+3] >> 24) |
uint64(o[i+4] >> 32) |
uint64(o[i+5] >> 40) |
uint64(o[i+6] >> 48) |
uint64(o[i+7] >> 56)
switch {
case a < b:
return -1
case a > b:
return 1
}
}*/
for i := 0; i < KeySize; i++ {
switch {
case k[i] < o[i]:
return -1
case k[i] > o[i]:
return 1
}
}
return 0
} }
// Put inserts node into the tree. // Put inserts node into the tree.

View File

@@ -1,8 +1,17 @@
package main package main
import "errors" import (
"errors"
"fmt"
)
var ErrRateLimit = errors.New("too many requests") var ErrRateLimit = errors.New("too many requests")
var ErrForbidden = errors.New("access denied")
var ErrKnown = errors.New("already crawled") var ErrKnown = errors.New("already crawled")
type HttpError struct {
code int
}
func (e HttpError) Error() string {
return fmt.Sprintf("http status %d", e.code)
}

View File

@@ -15,19 +15,34 @@ package fasturl
import ( import (
"errors" "errors"
"fmt" "fmt"
"github.com/terorie/oddb-go/runes"
"strconv" "strconv"
"strings" "strings"
) )
type Scheme int
const (
SchemeInvalid = iota
SchemeHTTP
SchemeHTTPS
SchemeCount
)
var Schemes = [SchemeCount]string {
"",
"http",
"https",
}
var ErrUnknownScheme = errors.New("unknown protocol scheme")
// Error reports an error and the operation and URL that caused it. // Error reports an error and the operation and URL that caused it.
type Error struct { type Error struct {
Op string Op string
URL []rune URL string
Err error Err error
} }
func (e *Error) Error() string { return e.Op + " " + string(e.URL) + ": " + e.Err.Error() } func (e *Error) Error() string { return e.Op + " " + e.URL + ": " + e.Err.Error() }
type timeout interface { type timeout interface {
Timeout() bool Timeout() bool
@@ -100,7 +115,7 @@ func (e InvalidHostError) Error() string {
// //
// Please be informed that for now shouldEscape does not check all // Please be informed that for now shouldEscape does not check all
// reserved characters correctly. See golang.org/issue/5684. // reserved characters correctly. See golang.org/issue/5684.
func shouldEscape(c rune, mode encoding) bool { func shouldEscape(c byte, mode encoding) bool {
// §2.3 Unreserved characters (alphanum) // §2.3 Unreserved characters (alphanum)
if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' { if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' {
return false return false
@@ -177,29 +192,9 @@ func shouldEscape(c rune, mode encoding) bool {
return true return true
} }
// QueryUnescape does the inverse transformation of QueryEscape,
// converting each 3-byte encoded substring of the form "%AB" into the
// hex-decoded byte 0xAB.
// It returns an error if any % is not followed by two hexadecimal
// digits.
func QueryUnescape(s []rune) ([]rune, error) {
return unescape(s, encodeQueryComponent)
}
// PathUnescape does the inverse transformation of PathEscape,
// converting each 3-byte encoded substring of the form "%AB" into the
// hex-decoded byte 0xAB. It returns an error if any % is not followed
// by two hexadecimal digits.
//
// PathUnescape is identical to QueryUnescape except that it does not
// unescape '+' to ' ' (space).
func PathUnescape(s []rune) ([]rune, error) {
return unescape(s, encodePathSegment)
}
// unescape unescapes a string; the mode specifies // unescape unescapes a string; the mode specifies
// which section of the URL string is being unescaped. // which section of the URL string is being unescaped.
func unescape(s []rune, mode encoding) ([]rune, error) { func unescape(s string, mode encoding) (string, error) {
// Count %, check that they're well-formed. // Count %, check that they're well-formed.
n := 0 n := 0
hasPlus := false hasPlus := false
@@ -207,12 +202,12 @@ func unescape(s []rune, mode encoding) ([]rune, error) {
switch s[i] { switch s[i] {
case '%': case '%':
n++ n++
if i+2 >= len(s) || !ishex(byte(s[i+1])) || !ishex(byte(s[i+2])) { if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:] s = s[i:]
if len(s) > 3 { if len(s) > 3 {
s = s[:3] s = s[:3]
} }
return nil, EscapeError(s) return "", EscapeError(s)
} }
// Per https://tools.ietf.org/html/rfc3986#page-21 // Per https://tools.ietf.org/html/rfc3986#page-21
// in the host component %-encoding can only be used // in the host component %-encoding can only be used
@@ -220,8 +215,8 @@ func unescape(s []rune, mode encoding) ([]rune, error) {
// But https://tools.ietf.org/html/rfc6874#section-2 // But https://tools.ietf.org/html/rfc6874#section-2
// introduces %25 being allowed to escape a percent sign // introduces %25 being allowed to escape a percent sign
// in IPv6 scoped-address literals. Yay. // in IPv6 scoped-address literals. Yay.
if mode == encodeHost && unhex(byte(s[i+1])) < 8 && !runes.Equals(s[i:i+3], []rune("%25")) { if mode == encodeHost && unhex(s[i+1]) < 8 && s[i:i+3] != "%25" {
return nil, EscapeError(s[i : i+3]) return "", EscapeError(s[i : i+3])
} }
if mode == encodeZone { if mode == encodeZone {
// RFC 6874 says basically "anything goes" for zone identifiers // RFC 6874 says basically "anything goes" for zone identifiers
@@ -231,9 +226,9 @@ func unescape(s []rune, mode encoding) ([]rune, error) {
// That is, you can use escaping in the zone identifier but not // That is, you can use escaping in the zone identifier but not
// to introduce bytes you couldn't just write directly. // to introduce bytes you couldn't just write directly.
// But Windows puts spaces here! Yay. // But Windows puts spaces here! Yay.
v := unhex(byte(s[i+1]))<<4 | unhex(byte(s[i+2])) v := unhex(s[i+1])<<4 | unhex(s[i+2])
if !runes.Equals(s[i:i+3], []rune("%25")) && v != ' ' && shouldEscape(rune(v), encodeHost) { if s[i:i+3] != "%25" && v != ' ' && shouldEscape(v, encodeHost) {
return nil, EscapeError(s[i : i+3]) return "", EscapeError(s[i : i+3])
} }
} }
i += 3 i += 3
@@ -242,7 +237,7 @@ func unescape(s []rune, mode encoding) ([]rune, error) {
i++ i++
default: default:
if (mode == encodeHost || mode == encodeZone) && s[i] < 0x80 && shouldEscape(s[i], mode) { if (mode == encodeHost || mode == encodeZone) && s[i] < 0x80 && shouldEscape(s[i], mode) {
return nil, InvalidHostError(s[i : i+1]) return "", InvalidHostError(s[i : i+1])
} }
i++ i++
} }
@@ -257,7 +252,7 @@ func unescape(s []rune, mode encoding) ([]rune, error) {
for i := 0; i < len(s); { for i := 0; i < len(s); {
switch s[i] { switch s[i] {
case '%': case '%':
t[j] = unhex(byte(s[i+1]))<<4 | unhex(byte(s[i+2])) t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
j++ j++
i += 3 i += 3
case '+': case '+':
@@ -269,27 +264,15 @@ func unescape(s []rune, mode encoding) ([]rune, error) {
j++ j++
i++ i++
default: default:
t[j] = byte(s[i]) t[j] = s[i]
j++ j++
i++ i++
} }
} }
return []rune(string(t)), nil return string(t), nil
} }
// QueryEscape escapes the string so it can be safely placed func escape(s string, mode encoding) string {
// inside a URL query.
func QueryEscape(s []rune) []rune {
return escape(s, encodeQueryComponent)
}
// PathEscape escapes the string so it can be safely placed
// inside a URL path segment.
func PathEscape(s []rune) []rune {
return escape(s, encodePathSegment)
}
func escape(s []rune, mode encoding) []rune {
spaceCount, hexCount := 0, 0 spaceCount, hexCount := 0, 0
for i := 0; i < len(s); i++ { for i := 0; i < len(s); i++ {
c := s[i] c := s[i]
@@ -319,11 +302,11 @@ func escape(s []rune, mode encoding) []rune {
t[j+2] = "0123456789ABCDEF"[c&15] t[j+2] = "0123456789ABCDEF"[c&15]
j += 3 j += 3
default: default:
t[j] = byte(s[i]) t[j] = s[i]
j++ j++
} }
} }
return []rune(string(t)) return string(t)
} }
// A URL represents a parsed URL (technically, a URI reference). // A URL represents a parsed URL (technically, a URI reference).
@@ -344,19 +327,15 @@ func escape(s []rune, mode encoding) []rune {
// and URL's String method uses RawPath if it is a valid encoding of Path, // and URL's String method uses RawPath if it is a valid encoding of Path,
// by calling the EscapedPath method. // by calling the EscapedPath method.
type URL struct { type URL struct {
Scheme []rune Scheme Scheme
Opaque []rune // encoded opaque data Host string // host or host:port
Host []rune // host or host:port Path string // path (relative paths may omit leading slash)
Path []rune // path (relative paths may omit leading slash)
RawPath []rune // encoded path hint (see EscapedPath method)
ForceQuery bool // append a query ('?') even if RawQuery is empty
RawQuery []rune // encoded query values, without '?'
} }
// Maybe rawurl is of the form scheme:path. // Maybe rawurl is of the form scheme:path.
// (Scheme must be [a-zA-Z][a-zA-Z0-9+-.]*) // (Scheme must be [a-zA-Z][a-zA-Z0-9+-.]*)
// If so, return scheme, path; else return "", rawurl. // If so, return scheme, path; else return "", rawurl.
func getscheme(rawurl []rune) (scheme []rune, path []rune, err error) { func getscheme(rawurl string) (scheme Scheme, path string, err error) {
for i := 0; i < len(rawurl); i++ { for i := 0; i < len(rawurl); i++ {
c := rawurl[i] c := rawurl[i]
switch { switch {
@@ -364,34 +343,42 @@ func getscheme(rawurl []rune) (scheme []rune, path []rune, err error) {
// do nothing // do nothing
case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.': case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.':
if i == 0 { if i == 0 {
return nil, rawurl, nil return SchemeInvalid, rawurl, nil
} }
case c == ':': case c == ':':
if i == 0 { if i == 0 {
return nil, nil, errors.New("missing protocol scheme") return SchemeInvalid, "", errors.New("missing protocol scheme")
} }
scheme = rawurl[:i] switch rawurl[:i] {
case "http":
scheme = SchemeHTTP
case "https":
scheme = SchemeHTTPS
default:
return SchemeInvalid, "", ErrUnknownScheme
}
path = rawurl[i+1:] path = rawurl[i+1:]
return return
default: default:
// we have encountered an invalid character, // we have encountered an invalid character,
// so there is no valid scheme // so there is no valid scheme
return nil, rawurl, nil return SchemeInvalid, rawurl, nil
} }
} }
return nil, rawurl, nil return SchemeInvalid, rawurl, nil
} }
// Maybe s is of the form t c u. // Maybe s is of the form t c u.
// If so, return t, c u (or t, u if cutc == true). // If so, return t, c u (or t, u if cutc == true).
// If not, return s, "". // If not, return s, "".
func split(s []rune, c rune, cutc bool) ([]rune, []rune) { func split(s string, c string, cutc bool) (string, string) {
i := strings.Index(string(s), string(c)) // TODO Optimize i := strings.Index(s, c)
if i < 0 { if i < 0 {
return s, nil return s, ""
} }
if cutc { if cutc {
return s[:i], s[i+1:] return s[:i], s[i+len(c):]
} }
return s[:i], s[i:] return s[:i], s[i:]
} }
@@ -402,14 +389,14 @@ func split(s []rune, c rune, cutc bool) ([]rune, []rune) {
// (starting with a scheme). Trying to parse a hostname and path // (starting with a scheme). Trying to parse a hostname and path
// without a scheme is invalid but may not necessarily return an // without a scheme is invalid but may not necessarily return an
// error, due to parsing ambiguities. // error, due to parsing ambiguities.
func (u *URL) Parse(rawurl []rune) error { func (u *URL) Parse(rawurl string) error {
// Cut off #frag // Cut off #frag
s, frag := split(rawurl, '#', true) s, frag := split(rawurl, "#", true)
err := u.parse(s, false) err := u.parse(s, false)
if err != nil { if err != nil {
return &Error{"parse", s, err} return &Error{"parse", s, err}
} }
if len(frag) == 0 { if frag == "" {
return nil return nil
} }
return nil return nil
@@ -420,7 +407,7 @@ func (u *URL) Parse(rawurl []rune) error {
// only as an absolute URI or an absolute path. // only as an absolute URI or an absolute path.
// The string rawurl is assumed not to have a #fragment suffix. // The string rawurl is assumed not to have a #fragment suffix.
// (Web browsers strip #fragment before sending the URL to a web server.) // (Web browsers strip #fragment before sending the URL to a web server.)
func (u *URL) ParseRequestURI(rawurl []rune) error { func (u *URL) ParseRequestURI(rawurl string) error {
err := u.parse(rawurl, true) err := u.parse(rawurl, true)
if err != nil { if err != nil {
return &Error{"parse", rawurl, err} return &Error{"parse", rawurl, err}
@@ -432,16 +419,16 @@ func (u *URL) ParseRequestURI(rawurl []rune) error {
// viaRequest is true, the URL is assumed to have arrived via an HTTP request, // viaRequest is true, the URL is assumed to have arrived via an HTTP request,
// in which case only absolute URLs or path-absolute relative URLs are allowed. // in which case only absolute URLs or path-absolute relative URLs are allowed.
// If viaRequest is false, all forms of relative URLs are allowed. // If viaRequest is false, all forms of relative URLs are allowed.
func (u *URL) parse(rawurl []rune, viaRequest bool) error { func (u *URL) parse(rawurl string, viaRequest bool) error {
var rest []rune var rest string
var err error var err error
if len(rawurl) == 0 && viaRequest { if rawurl == "" && viaRequest {
return errors.New("empty url") return errors.New("empty url")
} }
if runes.Equals(rawurl, []rune("*")) { if rawurl == "*" {
u.Path = []rune("*") u.Path = "*"
return nil return nil
} }
@@ -451,17 +438,15 @@ func (u *URL) parse(rawurl []rune, viaRequest bool) error {
return err return err
} }
if runes.HasSuffix(rest, []rune("?")) && runes.Count(rest, '?') == 1 { if strings.HasSuffix(rest, "?") && strings.Count(rest, "?") == 1 {
u.ForceQuery = true
rest = rest[:len(rest)-1] rest = rest[:len(rest)-1]
} else { } else {
rest, u.RawQuery = split(rest, '?', true) rest, _ = split(rest, "?", true)
} }
if !runes.HasPrefix(rest, []rune("/")) { if !strings.HasPrefix(rest, "/") {
if len(u.Scheme) != 0 { if u.Scheme != SchemeInvalid {
// We consider rootless paths per RFC 3986 as opaque. // We consider rootless paths per RFC 3986 as opaque.
u.Opaque = rest
return nil return nil
} }
if viaRequest { if viaRequest {
@@ -474,65 +459,59 @@ func (u *URL) parse(rawurl []rune, viaRequest bool) error {
// RFC 3986, §3.3: // RFC 3986, §3.3:
// In addition, a URI reference (Section 4.1) may be a relative-path reference, // In addition, a URI reference (Section 4.1) may be a relative-path reference,
// in which case the first path segment cannot contain a colon (":") character. // in which case the first path segment cannot contain a colon (":") character.
colon := runes.IndexRune(rest, ':') colon := strings.Index(rest, ":")
slash := runes.IndexRune(rest, '/') slash := strings.Index(rest, "/")
if colon >= 0 && (slash < 0 || colon < slash) { if colon >= 0 && (slash < 0 || colon < slash) {
// First path segment has colon. Not allowed in relative URL. // First path segment has colon. Not allowed in relative URL.
return errors.New("first path segment in URL cannot contain colon") return errors.New("first path segment in URL cannot contain colon")
} }
} }
if (len(u.Scheme) != 0 || !viaRequest && !runes.HasPrefix(rest, []rune("///"))) && runes.HasPrefix(rest, []rune("//")) { if (u.Scheme != SchemeInvalid || !viaRequest && !strings.HasPrefix(rest, "///")) && strings.HasPrefix(rest, "//") {
var authority []rune var authority string
authority, rest = split(rest[2:], '/', false) authority, rest = split(rest[2:], "/", false)
u.Host, err = parseAuthority(authority) u.Host, err = parseAuthority(authority)
if err != nil { if err != nil {
return err return err
} }
} }
// Set Path and, optionally, RawPath. u.Path = rest
// RawPath is a hint of the encoding of Path. We don't want to set it if
// the default escaping of Path is equivalent, to help make sure that people
// don't rely on it in general.
if err := u.setPath(rest); err != nil {
return err
}
return nil return nil
} }
func parseAuthority(authority []rune) (host []rune, err error) { func parseAuthority(authority string) (host string, err error) {
i := runes.LastIndexRune(authority, '@') i := strings.LastIndex(authority, "@")
if i < 0 { if i < 0 {
host, err = parseHost(authority) host, err = parseHost(authority)
} else { } else {
host, err = parseHost(authority[i+1:]) host, err = parseHost(authority[i+1:])
} }
if err != nil { if err != nil {
return nil, err return "", err
} }
if i < 0 { if i < 0 {
return host, nil return host, nil
} }
userinfo := authority[:i] userinfo := authority[:i]
if !validUserinfo(userinfo) { if !validUserinfo(userinfo) {
return nil, errors.New("fasturl: invalid userinfo") return "", errors.New("fasturl: invalid userinfo")
} }
return host, nil return host, nil
} }
// parseHost parses host as an authority without user // parseHost parses host as an authority without user
// information. That is, as host[:port]. // information. That is, as host[:port].
func parseHost(host []rune) ([]rune, error) { func parseHost(host string) (string, error) {
if runes.HasPrefix(host, []rune("[")) { if strings.HasPrefix(host, "[") {
// Parse an IP-Literal in RFC 3986 and RFC 6874. // Parse an IP-Literal in RFC 3986 and RFC 6874.
// E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80". // E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
i := runes.LastIndexRune(host, ']') i := strings.LastIndex(host, "]")
if i < 0 { if i < 0 {
return nil, errors.New("missing ']' in host") return "", errors.New("missing ']' in host")
} }
colonPort := host[i+1:] colonPort := host[i+1:]
if !validOptionalPort(colonPort) { if !validOptionalPort(colonPort) {
return nil, fmt.Errorf("invalid port %q after host", colonPort) return "", fmt.Errorf("invalid port %q after host", colonPort)
} }
// RFC 6874 defines that %25 (%-encoded percent) introduces // RFC 6874 defines that %25 (%-encoded percent) introduces
@@ -541,106 +520,35 @@ func parseHost(host []rune) ([]rune, error) {
// can only %-encode non-ASCII bytes. // can only %-encode non-ASCII bytes.
// We do impose some restrictions on the zone, to avoid stupidity // We do impose some restrictions on the zone, to avoid stupidity
// like newlines. // like newlines.
zone := strings.Index(string(host[:i]), "%25") zone := strings.Index(host[:i], "%25")
if zone >= 0 { if zone >= 0 {
host1, err := unescape(host[:zone], encodeHost) host1, err := unescape(host[:zone], encodeHost)
if err != nil { if err != nil {
return nil, err return "", err
} }
host2, err := unescape(host[zone:i], encodeZone) host2, err := unescape(host[zone:i], encodeZone)
if err != nil { if err != nil {
return nil, err return "", err
} }
host3, err := unescape(host[i:], encodeHost) host3, err := unescape(host[i:], encodeHost)
if err != nil { if err != nil {
return nil, err return "", err
} }
// TODO Optimize return host1 + host2 + host3, nil
return runes.Create(host1, host2, host3), nil
} }
} }
var err error var err error
if host, err = unescape(host, encodeHost); err != nil { if host, err = unescape(host, encodeHost); err != nil {
return nil, err return "", err
} }
return host, nil return host, nil
} }
// setPath sets the Path and RawPath fields of the URL based on the provided
// escaped path p. It maintains the invariant that RawPath is only specified
// when it differs from the default encoding of the path.
// For example:
// - setPath("/foo/bar") will set Path="/foo/bar" and RawPath=""
// - setPath("/foo%2fbar") will set Path="/foo/bar" and RawPath="/foo%2fbar"
// setPath will return an error only if the provided path contains an invalid
// escaping.
func (u *URL) setPath(p []rune) error {
path, err := unescape(p, encodePath)
if err != nil {
return err
}
u.Path = path
if escp := escape(path, encodePath); runes.Equals(p, escp) {
// Default encoding is fine.
u.RawPath = nil
} else {
u.RawPath = p
}
return nil
}
// EscapedPath returns the escaped form of u.Path.
// In general there are multiple possible escaped forms of any path.
// EscapedPath returns u.RawPath when it is a valid escaping of u.Path.
// Otherwise EscapedPath ignores u.RawPath and computes an escaped
// form on its own.
// The String and RequestURI methods use EscapedPath to construct
// their results.
// In general, code should call EscapedPath instead of
// reading u.RawPath directly.
func (u *URL) EscapedPath() []rune {
if len(u.RawPath) != 0 && validEncodedPath(u.RawPath) {
p, err := unescape(u.RawPath, encodePath)
if err == nil && runes.Equals(p, u.Path) {
return u.RawPath
}
}
if runes.Equals(u.Path, []rune("*")) {
return []rune("*") // don't escape (Issue 11202)
}
return escape(u.Path, encodePath)
}
// validEncodedPath reports whether s is a valid encoded path.
// It must not contain any bytes that require escaping during path encoding.
func validEncodedPath(s []rune) bool {
for i := 0; i < len(s); i++ {
// RFC 3986, Appendix A.
// pchar = unreserved / pct-encoded / sub-delims / ":" / "@".
// shouldEscape is not quite compliant with the RFC,
// so we check the sub-delims ourselves and let
// shouldEscape handle the others.
switch s[i] {
case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '@':
// ok
case '[', ']':
// ok - not specified in RFC 3986 but left alone by modern browsers
case '%':
// ok - percent encoded, will decode
default:
if shouldEscape(s[i], encodePath) {
return false
}
}
}
return true
}
// validOptionalPort reports whether port is either an empty string // validOptionalPort reports whether port is either an empty string
// or matches /^:\d*$/ // or matches /^:\d*$/
func validOptionalPort(port []rune) bool { func validOptionalPort(port string) bool {
if len(port) == 0 { if port == "" {
return true return true
} }
if port[0] != ':' { if port[0] != ':' {
@@ -654,46 +562,6 @@ func validOptionalPort(port []rune) bool {
return true return true
} }
func (u *URL) Runes() (buf []rune) {
if len(u.Scheme) != 0 {
buf = append(buf, u.Scheme...)
buf = append(buf, ':')
}
if len(u.Opaque) != 0 {
buf = append(buf, u.Opaque...)
} else {
if len(u.Scheme) != 0 || len(u.Host) != 0 {
if len(u.Host) != 0 || len(u.Path) != 0 {
buf = append(buf, '/', '/')
}
if h := u.Host; len(h) != 0 {
buf = append(buf, escape(h, encodeHost)...)
}
}
path := u.EscapedPath()
if len(path) != 0 && path[0] != '/' && len(u.Host) != 0 {
buf = append(buf, '/')
}
if len(buf) == 0 {
// RFC 3986 §4.2
// A path segment that contains a colon character (e.g., "this:that")
// cannot be used as the first segment of a relative-path reference, as
// it would be mistaken for a scheme name. Such a segment must be
// preceded by a dot-segment (e.g., "./this:that") to make a relative-
// path reference.
if i := runes.IndexRune(path, ':'); i > -1 && runes.IndexRune(path[:i], '/') == -1 {
buf = append(buf, '.', '/')
}
}
buf = append(buf, path...)
}
if u.ForceQuery || len(u.RawQuery) != 0 {
buf = append(buf, '?')
buf = append(buf, u.RawQuery...)
}
return
}
// String reassembles the URL into a valid URL string. // String reassembles the URL into a valid URL string.
// The general form of the result is one of: // The general form of the result is one of:
// //
@@ -715,28 +583,126 @@ func (u *URL) Runes() (buf []rune) {
// - if u.RawQuery is empty, ?query is omitted. // - if u.RawQuery is empty, ?query is omitted.
// - if u.Fragment is empty, #fragment is omitted. // - if u.Fragment is empty, #fragment is omitted.
func (u *URL) String() string { func (u *URL) String() string {
return string(u.Runes()) var buf strings.Builder
if u.Scheme != SchemeInvalid {
buf.WriteString(Schemes[u.Scheme])
buf.WriteByte(':')
}
if u.Scheme != SchemeInvalid || u.Host != "" {
if u.Host != "" || u.Path != "" {
buf.WriteString("//")
}
if h := u.Host; h != "" {
buf.WriteString(escape(h, encodeHost))
}
}
path := u.Path
if path != "" && path[0] != '/' && u.Host != "" {
buf.WriteByte('/')
}
if buf.Len() == 0 {
// RFC 3986 §4.2
// A path segment that contains a colon character (e.g., "this:that")
// cannot be used as the first segment of a relative-path reference, as
// it would be mistaken for a scheme name. Such a segment must be
// preceded by a dot-segment (e.g., "./this:that") to make a relative-
// path reference.
if i := strings.IndexByte(path, ':'); i > -1 && strings.IndexByte(path[:i], '/') == -1 {
buf.WriteString("./")
}
}
buf.WriteString(path)
return buf.String()
}
func isRunesDot(r []rune) bool {
return len(r) == 1 && r[0] == '.'
}
func isRunesDoubleDot(r []rune) bool {
return len(r) == 2 && r[0] == '.' && r[1] == '.'
} }
// resolvePath applies special path segments from refs and applies // resolvePath applies special path segments from refs and applies
// them to base, per RFC 3986. // them to base, per RFC 3986.
func resolvePath(base, ref []rune) []rune { func resolvePath(base, ref string) string {
var full []rune var full string
if len(ref) == 0 { if ref == "" {
full = base full = base
} else if ref[0] != '/' { } else if ref[0] != '/' {
// TODO Optimize i := strings.LastIndex(base, "/")
i := strings.LastIndex(string(base), "/") full = base[:i+1] + ref
full = runes.Create(base[:i+1], ref)
} else { } else {
full = ref full = ref
} }
if len(full) == 0 { if full == "" {
return nil return ""
} else if full == "/" {
return "/"
} }
var dst []string
// TODO Optimize dst := make([]rune, len(full))
src := strings.Split(string(full), "/") dst = dst[0:0]
start := 0
rs := []rune(full)
if len(rs) != 0 && rs[0] == '/' {
rs = rs[1:]
}
var stack []int
stack = append(stack, 0)
for i, c := range rs {
if i == len(rs) - 1 {
closingSlash := false
part := rs[start:]
if len(part) == 0 {
dst = append(dst, '/')
} else if part[len(part)-1] == '/' {
part = part[:len(part)-1]
closingSlash = true
}
switch {
case isRunesDot(part):
dst = append(dst, '/')
case isRunesDoubleDot(part):
// Cut to the last slash
start = i+1
dst = dst[:stack[len(stack)-1]]
if len(stack) != 1 {
stack = stack[:len(stack)-1]
}
dst = append(dst, '/')
default:
dst = append(dst, '/')
dst = append(dst, part...)
}
if closingSlash && len(dst) != 0 && dst[len(dst)-1] != '/' {
dst = append(dst, '/')
}
} else if c == '/' {
part := rs[start:i]
switch {
case isRunesDot(part):
start = i+1
case isRunesDoubleDot(part):
// Cut to the last slash
start = i+1
dst = dst[:stack[len(stack)-1]]
if len(stack) != 1 {
stack = stack[:len(stack)-1]
}
default:
start = i+1
stack = append(stack, len(dst))
dst = append(dst, '/')
dst = append(dst, part...)
}
}
}
return string(dst)
/*var dst []string
src := strings.Split(full, "/")
for _, elem := range src { for _, elem := range src {
switch elem { switch elem {
case ".": case ".":
@@ -751,22 +717,21 @@ func resolvePath(base, ref []rune) []rune {
} }
if last := src[len(src)-1]; last == "." || last == ".." { if last := src[len(src)-1]; last == "." || last == ".." {
// Add final slash to the joined path. // Add final slash to the joined path.
dst = append(dst, "") // TODO Wtf? dst = append(dst, "")
} }
// TODO Optimize return "/" + strings.TrimPrefix(strings.Join(dst, "/"), "/")*/
return []rune("/" + strings.TrimPrefix(strings.Join(dst, "/"), "/"))
} }
// IsAbs reports whether the URL is absolute. // IsAbs reports whether the URL is absolute.
// Absolute means that it has a non-empty scheme. // Absolute means that it has a non-empty scheme.
func (u *URL) IsAbs() bool { func (u *URL) IsAbs() bool {
return len(u.Scheme) != 0 return u.Scheme != SchemeInvalid
} }
// ParseRel parses a URL in the context of the receiver. The provided URL // ParseRel parses a URL in the context of the receiver. The provided URL
// may be relative or absolute. Parse returns nil, err on parse // may be relative or absolute. Parse returns nil, err on parse
// failure, otherwise its return value is the same as ResolveReference. // failure, otherwise its return value is the same as ResolveReference.
func (u *URL) ParseRel(out *URL, ref []rune) error { func (u *URL) ParseRel(out *URL, ref string) error {
var refurl URL var refurl URL
err := refurl.Parse(ref) err := refurl.Parse(ref)
@@ -786,92 +751,22 @@ func (u *URL) ParseRel(out *URL, ref []rune) error {
// ignores base and returns a copy of ref. // ignores base and returns a copy of ref.
func (u *URL) ResolveReference(url *URL, ref *URL) { func (u *URL) ResolveReference(url *URL, ref *URL) {
*url = *ref *url = *ref
if len(ref.Scheme) == 0 { if ref.Scheme == SchemeInvalid {
url.Scheme = u.Scheme url.Scheme = u.Scheme
} }
if len(ref.Scheme) != 0 || len(ref.Host) != 0 { if ref.Scheme != SchemeInvalid || ref.Host != "" {
// The "absoluteURI" or "net_path" cases. // The "absoluteURI" or "net_path" cases.
// We can ignore the error from setPath since we know we provided a // We can ignore the error from setPath since we know we provided a
// validly-escaped path. // validly-escaped path.
url.setPath(resolvePath(ref.EscapedPath(), nil)) url.Path = resolvePath(ref.Path, "")
return return
} }
if len(ref.Opaque) != 0 {
url.Host = nil
url.Path = nil
return
}
if len(ref.Path) == 0 && len(ref.RawQuery) == 0 {
url.RawQuery = u.RawQuery
}
// The "abs_path" or "rel_path" cases. // The "abs_path" or "rel_path" cases.
url.Host = u.Host url.Host = u.Host
url.setPath(resolvePath(u.EscapedPath(), ref.EscapedPath())) url.Path = resolvePath(u.Path, ref.Path)
return return
} }
// RequestURI returns the encoded path?query or opaque?query
// string that would be used in an HTTP request for u.
func (u *URL) RequestURI() []rune {
result := u.Opaque
if len(result) == 0 {
result = u.EscapedPath()
if len(result) == 0 {
result = []rune("/")
}
} else {
if runes.HasPrefix(result, []rune("//")) {
result = runes.Create(u.Scheme, []rune(":"), result)
}
}
if u.ForceQuery || len(u.RawQuery) != 0 {
result = append(result, '?')
result = append(result, u.RawQuery...)
}
return result
}
// Hostname returns u.Host, without any port number.
//
// If Host is an IPv6 literal with a port number, Hostname returns the
// IPv6 literal without the square brackets. IPv6 literals may include
// a zone identifier.
func (u *URL) Hostname() []rune {
return stripPort(u.Host)
}
// Port returns the port part of u.Host, without the leading colon.
// If u.Host doesn't contain a port, Port returns an empty string.
func (u *URL) Port() []rune {
return portOnly(u.Host)
}
func stripPort(hostport []rune) []rune {
colon := runes.IndexRune(hostport, ':')
if colon == -1 {
return hostport
}
if i := runes.IndexRune(hostport, ']'); i != -1 {
return runes.TrimPrefix(hostport[:i], []rune("["))
}
return hostport[:colon]
}
func portOnly(hostport []rune) []rune {
colon := runes.IndexRune(hostport, ':')
if colon == -1 {
return nil
}
// TODO Optimize
if i := strings.Index(string(hostport), "]:"); i != -1 {
return hostport[i+len("]:"):]
}
if strings.Contains(string(hostport), "]") {
return nil
}
return hostport[colon+len(":"):]
}
// Marshaling interface implementations. // Marshaling interface implementations.
// Would like to implement MarshalText/UnmarshalText but that will change the JSON representation of URLs. // Would like to implement MarshalText/UnmarshalText but that will change the JSON representation of URLs.
@@ -881,7 +776,7 @@ func (u *URL) MarshalBinary() (text []byte, err error) {
func (u *URL) UnmarshalBinary(text []byte) error { func (u *URL) UnmarshalBinary(text []byte) error {
var u1 URL var u1 URL
err := u1.Parse([]rune(string(text))) err := u1.Parse(string(text))
if err != nil { if err != nil {
return err return err
} }
@@ -897,7 +792,7 @@ func (u *URL) UnmarshalBinary(text []byte) error {
// / "*" / "+" / "," / ";" / "=" // / "*" / "+" / "," / ";" / "="
// //
// It doesn't validate pct-encoded. The caller does that via func unescape. // It doesn't validate pct-encoded. The caller does that via func unescape.
func validUserinfo(s []rune) bool { func validUserinfo(s string) bool {
for _, r := range s { for _, r := range s {
if 'A' <= r && r <= 'Z' { if 'A' <= r && r <= 'Z' {
continue continue
@@ -918,3 +813,57 @@ func validUserinfo(s []rune) bool {
} }
return true return true
} }
func PathUnescape(s string) string {
newStr, err := pathUnescape(s)
if err != nil {
return s
} else {
return newStr
}
}
func pathUnescape(s string) (string, error) {
// Count %, check that they're well-formed.
n := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
n++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:]
if len(s) > 3 {
s = s[:3]
}
return "", EscapeError(s)
}
i += 3
default:
i++
}
}
if n == 0 {
return s, nil
}
t := make([]byte, len(s)-2*n)
j := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
j++
i += 3
case '+':
t[j] = '+'
j++
i++
default:
t[j] = s[i]
j++
i++
}
}
return string(t), nil
}

897
fasturl/url_test.go Normal file
View File

@@ -0,0 +1,897 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package fasturl
import (
"bytes"
encodingPkg "encoding"
"encoding/gob"
"encoding/json"
"fmt"
"io"
"net"
"reflect"
"testing"
)
type URLTest struct {
in string
out *URL // expected parse; RawPath="" means same as Path
roundtrip string // expected result of reserializing the URL; empty means same as "in".
}
var urltests = []URLTest{
// no path
{
"http://www.google.com",
&URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
},
"",
},
// path
{
"http://www.google.com/",
&URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "/",
},
"",
},
// %20 outside query
{
"http://www.google.com/a%20b",
&URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "/a%20b",
},
"",
},
// leading // without scheme should create an authority
{
"//foo",
&URL{
Host: "foo",
},
"",
},
// Three leading slashes isn't an authority, but doesn't return an error.
// (We can't return an error, as this code is also used via
// ServeHTTP -> ReadRequest -> Parse, which is arguably a
// different URL parsing context, but currently shares the
// same codepath)
{
"///threeslashes",
&URL{
Path: "///threeslashes",
},
"",
},
// unescaped @ in username should not confuse host
{
"http://j@ne:password@google.com",
&URL{
Scheme: SchemeHTTP,
Host: "google.com",
},
"http://google.com",
},
// unescaped @ in password should not confuse host
{
"http://jane:p@ssword@google.com",
&URL{
Scheme: SchemeHTTP,
Host: "google.com",
},
"http://google.com",
},
// Relative path
{
"a/b/c",
&URL{
Path: "a/b/c",
},
"a/b/c",
},
// host subcomponent; IPv4 address in RFC 3986
{
"http://192.168.0.1/",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.1",
Path: "/",
},
"",
},
// host and port subcomponents; IPv4 address in RFC 3986
{
"http://192.168.0.1:8080/",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.1:8080",
Path: "/",
},
"",
},
// host subcomponent; IPv6 address in RFC 3986
{
"http://[fe80::1]/",
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1]",
Path: "/",
},
"",
},
// host and port subcomponents; IPv6 address in RFC 3986
{
"http://[fe80::1]:8080/",
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1]:8080",
Path: "/",
},
"",
},
// host subcomponent; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25en0]/", // alphanum zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en0]",
Path: "/",
},
"",
},
// host and port subcomponents; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25en0]:8080/", // alphanum zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en0]:8080",
Path: "/",
},
"",
},
// host subcomponent; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25%65%6e%301-._~]/", // percent-encoded+unreserved zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en01-._~]",
Path: "/",
},
"http://[fe80::1%25en01-._~]/",
},
// host and port subcomponents; IPv6 address with zone identifier in RFC 6874
{
"http://[fe80::1%25%65%6e%301-._~]:8080/", // percent-encoded+unreserved zone identifier
&URL{
Scheme: SchemeHTTP,
Host: "[fe80::1%en01-._~]:8080",
Path: "/",
},
"http://[fe80::1%25en01-._~]:8080/",
},
// golang.org/issue/12200 (colon with empty port)
{
"http://192.168.0.2:8080/foo",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.2:8080",
Path: "/foo",
},
"",
},
{
"http://192.168.0.2:/foo",
&URL{
Scheme: SchemeHTTP,
Host: "192.168.0.2:",
Path: "/foo",
},
"",
},
{
// Malformed IPv6 but still accepted.
"http://2b01:e34:ef40:7730:8e70:5aff:fefe:edac:8080/foo",
&URL{
Scheme: SchemeHTTP,
Host: "2b01:e34:ef40:7730:8e70:5aff:fefe:edac:8080",
Path: "/foo",
},
"",
},
{
// Malformed IPv6 but still accepted.
"http://2b01:e34:ef40:7730:8e70:5aff:fefe:edac:/foo",
&URL{
Scheme: SchemeHTTP,
Host: "2b01:e34:ef40:7730:8e70:5aff:fefe:edac:",
Path: "/foo",
},
"",
},
{
"http://[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:8080/foo",
&URL{
Scheme: SchemeHTTP,
Host: "[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:8080",
Path: "/foo",
},
"",
},
{
"http://[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:/foo",
&URL{
Scheme: SchemeHTTP,
Host: "[2b01:e34:ef40:7730:8e70:5aff:fefe:edac]:",
Path: "/foo",
},
"",
},
// golang.org/issue/7991 and golang.org/issue/12719 (non-ascii %-encoded in host)
{
"http://hello.世界.com/foo",
&URL{
Scheme: SchemeHTTP,
Host: "hello.世界.com",
Path: "/foo",
},
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
},
{
"http://hello.%e4%b8%96%e7%95%8c.com/foo",
&URL{
Scheme: SchemeHTTP,
Host: "hello.世界.com",
Path: "/foo",
},
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
},
{
"http://hello.%E4%B8%96%E7%95%8C.com/foo",
&URL{
Scheme: SchemeHTTP,
Host: "hello.世界.com",
Path: "/foo",
},
"",
},
// golang.org/issue/10433 (path beginning with //)
{
"http://example.com//foo",
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "//foo",
},
"",
},
// test that we can reparse the host names we accept.
{
"http://authority<\"hi\">/foo",
&URL{
Scheme: SchemeHTTP,
Host: "authority<\"hi\">",
Path: "/foo",
},
"",
},
}
// more useful string for debugging than fmt's struct printer
func ufmt(u *URL) string {
return fmt.Sprintf("scheme=%q, host=%q, path=%q",
Schemes[u.Scheme], u.Host, u.Path)
}
func BenchmarkString(b *testing.B) {
b.StopTimer()
b.ReportAllocs()
for _, tt := range urltests {
var u URL
err := u.Parse(tt.in)
if err != nil {
b.Errorf("Parse(%q) returned error %s", tt.in, err)
continue
}
if tt.roundtrip == "" {
continue
}
b.StartTimer()
var g string
for i := 0; i < b.N; i++ {
g = u.String()
}
b.StopTimer()
if w := tt.roundtrip; b.N > 0 && g != w {
b.Errorf("Parse(%q).String() == %q, want %q", tt.in, g, w)
}
}
}
func TestParse(t *testing.T) {
for _, tt := range urltests {
var u URL
err := u.Parse(tt.in)
if err != nil {
t.Errorf("Parse(%q) returned error %v", tt.in, err)
continue
}
if !reflect.DeepEqual(&u, tt.out) {
t.Errorf("Parse(%q):\n\tgot %v\n\twant %v\n", tt.in, ufmt(&u), ufmt(tt.out))
}
}
}
const pathThatLooksSchemeRelative = "//not.a.user@not.a.host/just/a/path"
var parseRequestURLTests = []struct {
url string
expectedValid bool
}{
{"http://foo.com", true},
{"http://foo.com/", true},
{"http://foo.com/path", true},
{"/", true},
{pathThatLooksSchemeRelative, true},
{"//not.a.user@%66%6f%6f.com/just/a/path/also", true},
{"*", true},
{"http://192.168.0.1/", true},
{"http://192.168.0.1:8080/", true},
{"http://[fe80::1]/", true},
{"http://[fe80::1]:8080/", true},
// Tests exercising RFC 6874 compliance:
{"http://[fe80::1%25en0]/", true}, // with alphanum zone identifier
{"http://[fe80::1%25en0]:8080/", true}, // with alphanum zone identifier
{"http://[fe80::1%25%65%6e%301-._~]/", true}, // with percent-encoded+unreserved zone identifier
{"http://[fe80::1%25%65%6e%301-._~]:8080/", true}, // with percent-encoded+unreserved zone identifier
{"foo.html", false},
{"../dir/", false},
{"http://192.168.0.%31/", false},
{"http://192.168.0.%31:8080/", false},
{"http://[fe80::%31]/", false},
{"http://[fe80::%31]:8080/", false},
{"http://[fe80::%31%25en0]/", false},
{"http://[fe80::%31%25en0]:8080/", false},
// These two cases are valid as textual representations as
// described in RFC 4007, but are not valid as address
// literals with IPv6 zone identifiers in URIs as described in
// RFC 6874.
{"http://[fe80::1%en0]/", false},
{"http://[fe80::1%en0]:8080/", false},
}
func TestParseRequestURI(t *testing.T) {
for _, test := range parseRequestURLTests {
var u URL
err := u.ParseRequestURI(test.url)
if test.expectedValid && err != nil {
t.Errorf("ParseRequestURI(%q) gave err %v; want no error", test.url, err)
} else if !test.expectedValid && err == nil {
t.Errorf("ParseRequestURI(%q) gave nil error; want some error", test.url)
}
}
var url URL
err := url.ParseRequestURI(pathThatLooksSchemeRelative)
if err != nil {
t.Fatalf("Unexpected error %v", err)
}
if url.Path != pathThatLooksSchemeRelative {
t.Errorf("ParseRequestURI path:\ngot %q\nwant %q", url.Path, pathThatLooksSchemeRelative)
}
}
var stringURLTests = []struct {
url URL
want string
}{
// No leading slash on path should prepend slash on String() call
{
url: URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "search",
},
want: "http://www.google.com/search",
},
// Relative path with first element containing ":" should be prepended with "./", golang.org/issue/17184
{
url: URL{
Path: "this:that",
},
want: "./this:that",
},
// Relative path with second element containing ":" should not be prepended with "./"
{
url: URL{
Path: "here/this:that",
},
want: "here/this:that",
},
// Non-relative path with first element containing ":" should not be prepended with "./"
{
url: URL{
Scheme: SchemeHTTP,
Host: "www.google.com",
Path: "this:that",
},
want: "http://www.google.com/this:that",
},
}
func TestURLString(t *testing.T) {
for _, tt := range urltests {
var u URL
err := u.Parse(tt.in)
if err != nil {
t.Errorf("Parse(%q) returned error %s", tt.in, err)
continue
}
expected := tt.in
if tt.roundtrip != "" {
expected = tt.roundtrip
}
s := u.String()
if s != expected {
t.Errorf("Parse(%q).String() == %q (expected %q)", tt.in, s, expected)
}
}
for _, tt := range stringURLTests {
if got := tt.url.String(); got != tt.want {
t.Errorf("%+v.String() = %q; want %q", tt.url, got, tt.want)
}
}
}
var resolvePathTests = []struct {
base, ref, expected string
}{
{"a/b", ".", "/a/"},
{"a/b", "c", "/a/c"},
{"a/b", "..", "/"},
{"a/", "..", "/"},
{"a/", "../..", "/"},
{"a/b/c", "..", "/a/"},
{"a/b/c", "../d", "/a/d"},
{"a/b/c", ".././d", "/a/d"},
{"a/b", "./..", "/"},
{"a/./b", ".", "/a/"},
{"a/../", ".", "/"},
{"a/.././b", "c", "/c"},
}
func TestResolvePath(t *testing.T) {
for _, test := range resolvePathTests {
got := resolvePath(test.base, test.ref)
if got != test.expected {
t.Errorf("For %q + %q got %q; expected %q", test.base, test.ref, got, test.expected)
}
}
}
var resolveReferenceTests = []struct {
base, rel, expected string
}{
// Absolute URL references
{"http://foo.com?a=b", "https://bar.com/", "https://bar.com/"},
{"http://foo.com/", "https://bar.com/?a=b", "https://bar.com/"},
{"http://foo.com/", "https://bar.com/?", "https://bar.com/"},
// Path-absolute references
{"http://foo.com/bar", "/baz", "http://foo.com/baz"},
{"http://foo.com/bar?a=b#f", "/baz", "http://foo.com/baz"},
{"http://foo.com/bar?a=b", "/baz?", "http://foo.com/baz"},
{"http://foo.com/bar?a=b", "/baz?c=d", "http://foo.com/baz"},
// Multiple slashes
{"http://foo.com/bar", "http://foo.com//baz", "http://foo.com//baz"},
{"http://foo.com/bar", "http://foo.com///baz/quux", "http://foo.com///baz/quux"},
// Scheme-relative
{"https://foo.com/bar?a=b", "//bar.com/quux", "https://bar.com/quux"},
// Path-relative references:
// ... current directory
{"http://foo.com", ".", "http://foo.com/"},
{"http://foo.com/bar", ".", "http://foo.com/"},
{"http://foo.com/bar/", ".", "http://foo.com/bar/"},
// ... going down
{"http://foo.com", "bar", "http://foo.com/bar"},
{"http://foo.com/", "bar", "http://foo.com/bar"},
{"http://foo.com/bar/baz", "quux", "http://foo.com/bar/quux"},
// ... going up
{"http://foo.com/bar/baz", "../quux", "http://foo.com/quux"},
{"http://foo.com/bar/baz", "../../../../../quux", "http://foo.com/quux"},
{"http://foo.com/bar", "..", "http://foo.com/"},
{"http://foo.com/bar/baz", "./..", "http://foo.com/"},
// ".." in the middle (issue 3560)
{"http://foo.com/bar/baz", "quux/dotdot/../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/.././tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/./../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/././../../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/./.././../tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/dotdot/dotdot/./../../.././././tail", "http://foo.com/bar/quux/tail"},
{"http://foo.com/bar/baz", "quux/./dotdot/../dotdot/../dot/./tail/..", "http://foo.com/bar/quux/dot/"},
// Remove any dot-segments prior to forming the target URI.
// http://tools.ietf.org/html/rfc3986#section-5.2.4
{"http://foo.com/dot/./dotdot/../foo/bar", "../baz", "http://foo.com/dot/baz"},
// Triple dot isn't special
{"http://foo.com/bar", "...", "http://foo.com/..."},
// Fragment
{"http://foo.com/bar", ".#frag", "http://foo.com/"},
{"http://example.org/", "#!$&%27()*+,;=", "http://example.org/"},
// Paths with escaping (issue 16947).
{"http://foo.com/foo%2fbar/", "../baz", "http://foo.com/baz"},
{"http://foo.com/1/2%2f/3%2f4/5", "../../a/b/c", "http://foo.com/1/a/b/c"},
{"http://foo.com/1/2/3", "./a%2f../../b/..%2fc", "http://foo.com/1/2/b/..%2fc"},
{"http://foo.com/1/2%2f/3%2f4/5", "./a%2f../b/../c", "http://foo.com/1/2%2f/3%2f4/a%2f../c"},
{"http://foo.com/foo%20bar/", "../baz", "http://foo.com/baz"},
{"http://foo.com/foo", "../bar%2fbaz", "http://foo.com/bar%2fbaz"},
{"http://foo.com/foo%2dbar/", "./baz-quux", "http://foo.com/foo%2dbar/baz-quux"},
// RFC 3986: Normal Examples
// http://tools.ietf.org/html/rfc3986#section-5.4.1
{"http://a/b/c/d;p?q", "g", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "./g", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g/", "http://a/b/c/g/"},
{"http://a/b/c/d;p?q", "/g", "http://a/g"},
{"http://a/b/c/d;p?q", "//g", "http://g"},
{"http://a/b/c/d;p?q", "?y", "http://a/b/c/d;p"},
{"http://a/b/c/d;p?q", "g?y", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "#s", "http://a/b/c/d;p"},
{"http://a/b/c/d;p?q", "g#s", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g?y#s", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", ";x", "http://a/b/c/;x"},
{"http://a/b/c/d;p?q", "g;x", "http://a/b/c/g;x"},
{"http://a/b/c/d;p?q", "g;x?y#s", "http://a/b/c/g;x"},
{"http://a/b/c/d;p?q", "", "http://a/b/c/d;p"},
{"http://a/b/c/d;p?q", ".", "http://a/b/c/"},
{"http://a/b/c/d;p?q", "./", "http://a/b/c/"},
{"http://a/b/c/d;p?q", "..", "http://a/b/"},
{"http://a/b/c/d;p?q", "../", "http://a/b/"},
{"http://a/b/c/d;p?q", "../g", "http://a/b/g"},
{"http://a/b/c/d;p?q", "../..", "http://a/"},
{"http://a/b/c/d;p?q", "../../", "http://a/"},
{"http://a/b/c/d;p?q", "../../g", "http://a/g"},
// RFC 3986: Abnormal Examples
// http://tools.ietf.org/html/rfc3986#section-5.4.2
{"http://a/b/c/d;p?q", "../../../g", "http://a/g"},
{"http://a/b/c/d;p?q", "../../../../g", "http://a/g"},
{"http://a/b/c/d;p?q", "/./g", "http://a/g"},
{"http://a/b/c/d;p?q", "/../g", "http://a/g"},
{"http://a/b/c/d;p?q", "g.", "http://a/b/c/g."},
{"http://a/b/c/d;p?q", ".g", "http://a/b/c/.g"},
{"http://a/b/c/d;p?q", "g..", "http://a/b/c/g.."},
{"http://a/b/c/d;p?q", "..g", "http://a/b/c/..g"},
{"http://a/b/c/d;p?q", "./../g", "http://a/b/g"},
{"http://a/b/c/d;p?q", "./g/.", "http://a/b/c/g/"},
{"http://a/b/c/d;p?q", "g/./h", "http://a/b/c/g/h"},
{"http://a/b/c/d;p?q", "g/../h", "http://a/b/c/h"},
{"http://a/b/c/d;p?q", "g;x=1/./y", "http://a/b/c/g;x=1/y"},
{"http://a/b/c/d;p?q", "g;x=1/../y", "http://a/b/c/y"},
{"http://a/b/c/d;p?q", "g?y/./x", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g?y/../x", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g#s/./x", "http://a/b/c/g"},
{"http://a/b/c/d;p?q", "g#s/../x", "http://a/b/c/g"},
// Extras.
{"https://a/b/c/d;p?q", "//g?q", "https://g"},
{"https://a/b/c/d;p?q", "//g#s", "https://g"},
{"https://a/b/c/d;p?q", "//g/d/e/f?y#s", "https://g/d/e/f"},
{"https://a/b/c/d;p#s", "?y", "https://a/b/c/d;p"},
{"https://a/b/c/d;p?q#s", "?y", "https://a/b/c/d;p"},
}
func TestResolveReference(t *testing.T) {
mustParse := func(url string) *URL {
u := new(URL)
err := u.Parse(url)
if err != nil {
t.Fatalf("Parse(%q) got err %v", url, err)
}
return u
}
for _, test := range resolveReferenceTests {
base := mustParse(test.base)
rel := mustParse(test.rel)
var url URL
base.ResolveReference(&url, rel)
if got := url.String(); got != test.expected {
t.Errorf("URL(%q).ResolveReference(%q)\ngot %q\nwant %q", test.base, test.rel, got, test.expected)
}
}
}
type RequestURITest struct {
url *URL
out string
}
var requritests = []RequestURITest{
{
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "",
},
"/",
},
{
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "/a b",
},
"/a%20b",
},
{
&URL{
Scheme: SchemeHTTP,
Host: "example.com",
Path: "//foo",
},
"//foo",
},
}
func TestParseErrors(t *testing.T) {
tests := []struct {
in string
wantErr bool
}{
{"http://[::1]", false},
{"http://[::1]:80", false},
{"http://[::1]:namedport", true}, // rfc3986 3.2.3
{"http://[::1]/", false},
{"http://[::1]a", true},
{"http://[::1]%23", true},
{"http://[::1%25en0]", false}, // valid zone id
{"http://[::1]:", false}, // colon, but no port OK
{"http://[::1]:%38%30", true}, // not allowed: % encoding only for non-ASCII
{"http://[::1%25%41]", false}, // RFC 6874 allows over-escaping in zone
{"http://[%10::1]", true}, // no %xx escapes in IP address
{"http://[::1]/%48", false}, // %xx in path is fine
{"http://%41:8080/", true}, // not allowed: % encoding only for non-ASCII
{"http://[]%20%48%54%54%50%2f%31%2e%31%0a%4d%79%48%65%61%64%65%72%3a%20%31%32%33%0a%0a/", true}, // golang.org/issue/11208
{"http://a b.com/", true}, // no space in host name please
}
for _, tt := range tests {
var u URL
err := u.Parse(tt.in)
if tt.wantErr {
if err == nil {
t.Errorf("Parse(%q) = %#v; want an error", tt.in, u)
}
continue
}
if err != nil {
t.Logf("Parse(%q) = %v; want no error", tt.in, err)
}
}
}
type shouldEscapeTest struct {
in byte
mode encoding
escape bool
}
var shouldEscapeTests = []shouldEscapeTest{
// Unreserved characters (§2.3)
{'a', encodePath, false},
{'a', encodeUserPassword, false},
{'a', encodeQueryComponent, false},
{'a', encodeFragment, false},
{'a', encodeHost, false},
{'z', encodePath, false},
{'A', encodePath, false},
{'Z', encodePath, false},
{'0', encodePath, false},
{'9', encodePath, false},
{'-', encodePath, false},
{'-', encodeUserPassword, false},
{'-', encodeQueryComponent, false},
{'-', encodeFragment, false},
{'.', encodePath, false},
{'_', encodePath, false},
{'~', encodePath, false},
// User information (§3.2.1)
{':', encodeUserPassword, true},
{'/', encodeUserPassword, true},
{'?', encodeUserPassword, true},
{'@', encodeUserPassword, true},
{'$', encodeUserPassword, false},
{'&', encodeUserPassword, false},
{'+', encodeUserPassword, false},
{',', encodeUserPassword, false},
{';', encodeUserPassword, false},
{'=', encodeUserPassword, false},
// Host (IP address, IPv6 address, registered name, port suffix; §3.2.2)
{'!', encodeHost, false},
{'$', encodeHost, false},
{'&', encodeHost, false},
{'\'', encodeHost, false},
{'(', encodeHost, false},
{')', encodeHost, false},
{'*', encodeHost, false},
{'+', encodeHost, false},
{',', encodeHost, false},
{';', encodeHost, false},
{'=', encodeHost, false},
{':', encodeHost, false},
{'[', encodeHost, false},
{']', encodeHost, false},
{'0', encodeHost, false},
{'9', encodeHost, false},
{'A', encodeHost, false},
{'z', encodeHost, false},
{'_', encodeHost, false},
{'-', encodeHost, false},
{'.', encodeHost, false},
}
func TestShouldEscape(t *testing.T) {
for _, tt := range shouldEscapeTests {
if shouldEscape(tt.in, tt.mode) != tt.escape {
t.Errorf("shouldEscape(%q, %v) returned %v; expected %v", tt.in, tt.mode, !tt.escape, tt.escape)
}
}
}
type timeoutError struct {
timeout bool
}
func (e *timeoutError) Error() string { return "timeout error" }
func (e *timeoutError) Timeout() bool { return e.timeout }
type temporaryError struct {
temporary bool
}
func (e *temporaryError) Error() string { return "temporary error" }
func (e *temporaryError) Temporary() bool { return e.temporary }
type timeoutTemporaryError struct {
timeoutError
temporaryError
}
func (e *timeoutTemporaryError) Error() string { return "timeout/temporary error" }
var netErrorTests = []struct {
err error
timeout bool
temporary bool
}{{
err: &Error{"Get", "http://google.com/", &timeoutError{timeout: true}},
timeout: true,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &timeoutError{timeout: false}},
timeout: false,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &temporaryError{temporary: true}},
timeout: false,
temporary: true,
}, {
err: &Error{"Get", "http://google.com/", &temporaryError{temporary: false}},
timeout: false,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: true}, temporaryError{temporary: true}}},
timeout: true,
temporary: true,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: false}, temporaryError{temporary: true}}},
timeout: false,
temporary: true,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: true}, temporaryError{temporary: false}}},
timeout: true,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", &timeoutTemporaryError{timeoutError{timeout: false}, temporaryError{temporary: false}}},
timeout: false,
temporary: false,
}, {
err: &Error{"Get", "http://google.com/", io.EOF},
timeout: false,
temporary: false,
}}
// Test that url.Error implements net.Error and that it forwards
func TestURLErrorImplementsNetError(t *testing.T) {
for i, tt := range netErrorTests {
err, ok := tt.err.(net.Error)
if !ok {
t.Errorf("%d: %T does not implement net.Error", i+1, tt.err)
continue
}
if err.Timeout() != tt.timeout {
t.Errorf("%d: err.Timeout(): got %v, want %v", i+1, err.Timeout(), tt.timeout)
continue
}
if err.Temporary() != tt.temporary {
t.Errorf("%d: err.Temporary(): got %v, want %v", i+1, err.Temporary(), tt.temporary)
}
}
}
var _ encodingPkg.BinaryMarshaler = (*URL)(nil)
var _ encodingPkg.BinaryUnmarshaler = (*URL)(nil)
func TestJSON(t *testing.T) {
var u URL
err := u.Parse("https://www.google.com/x?y=z")
if err != nil {
t.Fatal(err)
}
js, err := json.Marshal(&u)
if err != nil {
t.Fatal(err)
}
// If only we could implement TextMarshaler/TextUnmarshaler,
// this would work:
//
// if string(js) != strconv.Quote(u.String()) {
// t.Errorf("json encoding: %s\nwant: %s\n", js, strconv.Quote(u.String()))
// }
u1 := new(URL)
err = json.Unmarshal(js, u1)
if err != nil {
t.Fatal(err)
}
if u1.String() != u.String() {
t.Errorf("json decoded to: %s\nwant: %s\n", u1, &u)
}
}
func TestGob(t *testing.T) {
var u URL
err := u.Parse("https://www.google.com/x?y=z")
if err != nil {
t.Fatal(err)
}
var w bytes.Buffer
err = gob.NewEncoder(&w).Encode(&u)
if err != nil {
t.Fatal(err)
}
u1 := new(URL)
err = gob.NewDecoder(&w).Decode(u1)
if err != nil {
t.Fatal(err)
}
if u1.String() != u.String() {
t.Errorf("json decoded to: %s\nwant: %s\n", u1, &u)
}
}

174
main.go
View File

@@ -3,90 +3,158 @@ package main
import ( import (
"context" "context"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/terorie/oddb-go/fasturl" "github.com/spf13/viper"
"github.com/terorie/oddb-go/runes" "github.com/terorie/od-database-crawler/fasturl"
"github.com/urfave/cli" "github.com/urfave/cli"
"log"
"net/http"
_ "net/http/pprof"
"os" "os"
"strings" "strings"
"sync/atomic"
"time" "time"
) )
var configFile string
var app = cli.App { var app = cli.App {
Name: "oddb-go", Name: "od-database-crawler",
Usage: "OD-Database Go crawler", Usage: "OD-Database Go crawler",
Version: "0.1", Version: "1.1.1",
BashComplete: cli.DefaultAppComplete, BashComplete: cli.DefaultAppComplete,
Writer: os.Stdout, Writer: os.Stdout,
Compiled: buildDate, Action: cmdBase,
Commands: []cli.Command{ Commands: []cli.Command {
{ {
Name: "crawl", Name: "crawl",
Usage: "Crawl a list of URLs", Usage: "Crawl a list of URLs",
ArgsUsage: "[site, site, ...]", ArgsUsage: "<site>",
Action: cmdCrawler, Action: cmdCrawler,
}, },
}, },
Flags: []cli.Flag {
cli.StringFlag {
Name: "config",
EnvVar: "CONFIG",
Destination: &configFile,
},
},
Before: func(i *cli.Context) error {
if configFile != "" {
viper.SetConfigFile(configFile)
}
return nil
},
After: func(i *cli.Context) error {
exitHooks.Execute()
return nil
},
} }
var exitHooks Hooks
func init() { func init() {
prepareConfig() prepareConfig()
} }
func main() { func main() {
go func() { if err := os.MkdirAll("crawled", 0755);
log.Println(http.ListenAndServe("localhost:42069", nil)) err != nil { panic(err) }
}()
if err := os.MkdirAll("queue", 0755);
err != nil { panic(err) }
readConfig()
app.Run(os.Args) app.Run(os.Args)
} }
func cmdCrawler(clic *cli.Context) error { func cmdBase(_ *cli.Context) error {
readConfig() // TODO Graceful shutdown
appCtx := context.Background()
if clic.NArg() == 0 { forceCtx := context.Background()
cli.ShowCommandHelpAndExit(clic, "crawl", 1)
}
args := clic.Args()
remotes := make([]*OD, len(args))
for i, argStr := range args {
// https://github.com/golang/go/issues/19779
if !strings.Contains(argStr, "://") {
argStr = "http://" + argStr
}
arg := []rune(argStr)
var u fasturl.URL
err := u.Parse(arg)
if !runes.HasSuffix(u.Path, []rune("/")) {
u.Path = append(u.Path, '/')
}
if err != nil { return err }
remotes[i] = &OD{ BaseUri: u }
}
c := context.Background()
inRemotes := make(chan *OD) inRemotes := make(chan *OD)
go Schedule(c, inRemotes) go Schedule(forceCtx, inRemotes)
for _, remote := range remotes { ticker := time.NewTicker(config.Recheck)
globalWait.Add(1) defer ticker.Stop()
inRemotes <- remote for {
select {
case <-appCtx.Done():
return nil
case <-ticker.C:
t, err := FetchTask()
if err != nil {
logrus.WithError(err).
Error("Failed to get new task")
time.Sleep(viper.GetDuration(ConfCooldown))
continue
}
if t == nil {
// No new task
if atomic.LoadInt32(&numActiveTasks) == 0 {
logrus.Info("Waiting …")
}
continue
}
var baseUri fasturl.URL
err = baseUri.Parse(t.Url)
if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
// Not an error
err = nil
// Give back task
//err2 := CancelTask(t.WebsiteId)
//if err2 != nil {
// logrus.Error(err2)
//}
continue
} else if err != nil {
logrus.WithError(err).
Error("Failed to get new task")
time.Sleep(viper.GetDuration(ConfCooldown))
continue
}
ScheduleTask(inRemotes, t, &baseUri)
}
} }
// Wait for all jobs to finish
globalWait.Wait()
logrus.Info("All dirs processed!")
return nil return nil
} }
var buildDate = time.Date( func cmdCrawler(clic *cli.Context) error {
2018, 10, 28, if clic.NArg() != 1 {
17, 10, 0, 0, cli.ShowCommandHelpAndExit(clic, "crawl", 1)
time.UTC) }
arg := clic.Args()[0]
// https://github.com/golang/go/issues/19779
if !strings.Contains(arg, "://") {
arg = "http://" + arg
}
var u fasturl.URL
err := u.Parse(arg)
if !strings.HasSuffix(u.Path, "/") {
u.Path += "/"
}
if err != nil { return err }
// TODO Graceful shutdown
forceCtx := context.Background()
inRemotes := make(chan *OD)
go Schedule(forceCtx, inRemotes)
ticker := time.NewTicker(3 * time.Second)
defer ticker.Stop()
task := Task {
WebsiteId: 0,
Url: u.String(),
}
ScheduleTask(inRemotes, &task, &u)
// Wait for all jobs to finish
globalWait.Wait()
return nil
}

View File

@@ -1,14 +1,28 @@
package main package main
import ( import (
"github.com/terorie/oddb-go/ds/redblackhash" "github.com/terorie/od-database-crawler/ds/redblackhash"
"github.com/terorie/oddb-go/fasturl" "github.com/terorie/od-database-crawler/fasturl"
"sync" "sync"
"time" "time"
) )
type Task struct {
WebsiteId uint64 `json:"website_id"`
Url string `json:"url"`
}
type TaskResult struct {
StatusCode string `json:"status_code"`
FileCount uint64 `json:"file_count"`
ErrorCount uint64 `json:"-"`
StartTime time.Time `json:"-"`
StartTimeUnix int64 `json:"start_time"`
EndTimeUnix int64 `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}
type Job struct { type Job struct {
OD *OD
Uri fasturl.URL Uri fasturl.URL
UriStr string UriStr string
Fails int Fails int
@@ -16,26 +30,25 @@ type Job struct {
} }
type OD struct { type OD struct {
Task Task
Result TaskResult
Wait sync.WaitGroup Wait sync.WaitGroup
BaseUri fasturl.URL BaseUri fasturl.URL
Files []File
WCtx WorkerContext WCtx WorkerContext
Scanned redblackhash.Tree Scanned redblackhash.Tree
lock sync.Mutex
} }
type File struct { type File struct {
Name []rune `json:"name"` Name string `json:"name"`
Size int64 `json:"size"` Size int64 `json:"size"`
MTime time.Time `json:"mtime"` MTime int64 `json:"mtime"`
Path []rune `json:"path"` Path string `json:"path"`
IsDir bool `json:"-"` IsDir bool `json:"-"`
} }
func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) { func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) {
o.lock.Lock() o.Scanned.Lock()
defer o.lock.Unlock() defer o.Scanned.Unlock()
exists = o.Scanned.Get(k) exists = o.Scanned.Get(k)
if exists { return true } if exists { return true }
@@ -43,3 +56,8 @@ func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) {
o.Scanned.Put(k) o.Scanned.Put(k)
return false return false
} }
type errorString string
func (e errorString) Error() string {
return string(e)
}

129
queue.go Normal file
View File

@@ -0,0 +1,129 @@
package main
import (
"github.com/beeker1121/goque"
"os"
"sync"
"sync/atomic"
)
type BufferedQueue struct {
dataDir string
q *goque.Queue
buf []Job
m sync.Mutex
}
func OpenQueue(dataDir string) (bq *BufferedQueue, err error) {
bq = new(BufferedQueue)
if config.JobBufferSize < 0 {
return
}
bq.dataDir = dataDir
bq.q, err = goque.OpenQueue(dataDir)
if err != nil { return nil, err }
return
}
func (q *BufferedQueue) Enqueue(job *Job) error {
atomic.AddInt64(&totalQueued, 1)
if q.directEnqueue(job) {
return nil
}
var gob JobGob
gob.ToGob(job)
_, err := q.q.EnqueueObject(gob)
return err
}
func (q *BufferedQueue) Dequeue() (job Job, err error) {
if q.directDequeue(&job) {
atomic.AddInt64(&totalQueued, -1)
return job, nil
}
if config.JobBufferSize < 0 {
err = goque.ErrEmpty
return
}
var item *goque.Item
item, err = q.q.Dequeue()
if err != nil { return }
atomic.AddInt64(&totalQueued, -1)
var gob JobGob
err = item.ToObject(&gob)
if err != nil { return }
gob.FromGob(&job)
return
}
func (q *BufferedQueue) directEnqueue(job *Job) bool {
q.m.Lock()
defer q.m.Unlock()
bs := config.JobBufferSize
if len(q.buf) < bs || bs < 0 {
q.buf = append(q.buf, *job)
return true
} else {
return false
}
}
func (q *BufferedQueue) directDequeue(job *Job) bool {
q.m.Lock()
defer q.m.Unlock()
if len(q.buf) > 0 {
*job = q.buf[0]
q.buf = q.buf[1:]
return true
} else {
return false
}
}
// Always returns nil (But implements io.Closer)
func (q *BufferedQueue) Close() error {
if config.JobBufferSize < 0 {
return nil
}
// Close ignoring errors
q.q.Close()
// Delete files
if err := os.RemoveAll(q.dataDir);
err != nil { panic(err) }
return nil
}
type JobGob struct {
Uri string
Fails int
LastError string
}
func (g *JobGob) ToGob(j *Job) {
g.Uri = j.UriStr
g.Fails = j.Fails
if j.LastError != nil {
g.LastError = j.LastError.Error()
}
}
func (g *JobGob) FromGob(j *Job) {
if err := j.Uri.Parse(g.Uri);
err != nil { panic(err) }
j.UriStr = g.Uri
j.Fails = g.Fails
if g.LastError != "" {
j.LastError = errorString(g.LastError)
}
}

25
release.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
appname="od-database-crawler"
tag=$1
[ -z "$tag" ] && echo "Usage: build <version>" && exit 1
name=${appname}-${tag}-windows.exe
GOOS="windows" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-linux
GOOS="linux" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-mac
GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-freebsd
GOOS="freebsd" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name

View File

@@ -1,98 +0,0 @@
package runes
func Create(rs ...[]rune) (x []rune) {
for _, r := range rs {
x = append(x, r...)
}
return x
}
func IndexRune(s []rune, r rune) int {
for i, sr := range s {
if r == sr {
return i
}
}
return -1
}
func LastIndexRune(s []rune, r rune) int {
for i := len(s)-1; i >= 0; i-- {
sr := s[i]
if r == sr {
return i
}
}
return -1
}
func Equals(a, b []rune) bool {
if len(a) != len(b) {
return false
}
for i := 0; i < len(a); i++ {
if a[i] != b[i] {
return false
}
}
return true
}
func Count(s []rune, r rune) (n int) {
for _, sr := range s {
if sr == r {
n++
}
}
return
}
func HasPrefix(s, prefix []rune) bool {
return len(s) >= len(prefix) && Equals(s[0:len(prefix)], prefix)
}
func HasSuffix(s, suffix []rune) bool {
return len(s) >= len(suffix) && Equals(s[len(s)-len(suffix):], suffix)
}
// TrimPrefix returns s without the provided leading prefix string.
// If s doesn't start with prefix, s is returned unchanged.
func TrimPrefix(s, prefix []rune) []rune {
if HasPrefix(s, prefix) {
return s[len(prefix):]
}
return s
}
func TrimRune(s []rune, r rune) (out []rune) {
if len(s) == 0 {
return nil
}
var i = 0
var sr rune
// Trim prefix
for i, sr = range s {
if sr == r {
out = s[i:]
} else {
break
}
}
s = out
if len(s) == 0 {
return nil
}
// Trim suffix
for i := len(s)-1; i >= 0; i++ {
if s[i] == r {
out = s[:i]
} else {
break
}
}
return out
}

View File

@@ -1,44 +0,0 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package path implements utility routines for manipulating slash-separated
// paths.
//
// The path package should only be used for paths separated by forward
// slashes, such as the paths in URLs. This package does not deal with
// Windows paths with drive letters or backslashes; to manipulate
// operating system paths, use the path/filepath package.
package runespath
import (
"github.com/terorie/oddb-go/runes"
)
// Base returns the last element of path.
// Trailing slashes are removed before extracting the last element.
// If the path is empty, Base returns ".".
// If the path consists entirely of slashes, Base returns "/".
func Base(path []rune) []rune {
if len(path) == 0 {
return []rune(".")
}
// Strip trailing slashes.
for len(path) > 0 && path[len(path)-1] == '/' {
path = path[0 : len(path)-1]
}
// Find the last element
if i := runes.LastIndexRune(path, '/'); i >= 0 {
path = path[i+1:]
}
// If empty now, it had only slashes.
if len(path) == 0 {
return []rune("/")
}
return path
}
// IsAbs reports whether the path is absolute.
func IsAbs(path string) bool {
return len(path) > 0 && path[0] == '/'
}

View File

@@ -2,101 +2,215 @@ package main
import ( import (
"context" "context"
"encoding/json"
"fmt"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/terorie/od-database-crawler/fasturl"
"os"
"path"
"sync"
"sync/atomic" "sync/atomic"
"time"
) )
var activeTasks int32 var activeTasksLock sync.Mutex
var totalBuffered int64 var activeTasks = make(map[uint64]bool)
var numActiveTasks int32
var totalQueued int64
func Schedule(c context.Context, remotes <-chan *OD) { func Schedule(c context.Context, remotes <-chan *OD) {
go Stats(c) go Stats(c)
for { for remote := range remotes {
select { logrus.WithField("url", remote.BaseUri.String()).
case <-c.Done(): Info("Starting crawler")
return
case remote := <-remotes: // Collect results
logrus.WithField("url", remote.BaseUri.String()). results := make(chan File)
Info("Starting crawler")
// Spawn workers remote.WCtx.OD = remote
remote.WCtx.in, remote.WCtx.out = makeJobBuffer(c)
for i := 0; i < config.Workers; i++ { // Get queue path
go remote.WCtx.Worker() queuePath := path.Join("queue", fmt.Sprintf("%d", remote.Task.WebsiteId))
// Delete existing queue
if err := os.RemoveAll(queuePath);
err != nil { panic(err) }
// Start new queue
var err error
remote.WCtx.Queue, err = OpenQueue(queuePath)
if err != nil { panic(err) }
// Spawn workers
for i := 0; i < config.Workers; i++ {
go remote.WCtx.Worker(results)
}
// Enqueue initial job
atomic.AddInt32(&numActiveTasks, 1)
remote.WCtx.queueJob(Job{
Uri: remote.BaseUri,
UriStr: remote.BaseUri.String(),
Fails: 0,
})
// Upload result when ready
go remote.Watch(results)
// Sleep if max number of tasks are active
for atomic.LoadInt32(&numActiveTasks) > config.Tasks {
select {
case <-c.Done():
return
case <-time.After(time.Second):
continue
} }
// Enqueue initial job
atomic.AddInt32(&activeTasks, 1)
remote.WCtx.queueJob(Job{
OD: remote,
Uri: remote.BaseUri,
UriStr: remote.BaseUri.String(),
Fails: 0,
})
globalWait.Done()
// Upload result when ready
go remote.Watch()
} }
} }
} }
func (r *OD) Watch() { func ScheduleTask(remotes chan<- *OD, t *Task, u *fasturl.URL) {
if !t.register() {
return
}
globalWait.Add(1)
now := time.Now()
od := &OD {
Task: *t,
BaseUri: *u,
Result: TaskResult {
WebsiteId: t.WebsiteId,
StartTime: now,
StartTimeUnix: now.Unix(),
},
}
remotes <- od
}
func (t *Task) register() bool {
activeTasksLock.Lock()
defer activeTasksLock.Unlock()
if _, known := activeTasks[t.WebsiteId]; known {
return false
} else {
activeTasks[t.WebsiteId] = true
return true
}
}
func (t *Task) unregister() {
activeTasksLock.Lock()
delete(activeTasks, t.WebsiteId)
activeTasksLock.Unlock()
}
func (o *OD) Watch(results chan File) {
// Mark job as completely done
defer globalWait.Done()
defer o.Task.unregister()
filePath := path.Join("crawled", fmt.Sprintf("%d.json", o.Task.WebsiteId))
// Open crawl results file
f, err := os.OpenFile(
filePath,
os.O_CREATE | os.O_RDWR | os.O_TRUNC,
0644,
)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
return
}
defer f.Close()
defer os.Remove(filePath)
// Listen for exit code of Collect()
collectErrC := make(chan error)
// Block until all results are written
// (closes results channel)
o.handleCollect(results, f, collectErrC)
// Exit code of Collect()
err = <-collectErrC
close(collectErrC)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
return
}
// Upload results
err = PushResult(&o.Result, f)
if err != nil {
logrus.WithError(err).
Error("Failed uploading crawl results")
return
}
}
func (o *OD) handleCollect(results chan File, f *os.File, collectErrC chan error) {
// Begin collecting results
go o.Task.Collect(results, f, collectErrC)
defer close(results)
// Wait for all jobs on remote to finish // Wait for all jobs on remote to finish
r.Wait.Wait() o.Wait.Wait()
close(r.WCtx.in)
atomic.AddInt32(&activeTasks, -1)
logrus.WithField("url", r.BaseUri.String()). // Close queue
Info("Crawler finished") if err := o.WCtx.Queue.Close(); err != nil {
} panic(err)
func makeJobBuffer(c context.Context) (chan<- Job, <-chan Job) {
in := make(chan Job)
out := make(chan Job)
go bufferJobs(c, in, out)
return in, out
}
func bufferJobs(c context.Context, in chan Job, out chan Job) {
defer close(out)
var inQueue []Job
outCh := func() chan Job {
if len(inQueue) == 0 {
return nil
}
return out
} }
for len(inQueue) > 0 || in != nil { atomic.AddInt32(&numActiveTasks, -1)
if len(inQueue) == 0 {
select { // Log finish
case v, ok := <-in:
if !ok { logrus.WithFields(logrus.Fields{
in = nil "id": o.Task.WebsiteId,
} else { "url": o.BaseUri.String(),
atomic.AddInt64(&totalBuffered, 1) "duration": time.Since(o.Result.StartTime),
inQueue = append(inQueue, v) }).Info("Crawler finished")
}
case <-c.Done(): // Set status code
return now := time.Now()
} o.Result.EndTimeUnix = now.Unix()
fileCount := atomic.LoadUint64(&o.Result.FileCount)
if fileCount == 0 {
errorCount := atomic.LoadUint64(&o.Result.ErrorCount)
if errorCount == 0 {
o.Result.StatusCode = "empty"
} else { } else {
select { o.Result.StatusCode = "directory listing failed"
case v, ok := <-in:
if !ok {
in = nil
} else {
atomic.AddInt64(&totalBuffered, 1)
inQueue = append(inQueue, v)
}
case outCh() <- inQueue[0]:
atomic.AddInt64(&totalBuffered, -1)
inQueue = inQueue[1:]
case <-c.Done():
return
}
} }
} else {
o.Result.StatusCode = "success"
} }
} }
func (t *Task) Collect(results chan File, f *os.File, errC chan<- error) {
err := t.collect(results, f)
if err != nil {
logrus.WithError(err).
Error("Failed saving crawl results")
}
errC <- err
}
func (t *Task) collect(results chan File, f *os.File) error {
for result := range results {
result.Path = fasturl.PathUnescape(result.Path)
result.Name = fasturl.PathUnescape(result.Name)
resJson, err := json.Marshal(result)
if err != nil { panic(err) }
_, err = f.Write(resJson)
if err != nil { return err }
_, err = f.Write([]byte{'\n'})
if err != nil { return err }
}
return nil
}

170
server.go
View File

@@ -5,39 +5,34 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/spf13/viper"
"io" "io"
"mime/multipart" "mime/multipart"
"net/http" "net/http"
"net/url" "net/url"
"os" "os"
"path/filepath"
"strconv" "strconv"
"strings" "time"
) )
const ( var serverClient = http.Client {
fileListChunkSize int64 = 5000000 // 5 mb Timeout: config.ServerTimeout,
) }
var serverClient = http.DefaultClient
func FetchTask() (t *Task, err error) { func FetchTask() (t *Task, err error) {
escToken, _ := json.Marshal(config.Token) res, err := serverClient.PostForm(
payload := `{"token":` + string(escToken) + `}`
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/get", config.ServerUrl + "/task/get",
strings.NewReader(payload)) url.Values{ "token": {config.Token} })
if err != nil { return }
res, err := serverClient.Do(req)
if err != nil { return } if err != nil { return }
defer res.Body.Close() defer res.Body.Close()
if res.StatusCode != 200 { switch res.StatusCode {
err = fmt.Errorf("http %s", res.Status) case 200:
return break
case 404, 500:
return nil, nil
default:
return nil, fmt.Errorf("http %s", res.Status)
} }
t = new(Task) t = new(Task)
@@ -47,21 +42,17 @@ func FetchTask() (t *Task, err error) {
return return
} }
func PushResult(result *TaskResult) (err error) { func PushResult(result *TaskResult, f *os.File) (err error) {
filePath := filepath.Join( if result.WebsiteId == 0 {
".", "crawled", // Not a real result, don't push
fmt.Sprintf("%d.json", result.WebsiteId)) return nil
}
defer os.Remove(filePath) // Rewind to the beginning of the file
_, err = f.Seek(0, 0)
f, err := os.Open(filePath) if err != nil {
if os.IsNotExist(err) {
err = fmt.Errorf("cannot upload result: %s does not exist", filePath)
return
} else if err != nil {
return return
} }
defer f.Close()
err = uploadChunks(result.WebsiteId, f) err = uploadChunks(result.WebsiteId, f)
if err != nil { if err != nil {
@@ -73,104 +64,109 @@ func PushResult(result *TaskResult) (err error) {
return return
} }
err = uploadResult(result) // Upload result ignoring errors
if err != nil { uploadResult(result)
logrus.Errorf("Failed to upload result: %s", err)
err2 := CancelTask(result.WebsiteId)
if err2 != nil {
logrus.Error(err2)
}
return
}
return return
} }
func uploadChunks(websiteId uint64, f *os.File) (err error) { func uploadChunks(websiteId uint64, f *os.File) error {
for iter := 1; iter > 0; iter++ { eof := false
for iter := 1; !eof; iter++ {
// TODO Stream with io.Pipe? // TODO Stream with io.Pipe?
var b bytes.Buffer var b bytes.Buffer
multi := multipart.NewWriter(&b) multi := multipart.NewWriter(&b)
// Set upload fields // Set upload fields
var err error
err = multi.WriteField("token", config.Token) err = multi.WriteField("token", config.Token)
if err != nil { return } if err != nil { return err }
err = multi.WriteField("website_id", fmt.Sprintf("%d", websiteId)) err = multi.WriteField("website_id", fmt.Sprintf("%d", websiteId))
if err != nil { return } if err != nil { return err }
// Copy chunk to file_list // Copy chunk to file_list
formFile, err := multi.CreateFormFile("file_list", "file_list") formFile, err := multi.CreateFormFile("file_list", "file_list")
_, err = io.CopyN(formFile, f, fileListChunkSize) var n int64
if err == io.EOF { n, err = io.CopyN(formFile, f, config.ChunkSize)
break if err != io.EOF && err != nil {
} else if err == io.ErrUnexpectedEOF { return err
}
if n == 0 {
// Don't upload, no content
return nil
} else if n < config.ChunkSize {
err = nil err = nil
// Break at end of iteration // Break at end of iteration
iter = -420 eof = true
} }
req, err := http.NewRequest( multi.Close()
http.MethodPost,
config.ServerUrl + "/task/upload",
&b)
if err != nil { return err }
res, err := serverClient.Do(req) for retries := 0; retries < viper.GetInt(ConfUploadRetries); retries++ {
if err != nil { return err } if retries > 0 {
res.Body.Close() // Error occurred, retry upload
time.Sleep(viper.GetDuration(ConfUploadRetryInterval))
}
if res.StatusCode != http.StatusOK { req, err := http.NewRequest(
return fmt.Errorf("failed to upload list part %d: %s", http.MethodPost,
iter, res.Status) config.ServerUrl + "/task/upload",
&b)
req.Header.Set("content-type", multi.FormDataContentType())
if err != nil { continue }
res, err := serverClient.Do(req)
if err != nil { continue }
res.Body.Close()
if res.StatusCode != http.StatusOK {
logrus.WithField("status", res.Status).
WithField("part", iter).
Errorf("Upload failed")
continue
}
// Upload successful
break
} }
logrus.Infof("Uploading file list part %d: %s", logrus.WithField("id", websiteId).
iter, res.Status) WithField("part", iter).
Infof("Uploaded files chunk")
} }
return return nil
} }
func uploadResult(result *TaskResult) (err error) { func uploadResult(result *TaskResult) (err error) {
resultEnc, err := json.Marshal(result) resultEnc, err := json.Marshal(result)
if err != nil { panic(err) } if err != nil { panic(err) }
payload := url.Values { res, err := serverClient.PostForm(
"token": {config.Token},
"result": {string(resultEnc)},
}.Encode()
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/complete", config.ServerUrl + "/task/complete",
strings.NewReader(payload)) url.Values {
if err != nil { return } "token": {config.Token},
"result": {string(resultEnc)},
res, err := serverClient.Do(req) },
)
if err != nil { return } if err != nil { return }
res.Body.Close() res.Body.Close()
if res.StatusCode != http.StatusOK { if res.StatusCode != http.StatusOK {
return fmt.Errorf("failed to cancel task: %s", res.Status) return HttpError{res.StatusCode}
} }
return return
} }
func CancelTask(websiteId uint64) (err error) { func CancelTask(websiteId uint64) (err error) {
form := url.Values{ res, err := serverClient.PostForm(
"token": {config.Token},
"website_id": {strconv.FormatUint(websiteId, 10)},
}
encForm := form.Encode()
req, err := http.NewRequest(
http.MethodPost,
config.ServerUrl + "/task/cancel", config.ServerUrl + "/task/cancel",
strings.NewReader(encForm)) url.Values{
if err != nil { return } "token": {config.Token},
"website_id": {strconv.FormatUint(websiteId, 10)},
res, err := serverClient.Do(req) },
)
if err != nil { return } if err != nil { return }
res.Body.Close() res.Body.Close()

View File

@@ -39,6 +39,10 @@ func Stats(c context.Context) {
perSecond = math.Round(perSecond) perSecond = math.Round(perSecond)
perSecond /= 2 perSecond /= 2
if perSecond <= 0 {
continue
}
logrus.WithFields(logrus.Fields{ logrus.WithFields(logrus.Fields{
"per_second": perSecond, "per_second": perSecond,
"done": atomic.LoadUint64(&totalDone), "done": atomic.LoadUint64(&totalDone),
@@ -53,7 +57,7 @@ func Stats(c context.Context) {
runtime.ReadMemStats(&mem) runtime.ReadMemStats(&mem)
logrus.WithFields(logrus.Fields{ logrus.WithFields(logrus.Fields{
"queue_count": totalBuffered, "queue_count": atomic.LoadInt64(&totalQueued),
"heap": FormatByteCount(mem.Alloc), "heap": FormatByteCount(mem.Alloc),
"objects": mem.HeapObjects, "objects": mem.HeapObjects,
"num_gc": mem.NumGC, "num_gc": mem.NumGC,

View File

@@ -1,16 +0,0 @@
package main
import "time"
type Task struct {
WebsiteId int `json:"website_id"`
Url string `json:"url"`
}
type TaskResult struct {
StatusCode int `json:"status_code"`
FileCount uint64 `json:"file_count"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
WebsiteId uint64 `json:"website_id"`
}

22
util.go
View File

@@ -1,6 +1,9 @@
package main package main
import "fmt" import (
"fmt"
"sync"
)
// https://programming.guide/go/formatting-byte-size-to-human-readable-format.html // https://programming.guide/go/formatting-byte-size-to-human-readable-format.html
func FormatByteCount(b uint64) string { func FormatByteCount(b uint64) string {
@@ -16,3 +19,20 @@ func FormatByteCount(b uint64) string {
return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp]) return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp])
} }
} }
type Hooks struct {
m sync.Mutex
l []func()
}
func (h *Hooks) Add(hook func()) {
h.m.Lock()
h.l = append(h.l, hook)
h.m.Unlock()
}
func (h *Hooks) Execute() {
for _, hook := range h.l {
hook()
}
}

121
worker.go
View File

@@ -1,8 +1,12 @@
package main package main
import ( import (
"github.com/beeker1121/goque"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/valyala/fasthttp"
"math" "math"
"sort"
"strings"
"sync" "sync"
"sync/atomic" "sync/atomic"
"time" "time"
@@ -11,24 +15,38 @@ import (
var globalWait sync.WaitGroup var globalWait sync.WaitGroup
type WorkerContext struct { type WorkerContext struct {
in chan<- Job OD *OD
out <-chan Job Queue *BufferedQueue
lastRateLimit time.Time lastRateLimit time.Time
numRateLimits int numRateLimits int
} }
func (w WorkerContext) Worker() { func (w *WorkerContext) Worker(results chan<- File) {
for job := range w.out { for {
w.step(job) job, err := w.Queue.Dequeue()
switch err {
case goque.ErrEmpty:
time.Sleep(500 * time.Millisecond)
continue
case goque.ErrDBClosed:
return
case nil:
w.step(results, job)
default:
panic(err)
}
} }
} }
func (w WorkerContext) step(job Job) { func (w *WorkerContext) step(results chan<- File, job Job) {
defer w.finishJob(&job) defer w.finishJob(&job)
var f File var f File
newJobs, err := DoJob(&job, &f) newJobs, err := w.DoJob(&job, &f)
atomic.AddUint64(&totalStarted, 1) atomic.AddUint64(&totalStarted, 1)
if err == ErrKnown { if err == ErrKnown {
return return
@@ -37,9 +55,14 @@ func (w WorkerContext) step(job Job) {
if err != nil { if err != nil {
job.Fails++ job.Fails++
if err == ErrForbidden { if httpErr, ok := err.(*HttpError); ok {
// Don't attempt crawling again switch httpErr.code {
return case fasthttp.StatusTooManyRequests:
err = ErrRateLimit
default:
// Don't retry HTTP error codes
return
}
} }
if job.Fails > config.Retries { if job.Fails > config.Retries {
@@ -62,18 +85,22 @@ func (w WorkerContext) step(job Job) {
w.queueJob(job) w.queueJob(job)
} }
job.OD.Files = append(job.OD.Files, f) if !f.IsDir {
results <- f
}
} }
func DoJob(job *Job, f *File) (newJobs []Job, err error) { func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
if len(job.Uri.Path) == 0 { return } if len(job.Uri.Path) == 0 { return }
if job.Uri.Path[len(job.Uri.Path)-1] == '/' { if job.Uri.Path[len(job.Uri.Path)-1] == '/' {
// Load directory // Load directory
links, err := GetDir(job, f) links, err := GetDir(job, f)
if err != nil { if err != nil {
logrus.WithError(err). if !isErrSilent(err) {
WithField("url", job.UriStr). logrus.WithError(err).
Error("Failed getting dir") WithField("url", job.UriStr).
Error("Failed to crawl dir")
}
return nil, err return nil, err
} }
@@ -81,45 +108,58 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
hash := f.HashDir(links) hash := f.HashDir(links)
// Skip symlinked dirs // Skip symlinked dirs
if job.OD.LoadOrStoreKey(&hash) { if w.OD.LoadOrStoreKey(&hash) {
return nil, ErrKnown return nil, ErrKnown
} }
// Sort by path
sort.Slice(links, func(i, j int) bool {
return strings.Compare(links[i].Path, links[j].Path) < 0
})
var newJobCount int
var lastLink string
for _, link := range links { for _, link := range links {
// Skip already queued links uriStr := link.String()
//if _, old := job.OD.Scanned.LoadOrStore(link, true); old {
// continue // Ignore dupes
//} if uriStr == lastLink {
job.OD.Wait.Add(1) continue
}
lastLink = uriStr
newJobs = append(newJobs, Job{ newJobs = append(newJobs, Job{
OD: job.OD,
Uri: link, Uri: link,
UriStr: link.String(), UriStr: uriStr,
Fails: 0, Fails: 0,
}) })
newJobCount++
} }
if config.Verbose { if config.Verbose {
logrus.WithFields(logrus.Fields{ logrus.WithFields(logrus.Fields{
"url": job.UriStr, "url": job.UriStr,
"files": len(links), "files": newJobCount,
}).Debug("Listed") }).Debug("Listed")
} }
} else { } else {
// Load file // Load file
err := GetFile(job.Uri, f) err := GetFile(job.Uri, f)
if err != nil { if err != nil {
logrus.WithError(err). if !isErrSilent(err) {
WithField("url", job.Uri.String()). logrus.WithError(err).
Error("Failed getting file") WithField("url", job.UriStr).
Error("Failed to crawl file")
}
return nil, err return nil, err
} }
atomic.AddUint64(&w.OD.Result.FileCount, 1)
} }
return return
} }
func (w WorkerContext) queueJob(job Job) { func (w *WorkerContext) queueJob(job Job) {
job.OD.Wait.Add(1) w.OD.Wait.Add(1)
globalWait.Add(1)
if w.numRateLimits > 0 { if w.numRateLimits > 0 {
if time.Since(w.lastRateLimit) > 5 * time.Second { if time.Since(w.lastRateLimit) > 5 * time.Second {
@@ -127,14 +167,23 @@ func (w WorkerContext) queueJob(job Job) {
} else { } else {
time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) * time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) *
100 * time.Millisecond) 100 * time.Millisecond)
w.in <- job
} }
} else { }
w.in <- job
if err := w.Queue.Enqueue(&job); err != nil {
panic(err)
} }
} }
func (w WorkerContext) finishJob(job *Job) { func (w *WorkerContext) finishJob(job *Job) {
job.OD.Wait.Done() w.OD.Wait.Done()
globalWait.Done() }
func isErrSilent(err error) bool {
if !config.PrintHTTP {
if _, ok := err.(*HttpError); ok {
return true
}
}
return false
} }