mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-12-13 15:19:03 +00:00
Compare commits
27 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
24d9d1fd42 | ||
|
|
f3be76e001 | ||
|
|
4ef4ab13a8 | ||
|
|
25d0b0042c | ||
|
|
ef7d17cad4 | ||
|
|
e919323169 | ||
|
|
a3aebe4ef2 | ||
|
|
acbfd78a5d | ||
|
|
fe1e7bf261 | ||
|
|
c6d7fad8e8 | ||
|
|
0b20823ae1 | ||
|
|
8d68bf1bbc | ||
|
|
a83eb0cfd7 | ||
|
|
b18b70f798 | ||
|
|
9d5f549774 | ||
|
|
5239af08f7 | ||
|
|
46c0e0bd32 | ||
|
|
0ca6deede8 | ||
|
|
120c026983 | ||
|
|
527e8895ec | ||
|
|
108fff0503 | ||
|
|
e5746baa5b | ||
|
|
17ba5583c9 | ||
|
|
92a8c07f4a | ||
|
|
43f96c6988 | ||
|
|
b244cdae80 | ||
|
|
4b8275c7bf |
5
.travis.yml
Normal file
5
.travis.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
language: go
|
||||
|
||||
go:
|
||||
- "1.11.x"
|
||||
- master
|
||||
15
README.md
15
README.md
@@ -1,4 +1,5 @@
|
||||
# od-database Go crawler 🚀
|
||||
[](https://travis-ci.org/terorie/od-database-crawler)
|
||||
> by terorie 2018 :P
|
||||
|
||||
* Crawler for [__OD-Database__](https://github.com/simon987/od-database)
|
||||
@@ -7,3 +8,17 @@
|
||||
* Lightweight and fast: __over 9000 requests per second__ on a standard laptop
|
||||
|
||||
https://od-db.the-eye.eu/
|
||||
|
||||
#### Usage
|
||||
|
||||
1. With Config File (if `config.yml` found in working dir)
|
||||
- Download [default config](https://github.com/terorie/od-database-crawler/blob/master/config.yml)
|
||||
- Set `server.url` and `server.token`
|
||||
- Start with `./od-database-crawler server --config <file>`
|
||||
|
||||
2. With Flags or env
|
||||
- Override config file if it exists
|
||||
- `--help` for list of flags
|
||||
- Every flag is available as an environment variable:
|
||||
`--server.crawl_stats` ➡️ `OD_SERVER_CRAWL_STATS`
|
||||
- Start with `./od-database-crawler server <flags>`
|
||||
|
||||
137
config.go
137
config.go
@@ -4,6 +4,7 @@ import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/spf13/pflag"
|
||||
"github.com/spf13/viper"
|
||||
"io"
|
||||
"os"
|
||||
@@ -21,13 +22,13 @@ var config struct {
|
||||
Workers int
|
||||
UserAgent string
|
||||
Tasks int32
|
||||
CrawlStats time.Duration
|
||||
AllocStats time.Duration
|
||||
Verbose bool
|
||||
PrintHTTP bool
|
||||
JobBufferSize int
|
||||
}
|
||||
|
||||
var onlineMode bool
|
||||
|
||||
const (
|
||||
ConfServerUrl = "server.url"
|
||||
ConfToken = "server.token"
|
||||
@@ -45,6 +46,7 @@ const (
|
||||
ConfDialTimeout = "crawl.dial_timeout"
|
||||
ConfTimeout = "crawl.timeout"
|
||||
ConfJobBufferSize = "crawl.job_buffer"
|
||||
ConfResume = "crawl.resume"
|
||||
|
||||
ConfCrawlStats = "output.crawl_stats"
|
||||
ConfAllocStats = "output.resource_stats"
|
||||
@@ -54,43 +56,108 @@ const (
|
||||
)
|
||||
|
||||
func prepareConfig() {
|
||||
viper.SetDefault(ConfRetries, 5)
|
||||
viper.SetDefault(ConfWorkers, 2)
|
||||
viper.SetDefault(ConfTasks, 3)
|
||||
viper.SetDefault(ConfUserAgent, "")
|
||||
viper.SetDefault(ConfDialTimeout, 10 * time.Second)
|
||||
viper.SetDefault(ConfTimeout, 60 * time.Second)
|
||||
viper.SetDefault(ConfJobBufferSize, 5000)
|
||||
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
|
||||
viper.SetDefault(ConfAllocStats, 0)
|
||||
viper.SetDefault(ConfVerbose, false)
|
||||
viper.SetDefault(ConfPrintHTTP, false)
|
||||
viper.SetDefault(ConfLogFile, "")
|
||||
viper.SetDefault(ConfRecheck, 3 * time.Second)
|
||||
viper.SetDefault(ConfCooldown, 30 * time.Second)
|
||||
viper.SetDefault(ConfChunkSize, "1 MB")
|
||||
viper.SetDefault(ConfUploadRetries, 10)
|
||||
viper.SetDefault(ConfUploadRetryInterval, 30 * time.Second)
|
||||
pf := rootCmd.PersistentFlags()
|
||||
|
||||
pf.SortFlags = false
|
||||
pf.StringVar(&configFile, "config", "", "Config file")
|
||||
configFile = os.Getenv("OD_CONFIG")
|
||||
|
||||
pf.String(ConfServerUrl, "http://od-db.the-eye.eu/api", "OD-DB server URL")
|
||||
|
||||
pf.String(ConfToken, "", "OD-DB access token (env OD_SERVER_TOKEN)")
|
||||
|
||||
pf.Duration(ConfServerTimeout, 60 * time.Second, "OD-DB request timeout")
|
||||
|
||||
pf.Duration(ConfRecheck, 1 * time.Second, "OD-DB: Poll interval for new jobs")
|
||||
|
||||
pf.Duration(ConfCooldown, 30 * time.Second, "OD-DB: Time to wait after a server-side error")
|
||||
|
||||
pf.String(ConfChunkSize, "1 MB", "OD-DB: Result upload chunk size")
|
||||
|
||||
pf.Uint(ConfUploadRetries, 10, "OD-DB: Max upload retries")
|
||||
|
||||
pf.Duration(ConfUploadRetryInterval, 30 * time.Second, "OD-DB: Time to wait between upload retries")
|
||||
|
||||
pf.Uint(ConfTasks, 100, "Crawler: Max concurrent tasks")
|
||||
|
||||
pf.Uint(ConfWorkers, 4, "Crawler: Connections per server")
|
||||
|
||||
pf.Uint(ConfRetries, 5, "Crawler: Request retries")
|
||||
|
||||
pf.Duration(ConfDialTimeout, 10 * time.Second, "Crawler: Handshake timeout")
|
||||
|
||||
pf.Duration(ConfTimeout, 30 * time.Second, "Crawler: Request timeout")
|
||||
|
||||
pf.String(ConfUserAgent, "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0", "Crawler: User-Agent")
|
||||
|
||||
pf.Uint(ConfJobBufferSize, 5000, "Crawler: Task queue cache size")
|
||||
|
||||
pf.Duration(ConfResume, 72 * time.Hour, "Crawler: Resume tasks not older than x")
|
||||
|
||||
pf.Duration(ConfCrawlStats, time.Second, "Log: Crawl stats interval")
|
||||
|
||||
pf.Duration(ConfAllocStats, 10 * time.Second, "Log: Resource stats interval")
|
||||
|
||||
pf.Bool(ConfVerbose, false, "Log: Print every listed dir")
|
||||
|
||||
pf.Bool(ConfPrintHTTP, false, "Log: Print HTTP client errors")
|
||||
|
||||
pf.String(ConfLogFile, "crawler.log", "Log file")
|
||||
|
||||
// Bind all flags to Viper
|
||||
pf.VisitAll(func(flag *pflag.Flag) {
|
||||
s := flag.Name
|
||||
s = strings.TrimLeft(s, "-")
|
||||
|
||||
if err := viper.BindPFlag(s, flag); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var envKey string
|
||||
envKey = strings.Replace(s, ".", "_", -1)
|
||||
envKey = strings.ToUpper(envKey)
|
||||
envKey = "OD_" + envKey
|
||||
if err := viper.BindEnv(s, envKey); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func readConfig() {
|
||||
viper.AddConfigPath(".")
|
||||
viper.SetConfigName("config")
|
||||
err := viper.ReadInConfig()
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(1)
|
||||
// If config.yml in working dir, use it
|
||||
if configFile == "" {
|
||||
_, err := os.Stat("config.yml")
|
||||
if err == nil {
|
||||
configFile = "config.yml"
|
||||
}
|
||||
}
|
||||
|
||||
config.ServerUrl = viper.GetString(ConfServerUrl)
|
||||
if config.ServerUrl == "" {
|
||||
configMissing(ConfServerUrl)
|
||||
}
|
||||
config.ServerUrl = strings.TrimRight(config.ServerUrl, "/")
|
||||
if configFile != "" {
|
||||
confF, err := os.Open(configFile)
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer confF.Close()
|
||||
|
||||
config.Token = viper.GetString(ConfToken)
|
||||
if config.Token == "" {
|
||||
configMissing(ConfToken)
|
||||
viper.SetConfigType("yml")
|
||||
err = viper.ReadConfig(confF)
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
if onlineMode {
|
||||
config.ServerUrl = viper.GetString(ConfServerUrl)
|
||||
if config.ServerUrl == "" {
|
||||
configMissing(ConfServerUrl)
|
||||
}
|
||||
config.ServerUrl = strings.TrimRight(config.ServerUrl, "/")
|
||||
|
||||
config.Token = viper.GetString(ConfToken)
|
||||
if config.Token == "" {
|
||||
configMissing(ConfToken)
|
||||
}
|
||||
}
|
||||
|
||||
config.ServerTimeout = viper.GetDuration(ConfServerTimeout)
|
||||
@@ -125,10 +192,6 @@ func readConfig() {
|
||||
|
||||
config.JobBufferSize = viper.GetInt(ConfJobBufferSize)
|
||||
|
||||
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
|
||||
|
||||
config.AllocStats = viper.GetDuration(ConfAllocStats)
|
||||
|
||||
config.Verbose = viper.GetBool(ConfVerbose)
|
||||
if config.Verbose {
|
||||
logrus.SetLevel(logrus.DebugLevel)
|
||||
|
||||
14
crawl.go
14
crawl.go
@@ -58,6 +58,10 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
||||
}
|
||||
|
||||
body := res.Body()
|
||||
return ParseDir(body, &j.Uri)
|
||||
}
|
||||
|
||||
func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error) {
|
||||
doc := html.NewTokenizer(bytes.NewReader(body))
|
||||
|
||||
var linkHref string
|
||||
@@ -107,15 +111,15 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
||||
}
|
||||
|
||||
var link fasturl.URL
|
||||
err = j.Uri.ParseRel(&link, href)
|
||||
err = baseUri.ParseRel(&link, href)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if link.Scheme != j.Uri.Scheme ||
|
||||
link.Host != j.Uri.Host ||
|
||||
link.Path == j.Uri.Path ||
|
||||
!strings.HasPrefix(link.Path, j.Uri.Path) {
|
||||
if link.Scheme != baseUri.Scheme ||
|
||||
link.Host != baseUri.Host ||
|
||||
link.Path == baseUri.Path ||
|
||||
!strings.HasPrefix(link.Path, baseUri.Path) {
|
||||
continue
|
||||
}
|
||||
|
||||
|
||||
4766
crawl_apache2_test.go
Normal file
4766
crawl_apache2_test.go
Normal file
File diff suppressed because it is too large
Load Diff
117
crawl_nginx_test.go
Normal file
117
crawl_nginx_test.go
Normal file
@@ -0,0 +1,117 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/terorie/od-database-crawler/fasturl"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseDirNginx(t *testing.T) {
|
||||
var u fasturl.URL
|
||||
err := u.Parse("https://the-eye.eu/public/")
|
||||
if err != nil {
|
||||
t.Fatal("Failed to parse URL", err)
|
||||
}
|
||||
|
||||
links, err := ParseDir([]byte(nginxListing), &u)
|
||||
if err != nil {
|
||||
t.Fatal("Failed to extract links", err)
|
||||
}
|
||||
|
||||
if len(links) != len(nginxLinks) {
|
||||
t.Fatalf("Expected %d links, got %d",
|
||||
len(nginxLinks), len(links))
|
||||
}
|
||||
|
||||
for i := 0; i < len(links); i++ {
|
||||
gotLink := links[i].String()
|
||||
expLink := nginxLinks[i]
|
||||
|
||||
if gotLink != expLink {
|
||||
t.Errorf(`Expected "%s" got "%s"`,
|
||||
expLink, gotLink)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var nginxLinks = []string {
|
||||
"https://the-eye.eu/public/AppleArchive/",
|
||||
"https://the-eye.eu/public/AudioBooks/",
|
||||
"https://the-eye.eu/public/Books/",
|
||||
"https://the-eye.eu/public/Comics/",
|
||||
"https://the-eye.eu/public/Games/",
|
||||
"https://the-eye.eu/public/Icons/",
|
||||
"https://the-eye.eu/public/Images/",
|
||||
"https://the-eye.eu/public/JFK_Files/",
|
||||
"https://the-eye.eu/public/MSDN/",
|
||||
"https://the-eye.eu/public/Music/",
|
||||
"https://the-eye.eu/public/Operating%20Systems/",
|
||||
"https://the-eye.eu/public/Posters/",
|
||||
"https://the-eye.eu/public/Psychedelics/",
|
||||
"https://the-eye.eu/public/Psychoactives/",
|
||||
"https://the-eye.eu/public/Radio/",
|
||||
"https://the-eye.eu/public/Random/",
|
||||
"https://the-eye.eu/public/Site-Dumps/",
|
||||
"https://the-eye.eu/public/Software/",
|
||||
"https://the-eye.eu/public/Strategic%20Intelligence%20Network/",
|
||||
"https://the-eye.eu/public/WorldTracker.org/",
|
||||
"https://the-eye.eu/public/concen.org/",
|
||||
"https://the-eye.eu/public/freenrg.info/",
|
||||
"https://the-eye.eu/public/murdercube.com/",
|
||||
"https://the-eye.eu/public/parazite/",
|
||||
"https://the-eye.eu/public/ripreddit/",
|
||||
"https://the-eye.eu/public/rom/",
|
||||
"https://the-eye.eu/public/touhou/",
|
||||
"https://the-eye.eu/public/vns/",
|
||||
"https://the-eye.eu/public/xbins/",
|
||||
"https://the-eye.eu/public/xbins.diodematrix/",
|
||||
"https://the-eye.eu/public/Rclone_for_Scrubs.pdf",
|
||||
"https://the-eye.eu/public/Wget_Linux_Guide.pdf",
|
||||
"https://the-eye.eu/public/Wget_Windows_Guide.pdf",
|
||||
"https://the-eye.eu/public/rclone_guide.pdf",
|
||||
"https://the-eye.eu/public/wget-noobs-guide.pdf",
|
||||
"https://the-eye.eu/public/xbox-scene_Aug2014.7z",
|
||||
}
|
||||
|
||||
const nginxListing =
|
||||
`<html>
|
||||
<head><title>Index of /public/</title></head>
|
||||
<body bgcolor="white">
|
||||
<h1>Index of /public/</h1><hr><pre><a href="../">../</a>
|
||||
<a href="AppleArchive/">AppleArchive/</a> 03-Nov-2017 18:13 -
|
||||
<a href="AudioBooks/">AudioBooks/</a> 29-Sep-2018 19:47 -
|
||||
<a href="Books/">Books/</a> 27-Nov-2018 17:50 -
|
||||
<a href="Comics/">Comics/</a> 05-Nov-2018 21:37 -
|
||||
<a href="Games/">Games/</a> 28-Nov-2018 11:54 -
|
||||
<a href="Icons/">Icons/</a> 22-May-2018 07:47 -
|
||||
<a href="Images/">Images/</a> 21-Jan-2018 03:21 -
|
||||
<a href="JFK_Files/">JFK_Files/</a> 03-Nov-2017 17:03 -
|
||||
<a href="MSDN/">MSDN/</a> 03-Nov-2017 15:48 -
|
||||
<a href="Music/">Music/</a> 02-Mar-2018 15:47 -
|
||||
<a href="Operating%20Systems/">Operating Systems/</a> 25-Apr-2018 07:18 -
|
||||
<a href="Posters/">Posters/</a> 07-Jul-2018 01:12 -
|
||||
<a href="Psychedelics/">Psychedelics/</a> 11-Apr-2018 05:45 -
|
||||
<a href="Psychoactives/">Psychoactives/</a> 18-May-2018 02:58 -
|
||||
<a href="Radio/">Radio/</a> 09-Jun-2018 15:49 -
|
||||
<a href="Random/">Random/</a> 04-Dec-2018 12:33 -
|
||||
<a href="Site-Dumps/">Site-Dumps/</a> 15-Dec-2018 11:04 -
|
||||
<a href="Software/">Software/</a> 27-Nov-2017 00:22 -
|
||||
<a href="Strategic%20Intelligence%20Network/">Strategic Intelligence Network/</a> 17-Nov-2017 16:35 -
|
||||
<a href="WorldTracker.org/">WorldTracker.org/</a> 12-Apr-2018 04:16 -
|
||||
<a href="concen.org/">concen.org/</a> 08-Oct-2018 14:08 -
|
||||
<a href="freenrg.info/">freenrg.info/</a> 19-Dec-2017 10:59 -
|
||||
<a href="murdercube.com/">murdercube.com/</a> 06-Dec-2017 10:45 -
|
||||
<a href="parazite/">parazite/</a> 20-Nov-2017 21:25 -
|
||||
<a href="ripreddit/">ripreddit/</a> 04-Aug-2018 14:30 -
|
||||
<a href="rom/">rom/</a> 28-Nov-2018 14:15 -
|
||||
<a href="touhou/">touhou/</a> 03-Nov-2017 11:07 -
|
||||
<a href="vns/">vns/</a> 03-Nov-2017 11:36 -
|
||||
<a href="xbins/">xbins/</a> 03-Nov-2017 17:23 -
|
||||
<a href="xbins.diodematrix/">xbins.diodematrix/</a> 21-Sep-2018 22:33 -
|
||||
<a href="Rclone_for_Scrubs.pdf">Rclone_for_Scrubs.pdf</a> 04-Sep-2018 13:31 315K
|
||||
<a href="Wget_Linux_Guide.pdf">Wget_Linux_Guide.pdf</a> 21-Dec-2017 20:28 168K
|
||||
<a href="Wget_Windows_Guide.pdf">Wget_Windows_Guide.pdf</a> 25-Nov-2017 17:59 867K
|
||||
<a href="rclone_guide.pdf">rclone_guide.pdf</a> 03-Sep-2018 23:37 315K
|
||||
<a href="wget-noobs-guide.pdf">wget-noobs-guide.pdf</a> 21-Dec-2017 20:29 168K
|
||||
<a href="xbox-scene_Aug2014.7z">xbox-scene_Aug2014.7z</a> 26-Oct-2017 23:09 1G
|
||||
</pre><hr></body>
|
||||
</html>`
|
||||
59
crawl_test.go
Normal file
59
crawl_test.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/terorie/od-database-crawler/fasturl"
|
||||
"net/url"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func BenchmarkParseDir(b *testing.B) {
|
||||
for n := 0; n < b.N; n++ {
|
||||
var u fasturl.URL
|
||||
err := u.Parse("http://archive.ubuntu.com/ubuntu/indices/")
|
||||
if err != nil {
|
||||
b.Fatal("Failed to parse URL", err)
|
||||
}
|
||||
|
||||
_, err = ParseDir([]byte(apache2Listing), &u)
|
||||
if err != nil {
|
||||
b.Fatal("Failed to extract links", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkParseDirReference(b *testing.B) {
|
||||
for n := 0; n < b.N; n++ {
|
||||
u, err := url.Parse("http://archive.ubuntu.com/ubuntu/indices/")
|
||||
if err != nil {
|
||||
b.Fatal("Failed to parse URL", err)
|
||||
}
|
||||
|
||||
_, err = referenceParseDir([]byte(apache2Listing), u)
|
||||
if err != nil {
|
||||
b.Fatal("Failed to extract links", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func referenceParseDir(body []byte, baseUri *url.URL) (links []*url.URL, err error) {
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||
if err != nil { return nil, err }
|
||||
|
||||
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
||||
href, _ := s.Attr("href")
|
||||
|
||||
sub, err := baseUri.Parse(href)
|
||||
if err != nil { return } // continue
|
||||
|
||||
if !strings.HasPrefix(sub.String(), baseUri.String()) {
|
||||
return // continue
|
||||
}
|
||||
|
||||
links = append(links, sub)
|
||||
})
|
||||
|
||||
return
|
||||
}
|
||||
@@ -15,7 +15,10 @@ package redblackhash
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"sync"
|
||||
)
|
||||
|
||||
@@ -43,6 +46,13 @@ type Node struct {
|
||||
Parent *Node
|
||||
}
|
||||
|
||||
type nodeHeader struct {
|
||||
Key *Key
|
||||
Color color
|
||||
}
|
||||
|
||||
var o = binary.BigEndian
|
||||
|
||||
func (k *Key) Compare(o *Key) int {
|
||||
return bytes.Compare(k[:], o[:])
|
||||
}
|
||||
@@ -233,7 +243,7 @@ func (tree *Tree) String() string {
|
||||
}
|
||||
|
||||
func (node *Node) String() string {
|
||||
return fmt.Sprintf("%v", node.Key)
|
||||
return hex.EncodeToString(node.Key[:16]) + "..."
|
||||
}
|
||||
|
||||
func output(node *Node, prefix string, isTail bool, str *string) {
|
||||
@@ -481,6 +491,119 @@ func (tree *Tree) deleteCase6(node *Node) {
|
||||
}
|
||||
}
|
||||
|
||||
func (tree *Tree) Marshal(w io.Writer) (err error) {
|
||||
tree.Lock()
|
||||
defer tree.Unlock()
|
||||
|
||||
err = binary.Write(w, o, uint64(0x617979797979790A))
|
||||
if err != nil { return err }
|
||||
|
||||
err = marshal(tree.Root, w)
|
||||
if err != nil { return err }
|
||||
|
||||
err = binary.Write(w, o, uint64(0x6C6D616F6F6F6F0A))
|
||||
if err != nil { return err }
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func marshal(n *Node, w io.Writer) (err error) {
|
||||
if n == nil {
|
||||
err = binary.Write(w, o, uint64(0x796565656565740A))
|
||||
return err
|
||||
}
|
||||
|
||||
err = binary.Write(w, o, uint64(0xF09F85B1EFB88F0A))
|
||||
if err != nil { return err }
|
||||
|
||||
_, err = w.Write(n.Key[:])
|
||||
if err != nil { return err }
|
||||
|
||||
var colorI uint64
|
||||
if n.color {
|
||||
colorI = 0x7468652D6579657C
|
||||
} else {
|
||||
colorI = 0x6865782B7465727C
|
||||
}
|
||||
|
||||
err = binary.Write(w, o, colorI)
|
||||
if err != nil { return err }
|
||||
|
||||
err = marshal(n.Left, w)
|
||||
if err != nil { return err }
|
||||
|
||||
err = marshal(n.Right, w)
|
||||
if err != nil { return err }
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (tree *Tree) Unmarshal(r io.Reader) (err error) {
|
||||
tree.Lock()
|
||||
defer tree.Unlock()
|
||||
|
||||
var sof uint64
|
||||
err = binary.Read(r, o, &sof)
|
||||
if err != nil { return err }
|
||||
if sof != 0x617979797979790A {
|
||||
return fmt.Errorf("redblack: wrong format")
|
||||
}
|
||||
|
||||
tree.Root, tree.size, err = unmarshal(r)
|
||||
if err != nil { return err }
|
||||
|
||||
var eof uint64
|
||||
err = binary.Read(r, o, &eof)
|
||||
if err != nil { return err }
|
||||
if eof != 0x6C6D616F6F6F6F0A {
|
||||
return fmt.Errorf("redblack: end of file missing")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func unmarshal(r io.Reader) (n *Node, size int, err error) {
|
||||
var head uint64
|
||||
err = binary.Read(r, o, &head)
|
||||
if err != nil { return nil, 0, err }
|
||||
|
||||
size = 1
|
||||
|
||||
switch head {
|
||||
case 0x796565656565740A:
|
||||
return nil, 0, nil
|
||||
case 0xF09F85B1EFB88F0A:
|
||||
n = new(Node)
|
||||
|
||||
_, err = io.ReadFull(r, n.Key[:])
|
||||
if err != nil { return nil, 0, err }
|
||||
|
||||
var colorInt uint64
|
||||
err = binary.Read(r, o, &colorInt)
|
||||
if err != nil { return nil, 0, err }
|
||||
switch colorInt {
|
||||
case 0x7468652D6579657C:
|
||||
n.color = true
|
||||
case 0x6865782B7465727C:
|
||||
n.color = false
|
||||
default:
|
||||
return nil, 0, fmt.Errorf("redblack: corrupt node color")
|
||||
}
|
||||
default:
|
||||
return nil, 0, fmt.Errorf("redblack: corrupt node info")
|
||||
}
|
||||
|
||||
var s2 int
|
||||
n.Left, s2, err = unmarshal(r)
|
||||
size += s2
|
||||
if err != nil { return nil, 0, err }
|
||||
n.Right, s2, err = unmarshal(r)
|
||||
size += s2
|
||||
if err != nil { return nil, 0, err }
|
||||
|
||||
return n, size, nil
|
||||
}
|
||||
|
||||
func nodeColor(node *Node) color {
|
||||
if node == nil {
|
||||
return black
|
||||
|
||||
47
ds/redblackhash/redblack_test.go
Normal file
47
ds/redblackhash/redblack_test.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package redblackhash
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"math/rand"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestTree_Marshal(t *testing.T) {
|
||||
var t1, t2 Tree
|
||||
|
||||
// Generate 1000 random values to insert
|
||||
for i := 0; i < 1000; i++ {
|
||||
var key Key
|
||||
rand.Read(key[:])
|
||||
t1.Put(&key)
|
||||
}
|
||||
|
||||
// Marshal tree
|
||||
var wr bytes.Buffer
|
||||
err := t1.Marshal(&wr)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
t.FailNow()
|
||||
}
|
||||
|
||||
buf := wr.Bytes()
|
||||
rd := bytes.NewBuffer(buf)
|
||||
|
||||
// Unmarshal tree
|
||||
err = t2.Unmarshal(rd)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
t.FailNow()
|
||||
}
|
||||
|
||||
if !compare(t1.Root, t2.Root) {
|
||||
t.Error("trees are not equal")
|
||||
t.FailNow()
|
||||
}
|
||||
}
|
||||
|
||||
func compare(n1, n2 *Node) bool {
|
||||
return n1.Key.Compare(&n2.Key) == 0 &&
|
||||
(n1.Left == nil || compare(n1.Left, n2.Left)) &&
|
||||
(n1.Right == nil || compare(n1.Right, n2.Right))
|
||||
}
|
||||
28
errors.go
28
errors.go
@@ -3,6 +3,8 @@ package main
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/valyala/fasthttp"
|
||||
"net"
|
||||
)
|
||||
|
||||
var ErrRateLimit = errors.New("too many requests")
|
||||
@@ -15,3 +17,29 @@ type HttpError struct {
|
||||
func (e HttpError) Error() string {
|
||||
return fmt.Sprintf("http status %d", e.code)
|
||||
}
|
||||
|
||||
func shouldRetry(err error) bool {
|
||||
// HTTP errors
|
||||
if httpErr, ok := err.(*HttpError); ok {
|
||||
switch httpErr.code {
|
||||
case fasthttp.StatusTooManyRequests:
|
||||
return true
|
||||
default:
|
||||
// Don't retry HTTP error codes
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if dnsError, ok := err.(*net.DNSError); ok {
|
||||
// Don't retry permanent DNS errors
|
||||
return dnsError.IsTemporary
|
||||
}
|
||||
|
||||
if netErr, ok := err.(*net.OpError); ok {
|
||||
// Don't retry permanent network errors
|
||||
return netErr.Temporary()
|
||||
}
|
||||
|
||||
// Retry by default
|
||||
return true
|
||||
}
|
||||
|
||||
14
go.mod
Normal file
14
go.mod
Normal file
@@ -0,0 +1,14 @@
|
||||
module github.com/syndtr/od-database-crawler
|
||||
|
||||
require (
|
||||
github.com/beeker1121/goque v2.0.1+incompatible
|
||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect
|
||||
github.com/sirupsen/logrus v1.3.0
|
||||
github.com/spf13/cobra v0.0.3
|
||||
github.com/spf13/viper v1.3.1
|
||||
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 // indirect
|
||||
github.com/terorie/od-database-crawler v1.1.1
|
||||
github.com/valyala/fasthttp v1.1.0
|
||||
golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613
|
||||
golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3
|
||||
)
|
||||
66
go.sum
Normal file
66
go.sum
Normal file
@@ -0,0 +1,66 @@
|
||||
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
|
||||
github.com/beeker1121/goque v2.0.1+incompatible h1:5nJHPMqQLxUvGFc8m/NW2QzxKyc0zICmqs/JUsmEjwE=
|
||||
github.com/beeker1121/goque v2.0.1+incompatible/go.mod h1:L6dOWBhDOnxUVQsb0wkLve0VCnt2xJW/MI8pdRX4ANw=
|
||||
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
|
||||
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
|
||||
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=
|
||||
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
|
||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w=
|
||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
|
||||
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
|
||||
github.com/klauspost/compress v1.4.0 h1:8nsMz3tWa9SWWPL60G1V6CUsf4lLjWLTNEtibhe8gh8=
|
||||
github.com/klauspost/compress v1.4.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
|
||||
github.com/klauspost/cpuid v0.0.0-20180405133222-e7e905edc00e h1:+lIPJOWl+jSiJOc70QXJ07+2eg2Jy2EC7Mi11BWujeM=
|
||||
github.com/klauspost/cpuid v0.0.0-20180405133222-e7e905edc00e/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
|
||||
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
|
||||
github.com/magiconair/properties v1.8.0 h1:LLgXmsheXeRoUOBOjtwPQCWIYqM/LU1ayDtDePerRcY=
|
||||
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
|
||||
github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE=
|
||||
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
|
||||
github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc=
|
||||
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/sirupsen/logrus v1.3.0 h1:hI/7Q+DtNZ2kINb6qt/lS+IyXnHQe9e90POfeewL/ME=
|
||||
github.com/sirupsen/logrus v1.3.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
|
||||
github.com/spf13/afero v1.1.2 h1:m8/z1t7/fwjysjQRYbP0RD+bUIF/8tJwPdEZsI83ACI=
|
||||
github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
|
||||
github.com/spf13/cast v1.3.0 h1:oget//CVOEoFewqQxwr0Ej5yjygnqGkvggSE/gB35Q8=
|
||||
github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
|
||||
github.com/spf13/cobra v0.0.3 h1:ZlrZ4XsMRm04Fr5pSFxBgfND2EBVa1nLpiy1stUsX/8=
|
||||
github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ=
|
||||
github.com/spf13/jwalterweatherman v1.0.0 h1:XHEdyB+EcvlqZamSM4ZOMGlc93t6AcsBEu9Gc1vn7yk=
|
||||
github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
|
||||
github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg=
|
||||
github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
|
||||
github.com/spf13/viper v1.3.1 h1:5+8j8FTpnFV4nEImW/ofkzEt8VoOiLXxdYIDsB73T38=
|
||||
github.com/spf13/viper v1.3.1/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
|
||||
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 h1:GnOzE5fEFN3b2zDhJJABEofdb51uMRNb8eqIVtdducs=
|
||||
github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2/go.mod h1:Z4AUp2Km+PwemOoO/VB5AOx9XSsIItzFjoJlOSiYmn0=
|
||||
github.com/terorie/od-database-crawler v1.1.1 h1:Ca+ZqbZX3rVWBR8SDRzvroyxjBtUs75MQXZ9YG0gqGo=
|
||||
github.com/terorie/od-database-crawler v1.1.1/go.mod h1:vVJ7pLkudrlUNp9qu24JCzQ8N6mFsrOmX1tPXr155DQ=
|
||||
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
|
||||
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
|
||||
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
|
||||
github.com/valyala/fasthttp v1.1.0 h1:3BohG7mqwj4lq7PTX//7gLbUlzNvZSPmuHFnloXT0lw=
|
||||
github.com/valyala/fasthttp v1.1.0/go.mod h1:4vX61m6KN+xDduDNwXrhIAVZaZaZiQ1luJk8LWSxF3s=
|
||||
github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
|
||||
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
|
||||
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||
golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||
golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613 h1:MQ/ZZiDsUapFFiMS+vzwXkCTeEKaum+Do5rINYJDmxc=
|
||||
golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||
golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3 h1:czFLhve3vsQetD6JOJ8NZZvGQIXlnN3/yXxbT6/awxI=
|
||||
golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a h1:1n5lsVfiQW3yfsRGu98756EH1YthsFqr/5mxHduZW2A=
|
||||
golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
|
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
15
help.go
Normal file
15
help.go
Normal file
@@ -0,0 +1,15 @@
|
||||
package main
|
||||
|
||||
const helpText =
|
||||
`HTTP crawler for the OD-Database
|
||||
DB >> https://od-db.the-eye.eu <<
|
||||
Crawler >> https://github.com/terorie/od-database-crawler <<
|
||||
Server >> https://github.com/simon987/od-database <<
|
||||
|
||||
Quick start:
|
||||
- get config file (config.yml in working dir)
|
||||
- get OD-DB server ("server.url": Database URL + /api)
|
||||
- get access token ("server.token": e.g. c010b6dd-20...)
|
||||
- ./od-database-crawler server
|
||||
|
||||
Questions? Discord @terorie#2664 / Telegram @terorie`
|
||||
136
main.go
136
main.go
@@ -2,11 +2,13 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/spf13/viper"
|
||||
"github.com/terorie/od-database-crawler/fasturl"
|
||||
"github.com/urfave/cli"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
@@ -14,77 +16,90 @@ import (
|
||||
|
||||
var configFile string
|
||||
|
||||
var app = cli.App {
|
||||
Name: "od-database-crawler",
|
||||
Usage: "OD-Database Go crawler",
|
||||
Version: "1.1.1",
|
||||
BashComplete: cli.DefaultAppComplete,
|
||||
Writer: os.Stdout,
|
||||
Action: cmdBase,
|
||||
Commands: []cli.Command {
|
||||
{
|
||||
Name: "crawl",
|
||||
Usage: "Crawl a list of URLs",
|
||||
ArgsUsage: "<site>",
|
||||
Action: cmdCrawler,
|
||||
},
|
||||
},
|
||||
Flags: []cli.Flag {
|
||||
cli.StringFlag {
|
||||
Name: "config",
|
||||
EnvVar: "CONFIG",
|
||||
Destination: &configFile,
|
||||
},
|
||||
},
|
||||
Before: func(i *cli.Context) error {
|
||||
if configFile != "" {
|
||||
viper.SetConfigFile(configFile)
|
||||
}
|
||||
return nil
|
||||
},
|
||||
After: func(i *cli.Context) error {
|
||||
var rootCmd = cobra.Command {
|
||||
Use: "od-database-crawler",
|
||||
Version: "1.2.1",
|
||||
Short: "OD-Database Go crawler",
|
||||
Long: helpText,
|
||||
PersistentPreRunE: preRun,
|
||||
PersistentPostRun: func(cmd *cobra.Command, args []string) {
|
||||
exitHooks.Execute()
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
var serverCmd = cobra.Command {
|
||||
Use: "server",
|
||||
Short: "Start crawl server",
|
||||
Long: "Connect to the OD-Database and contribute to the database\n" +
|
||||
"by crawling the web for open directories!",
|
||||
Run: cmdBase,
|
||||
}
|
||||
|
||||
var crawlCmd = cobra.Command {
|
||||
Use: "crawl",
|
||||
Short: "Crawl an URL",
|
||||
Long: "Crawl the URL specified.\n" +
|
||||
"Results will not be uploaded to the database,\n" +
|
||||
"they're saved under crawled/0.json instead.\n" +
|
||||
"Primarily used for testing and benchmarking.",
|
||||
RunE: cmdCrawler,
|
||||
Args: cobra.ExactArgs(1),
|
||||
}
|
||||
|
||||
var exitHooks Hooks
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(&crawlCmd)
|
||||
rootCmd.AddCommand(&serverCmd)
|
||||
|
||||
prepareConfig()
|
||||
}
|
||||
|
||||
func main() {
|
||||
func preRun(cmd *cobra.Command, args []string) error {
|
||||
if err := os.MkdirAll("crawled", 0755);
|
||||
err != nil { panic(err) }
|
||||
|
||||
if err := os.MkdirAll("queue", 0755);
|
||||
err != nil { panic(err) }
|
||||
|
||||
readConfig()
|
||||
app.Run(os.Args)
|
||||
return nil
|
||||
}
|
||||
|
||||
func cmdBase(_ *cli.Context) error {
|
||||
// TODO Graceful shutdown
|
||||
appCtx := context.Background()
|
||||
forceCtx := context.Background()
|
||||
func main() {
|
||||
err := rootCmd.Execute()
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func cmdBase(_ *cobra.Command, _ []string) {
|
||||
onlineMode = true
|
||||
readConfig()
|
||||
|
||||
appCtx, soft := context.WithCancel(context.Background())
|
||||
forceCtx, hard := context.WithCancel(context.Background())
|
||||
go hardShutdown(forceCtx)
|
||||
go listenCtrlC(soft, hard)
|
||||
|
||||
inRemotes := make(chan *OD)
|
||||
go Schedule(forceCtx, inRemotes)
|
||||
go LoadResumeTasks(inRemotes)
|
||||
go Schedule(appCtx, inRemotes)
|
||||
|
||||
ticker := time.NewTicker(config.Recheck)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-appCtx.Done():
|
||||
return nil
|
||||
goto shutdown
|
||||
case <-ticker.C:
|
||||
t, err := FetchTask()
|
||||
if err != nil {
|
||||
logrus.WithError(err).
|
||||
Error("Failed to get new task")
|
||||
time.Sleep(viper.GetDuration(ConfCooldown))
|
||||
if !sleep(viper.GetDuration(ConfCooldown), appCtx) {
|
||||
goto shutdown
|
||||
}
|
||||
continue
|
||||
}
|
||||
if t == nil {
|
||||
@@ -118,15 +133,15 @@ func cmdBase(_ *cli.Context) error {
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
shutdown:
|
||||
globalWait.Wait()
|
||||
}
|
||||
|
||||
func cmdCrawler(clic *cli.Context) error {
|
||||
if clic.NArg() != 1 {
|
||||
cli.ShowCommandHelpAndExit(clic, "crawl", 1)
|
||||
}
|
||||
func cmdCrawler(_ *cobra.Command, args []string) error {
|
||||
onlineMode = false
|
||||
readConfig()
|
||||
|
||||
arg := clic.Args()[0]
|
||||
arg := args[0]
|
||||
// https://github.com/golang/go/issues/19779
|
||||
if !strings.Contains(arg, "://") {
|
||||
arg = "http://" + arg
|
||||
@@ -158,3 +173,30 @@ func cmdCrawler(clic *cli.Context) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func listenCtrlC(soft, hard context.CancelFunc) {
|
||||
c := make(chan os.Signal)
|
||||
signal.Notify(c, os.Interrupt)
|
||||
|
||||
<-c
|
||||
logrus.Info(">>> Shutting down crawler... <<<")
|
||||
soft()
|
||||
|
||||
<-c
|
||||
logrus.Warning(">>> Force shutdown! <<<")
|
||||
hard()
|
||||
}
|
||||
|
||||
func hardShutdown(c context.Context) {
|
||||
<-c.Done()
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
func sleep(d time.Duration, c context.Context) bool {
|
||||
select {
|
||||
case <-time.After(d):
|
||||
return true
|
||||
case <-c.Done():
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
20
model.go
20
model.go
@@ -3,7 +3,6 @@ package main
|
||||
import (
|
||||
"github.com/terorie/od-database-crawler/ds/redblackhash"
|
||||
"github.com/terorie/od-database-crawler/fasturl"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -30,12 +29,19 @@ type Job struct {
|
||||
}
|
||||
|
||||
type OD struct {
|
||||
Task Task
|
||||
Result TaskResult
|
||||
Wait sync.WaitGroup
|
||||
BaseUri fasturl.URL
|
||||
WCtx WorkerContext
|
||||
Scanned redblackhash.Tree
|
||||
Task Task
|
||||
Result TaskResult
|
||||
InProgress int64
|
||||
BaseUri fasturl.URL
|
||||
WCtx WorkerContext
|
||||
Scanned redblackhash.Tree
|
||||
}
|
||||
|
||||
type PausedOD struct {
|
||||
Task *Task
|
||||
Result *TaskResult
|
||||
BaseUri *fasturl.URL
|
||||
InProgress int64
|
||||
}
|
||||
|
||||
type File struct {
|
||||
|
||||
270
resume.go
Normal file
270
resume.go
Normal file
@@ -0,0 +1,270 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"encoding/gob"
|
||||
"fmt"
|
||||
"github.com/beeker1121/goque"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/spf13/viper"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
func init() {
|
||||
gob.Register(&PausedOD{})
|
||||
}
|
||||
|
||||
func LoadResumeTasks(inRemotes chan<- *OD) {
|
||||
resumed, err := ResumeTasks()
|
||||
if err != nil {
|
||||
logrus.WithError(err).
|
||||
Error("Failed to resume queued tasks. " +
|
||||
"/queue is probably corrupt")
|
||||
err = nil
|
||||
}
|
||||
|
||||
for _, remote := range resumed {
|
||||
inRemotes <- remote
|
||||
}
|
||||
}
|
||||
|
||||
func ResumeTasks() (tasks []*OD, err error) {
|
||||
// Get files in /queue
|
||||
var queueF *os.File
|
||||
var entries []os.FileInfo
|
||||
queueF, err = os.Open("queue")
|
||||
if err != nil { return nil, err }
|
||||
defer queueF.Close()
|
||||
entries, err = queueF.Readdir(-1)
|
||||
if err != nil { return nil, err }
|
||||
|
||||
resumeDur := viper.GetDuration(ConfResume)
|
||||
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() { continue }
|
||||
|
||||
// Check if name is a number
|
||||
var id uint64
|
||||
if id, err = strconv.ParseUint(entry.Name(), 10, 64); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Too old to be resumed
|
||||
timeDelta := time.Since(entry.ModTime())
|
||||
if resumeDur >= 0 && timeDelta > resumeDur {
|
||||
removeOldQueue(id)
|
||||
continue
|
||||
}
|
||||
|
||||
// Load queue
|
||||
var od *OD
|
||||
if od, err = resumeQueue(id); err != nil {
|
||||
logrus.WithError(err).
|
||||
WithField("id", id).
|
||||
Warning("Failed to load paused task")
|
||||
continue
|
||||
} else if od == nil {
|
||||
removeOldQueue(id)
|
||||
continue
|
||||
}
|
||||
|
||||
tasks = append(tasks, od)
|
||||
}
|
||||
|
||||
return tasks, nil
|
||||
}
|
||||
|
||||
func SaveTask(od *OD) (err error) {
|
||||
dir := filepath.Join("queue",
|
||||
strconv.FormatUint(od.Task.WebsiteId, 10))
|
||||
|
||||
fPath := filepath.Join(dir, "PAUSED")
|
||||
|
||||
err = os.Mkdir(dir, 0777)
|
||||
if err != nil { return err }
|
||||
|
||||
// Open pause file
|
||||
pausedF, err := os.OpenFile(fPath, os.O_CREATE | os.O_WRONLY | os.O_TRUNC, 0666)
|
||||
if err != nil { return err }
|
||||
defer pausedF.Close()
|
||||
|
||||
err = writePauseFile(od, pausedF)
|
||||
if err != nil { return err }
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func resumeQueue(id uint64) (od *OD, err error) {
|
||||
logrus.WithField("id", id).
|
||||
Info("Found unfinished")
|
||||
|
||||
fPath := filepath.Join("queue", strconv.FormatUint(id, 10))
|
||||
|
||||
// Try to find pause file
|
||||
pausedF, err := os.Open(filepath.Join(fPath, "PAUSED"))
|
||||
if os.IsNotExist(err) {
|
||||
// No PAUSED file => not paused
|
||||
// not paused => no error
|
||||
return nil, nil
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer pausedF.Close()
|
||||
|
||||
od = new(OD)
|
||||
od.WCtx.OD = od
|
||||
|
||||
err = readPauseFile(od, pausedF)
|
||||
if err != nil { return nil, err }
|
||||
|
||||
// Open queue
|
||||
bq, err := OpenQueue(fPath)
|
||||
if err != nil { return nil, err }
|
||||
|
||||
od.WCtx.Queue = bq
|
||||
|
||||
logrus.WithField("id", id).
|
||||
Info("Resuming task")
|
||||
|
||||
return od, nil
|
||||
}
|
||||
|
||||
func removeOldQueue(id uint64) {
|
||||
if id == 0 {
|
||||
// TODO Make custom crawl less of an ugly hack
|
||||
return
|
||||
}
|
||||
|
||||
logrus.WithField("id", id).
|
||||
Warning("Deleting & returning old task")
|
||||
|
||||
name := strconv.FormatUint(id, 10)
|
||||
|
||||
fPath := filepath.Join("queue", name)
|
||||
|
||||
// Acquire old queue
|
||||
q, err := goque.OpenQueue(fPath)
|
||||
if err != nil {
|
||||
// Queue lock exists, don't delete
|
||||
logrus.WithField("err", err).
|
||||
WithField("path", fPath).
|
||||
Error("Failed to acquire old task")
|
||||
return
|
||||
}
|
||||
|
||||
// Delete old queue from disk
|
||||
err = q.Drop()
|
||||
if err != nil {
|
||||
// Queue lock exists, don't delete
|
||||
logrus.WithField("err", err).
|
||||
WithField("path", fPath).
|
||||
Error("Failed to delete old task")
|
||||
return
|
||||
}
|
||||
|
||||
// Delete old crawl result from disk
|
||||
_ = os.Remove(filepath.Join("crawled", name + ".json"))
|
||||
|
||||
// Return task to server
|
||||
if err := CancelTask(id); err != nil {
|
||||
// Queue lock exists, don't delete
|
||||
logrus.WithField("err", err).
|
||||
WithField("id", id).
|
||||
Warning("Failed to return unfinished task to server")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func writePauseFile(od *OD, w io.Writer) (err error) {
|
||||
// Write pause file version
|
||||
_, err = w.Write([]byte("ODPAUSE-"))
|
||||
if err != nil { return err }
|
||||
|
||||
// Create save state
|
||||
paused := PausedOD {
|
||||
Task: &od.Task,
|
||||
Result: &od.Result,
|
||||
BaseUri: &od.BaseUri,
|
||||
InProgress: atomic.LoadInt64(&od.InProgress),
|
||||
}
|
||||
|
||||
// Prepare pause settings
|
||||
var b bytes.Buffer
|
||||
pauseEnc := gob.NewEncoder(&b)
|
||||
err = pauseEnc.Encode(&paused)
|
||||
if err != nil { return err }
|
||||
|
||||
// Write length of pause settings
|
||||
err = binary.Write(w, binary.LittleEndian, uint64(b.Len()))
|
||||
if err != nil { return err }
|
||||
|
||||
// Write pause settings
|
||||
_, err = w.Write(b.Bytes())
|
||||
if err != nil { return err }
|
||||
|
||||
// Write pause scan state
|
||||
err = od.Scanned.Marshal(w)
|
||||
if err != nil { return err }
|
||||
|
||||
// Save mark
|
||||
_, err = w.Write([]byte("--------"))
|
||||
if err != nil { return err }
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func readPauseFile(od *OD, r io.Reader) (err error) {
|
||||
// Make the paused struct point to OD fields
|
||||
// So gob loads values into the OD struct
|
||||
paused := PausedOD {
|
||||
Task: &od.Task,
|
||||
Result: &od.Result,
|
||||
BaseUri: &od.BaseUri,
|
||||
}
|
||||
|
||||
var version [8]byte
|
||||
_, err = io.ReadFull(r, version[:])
|
||||
if err != nil { return err }
|
||||
if !bytes.Equal(version[:], []byte("ODPAUSE-")) {
|
||||
return fmt.Errorf("unsupported pause file")
|
||||
}
|
||||
|
||||
// Read pause settings len
|
||||
var pauseSettingsLen uint64
|
||||
err = binary.Read(r, binary.LittleEndian, &pauseSettingsLen)
|
||||
|
||||
// Read pause settings
|
||||
pauseDec := gob.NewDecoder(io.LimitReader(r, int64(pauseSettingsLen)))
|
||||
err = pauseDec.Decode(&paused)
|
||||
if err != nil { return err }
|
||||
atomic.StoreInt64(&od.InProgress, paused.InProgress)
|
||||
|
||||
err = readPauseStateTree(od, r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read state tree: %s", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func readPauseStateTree(od *OD, r io.Reader) (err error) {
|
||||
// Read pause scan state
|
||||
err = od.Scanned.Unmarshal(r)
|
||||
if err != nil { return err }
|
||||
|
||||
// Check mark
|
||||
var mark [8]byte
|
||||
_, err = io.ReadFull(r, mark[:])
|
||||
if err != nil { return err }
|
||||
if !bytes.Equal(mark[:], []byte("--------")) {
|
||||
return fmt.Errorf("corrupt pause file")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
48
resume_test.go
Normal file
48
resume_test.go
Normal file
@@ -0,0 +1,48 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"github.com/terorie/od-database-crawler/fasturl"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestResumeTasks_Empty(t *testing.T) {
|
||||
start := time.Now().Add(-1 * time.Minute)
|
||||
od := OD {
|
||||
Task: Task {
|
||||
WebsiteId: 213,
|
||||
Url: "https://the-eye.eu/public/",
|
||||
},
|
||||
Result: TaskResult {
|
||||
StartTime: start,
|
||||
StartTimeUnix: start.Unix(),
|
||||
EndTimeUnix: time.Now().Unix(),
|
||||
WebsiteId: 213,
|
||||
},
|
||||
InProgress: 0,
|
||||
BaseUri: fasturl.URL {
|
||||
Scheme: fasturl.SchemeHTTPS,
|
||||
Host: "the-eye.eu",
|
||||
Path: "/public/",
|
||||
},
|
||||
}
|
||||
od.WCtx.OD = &od
|
||||
|
||||
var b bytes.Buffer
|
||||
var err error
|
||||
err = writePauseFile(&od, &b)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
buf := b.Bytes()
|
||||
|
||||
var od2 OD
|
||||
|
||||
b2 := bytes.NewBuffer(buf)
|
||||
err = readPauseFile(&od2, b2)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
147
scheduler.go
147
scheduler.go
@@ -22,54 +22,57 @@ func Schedule(c context.Context, remotes <-chan *OD) {
|
||||
go Stats(c)
|
||||
|
||||
for remote := range remotes {
|
||||
logrus.WithField("url", remote.BaseUri.String()).
|
||||
Info("Starting crawler")
|
||||
|
||||
// Collect results
|
||||
results := make(chan File)
|
||||
|
||||
remote.WCtx.OD = remote
|
||||
|
||||
// Get queue path
|
||||
queuePath := path.Join("queue", fmt.Sprintf("%d", remote.Task.WebsiteId))
|
||||
|
||||
// Delete existing queue
|
||||
if err := os.RemoveAll(queuePath);
|
||||
err != nil { panic(err) }
|
||||
|
||||
// Start new queue
|
||||
var err error
|
||||
remote.WCtx.Queue, err = OpenQueue(queuePath)
|
||||
if err != nil { panic(err) }
|
||||
|
||||
// Spawn workers
|
||||
for i := 0; i < config.Workers; i++ {
|
||||
go remote.WCtx.Worker(results)
|
||||
}
|
||||
|
||||
// Enqueue initial job
|
||||
atomic.AddInt32(&numActiveTasks, 1)
|
||||
remote.WCtx.queueJob(Job{
|
||||
Uri: remote.BaseUri,
|
||||
UriStr: remote.BaseUri.String(),
|
||||
Fails: 0,
|
||||
})
|
||||
|
||||
// Upload result when ready
|
||||
go remote.Watch(results)
|
||||
|
||||
// Sleep if max number of tasks are active
|
||||
for atomic.LoadInt32(&numActiveTasks) > config.Tasks {
|
||||
select {
|
||||
case <-c.Done():
|
||||
return
|
||||
case <-time.After(time.Second):
|
||||
continue
|
||||
}
|
||||
if !scheduleNewTask(c, remote) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func scheduleNewTask(c context.Context, remote *OD) bool {
|
||||
logrus.WithField("url", remote.BaseUri.String()).
|
||||
Info("Starting crawler")
|
||||
|
||||
// Collect results
|
||||
results := make(chan File)
|
||||
|
||||
remote.WCtx.OD = remote
|
||||
|
||||
// Get queue path
|
||||
queuePath := path.Join("queue", fmt.Sprintf("%d", remote.Task.WebsiteId))
|
||||
|
||||
// Delete existing queue
|
||||
if err := os.RemoveAll(queuePath);
|
||||
err != nil { panic(err) }
|
||||
|
||||
// Start new queue
|
||||
var err error
|
||||
remote.WCtx.Queue, err = OpenQueue(queuePath)
|
||||
if err != nil { panic(err) }
|
||||
|
||||
// Spawn workers
|
||||
remote.WCtx.SpawnWorkers(c, results, config.Workers)
|
||||
|
||||
// Enqueue initial job
|
||||
atomic.AddInt32(&numActiveTasks, 1)
|
||||
remote.WCtx.queueJob(Job{
|
||||
Uri: remote.BaseUri,
|
||||
UriStr: remote.BaseUri.String(),
|
||||
Fails: 0,
|
||||
})
|
||||
|
||||
// Upload result when ready
|
||||
go remote.Watch(results)
|
||||
|
||||
// Sleep if max number of tasks are active
|
||||
for atomic.LoadInt32(&numActiveTasks) > config.Tasks {
|
||||
if !sleep(time.Second, c) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func ScheduleTask(remotes chan<- *OD, t *Task, u *fasturl.URL) {
|
||||
if !t.register() {
|
||||
return
|
||||
@@ -117,7 +120,7 @@ func (o *OD) Watch(results chan File) {
|
||||
// Open crawl results file
|
||||
f, err := os.OpenFile(
|
||||
filePath,
|
||||
os.O_CREATE | os.O_RDWR | os.O_TRUNC,
|
||||
os.O_CREATE | os.O_RDWR | os.O_APPEND,
|
||||
0644,
|
||||
)
|
||||
if err != nil {
|
||||
@@ -159,16 +162,33 @@ func (o *OD) handleCollect(results chan File, f *os.File, collectErrC chan error
|
||||
defer close(results)
|
||||
|
||||
// Wait for all jobs on remote to finish
|
||||
o.Wait.Wait()
|
||||
for {
|
||||
// Natural finish
|
||||
if atomic.LoadInt64(&o.InProgress) == 0 {
|
||||
o.onTaskFinished()
|
||||
return
|
||||
}
|
||||
// Abort
|
||||
if atomic.LoadInt32(&o.WCtx.aborted) != 0 {
|
||||
// Wait for all workers to finish
|
||||
o.WCtx.workers.Wait()
|
||||
o.onTaskPaused()
|
||||
return
|
||||
}
|
||||
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
func (o *OD) onTaskFinished() {
|
||||
defer atomic.AddInt32(&numActiveTasks, -1)
|
||||
|
||||
// Close queue
|
||||
if err := o.WCtx.Queue.Close(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
atomic.AddInt32(&numActiveTasks, -1)
|
||||
|
||||
// Log finish
|
||||
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"id": o.Task.WebsiteId,
|
||||
"url": o.BaseUri.String(),
|
||||
@@ -191,6 +211,37 @@ func (o *OD) handleCollect(results chan File, f *os.File, collectErrC chan error
|
||||
}
|
||||
}
|
||||
|
||||
func (o *OD) onTaskPaused() {
|
||||
defer atomic.AddInt32(&numActiveTasks, -1)
|
||||
|
||||
// Close queue
|
||||
if err := o.WCtx.Queue.Close(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// Set current end time
|
||||
o.Result.EndTimeUnix = time.Now().Unix()
|
||||
|
||||
// Save task metadata
|
||||
err := SaveTask(o)
|
||||
if err != nil {
|
||||
// Log finish
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"err": err.Error(),
|
||||
"id": o.Task.WebsiteId,
|
||||
"url": o.BaseUri.String(),
|
||||
}).Error("Failed to save crawler state")
|
||||
return
|
||||
}
|
||||
|
||||
// Log finish
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"id": o.Task.WebsiteId,
|
||||
"url": o.BaseUri.String(),
|
||||
"duration": time.Since(o.Result.StartTime),
|
||||
}).Info("Crawler paused")
|
||||
}
|
||||
|
||||
func (t *Task) Collect(results chan File, f *os.File, errC chan<- error) {
|
||||
err := t.collect(results, f)
|
||||
if err != nil {
|
||||
|
||||
10
server.go
10
server.go
@@ -17,8 +17,11 @@ import (
|
||||
|
||||
var serverClient = http.Client {
|
||||
Timeout: config.ServerTimeout,
|
||||
Transport: new(ServerTripper),
|
||||
}
|
||||
|
||||
var serverUserAgent = "od-database-crawler/" + rootCmd.Version
|
||||
|
||||
func FetchTask() (t *Task, err error) {
|
||||
res, err := serverClient.PostForm(
|
||||
config.ServerUrl + "/task/get",
|
||||
@@ -176,3 +179,10 @@ func CancelTask(websiteId uint64) (err error) {
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
type ServerTripper struct{}
|
||||
|
||||
func (t *ServerTripper) RoundTrip(req *http.Request) (res *http.Response, err error) {
|
||||
req.Header.Set("User-Agent", serverUserAgent)
|
||||
return http.DefaultTransport.RoundTrip(req)
|
||||
}
|
||||
|
||||
14
stats.go
14
stats.go
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/spf13/viper"
|
||||
"math"
|
||||
"runtime"
|
||||
"sync/atomic"
|
||||
@@ -19,11 +20,14 @@ func Stats(c context.Context) {
|
||||
var crawlTicker <-chan time.Time
|
||||
var allocTicker <-chan time.Time
|
||||
|
||||
if config.CrawlStats != 0 {
|
||||
crawlTicker = time.NewTicker(config.CrawlStats).C
|
||||
crawlInterval := viper.GetDuration(ConfCrawlStats)
|
||||
allocInterval := viper.GetDuration(ConfAllocStats)
|
||||
|
||||
if crawlInterval != 0 {
|
||||
crawlTicker = time.Tick(crawlInterval)
|
||||
}
|
||||
if config.AllocStats != 0 {
|
||||
allocTicker = time.NewTicker(config.AllocStats).C
|
||||
if allocInterval != 0 {
|
||||
allocTicker = time.Tick(allocInterval)
|
||||
}
|
||||
|
||||
for {
|
||||
@@ -32,7 +36,7 @@ func Stats(c context.Context) {
|
||||
startedNow := atomic.LoadUint64(&totalStarted)
|
||||
|
||||
perSecond := float64(startedNow - startedLast) /
|
||||
config.CrawlStats.Seconds()
|
||||
crawlInterval.Seconds()
|
||||
|
||||
// Round to .5
|
||||
perSecond *= 2
|
||||
|
||||
45
worker.go
45
worker.go
@@ -1,9 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github.com/beeker1121/goque"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/valyala/fasthttp"
|
||||
"math"
|
||||
"sort"
|
||||
"strings"
|
||||
@@ -19,10 +19,29 @@ type WorkerContext struct {
|
||||
Queue *BufferedQueue
|
||||
lastRateLimit time.Time
|
||||
numRateLimits int
|
||||
workers sync.WaitGroup
|
||||
aborted int32
|
||||
}
|
||||
|
||||
func (w *WorkerContext) Worker(results chan<- File) {
|
||||
func (w *WorkerContext) SpawnWorkers(c context.Context, results chan<- File, n int) {
|
||||
w.workers.Add(n)
|
||||
for i := 0; i < n; i++ {
|
||||
go w.Worker(c, results)
|
||||
}
|
||||
}
|
||||
|
||||
func (w *WorkerContext) Worker(c context.Context, results chan<- File) {
|
||||
defer w.workers.Done()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-c.Done():
|
||||
// Not yet done
|
||||
atomic.StoreInt32(&w.aborted, 1)
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
job, err := w.Queue.Dequeue()
|
||||
switch err {
|
||||
case goque.ErrEmpty:
|
||||
@@ -42,7 +61,7 @@ func (w *WorkerContext) Worker(results chan<- File) {
|
||||
}
|
||||
|
||||
func (w *WorkerContext) step(results chan<- File, job Job) {
|
||||
defer w.finishJob(&job)
|
||||
defer w.finishJob()
|
||||
|
||||
var f File
|
||||
|
||||
@@ -55,14 +74,12 @@ func (w *WorkerContext) step(results chan<- File, job Job) {
|
||||
if err != nil {
|
||||
job.Fails++
|
||||
|
||||
if httpErr, ok := err.(*HttpError); ok {
|
||||
switch httpErr.code {
|
||||
case fasthttp.StatusTooManyRequests:
|
||||
err = ErrRateLimit
|
||||
default:
|
||||
// Don't retry HTTP error codes
|
||||
return
|
||||
}
|
||||
if !shouldRetry(err) {
|
||||
atomic.AddUint64(&totalAborted, 1)
|
||||
logrus.WithField("url", job.UriStr).
|
||||
WithError(err).
|
||||
Error("Giving up after failure")
|
||||
return
|
||||
}
|
||||
|
||||
if job.Fails > config.Retries {
|
||||
@@ -159,7 +176,7 @@ func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
||||
}
|
||||
|
||||
func (w *WorkerContext) queueJob(job Job) {
|
||||
w.OD.Wait.Add(1)
|
||||
atomic.AddInt64(&w.OD.InProgress, 1)
|
||||
|
||||
if w.numRateLimits > 0 {
|
||||
if time.Since(w.lastRateLimit) > 5 * time.Second {
|
||||
@@ -175,8 +192,8 @@ func (w *WorkerContext) queueJob(job Job) {
|
||||
}
|
||||
}
|
||||
|
||||
func (w *WorkerContext) finishJob(job *Job) {
|
||||
w.OD.Wait.Done()
|
||||
func (w *WorkerContext) finishJob() {
|
||||
atomic.AddInt64(&w.OD.InProgress, -1)
|
||||
}
|
||||
|
||||
func isErrSilent(err error) bool {
|
||||
|
||||
Reference in New Issue
Block a user