From d69cd4400e166e519d38c1c5be690638605b8931 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sat, 9 Feb 2019 16:46:36 +0100 Subject: [PATCH] Use fasthttp.PipelineClient --- config.go | 4 ---- crawl.go | 50 ++++++++++++++++++++++++++++++++++---------------- main.go | 2 +- scheduler.go | 3 +++ worker.go | 10 ++++++++-- 5 files changed, 46 insertions(+), 23 deletions(-) diff --git a/config.go b/config.go index b8c15ef..1ef5e6c 100644 --- a/config.go +++ b/config.go @@ -194,10 +194,6 @@ func readConfig() { config.UserAgent = viper.GetString(ConfUserAgent) - setDialTimeout(viper.GetDuration(ConfDialTimeout)) - - setTimeout(viper.GetDuration(ConfTimeout)) - config.JobBufferSize = viper.GetInt(ConfJobBufferSize) config.Verbose = viper.GetBool(ConfVerbose) diff --git a/crawl.go b/crawl.go index 3aef115..5ee17f0 100644 --- a/crawl.go +++ b/crawl.go @@ -3,6 +3,7 @@ package main import ( "bytes" "crypto/tls" + "github.com/spf13/viper" "github.com/terorie/od-database-crawler/ds/redblackhash" "github.com/terorie/od-database-crawler/fasturl" "github.com/valyala/fasthttp" @@ -15,24 +16,33 @@ import ( "time" ) -var client = fasthttp.Client { - TLSConfig: &tls.Config{ - InsecureSkipVerify: true, - }, +var tlsConfig = tls.Config { + InsecureSkipVerify: true, } -func setDialTimeout(d time.Duration) { - client.Dial = func(addr string) (net.Conn, error) { - return fasthttp.DialTimeout(addr, d) +func newHTTPClient(url *fasturl.URL) *fasthttp.PipelineClient { + var isTLS bool + switch url.Scheme { + case fasturl.SchemeHTTP: + isTLS = false + case fasturl.SchemeHTTPS: + isTLS = true + } + + return &fasthttp.PipelineClient { + MaxConns: viper.GetInt(ConfWorkers), + Addr: url.Host, + IsTLS: isTLS, + TLSConfig: &tlsConfig, + ReadTimeout: viper.GetDuration(ConfTimeout), + WriteTimeout: viper.GetDuration(ConfTimeout) / 2, + Dial: func(addr string) (conn net.Conn, e error) { + return fasthttp.DialTimeout(addr, viper.GetDuration(ConfDialTimeout)) + }, } } -func setTimeout(d time.Duration) { - client.ReadTimeout = d - client.WriteTimeout = d / 2 -} - -func GetDir(j *Job, f *File) (links []fasturl.URL, err error) { +func (w *WorkerContext) GetDir(j *Job, f *File) (links []fasturl.URL, err error) { f.IsDir = true f.Name = path.Base(j.Uri.Path) @@ -45,7 +55,7 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) { res := fasthttp.AcquireResponse() defer fasthttp.ReleaseResponse(res) - err = client.Do(req, res) + err = w.client.Do(req, res) fasthttp.ReleaseRequest(req) if err != nil { @@ -123,6 +133,14 @@ func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error continue } + if strings.HasSuffix(link.Path, ".php") { + continue + } + + if strings.Contains(link.Path, "/cgi-bin/") { + continue + } + links = append(links, link) } } @@ -131,7 +149,7 @@ func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error return } -func GetFile(u fasturl.URL, f *File) (err error) { +func (w *WorkerContext) GetFile(u fasturl.URL, f *File) (err error) { f.IsDir = false u.Path = path.Clean(u.Path) f.Name = path.Base(u.Path) @@ -148,7 +166,7 @@ func GetFile(u fasturl.URL, f *File) (err error) { res.SkipBody = true defer fasthttp.ReleaseResponse(res) - err = client.Do(req, res) + err = w.client.Do(req, res) fasthttp.ReleaseRequest(req) if err != nil { diff --git a/main.go b/main.go index e2d67ac..11c06b9 100644 --- a/main.go +++ b/main.go @@ -35,7 +35,7 @@ var serverCmd = cobra.Command { } var crawlCmd = cobra.Command { - Use: "crawl", + Use: "crawl ", Short: "Crawl an URL", Long: "Crawl the URL specified.\n" + "Results will not be uploaded to the database,\n" + diff --git a/scheduler.go b/scheduler.go index 9abe491..a87bb27 100644 --- a/scheduler.go +++ b/scheduler.go @@ -25,6 +25,9 @@ func Schedule(c context.Context, remotes <-chan *OD) { logrus.WithField("url", remote.BaseUri.String()). Info("Starting crawler") + // Create HTTP client + remote.WCtx.Prepare() + // Collect results results := make(chan File) diff --git a/worker.go b/worker.go index 118e28f..2bedf21 100644 --- a/worker.go +++ b/worker.go @@ -3,6 +3,7 @@ package main import ( "github.com/beeker1121/goque" "github.com/sirupsen/logrus" + "github.com/valyala/fasthttp" "math" "sort" "strings" @@ -18,6 +19,11 @@ type WorkerContext struct { Queue *BufferedQueue lastRateLimit time.Time numRateLimits int + client *fasthttp.PipelineClient +} + +func (w *WorkerContext) Prepare() { + w.client = newHTTPClient(&w.OD.BaseUri) } func (w *WorkerContext) Worker(results chan<- File) { @@ -91,7 +97,7 @@ func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) { if len(job.Uri.Path) == 0 { return } if job.Uri.Path[len(job.Uri.Path)-1] == '/' { // Load directory - links, err := GetDir(job, f) + links, err := w.GetDir(job, f) if err != nil { if !isErrSilent(err) { logrus.WithError(err). @@ -141,7 +147,7 @@ func (w *WorkerContext) DoJob(job *Job, f *File) (newJobs []Job, err error) { } } else { // Load file - err := GetFile(job.Uri, f) + err := w.GetFile(job.Uri, f) if err != nil { if !isErrSilent(err) { logrus.WithError(err).