From 77cb45dbecb94e30d12e12263723951e082285d4 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sun, 28 Oct 2018 18:37:18 +0100 Subject: [PATCH] Detect directory symlinks --- crawl.go | 17 ++++++++++++++--- errors.go | 8 ++++++++ main.go | 3 +++ worker.go | 17 +++++++++++++++-- 4 files changed, 40 insertions(+), 5 deletions(-) create mode 100644 errors.go diff --git a/crawl.go b/crawl.go index 7532fef..0e999e1 100644 --- a/crawl.go +++ b/crawl.go @@ -2,10 +2,11 @@ package main import ( "bytes" - "errors" + "encoding/base64" "fmt" "github.com/sirupsen/logrus" "github.com/valyala/fasthttp" + "golang.org/x/crypto/blake2b" "golang.org/x/net/html" "golang.org/x/net/html/atom" "net/url" @@ -16,8 +17,6 @@ import ( ) var client fasthttp.Client -var ErrRateLimit = errors.New("too many requests") -var ErrForbidden = errors.New("access denied") func GetDir(j *Job, f *File) (links []url.URL, err error) { f.IsDir = true @@ -146,6 +145,18 @@ func GetFile(u url.URL, f *File) (err error) { return nil } +func (f *File) HashDir(links []url.URL) string { + h, _ := blake2b.New256(nil) + h.Write([]byte(f.Name)) + for _, link := range links { + fileName := path.Base(link.Path) + h.Write([]byte(fileName)) + } + sum := h.Sum(nil) + b64sum := base64.StdEncoding.EncodeToString(sum) + return b64sum +} + func (f *File) ParseHeader(h []byte) { var k1, k2 int var v1, v2 int diff --git a/errors.go b/errors.go new file mode 100644 index 0000000..316a108 --- /dev/null +++ b/errors.go @@ -0,0 +1,8 @@ +package main + +import "errors" + +var ErrRateLimit = errors.New("too many requests") +var ErrForbidden = errors.New("access denied") +var ErrKnown = errors.New("already crawled") + diff --git a/main.go b/main.go index 86ca871..2371d82 100644 --- a/main.go +++ b/main.go @@ -2,6 +2,7 @@ package main import ( "context" + "github.com/sirupsen/logrus" "github.com/urfave/cli" "net/url" "os" @@ -69,6 +70,8 @@ func cmdCrawler(clic *cli.Context) error { // Wait for all jobs to finish globalWait.Wait() + logrus.Info("All dirs processed!") + return nil } diff --git a/worker.go b/worker.go index 49793b3..d9c52b6 100644 --- a/worker.go +++ b/worker.go @@ -31,6 +31,9 @@ func (w WorkerContext) step(job Job) { newJobs, err := DoJob(&job, &f) atomic.AddUint64(&totalStarted, 1) + if err == ErrKnown { + return + } if err != nil { job.Fails++ @@ -64,9 +67,8 @@ func (w WorkerContext) step(job Job) { } func DoJob(job *Job, f *File) (newJobs []Job, err error) { - // File if strings.HasSuffix(job.Uri.Path, "/") { - // Dir + // Load directory links, err := GetDir(job, f) if err != nil { logrus.WithError(err). @@ -74,7 +76,17 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) { Error("Failed getting dir") return nil, err } + + // Hash directory + hash := f.HashDir(links) + + // Skip symlinked dirs + if _, old := job.OD.Scanned.LoadOrStore(hash, true); old { + return nil, ErrKnown + } + for _, link := range links { + // Skip already queued links if _, old := job.OD.Scanned.LoadOrStore(link, true); old { continue } @@ -91,6 +103,7 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) { "files": len(links), }).Debug("Listed") } else { + // Load file err := GetFile(job.Uri, f) if err != nil { logrus.WithError(err).