From 4c071171ebe59bcfcd962a091a206043f5899d19 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sun, 11 Nov 2018 23:11:30 +0100 Subject: [PATCH] Exclude dups in dir instead of keeping hashes of links --- crawl.go | 8 -------- worker.go | 21 +++++++++++++++++---- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/crawl.go b/crawl.go index f439455..50d71df 100644 --- a/crawl.go +++ b/crawl.go @@ -155,14 +155,6 @@ func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) { return } -func HashString(s string) (o redblackhash.Key) { - h, _ := blake2b.New256(nil) - h.Write([]byte(s)) - sum := h.Sum(nil) - copy(o[:redblackhash.KeySize], sum) - return -} - func (f *File) applyContentLength(v string) { if v == "" { return } size, err := strconv.ParseInt(v, 10, 64) diff --git a/worker.go b/worker.go index 55ca425..687f5f8 100644 --- a/worker.go +++ b/worker.go @@ -3,6 +3,8 @@ package main import ( "github.com/sirupsen/logrus" "math" + "sort" + "strings" "sync" "sync/atomic" "time" @@ -85,13 +87,22 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) { return nil, ErrKnown } + // Sort by path + sort.Slice(links, func(i, j int) bool { + return strings.Compare(links[i].Path, links[j].Path) < 0 + }) + + var newJobCount int + var lastLink string for _, link := range links { uriStr := link.String() - // Skip already queued links - linkHash := HashString(uriStr) - if job.OD.LoadOrStoreKey(&linkHash) { + + // Ignore dupes + if uriStr == lastLink { continue } + lastLink = uriStr + job.OD.Wait.Add(1) newJobs = append(newJobs, Job{ OD: job.OD, @@ -99,11 +110,13 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) { UriStr: uriStr, Fails: 0, }) + + newJobCount++ } if config.Verbose { logrus.WithFields(logrus.Fields{ "url": job.UriStr, - "files": len(links), + "files": newJobCount, }).Debug("Listed") } } else {