From a8c27b2d2120c8ca298868c91b99586528c4eeee Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 6 Nov 2018 02:01:53 +0100 Subject: [PATCH] Hash links --- crawl.go | 8 ++++++++ worker.go | 10 ++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/crawl.go b/crawl.go index 805449a..476cc62 100644 --- a/crawl.go +++ b/crawl.go @@ -156,6 +156,14 @@ func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) { return } +func HashString(s string) (o redblackhash.Key) { + h, _ := blake2b.New256(nil) + h.Write([]byte(s)) + sum := h.Sum(nil) + copy(o[:redblackhash.KeySize], sum) + return +} + func (f *File) ParseHeader(h []byte) { var k1, k2 int var v1, v2 int diff --git a/worker.go b/worker.go index bbc1b9e..55ca425 100644 --- a/worker.go +++ b/worker.go @@ -86,15 +86,17 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) { } for _, link := range links { + uriStr := link.String() // Skip already queued links - //if _, old := job.OD.Scanned.LoadOrStore(link, true); old { - // continue - //} + linkHash := HashString(uriStr) + if job.OD.LoadOrStoreKey(&linkHash) { + continue + } job.OD.Wait.Add(1) newJobs = append(newJobs, Job{ OD: job.OD, Uri: link, - UriStr: link.String(), + UriStr: uriStr, Fails: 0, }) }