mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-16 08:56:44 +00:00
Exclude dups in dir instead of keeping hashes of links
This commit is contained in:
parent
9c8174dd8d
commit
4c071171eb
8
crawl.go
8
crawl.go
@ -155,14 +155,6 @@ func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) {
|
||||
return
|
||||
}
|
||||
|
||||
func HashString(s string) (o redblackhash.Key) {
|
||||
h, _ := blake2b.New256(nil)
|
||||
h.Write([]byte(s))
|
||||
sum := h.Sum(nil)
|
||||
copy(o[:redblackhash.KeySize], sum)
|
||||
return
|
||||
}
|
||||
|
||||
func (f *File) applyContentLength(v string) {
|
||||
if v == "" { return }
|
||||
size, err := strconv.ParseInt(v, 10, 64)
|
||||
|
21
worker.go
21
worker.go
@ -3,6 +3,8 @@ package main
|
||||
import (
|
||||
"github.com/sirupsen/logrus"
|
||||
"math"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
@ -85,13 +87,22 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
||||
return nil, ErrKnown
|
||||
}
|
||||
|
||||
// Sort by path
|
||||
sort.Slice(links, func(i, j int) bool {
|
||||
return strings.Compare(links[i].Path, links[j].Path) < 0
|
||||
})
|
||||
|
||||
var newJobCount int
|
||||
var lastLink string
|
||||
for _, link := range links {
|
||||
uriStr := link.String()
|
||||
// Skip already queued links
|
||||
linkHash := HashString(uriStr)
|
||||
if job.OD.LoadOrStoreKey(&linkHash) {
|
||||
|
||||
// Ignore dupes
|
||||
if uriStr == lastLink {
|
||||
continue
|
||||
}
|
||||
lastLink = uriStr
|
||||
|
||||
job.OD.Wait.Add(1)
|
||||
newJobs = append(newJobs, Job{
|
||||
OD: job.OD,
|
||||
@ -99,11 +110,13 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
||||
UriStr: uriStr,
|
||||
Fails: 0,
|
||||
})
|
||||
|
||||
newJobCount++
|
||||
}
|
||||
if config.Verbose {
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"url": job.UriStr,
|
||||
"files": len(links),
|
||||
"files": newJobCount,
|
||||
}).Debug("Listed")
|
||||
}
|
||||
} else {
|
||||
|
Loading…
x
Reference in New Issue
Block a user