mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-19 18:36:43 +00:00
Exclude dups in dir instead of keeping hashes of links
This commit is contained in:
parent
9c8174dd8d
commit
4c071171eb
8
crawl.go
8
crawl.go
@ -155,14 +155,6 @@ func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func HashString(s string) (o redblackhash.Key) {
|
|
||||||
h, _ := blake2b.New256(nil)
|
|
||||||
h.Write([]byte(s))
|
|
||||||
sum := h.Sum(nil)
|
|
||||||
copy(o[:redblackhash.KeySize], sum)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *File) applyContentLength(v string) {
|
func (f *File) applyContentLength(v string) {
|
||||||
if v == "" { return }
|
if v == "" { return }
|
||||||
size, err := strconv.ParseInt(v, 10, 64)
|
size, err := strconv.ParseInt(v, 10, 64)
|
||||||
|
21
worker.go
21
worker.go
@ -3,6 +3,8 @@ package main
|
|||||||
import (
|
import (
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"math"
|
"math"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
@ -85,13 +87,22 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
|||||||
return nil, ErrKnown
|
return nil, ErrKnown
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sort by path
|
||||||
|
sort.Slice(links, func(i, j int) bool {
|
||||||
|
return strings.Compare(links[i].Path, links[j].Path) < 0
|
||||||
|
})
|
||||||
|
|
||||||
|
var newJobCount int
|
||||||
|
var lastLink string
|
||||||
for _, link := range links {
|
for _, link := range links {
|
||||||
uriStr := link.String()
|
uriStr := link.String()
|
||||||
// Skip already queued links
|
|
||||||
linkHash := HashString(uriStr)
|
// Ignore dupes
|
||||||
if job.OD.LoadOrStoreKey(&linkHash) {
|
if uriStr == lastLink {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
lastLink = uriStr
|
||||||
|
|
||||||
job.OD.Wait.Add(1)
|
job.OD.Wait.Add(1)
|
||||||
newJobs = append(newJobs, Job{
|
newJobs = append(newJobs, Job{
|
||||||
OD: job.OD,
|
OD: job.OD,
|
||||||
@ -99,11 +110,13 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
|||||||
UriStr: uriStr,
|
UriStr: uriStr,
|
||||||
Fails: 0,
|
Fails: 0,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
newJobCount++
|
||||||
}
|
}
|
||||||
if config.Verbose {
|
if config.Verbose {
|
||||||
logrus.WithFields(logrus.Fields{
|
logrus.WithFields(logrus.Fields{
|
||||||
"url": job.UriStr,
|
"url": job.UriStr,
|
||||||
"files": len(links),
|
"files": newJobCount,
|
||||||
}).Debug("Listed")
|
}).Debug("Listed")
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user