mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-18 18:06:45 +00:00
Hash links
This commit is contained in:
parent
ed5e35f005
commit
a8c27b2d21
8
crawl.go
8
crawl.go
@ -156,6 +156,14 @@ func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func HashString(s string) (o redblackhash.Key) {
|
||||||
|
h, _ := blake2b.New256(nil)
|
||||||
|
h.Write([]byte(s))
|
||||||
|
sum := h.Sum(nil)
|
||||||
|
copy(o[:redblackhash.KeySize], sum)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
func (f *File) ParseHeader(h []byte) {
|
func (f *File) ParseHeader(h []byte) {
|
||||||
var k1, k2 int
|
var k1, k2 int
|
||||||
var v1, v2 int
|
var v1, v2 int
|
||||||
|
10
worker.go
10
worker.go
@ -86,15 +86,17 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, link := range links {
|
for _, link := range links {
|
||||||
|
uriStr := link.String()
|
||||||
// Skip already queued links
|
// Skip already queued links
|
||||||
//if _, old := job.OD.Scanned.LoadOrStore(link, true); old {
|
linkHash := HashString(uriStr)
|
||||||
// continue
|
if job.OD.LoadOrStoreKey(&linkHash) {
|
||||||
//}
|
continue
|
||||||
|
}
|
||||||
job.OD.Wait.Add(1)
|
job.OD.Wait.Add(1)
|
||||||
newJobs = append(newJobs, Job{
|
newJobs = append(newJobs, Job{
|
||||||
OD: job.OD,
|
OD: job.OD,
|
||||||
Uri: link,
|
Uri: link,
|
||||||
UriStr: link.String(),
|
UriStr: uriStr,
|
||||||
Fails: 0,
|
Fails: 0,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user