Exclude dups in dir instead of keeping hashes of links

This commit is contained in:
Richard Patel 2018-11-11 23:11:30 +01:00
parent 9c8174dd8d
commit 4c071171eb
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB
2 changed files with 17 additions and 12 deletions

View File

@ -155,14 +155,6 @@ func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) {
return
}
func HashString(s string) (o redblackhash.Key) {
h, _ := blake2b.New256(nil)
h.Write([]byte(s))
sum := h.Sum(nil)
copy(o[:redblackhash.KeySize], sum)
return
}
func (f *File) applyContentLength(v string) {
if v == "" { return }
size, err := strconv.ParseInt(v, 10, 64)

View File

@ -3,6 +3,8 @@ package main
import (
"github.com/sirupsen/logrus"
"math"
"sort"
"strings"
"sync"
"sync/atomic"
"time"
@ -85,13 +87,22 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
return nil, ErrKnown
}
// Sort by path
sort.Slice(links, func(i, j int) bool {
return strings.Compare(links[i].Path, links[j].Path) < 0
})
var newJobCount int
var lastLink string
for _, link := range links {
uriStr := link.String()
// Skip already queued links
linkHash := HashString(uriStr)
if job.OD.LoadOrStoreKey(&linkHash) {
// Ignore dupes
if uriStr == lastLink {
continue
}
lastLink = uriStr
job.OD.Wait.Add(1)
newJobs = append(newJobs, Job{
OD: job.OD,
@ -99,11 +110,13 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
UriStr: uriStr,
Fails: 0,
})
newJobCount++
}
if config.Verbose {
logrus.WithFields(logrus.Fields{
"url": job.UriStr,
"files": len(links),
"files": newJobCount,
}).Debug("Listed")
}
} else {