Detect directory symlinks

This commit is contained in:
Richard Patel 2018-10-28 18:37:18 +01:00
parent fa37d45378
commit 77cb45dbec
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB
4 changed files with 40 additions and 5 deletions

View File

@ -2,10 +2,11 @@ package main
import ( import (
"bytes" "bytes"
"errors" "encoding/base64"
"fmt" "fmt"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/valyala/fasthttp" "github.com/valyala/fasthttp"
"golang.org/x/crypto/blake2b"
"golang.org/x/net/html" "golang.org/x/net/html"
"golang.org/x/net/html/atom" "golang.org/x/net/html/atom"
"net/url" "net/url"
@ -16,8 +17,6 @@ import (
) )
var client fasthttp.Client var client fasthttp.Client
var ErrRateLimit = errors.New("too many requests")
var ErrForbidden = errors.New("access denied")
func GetDir(j *Job, f *File) (links []url.URL, err error) { func GetDir(j *Job, f *File) (links []url.URL, err error) {
f.IsDir = true f.IsDir = true
@ -146,6 +145,18 @@ func GetFile(u url.URL, f *File) (err error) {
return nil return nil
} }
func (f *File) HashDir(links []url.URL) string {
h, _ := blake2b.New256(nil)
h.Write([]byte(f.Name))
for _, link := range links {
fileName := path.Base(link.Path)
h.Write([]byte(fileName))
}
sum := h.Sum(nil)
b64sum := base64.StdEncoding.EncodeToString(sum)
return b64sum
}
func (f *File) ParseHeader(h []byte) { func (f *File) ParseHeader(h []byte) {
var k1, k2 int var k1, k2 int
var v1, v2 int var v1, v2 int

8
errors.go Normal file
View File

@ -0,0 +1,8 @@
package main
import "errors"
var ErrRateLimit = errors.New("too many requests")
var ErrForbidden = errors.New("access denied")
var ErrKnown = errors.New("already crawled")

View File

@ -2,6 +2,7 @@ package main
import ( import (
"context" "context"
"github.com/sirupsen/logrus"
"github.com/urfave/cli" "github.com/urfave/cli"
"net/url" "net/url"
"os" "os"
@ -69,6 +70,8 @@ func cmdCrawler(clic *cli.Context) error {
// Wait for all jobs to finish // Wait for all jobs to finish
globalWait.Wait() globalWait.Wait()
logrus.Info("All dirs processed!")
return nil return nil
} }

View File

@ -31,6 +31,9 @@ func (w WorkerContext) step(job Job) {
newJobs, err := DoJob(&job, &f) newJobs, err := DoJob(&job, &f)
atomic.AddUint64(&totalStarted, 1) atomic.AddUint64(&totalStarted, 1)
if err == ErrKnown {
return
}
if err != nil { if err != nil {
job.Fails++ job.Fails++
@ -64,9 +67,8 @@ func (w WorkerContext) step(job Job) {
} }
func DoJob(job *Job, f *File) (newJobs []Job, err error) { func DoJob(job *Job, f *File) (newJobs []Job, err error) {
// File
if strings.HasSuffix(job.Uri.Path, "/") { if strings.HasSuffix(job.Uri.Path, "/") {
// Dir // Load directory
links, err := GetDir(job, f) links, err := GetDir(job, f)
if err != nil { if err != nil {
logrus.WithError(err). logrus.WithError(err).
@ -74,7 +76,17 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
Error("Failed getting dir") Error("Failed getting dir")
return nil, err return nil, err
} }
// Hash directory
hash := f.HashDir(links)
// Skip symlinked dirs
if _, old := job.OD.Scanned.LoadOrStore(hash, true); old {
return nil, ErrKnown
}
for _, link := range links { for _, link := range links {
// Skip already queued links
if _, old := job.OD.Scanned.LoadOrStore(link, true); old { if _, old := job.OD.Scanned.LoadOrStore(link, true); old {
continue continue
} }
@ -91,6 +103,7 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
"files": len(links), "files": len(links),
}).Debug("Listed") }).Debug("Listed")
} else { } else {
// Load file
err := GetFile(job.Uri, f) err := GetFile(job.Uri, f)
if err != nil { if err != nil {
logrus.WithError(err). logrus.WithError(err).