From 1c33346f45841e60051a556f6239ecb3c1eb7ea9 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sun, 28 Oct 2018 03:06:18 +0100 Subject: [PATCH] Fix crawl descent --- config.go | 8 ++++++++ config.yml | 4 +++- crawl_http.go | 24 ++++++++++++++---------- main.go | 2 +- worker.go | 6 +++++- 5 files changed, 31 insertions(+), 13 deletions(-) diff --git a/config.go b/config.go index 64a7856..4205c7e 100644 --- a/config.go +++ b/config.go @@ -12,6 +12,7 @@ var config struct { Retries int Workers int StatsInterval time.Duration + Verbose bool } const ( @@ -20,12 +21,14 @@ const ( ConfRetries = "retries" ConfWorkers = "workers" ConfStatsInterval = "stats_interval" + ConfVerbose = "verbose" ) func prepareConfig() { viper.SetDefault(ConfRetries, 3) viper.SetDefault(ConfWorkers, 50) viper.SetDefault(ConfStatsInterval, 3 * time.Second) + viper.SetDefault(ConfVerbose, false) } func readConfig() { @@ -57,4 +60,9 @@ func readConfig() { } config.StatsInterval = viper.GetDuration(ConfStatsInterval) + + config.Verbose = viper.GetBool(ConfVerbose) + if config.Verbose { + logrus.SetLevel(logrus.DebugLevel) + } } diff --git a/config.yml b/config.yml index d31471c..7a4ac78 100644 --- a/config.yml +++ b/config.yml @@ -1,3 +1,5 @@ server_url: localhost:6969 token: abc -stats: 5s \ No newline at end of file +stats_interval: 1s +verbose: true +retries: 0 \ No newline at end of file diff --git a/crawl_http.go b/crawl_http.go index 28f2a1f..5e159fa 100644 --- a/crawl_http.go +++ b/crawl_http.go @@ -27,15 +27,12 @@ func NewRemoteDir(u url.URL) *RemoteDir { return &RemoteDir{ BaseUri: u } } -func GetDir(u url.URL, f *File) (links []url.URL, err error) { +func GetDir(j *Job, f *File) (links []url.URL, err error) { f.IsDir = true - u.Path = path.Clean(u.Path) - // TODO Handle external links - f.Name = path.Base(u.Path) - f.Path = strings.TrimLeft(u.Path, "/") + f.Name = path.Base(j.Uri.Path) req := fasthttp.AcquireRequest() - req.SetRequestURI(u.String()) + req.SetRequestURI(j.Uri.String()) res := fasthttp.AcquireResponse() defer fasthttp.ReleaseResponse(res) @@ -48,7 +45,8 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) { return } - doc := html.NewTokenizer(bytes.NewReader(res.Body())) + body := res.Body() + doc := html.NewTokenizer(bytes.NewReader(body)) var linkHref string var linkTexts []string @@ -105,10 +103,12 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) { subref, err := url.Parse(href) if err != nil { continue } - link := *u.ResolveReference(subref) + link := *j.Uri.ResolveReference(subref) - if link.Scheme != u.Scheme || - link.Host != u.Host { + if link.Scheme != j.Uri.Scheme || + link.Host != j.Uri.Host || + link.Path == j.Uri.Path || + !strings.HasPrefix(link.Path, j.Uri.Path) { continue } @@ -119,6 +119,10 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) { nextToken: } + if len(links) == 0 { + println(string(body)) + } + return } diff --git a/main.go b/main.go index ba8a427..6bb46c2 100644 --- a/main.go +++ b/main.go @@ -14,7 +14,7 @@ func main() { remotes := make(chan *RemoteDir) go Schedule(c, remotes) - u, _ := url.Parse("http://mine.terorie.com:420/") + u, _ := url.Parse("https://the-eye.eu/public/rom/") remote := NewRemoteDir(*u) globalWait.Add(1) diff --git a/worker.go b/worker.go index 76d27ed..380ca98 100644 --- a/worker.go +++ b/worker.go @@ -59,7 +59,7 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) { // File if strings.HasSuffix(job.Uri.Path, "/") { // Dir - links, err := GetDir(job.Uri, f) + links, err := GetDir(job, f) if err != nil { logrus.WithError(err). WithField("url", job.Uri.String()). @@ -75,6 +75,10 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) { Fails: 0, }) } + logrus.WithFields(logrus.Fields{ + "url": job.UriStr, + "files": len(links), + }).Debug("Listed") } else { err := GetFile(job.Uri, f) if err != nil {