From f2d2b620fa453865962b5debd91535cd7718297c Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sat, 27 Oct 2018 04:08:32 +0200 Subject: [PATCH] Simple queue crawler --- crawl.go | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 crawl.go diff --git a/crawl.go b/crawl.go new file mode 100644 index 0000000..9e030e3 --- /dev/null +++ b/crawl.go @@ -0,0 +1,185 @@ +package main + +import ( + "github.com/PuerkitoBio/goquery" + "github.com/sirupsen/logrus" + "net/http" + "net/url" + "os" + "strings" + "sync" + "sync/atomic" + "time" +) + +const ( + nConns = 100 +) + +var client = http.DefaultClient +var wait sync.WaitGroup + +var visited int64 + +var in chan<- url.URL +var out <-chan url.URL + +func main() { + if len(os.Args) != 2 { + println("Usage: ./crawl ") + os.Exit(1) + } + + in, out = makeInfinite() + + go func() { + var visitedLast int64 = 0 + for range time.NewTicker(time.Second).C { + visitedNow := atomic.LoadInt64(&visited) + logrus. + WithField("per_second", visitedNow - visitedLast). + WithField("total", visitedNow). + Info("Tick") + visitedLast = visitedNow + } + }() + + base, _ := url.Parse(os.Args[1]) + in <- *base + wait.Add(nConns) + for i := 0; i < nConns; i++ { + go worker() + } + wait.Wait() +} + +func worker() { + for u := range out { + if strings.HasSuffix(u.Path, "/") { + // Dir + links := listDir(u) + for _, sub := range links { + subrefi, err := url.Parse(sub) + subref := *subrefi + if err != nil { continue } + + in <- *u.ResolveReference(&subref) + } + } else { + // File + // TODO check file + } + } + wait.Done() +} + +func listDir(u url.URL) (links []string) { + //logrus.Infof("Visiting %s", u) + atomic.AddInt64(&visited, 1) + + res, err := client.Get(u.String()) + if err != nil { + logrus.Error(err) + return + } + defer res.Body.Close() + + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + logrus.Error(err) + return + } + + doc.Find("a[href]").Each(func(_ int, s *goquery.Selection) { + href, _ := s.Attr("href") + text := s.Text() + + if href == "." { + return + } + + for _, entry := range blackList { + if strings.Contains(href, entry) { + return + } + } + + for _, entry := range fileNameBlackList { + if strings.Contains(text, entry) { + return + } + } + + links = append(links, href) + }) + + return +} + +func makeInfinite() (chan<- url.URL, <-chan url.URL) { + in := make(chan url.URL) + out := make(chan url.URL) + // Set up in and out queues + go func() { + var inQueue []url.URL + outCh := func() chan url.URL { + if len(inQueue) == 0 { + return nil + } + return out + } + for len(inQueue) > 0 || in != nil { + if len(inQueue) == 0 { + v, ok := <-in + if !ok { + in = nil + } else { + inQueue = append(inQueue, v) + } + } else { + select { + case v, ok := <-in: + if !ok { + in = nil + } else { + inQueue = append(inQueue, v) + } + case outCh() <- inQueue[0]: + inQueue = inQueue[1:] + } + } + } + close(out) + }() + return in, out +} + +var blackList = [...]string { + "?C=N&O=D", + "?C=M&O=A", + "?C=S&O=A", + "?C=D&O=A", + "?C=N;O=D", + "?C=M;O=A", + "?C=M&O=D", + "?C=S;O=A", + "?C=S&O=D", + "?C=D;O=A", + "?MA", + "?SA", + "?DA", + "?ND", + "?C=N&O=A", + "?C=N&O=A", + "?M=A", + "?N=D", + "?S=A", + "?D=A", +} + +var fileNameBlackList = [...]string { + "Parent Directory", + " Parent Directory", + "../", +} +