From f2d2b620fa453865962b5debd91535cd7718297c Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sat, 27 Oct 2018 04:08:32 +0200 Subject: [PATCH 1/7] Simple queue crawler --- crawl.go | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 crawl.go diff --git a/crawl.go b/crawl.go new file mode 100644 index 0000000..9e030e3 --- /dev/null +++ b/crawl.go @@ -0,0 +1,185 @@ +package main + +import ( + "github.com/PuerkitoBio/goquery" + "github.com/sirupsen/logrus" + "net/http" + "net/url" + "os" + "strings" + "sync" + "sync/atomic" + "time" +) + +const ( + nConns = 100 +) + +var client = http.DefaultClient +var wait sync.WaitGroup + +var visited int64 + +var in chan<- url.URL +var out <-chan url.URL + +func main() { + if len(os.Args) != 2 { + println("Usage: ./crawl ") + os.Exit(1) + } + + in, out = makeInfinite() + + go func() { + var visitedLast int64 = 0 + for range time.NewTicker(time.Second).C { + visitedNow := atomic.LoadInt64(&visited) + logrus. + WithField("per_second", visitedNow - visitedLast). + WithField("total", visitedNow). + Info("Tick") + visitedLast = visitedNow + } + }() + + base, _ := url.Parse(os.Args[1]) + in <- *base + wait.Add(nConns) + for i := 0; i < nConns; i++ { + go worker() + } + wait.Wait() +} + +func worker() { + for u := range out { + if strings.HasSuffix(u.Path, "/") { + // Dir + links := listDir(u) + for _, sub := range links { + subrefi, err := url.Parse(sub) + subref := *subrefi + if err != nil { continue } + + in <- *u.ResolveReference(&subref) + } + } else { + // File + // TODO check file + } + } + wait.Done() +} + +func listDir(u url.URL) (links []string) { + //logrus.Infof("Visiting %s", u) + atomic.AddInt64(&visited, 1) + + res, err := client.Get(u.String()) + if err != nil { + logrus.Error(err) + return + } + defer res.Body.Close() + + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + logrus.Error(err) + return + } + + doc.Find("a[href]").Each(func(_ int, s *goquery.Selection) { + href, _ := s.Attr("href") + text := s.Text() + + if href == "." { + return + } + + for _, entry := range blackList { + if strings.Contains(href, entry) { + return + } + } + + for _, entry := range fileNameBlackList { + if strings.Contains(text, entry) { + return + } + } + + links = append(links, href) + }) + + return +} + +func makeInfinite() (chan<- url.URL, <-chan url.URL) { + in := make(chan url.URL) + out := make(chan url.URL) + // Set up in and out queues + go func() { + var inQueue []url.URL + outCh := func() chan url.URL { + if len(inQueue) == 0 { + return nil + } + return out + } + for len(inQueue) > 0 || in != nil { + if len(inQueue) == 0 { + v, ok := <-in + if !ok { + in = nil + } else { + inQueue = append(inQueue, v) + } + } else { + select { + case v, ok := <-in: + if !ok { + in = nil + } else { + inQueue = append(inQueue, v) + } + case outCh() <- inQueue[0]: + inQueue = inQueue[1:] + } + } + } + close(out) + }() + return in, out +} + +var blackList = [...]string { + "?C=N&O=D", + "?C=M&O=A", + "?C=S&O=A", + "?C=D&O=A", + "?C=N;O=D", + "?C=M;O=A", + "?C=M&O=D", + "?C=S;O=A", + "?C=S&O=D", + "?C=D;O=A", + "?MA", + "?SA", + "?DA", + "?ND", + "?C=N&O=A", + "?C=N&O=A", + "?M=A", + "?N=D", + "?S=A", + "?D=A", +} + +var fileNameBlackList = [...]string { + "Parent Directory", + " Parent Directory", + "../", +} + From 2844d344ec8d251bfd85b01f0e105a27858a6ff2 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sat, 27 Oct 2018 15:00:20 +0200 Subject: [PATCH 2/7] Working listing --- crawl.go | 106 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 33 deletions(-) diff --git a/crawl.go b/crawl.go index 9e030e3..a0ca8ae 100644 --- a/crawl.go +++ b/crawl.go @@ -1,8 +1,9 @@ package main import ( - "github.com/PuerkitoBio/goquery" "github.com/sirupsen/logrus" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" "net/http" "net/url" "os" @@ -62,8 +63,8 @@ func worker() { subrefi, err := url.Parse(sub) subref := *subrefi if err != nil { continue } - - in <- *u.ResolveReference(&subref) + abs := *u.ResolveReference(&subref) + in <- abs } } else { // File @@ -74,9 +75,6 @@ func worker() { } func listDir(u url.URL) (links []string) { - //logrus.Infof("Visiting %s", u) - atomic.AddInt64(&visited, 1) - res, err := client.Get(u.String()) if err != nil { logrus.Error(err) @@ -84,34 +82,68 @@ func listDir(u url.URL) (links []string) { } defer res.Body.Close() - doc, err := goquery.NewDocumentFromReader(res.Body) - if err != nil { - logrus.Error(err) - return + doc := html.NewTokenizer(res.Body) + + var linkHref string + var linkTexts []string + for { + tokenType := doc.Next() + token := doc.Token() + if tokenType == html.ErrorToken { + break + } + + switch tokenType { + case html.StartTagToken: + if token.DataAtom == atom.A { + for _, attr := range token.Attr { + if attr.Key == "href" { + linkHref = attr.Val + break + } + } + } + + case html.TextToken: + if linkHref != "" { + linkTexts = append(linkTexts, token.Data) + } + + case html.EndTagToken: + if linkHref != "" && token.DataAtom == atom.A { + // Copy params + href := linkHref + linkText := strings.Join(linkTexts, " ") + + // Reset params + linkHref = "" + linkTexts = nil + + // TODO Optimized decision tree + for _, entry := range urlBlackList { + if href == entry { + goto nextToken + } + } + for _, entry := range urlPartBlackList { + if strings.Contains(href, entry) { + goto nextToken + } + } + for _, entry := range fileNameBlackList { + if strings.Contains(linkText, entry) { + goto nextToken + } + } + + links = append(links, href) + } + } + + nextToken: } - doc.Find("a[href]").Each(func(_ int, s *goquery.Selection) { - href, _ := s.Attr("href") - text := s.Text() - - if href == "." { - return - } - - for _, entry := range blackList { - if strings.Contains(href, entry) { - return - } - } - - for _, entry := range fileNameBlackList { - if strings.Contains(text, entry) { - return - } - } - - links = append(links, href) - }) + atomic.AddInt64(&visited, 1) return } @@ -154,7 +186,15 @@ func makeInfinite() (chan<- url.URL, <-chan url.URL) { return in, out } -var blackList = [...]string { +var urlBlackList = [...]string { + "", + " ", + ".", + "..", + "/", +} + +var urlPartBlackList = [...]string { "?C=N&O=D", "?C=M&O=A", "?C=S&O=A", From d748be72cd57bf37dc4e80e8d3fe6ccd951b78fd Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sat, 27 Oct 2018 16:22:01 +0200 Subject: [PATCH 3/7] File HEAD requests --- crawl.go | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 6 deletions(-) diff --git a/crawl.go b/crawl.go index a0ca8ae..ac165e4 100644 --- a/crawl.go +++ b/crawl.go @@ -1,12 +1,14 @@ package main import ( + "bytes" "github.com/sirupsen/logrus" + "github.com/valyala/fasthttp" "golang.org/x/net/html" "golang.org/x/net/html/atom" - "net/http" "net/url" "os" + "strconv" "strings" "sync" "sync/atomic" @@ -17,7 +19,7 @@ const ( nConns = 100 ) -var client = http.DefaultClient +var client = fasthttp.Client{} var wait sync.WaitGroup var visited int64 @@ -25,6 +27,14 @@ var visited int64 var in chan<- url.URL var out <-chan url.URL +type File struct { + Name string `json:"name"` + Size int64 `json:"size"` + MTime time.Time `json:"mtime"` + Path string `json:"path"` + IsDir bool `json:"-"` +} + func main() { if len(os.Args) != 2 { println("Usage: ./crawl ") @@ -62,27 +72,38 @@ func worker() { for _, sub := range links { subrefi, err := url.Parse(sub) subref := *subrefi + // TODO Print errors if err != nil { continue } abs := *u.ResolveReference(&subref) in <- abs } } else { // File - // TODO check file + var fil File + err := fileInfo(u, &fil) + // TODO Print errors + if err != nil { continue } } } wait.Done() } func listDir(u url.URL) (links []string) { - res, err := client.Get(u.String()) + req := fasthttp.AcquireRequest() + req.SetRequestURI(u.String()) + + res := fasthttp.AcquireResponse() + defer fasthttp.ReleaseResponse(res) + + err := client.Do(req, res) + fasthttp.ReleaseRequest(req) + if err != nil { logrus.Error(err) return } - defer res.Body.Close() - doc := html.NewTokenizer(res.Body) + doc := html.NewTokenizer(bytes.NewReader(res.Body())) var linkHref string var linkTexts []string @@ -148,6 +169,55 @@ func listDir(u url.URL) (links []string) { return } +func fileInfo(u url.URL, f *File) (err error) { + req := fasthttp.AcquireRequest() + req.Header.SetMethod("HEAD") + req.SetRequestURI(u.String()) + + res := fasthttp.AcquireResponse() + res.SkipBody = true + defer fasthttp.ReleaseResponse(res) + + err = client.Do(req, res) + fasthttp.ReleaseRequest(req) + + if err != nil { return } + + // TODO Inefficient af + header := res.Header.String() + + for _, line := range strings.Split(header, "\r\n") { + if line == "" { continue } + if strings.HasPrefix(line, "HTTP/1") { continue } + + parts := strings.SplitN(line, ": ", 2) + if len(parts) != 2 { continue } + + k, v := parts[0], parts[1] + k = strings.ToLower(k) + + switch k { + case "content-length": + size, err := strconv.ParseInt(v, 10, 64) + if err != nil { break } + if size < 0 { break } + f.Size = size + + case "last-modified": + var err error + f.MTime, err = time.Parse(time.RFC1123, v) + if err == nil { break } + f.MTime, err = time.Parse(time.RFC850, v) + if err == nil { break } + // TODO Parse asctime + f.MTime, err = time.Parse("2006-01-02", v[:10]) + if err == nil { break } + } + } + + return nil +} + func makeInfinite() (chan<- url.URL, <-chan url.URL) { in := make(chan url.URL) out := make(chan url.URL) From 9e090d109da4962d58c4df5dbdb4464afa3a4175 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sat, 27 Oct 2018 16:29:10 +0200 Subject: [PATCH 4/7] Header state machine --- crawl.go | 78 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 23 deletions(-) diff --git a/crawl.go b/crawl.go index ac165e4..849835f 100644 --- a/crawl.go +++ b/crawl.go @@ -2,6 +2,7 @@ package main import ( "bytes" + "errors" "github.com/sirupsen/logrus" "github.com/valyala/fasthttp" "golang.org/x/net/html" @@ -184,34 +185,65 @@ func fileInfo(u url.URL, f *File) (err error) { if err != nil { return } // TODO Inefficient af - header := res.Header.String() + header := res.Header.Header() - for _, line := range strings.Split(header, "\r\n") { - if line == "" { continue } - if strings.HasPrefix(line, "HTTP/1") { continue } + var k []byte + var v []byte - parts := strings.SplitN(line, ": ", 2) - if len(parts) != 2 { continue } + // Simple finite state machine + state := 0 + for _, b := range header { + switch state { + case 0: + if b == byte(':') { + state = 1 + } else { + k = append(k, b) + } - k, v := parts[0], parts[1] - k = strings.ToLower(k) + case 1: + if b == byte(' ') { + state = 2 + } else { + return errors.New("bad request") + } - switch k { - case "content-length": - size, err := strconv.ParseInt(v, 10, 64) - if err != nil { break } - if size < 0 { break } - f.Size = size + case 2: + if b == byte('\r') { + state = 3 + } else { + v = append(v, b) + } - case "last-modified": - var err error - f.MTime, err = time.Parse(time.RFC1123, v) - if err == nil { break } - f.MTime, err = time.Parse(time.RFC850, v) - if err == nil { break } - // TODO Parse asctime - f.MTime, err = time.Parse("2006-01-02", v[:10]) - if err == nil { break } + case 3: + if b == byte('\n') { + state = 0 + key := strings.ToLower(string(k)) + val := string(v) + + switch key { + case "content-length": + size, err := strconv.ParseInt(val, 10, 64) + if err != nil { break } + if size < 0 { break } + f.Size = size + + case "last-modified": + var err error + f.MTime, err = time.Parse(time.RFC1123, val) + if err == nil { break } + f.MTime, err = time.Parse(time.RFC850, val) + if err == nil { break } + // TODO Parse asctime + f.MTime, err = time.Parse("2006-01-02", val[:10]) + if err == nil { break } + } + + k = k[:0] + v = v[:0] + } else { + return errors.New("bad request") + } } } From 442a2cf8a7eb7ed633764d1697e0d73dc207826b Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sat, 27 Oct 2018 16:53:45 +0200 Subject: [PATCH 5/7] Compare finite state machine and Regex --- crawl.go | 103 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 59 insertions(+), 44 deletions(-) diff --git a/crawl.go b/crawl.go index 849835f..6f5c2ab 100644 --- a/crawl.go +++ b/crawl.go @@ -2,13 +2,13 @@ package main import ( "bytes" - "errors" "github.com/sirupsen/logrus" "github.com/valyala/fasthttp" "golang.org/x/net/html" "golang.org/x/net/html/atom" "net/url" "os" + "regexp" "strconv" "strings" "sync" @@ -28,6 +28,8 @@ var visited int64 var in chan<- url.URL var out <-chan url.URL +var matchHeader = regexp.MustCompile("([\\w-]+): (.*)") + type File struct { Name string `json:"name"` Size int64 `json:"size"` @@ -186,68 +188,81 @@ func fileInfo(u url.URL, f *File) (err error) { // TODO Inefficient af header := res.Header.Header() + s := time.Now() + for i := 0; i < 10000; i++ { + f.ParseHeaderRegex(header) + } + println(time.Since(s).String()) - var k []byte - var v []byte + return nil +} + +func (f *File) ParseHeaderRegex(h []byte) { + for _, parts := range matchHeader.FindAllSubmatch(h, -1) { + k := string(parts[1]) + v := string(parts[2]) + f.applyHeader(k, v) + } +} + +func (f *File) ParseHeaderMachine(h []byte) { + var k1, k2 int + var v1, v2 int // Simple finite state machine state := 0 - for _, b := range header { + for i, b := range h { switch state { case 0: if b == byte(':') { state = 1 - } else { - k = append(k, b) + k2 = i } case 1: - if b == byte(' ') { - state = 2 - } else { - return errors.New("bad request") - } + state = 2 case 2: - if b == byte('\r') { - state = 3 - } else { - v = append(v, b) - } + state = 3 + v1 = i case 3: - if b == byte('\n') { - state = 0 - key := strings.ToLower(string(k)) - val := string(v) - - switch key { - case "content-length": - size, err := strconv.ParseInt(val, 10, 64) - if err != nil { break } - if size < 0 { break } - f.Size = size - - case "last-modified": - var err error - f.MTime, err = time.Parse(time.RFC1123, val) - if err == nil { break } - f.MTime, err = time.Parse(time.RFC850, val) - if err == nil { break } - // TODO Parse asctime - f.MTime, err = time.Parse("2006-01-02", val[:10]) - if err == nil { break } - } - - k = k[:0] - v = v[:0] - } else { - return errors.New("bad request") + if b == byte('\r') { + state = 4 } + + case 4: + state = 0 + v2 = i - 1 + + key := string(h[k1:k2]) + val := string(h[v1:v2]) + k1 = i + + f.applyHeader(key, val) } } - return nil +} + +func (f *File) applyHeader(k, v string) { + switch k { + case "content-length": + size, err := strconv.ParseInt(v, 10, 64) + if err != nil { break } + if size < 0 { break } + f.Size = size + + case "last-modified": + var err error + f.MTime, err = time.Parse(time.RFC1123, v) + if err == nil { break } + f.MTime, err = time.Parse(time.RFC850, v) + if err == nil { break } + // TODO Parse asctime + f.MTime, err = time.Parse("2006-01-02", v[:10]) + if err == nil { break } + } } func makeInfinite() (chan<- url.URL, <-chan url.URL) { From 76c8c13d499d395a73351f3e2679564ac082dfc1 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sat, 27 Oct 2018 16:55:00 +0200 Subject: [PATCH 6/7] Use finite state machine --- crawl.go | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/crawl.go b/crawl.go index 6f5c2ab..9f1b0ae 100644 --- a/crawl.go +++ b/crawl.go @@ -8,7 +8,6 @@ import ( "golang.org/x/net/html/atom" "net/url" "os" - "regexp" "strconv" "strings" "sync" @@ -28,8 +27,6 @@ var visited int64 var in chan<- url.URL var out <-chan url.URL -var matchHeader = regexp.MustCompile("([\\w-]+): (.*)") - type File struct { Name string `json:"name"` Size int64 `json:"size"` @@ -188,24 +185,12 @@ func fileInfo(u url.URL, f *File) (err error) { // TODO Inefficient af header := res.Header.Header() - s := time.Now() - for i := 0; i < 10000; i++ { - f.ParseHeaderRegex(header) - } - println(time.Since(s).String()) + f.ParseHeader(header) return nil } -func (f *File) ParseHeaderRegex(h []byte) { - for _, parts := range matchHeader.FindAllSubmatch(h, -1) { - k := string(parts[1]) - v := string(parts[2]) - f.applyHeader(k, v) - } -} - -func (f *File) ParseHeaderMachine(h []byte) { +func (f *File) ParseHeader(h []byte) { var k1, k2 int var v1, v2 int From 3fb4d4bde9bbdd55a08da6da7aafb18b9186844a Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sat, 27 Oct 2018 17:25:32 +0200 Subject: [PATCH 7/7] More logs --- crawl.go | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/crawl.go b/crawl.go index 9f1b0ae..769dc4b 100644 --- a/crawl.go +++ b/crawl.go @@ -8,6 +8,7 @@ import ( "golang.org/x/net/html/atom" "net/url" "os" + "path" "strconv" "strings" "sync" @@ -66,36 +67,54 @@ func main() { func worker() { for u := range out { + // File + var fil File if strings.HasSuffix(u.Path, "/") { // Dir - links := listDir(u) + links, err := listDir(u, &fil) + if err != nil { + logrus.WithError(err). + WithField("url", u.String()). + Error("Failed getting dir") + continue + } for _, sub := range links { subrefi, err := url.Parse(sub) subref := *subrefi // TODO Print errors if err != nil { continue } abs := *u.ResolveReference(&subref) + // TODO Check if host changed in <- abs } + //logrus.Infof("LISTED %s", u.Path) } else { - // File - var fil File err := fileInfo(u, &fil) - // TODO Print errors - if err != nil { continue } + if err != nil { + logrus.WithError(err). + WithField("url", u.String()). + Error("Failed getting file") + continue + } } } wait.Done() } -func listDir(u url.URL) (links []string) { +func listDir(u url.URL, f *File) (links []string, err error) { + f.IsDir = true + u.Path = path.Clean(u.Path) + // TODO Handle external links + f.Name = path.Base(u.Path) + f.Path = strings.TrimLeft(u.Path, "/") + req := fasthttp.AcquireRequest() req.SetRequestURI(u.String()) res := fasthttp.AcquireResponse() defer fasthttp.ReleaseResponse(res) - err := client.Do(req, res) + err = client.Do(req, res) fasthttp.ReleaseRequest(req) if err != nil { @@ -170,6 +189,11 @@ func listDir(u url.URL) (links []string) { } func fileInfo(u url.URL, f *File) (err error) { + f.IsDir = false + u.Path = path.Clean(u.Path) + f.Name = path.Base(u.Path) + f.Path = strings.Trim(u.Path, "/") + req := fasthttp.AcquireRequest() req.Header.SetMethod("HEAD") req.SetRequestURI(u.String()) @@ -187,6 +211,8 @@ func fileInfo(u url.URL, f *File) (err error) { header := res.Header.Header() f.ParseHeader(header) + atomic.AddInt64(&visited, 1) + return nil }