From 43f96c6988c115779bec480325005a12a39118da Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 18 Dec 2018 15:39:41 +0100 Subject: [PATCH] Benchmark: Reference parser --- crawl_test.go | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/crawl_test.go b/crawl_test.go index 2ca3f84..406447f 100644 --- a/crawl_test.go +++ b/crawl_test.go @@ -1,7 +1,11 @@ package main import ( + "bytes" + "github.com/PuerkitoBio/goquery" "github.com/terorie/od-database-crawler/fasturl" + "net/url" + "strings" "testing" ) @@ -19,3 +23,37 @@ func BenchmarkParseDir(b *testing.B) { } } } + +func BenchmarkParseDirReference(b *testing.B) { + for n := 0; n < b.N; n++ { + u, err := url.Parse("http://archive.ubuntu.com/ubuntu/indices/") + if err != nil { + b.Fatal("Failed to parse URL", err) + } + + _, err = referenceParseDir([]byte(apache2Listing), u) + if err != nil { + b.Fatal("Failed to extract links", err) + } + } +} + +func referenceParseDir(body []byte, baseUri *url.URL) (links []*url.URL, err error) { + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) + if err != nil { return nil, err } + + doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { + href, _ := s.Attr("href") + + sub, err := baseUri.Parse(href) + if err != nil { return } // continue + + if !strings.HasPrefix(sub.String(), baseUri.String()) { + return // continue + } + + links = append(links, sub) + }) + + return +}