mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-04 06:52:59 +00:00
Benchmark: Reference parser
This commit is contained in:
parent
b244cdae80
commit
43f96c6988
@ -1,7 +1,11 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/terorie/od-database-crawler/fasturl"
|
||||
"net/url"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@ -19,3 +23,37 @@ func BenchmarkParseDir(b *testing.B) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkParseDirReference(b *testing.B) {
|
||||
for n := 0; n < b.N; n++ {
|
||||
u, err := url.Parse("http://archive.ubuntu.com/ubuntu/indices/")
|
||||
if err != nil {
|
||||
b.Fatal("Failed to parse URL", err)
|
||||
}
|
||||
|
||||
_, err = referenceParseDir([]byte(apache2Listing), u)
|
||||
if err != nil {
|
||||
b.Fatal("Failed to extract links", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func referenceParseDir(body []byte, baseUri *url.URL) (links []*url.URL, err error) {
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||
if err != nil { return nil, err }
|
||||
|
||||
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
||||
href, _ := s.Attr("href")
|
||||
|
||||
sub, err := baseUri.Parse(href)
|
||||
if err != nil { return } // continue
|
||||
|
||||
if !strings.HasPrefix(sub.String(), baseUri.String()) {
|
||||
return // continue
|
||||
}
|
||||
|
||||
links = append(links, sub)
|
||||
})
|
||||
|
||||
return
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user