Less tokenizer allocations

This commit is contained in:
Richard Patel 2018-11-16 00:22:40 +01:00
parent 084b3a5903
commit 82234f949e
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB

View File

@ -8,7 +8,6 @@ import (
"github.com/valyala/fasthttp" "github.com/valyala/fasthttp"
"golang.org/x/crypto/blake2b" "golang.org/x/crypto/blake2b"
"golang.org/x/net/html" "golang.org/x/net/html"
"golang.org/x/net/html/atom"
"path" "path"
"strconv" "strconv"
"strings" "strings"
@ -41,40 +40,43 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
var linkHref string var linkHref string
for { for {
tokenType := doc.Next() tokenType := doc.Next()
token := doc.Token()
if tokenType == html.ErrorToken { if tokenType == html.ErrorToken {
break break
} }
switch tokenType { switch tokenType {
case html.StartTagToken: case html.StartTagToken:
if token.DataAtom == atom.A { name, hasAttr := doc.TagName()
for _, attr := range token.Attr { if len(name) == 1 && name[0] == 'a' {
if attr.Key == "href" { for hasAttr {
linkHref = attr.Val var ks, vs []byte
ks, vs, hasAttr = doc.TagAttr()
if bytes.Equal(ks, []byte("href")) {
// TODO Check escape
linkHref = string(vs)
break break
} }
} }
} }
case html.EndTagToken: case html.EndTagToken:
if linkHref != "" && token.DataAtom == atom.A { name, _ := doc.TagName()
if len(name) == 1 && name[0] == 'a' {
// Copy params // Copy params
href := linkHref href := linkHref
// Reset params // Reset params
linkHref = "" linkHref = ""
// TODO Optimized decision tree
if strings.LastIndexByte(href, '?') != -1 { if strings.LastIndexByte(href, '?') != -1 {
goto nextToken goto nextToken
} }
for _, entry := range urlBlackList { switch href {
if href == entry { case "", " ", ".", "..", "/":
goto nextToken goto nextToken
}
} }
if strings.Contains(href, "../") { if strings.Contains(href, "../") {
goto nextToken goto nextToken
} }
@ -176,11 +178,3 @@ func checkStatusCode(status int) error {
return fmt.Errorf("got HTTP status %d", status) return fmt.Errorf("got HTTP status %d", status)
} }
} }
var urlBlackList = [...]string {
"",
" ",
".",
"..",
"/",
}