mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-18 18:06:45 +00:00
Optimizing with hexa :P
This commit is contained in:
parent
ac0b8d2d0b
commit
084b3a5903
19
crawl.go
19
crawl.go
@ -39,7 +39,6 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
||||
doc := html.NewTokenizer(bytes.NewReader(body))
|
||||
|
||||
var linkHref string
|
||||
var linkTexts []string
|
||||
for {
|
||||
tokenType := doc.Next()
|
||||
token := doc.Token()
|
||||
@ -58,20 +57,13 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
||||
}
|
||||
}
|
||||
|
||||
case html.TextToken:
|
||||
if linkHref != "" {
|
||||
linkTexts = append(linkTexts, token.Data)
|
||||
}
|
||||
|
||||
case html.EndTagToken:
|
||||
if linkHref != "" && token.DataAtom == atom.A {
|
||||
// Copy params
|
||||
href := linkHref
|
||||
linkText := strings.Join(linkTexts, " ")
|
||||
|
||||
// Reset params
|
||||
linkHref = ""
|
||||
linkTexts = nil
|
||||
|
||||
// TODO Optimized decision tree
|
||||
if strings.LastIndexByte(href, '?') != -1 {
|
||||
@ -83,11 +75,9 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
||||
goto nextToken
|
||||
}
|
||||
}
|
||||
for _, entry := range fileNameBlackList {
|
||||
if strings.Contains(linkText, entry) {
|
||||
if strings.Contains(href, "../") {
|
||||
goto nextToken
|
||||
}
|
||||
}
|
||||
|
||||
var link fasturl.URL
|
||||
err = j.Uri.ParseRel(&link, href)
|
||||
@ -194,10 +184,3 @@ var urlBlackList = [...]string {
|
||||
"..",
|
||||
"/",
|
||||
}
|
||||
|
||||
var fileNameBlackList = [...]string {
|
||||
"Parent Directory",
|
||||
" Parent Directory",
|
||||
"../",
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user