Working listing

This commit is contained in:
Richard Patel 2018-10-27 15:00:20 +02:00
parent f2d2b620fa
commit 2844d344ec
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB

View File

@ -1,8 +1,9 @@
package main package main
import ( import (
"github.com/PuerkitoBio/goquery"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"net/http" "net/http"
"net/url" "net/url"
"os" "os"
@ -62,8 +63,8 @@ func worker() {
subrefi, err := url.Parse(sub) subrefi, err := url.Parse(sub)
subref := *subrefi subref := *subrefi
if err != nil { continue } if err != nil { continue }
abs := *u.ResolveReference(&subref)
in <- *u.ResolveReference(&subref) in <- abs
} }
} else { } else {
// File // File
@ -74,9 +75,6 @@ func worker() {
} }
func listDir(u url.URL) (links []string) { func listDir(u url.URL) (links []string) {
//logrus.Infof("Visiting %s", u)
atomic.AddInt64(&visited, 1)
res, err := client.Get(u.String()) res, err := client.Get(u.String())
if err != nil { if err != nil {
logrus.Error(err) logrus.Error(err)
@ -84,34 +82,68 @@ func listDir(u url.URL) (links []string) {
} }
defer res.Body.Close() defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body) doc := html.NewTokenizer(res.Body)
if err != nil {
logrus.Error(err) var linkHref string
return var linkTexts []string
for {
tokenType := doc.Next()
token := doc.Token()
if tokenType == html.ErrorToken {
break
} }
doc.Find("a[href]").Each(func(_ int, s *goquery.Selection) { switch tokenType {
href, _ := s.Attr("href") case html.StartTagToken:
text := s.Text() if token.DataAtom == atom.A {
for _, attr := range token.Attr {
if href == "." { if attr.Key == "href" {
return linkHref = attr.Val
break
}
}
} }
for _, entry := range blackList { case html.TextToken:
if linkHref != "" {
linkTexts = append(linkTexts, token.Data)
}
case html.EndTagToken:
if linkHref != "" && token.DataAtom == atom.A {
// Copy params
href := linkHref
linkText := strings.Join(linkTexts, " ")
// Reset params
linkHref = ""
linkTexts = nil
// TODO Optimized decision tree
for _, entry := range urlBlackList {
if href == entry {
goto nextToken
}
}
for _, entry := range urlPartBlackList {
if strings.Contains(href, entry) { if strings.Contains(href, entry) {
return goto nextToken
} }
} }
for _, entry := range fileNameBlackList { for _, entry := range fileNameBlackList {
if strings.Contains(text, entry) { if strings.Contains(linkText, entry) {
return goto nextToken
} }
} }
links = append(links, href) links = append(links, href)
}) }
}
nextToken:
}
atomic.AddInt64(&visited, 1)
return return
} }
@ -154,7 +186,15 @@ func makeInfinite() (chan<- url.URL, <-chan url.URL) {
return in, out return in, out
} }
var blackList = [...]string { var urlBlackList = [...]string {
"",
" ",
".",
"..",
"/",
}
var urlPartBlackList = [...]string {
"?C=N&O=D", "?C=N&O=D",
"?C=M&O=A", "?C=M&O=A",
"?C=S&O=A", "?C=S&O=A",