package main import ( "bytes" "github.com/sirupsen/logrus" "github.com/valyala/fasthttp" "golang.org/x/net/html" "golang.org/x/net/html/atom" "net/url" "os" "path" "strconv" "strings" "sync" "sync/atomic" "time" ) const ( nConns = 100 ) var client = fasthttp.Client{} var wait sync.WaitGroup var visited int64 var in chan<- url.URL var out <-chan url.URL type File struct { Name string `json:"name"` Size int64 `json:"size"` MTime time.Time `json:"mtime"` Path string `json:"path"` IsDir bool `json:"-"` } func main() { if len(os.Args) != 2 { println("Usage: ./crawl ") os.Exit(1) } in, out = makeInfinite() go func() { var visitedLast int64 = 0 for range time.NewTicker(time.Second).C { visitedNow := atomic.LoadInt64(&visited) logrus. WithField("per_second", visitedNow - visitedLast). WithField("total", visitedNow). Info("Tick") visitedLast = visitedNow } }() base, _ := url.Parse(os.Args[1]) in <- *base wait.Add(nConns) for i := 0; i < nConns; i++ { go worker() } wait.Wait() } func worker() { for u := range out { // File var fil File if strings.HasSuffix(u.Path, "/") { // Dir links, err := listDir(u, &fil) if err != nil { logrus.WithError(err). WithField("url", u.String()). Error("Failed getting dir") continue } for _, sub := range links { subrefi, err := url.Parse(sub) subref := *subrefi // TODO Print errors if err != nil { continue } abs := *u.ResolveReference(&subref) // TODO Check if host changed in <- abs } //logrus.Infof("LISTED %s", u.Path) } else { err := fileInfo(u, &fil) if err != nil { logrus.WithError(err). WithField("url", u.String()). Error("Failed getting file") continue } } } wait.Done() } func listDir(u url.URL, f *File) (links []string, err error) { f.IsDir = true u.Path = path.Clean(u.Path) // TODO Handle external links f.Name = path.Base(u.Path) f.Path = strings.TrimLeft(u.Path, "/") req := fasthttp.AcquireRequest() req.SetRequestURI(u.String()) res := fasthttp.AcquireResponse() defer fasthttp.ReleaseResponse(res) err = client.Do(req, res) fasthttp.ReleaseRequest(req) if err != nil { logrus.Error(err) return } doc := html.NewTokenizer(bytes.NewReader(res.Body())) var linkHref string var linkTexts []string for { tokenType := doc.Next() token := doc.Token() if tokenType == html.ErrorToken { break } switch tokenType { case html.StartTagToken: if token.DataAtom == atom.A { for _, attr := range token.Attr { if attr.Key == "href" { linkHref = attr.Val break } } } case html.TextToken: if linkHref != "" { linkTexts = append(linkTexts, token.Data) } case html.EndTagToken: if linkHref != "" && token.DataAtom == atom.A { // Copy params href := linkHref linkText := strings.Join(linkTexts, " ") // Reset params linkHref = "" linkTexts = nil // TODO Optimized decision tree for _, entry := range urlBlackList { if href == entry { goto nextToken } } for _, entry := range urlPartBlackList { if strings.Contains(href, entry) { goto nextToken } } for _, entry := range fileNameBlackList { if strings.Contains(linkText, entry) { goto nextToken } } links = append(links, href) } } nextToken: } atomic.AddInt64(&visited, 1) return } func fileInfo(u url.URL, f *File) (err error) { f.IsDir = false u.Path = path.Clean(u.Path) f.Name = path.Base(u.Path) f.Path = strings.Trim(u.Path, "/") req := fasthttp.AcquireRequest() req.Header.SetMethod("HEAD") req.SetRequestURI(u.String()) res := fasthttp.AcquireResponse() res.SkipBody = true defer fasthttp.ReleaseResponse(res) err = client.Do(req, res) fasthttp.ReleaseRequest(req) if err != nil { return } // TODO Inefficient af header := res.Header.Header() f.ParseHeader(header) atomic.AddInt64(&visited, 1) return nil } func (f *File) ParseHeader(h []byte) { var k1, k2 int var v1, v2 int // Simple finite state machine state := 0 for i, b := range h { switch state { case 0: if b == byte(':') { state = 1 k2 = i } case 1: state = 2 case 2: state = 3 v1 = i case 3: if b == byte('\r') { state = 4 } case 4: state = 0 v2 = i - 1 key := string(h[k1:k2]) val := string(h[v1:v2]) k1 = i f.applyHeader(key, val) } } } func (f *File) applyHeader(k, v string) { switch k { case "content-length": size, err := strconv.ParseInt(v, 10, 64) if err != nil { break } if size < 0 { break } f.Size = size case "last-modified": var err error f.MTime, err = time.Parse(time.RFC1123, v) if err == nil { break } f.MTime, err = time.Parse(time.RFC850, v) if err == nil { break } // TODO Parse asctime f.MTime, err = time.Parse("2006-01-02", v[:10]) if err == nil { break } } } func makeInfinite() (chan<- url.URL, <-chan url.URL) { in := make(chan url.URL) out := make(chan url.URL) // Set up in and out queues go func() { var inQueue []url.URL outCh := func() chan url.URL { if len(inQueue) == 0 { return nil } return out } for len(inQueue) > 0 || in != nil { if len(inQueue) == 0 { v, ok := <-in if !ok { in = nil } else { inQueue = append(inQueue, v) } } else { select { case v, ok := <-in: if !ok { in = nil } else { inQueue = append(inQueue, v) } case outCh() <- inQueue[0]: inQueue = inQueue[1:] } } } close(out) }() return in, out } var urlBlackList = [...]string { "", " ", ".", "..", "/", } var urlPartBlackList = [...]string { "?C=N&O=D", "?C=M&O=A", "?C=S&O=A", "?C=D&O=A", "?C=N;O=D", "?C=M;O=A", "?C=M&O=D", "?C=S;O=A", "?C=S&O=D", "?C=D;O=A", "?MA", "?SA", "?DA", "?ND", "?C=N&O=A", "?C=N&O=A", "?M=A", "?N=D", "?S=A", "?D=A", } var fileNameBlackList = [...]string { "Parent Directory", " Parent Directory", "../", }