mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-16 08:56:44 +00:00
328 lines
5.3 KiB
Go
328 lines
5.3 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"github.com/sirupsen/logrus"
|
|
"github.com/valyala/fasthttp"
|
|
"golang.org/x/net/html"
|
|
"golang.org/x/net/html/atom"
|
|
"net/url"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
nConns = 100
|
|
)
|
|
|
|
var client = fasthttp.Client{}
|
|
var wait sync.WaitGroup
|
|
|
|
var visited int64
|
|
|
|
var in chan<- url.URL
|
|
var out <-chan url.URL
|
|
|
|
type File struct {
|
|
Name string `json:"name"`
|
|
Size int64 `json:"size"`
|
|
MTime time.Time `json:"mtime"`
|
|
Path string `json:"path"`
|
|
IsDir bool `json:"-"`
|
|
}
|
|
|
|
func main() {
|
|
if len(os.Args) != 2 {
|
|
println("Usage: ./crawl <url>")
|
|
os.Exit(1)
|
|
}
|
|
|
|
in, out = makeInfinite()
|
|
|
|
go func() {
|
|
var visitedLast int64 = 0
|
|
for range time.NewTicker(time.Second).C {
|
|
visitedNow := atomic.LoadInt64(&visited)
|
|
logrus.
|
|
WithField("per_second", visitedNow - visitedLast).
|
|
WithField("total", visitedNow).
|
|
Info("Tick")
|
|
visitedLast = visitedNow
|
|
}
|
|
}()
|
|
|
|
base, _ := url.Parse(os.Args[1])
|
|
in <- *base
|
|
wait.Add(nConns)
|
|
for i := 0; i < nConns; i++ {
|
|
go worker()
|
|
}
|
|
wait.Wait()
|
|
}
|
|
|
|
func worker() {
|
|
for u := range out {
|
|
if strings.HasSuffix(u.Path, "/") {
|
|
// Dir
|
|
links := listDir(u)
|
|
for _, sub := range links {
|
|
subrefi, err := url.Parse(sub)
|
|
subref := *subrefi
|
|
// TODO Print errors
|
|
if err != nil { continue }
|
|
abs := *u.ResolveReference(&subref)
|
|
in <- abs
|
|
}
|
|
} else {
|
|
// File
|
|
var fil File
|
|
err := fileInfo(u, &fil)
|
|
// TODO Print errors
|
|
if err != nil { continue }
|
|
}
|
|
}
|
|
wait.Done()
|
|
}
|
|
|
|
func listDir(u url.URL) (links []string) {
|
|
req := fasthttp.AcquireRequest()
|
|
req.SetRequestURI(u.String())
|
|
|
|
res := fasthttp.AcquireResponse()
|
|
defer fasthttp.ReleaseResponse(res)
|
|
|
|
err := client.Do(req, res)
|
|
fasthttp.ReleaseRequest(req)
|
|
|
|
if err != nil {
|
|
logrus.Error(err)
|
|
return
|
|
}
|
|
|
|
doc := html.NewTokenizer(bytes.NewReader(res.Body()))
|
|
|
|
var linkHref string
|
|
var linkTexts []string
|
|
for {
|
|
tokenType := doc.Next()
|
|
token := doc.Token()
|
|
if tokenType == html.ErrorToken {
|
|
break
|
|
}
|
|
|
|
switch tokenType {
|
|
case html.StartTagToken:
|
|
if token.DataAtom == atom.A {
|
|
for _, attr := range token.Attr {
|
|
if attr.Key == "href" {
|
|
linkHref = attr.Val
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
case html.TextToken:
|
|
if linkHref != "" {
|
|
linkTexts = append(linkTexts, token.Data)
|
|
}
|
|
|
|
case html.EndTagToken:
|
|
if linkHref != "" && token.DataAtom == atom.A {
|
|
// Copy params
|
|
href := linkHref
|
|
linkText := strings.Join(linkTexts, " ")
|
|
|
|
// Reset params
|
|
linkHref = ""
|
|
linkTexts = nil
|
|
|
|
// TODO Optimized decision tree
|
|
for _, entry := range urlBlackList {
|
|
if href == entry {
|
|
goto nextToken
|
|
}
|
|
}
|
|
for _, entry := range urlPartBlackList {
|
|
if strings.Contains(href, entry) {
|
|
goto nextToken
|
|
}
|
|
}
|
|
for _, entry := range fileNameBlackList {
|
|
if strings.Contains(linkText, entry) {
|
|
goto nextToken
|
|
}
|
|
}
|
|
|
|
links = append(links, href)
|
|
}
|
|
}
|
|
|
|
nextToken:
|
|
}
|
|
|
|
atomic.AddInt64(&visited, 1)
|
|
|
|
return
|
|
}
|
|
|
|
func fileInfo(u url.URL, f *File) (err error) {
|
|
req := fasthttp.AcquireRequest()
|
|
req.Header.SetMethod("HEAD")
|
|
req.SetRequestURI(u.String())
|
|
|
|
res := fasthttp.AcquireResponse()
|
|
res.SkipBody = true
|
|
defer fasthttp.ReleaseResponse(res)
|
|
|
|
err = client.Do(req, res)
|
|
fasthttp.ReleaseRequest(req)
|
|
|
|
if err != nil { return }
|
|
|
|
// TODO Inefficient af
|
|
header := res.Header.Header()
|
|
f.ParseHeader(header)
|
|
|
|
return nil
|
|
}
|
|
|
|
func (f *File) ParseHeader(h []byte) {
|
|
var k1, k2 int
|
|
var v1, v2 int
|
|
|
|
// Simple finite state machine
|
|
state := 0
|
|
for i, b := range h {
|
|
switch state {
|
|
case 0:
|
|
if b == byte(':') {
|
|
state = 1
|
|
k2 = i
|
|
}
|
|
|
|
case 1:
|
|
state = 2
|
|
|
|
case 2:
|
|
state = 3
|
|
v1 = i
|
|
|
|
case 3:
|
|
if b == byte('\r') {
|
|
state = 4
|
|
}
|
|
|
|
case 4:
|
|
state = 0
|
|
v2 = i - 1
|
|
|
|
key := string(h[k1:k2])
|
|
val := string(h[v1:v2])
|
|
k1 = i
|
|
|
|
f.applyHeader(key, val)
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
func (f *File) applyHeader(k, v string) {
|
|
switch k {
|
|
case "content-length":
|
|
size, err := strconv.ParseInt(v, 10, 64)
|
|
if err != nil { break }
|
|
if size < 0 { break }
|
|
f.Size = size
|
|
|
|
case "last-modified":
|
|
var err error
|
|
f.MTime, err = time.Parse(time.RFC1123, v)
|
|
if err == nil { break }
|
|
f.MTime, err = time.Parse(time.RFC850, v)
|
|
if err == nil { break }
|
|
// TODO Parse asctime
|
|
f.MTime, err = time.Parse("2006-01-02", v[:10])
|
|
if err == nil { break }
|
|
}
|
|
}
|
|
|
|
func makeInfinite() (chan<- url.URL, <-chan url.URL) {
|
|
in := make(chan url.URL)
|
|
out := make(chan url.URL)
|
|
// Set up in and out queues
|
|
go func() {
|
|
var inQueue []url.URL
|
|
outCh := func() chan url.URL {
|
|
if len(inQueue) == 0 {
|
|
return nil
|
|
}
|
|
return out
|
|
}
|
|
for len(inQueue) > 0 || in != nil {
|
|
if len(inQueue) == 0 {
|
|
v, ok := <-in
|
|
if !ok {
|
|
in = nil
|
|
} else {
|
|
inQueue = append(inQueue, v)
|
|
}
|
|
} else {
|
|
select {
|
|
case v, ok := <-in:
|
|
if !ok {
|
|
in = nil
|
|
} else {
|
|
inQueue = append(inQueue, v)
|
|
}
|
|
case outCh() <- inQueue[0]:
|
|
inQueue = inQueue[1:]
|
|
}
|
|
}
|
|
}
|
|
close(out)
|
|
}()
|
|
return in, out
|
|
}
|
|
|
|
var urlBlackList = [...]string {
|
|
"",
|
|
" ",
|
|
".",
|
|
"..",
|
|
"/",
|
|
}
|
|
|
|
var urlPartBlackList = [...]string {
|
|
"?C=N&O=D",
|
|
"?C=M&O=A",
|
|
"?C=S&O=A",
|
|
"?C=D&O=A",
|
|
"?C=N;O=D",
|
|
"?C=M;O=A",
|
|
"?C=M&O=D",
|
|
"?C=S;O=A",
|
|
"?C=S&O=D",
|
|
"?C=D;O=A",
|
|
"?MA",
|
|
"?SA",
|
|
"?DA",
|
|
"?ND",
|
|
"?C=N&O=A",
|
|
"?C=N&O=A",
|
|
"?M=A",
|
|
"?N=D",
|
|
"?S=A",
|
|
"?D=A",
|
|
}
|
|
|
|
var fileNameBlackList = [...]string {
|
|
"Parent Directory",
|
|
" Parent Directory",
|
|
"../",
|
|
}
|
|
|