mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-18 18:06:45 +00:00
Simple queue crawler
This commit is contained in:
parent
dc816146cc
commit
f2d2b620fa
185
crawl.go
Normal file
185
crawl.go
Normal file
@ -0,0 +1,185 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/sirupsen/logrus"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
nConns = 100
|
||||
)
|
||||
|
||||
var client = http.DefaultClient
|
||||
var wait sync.WaitGroup
|
||||
|
||||
var visited int64
|
||||
|
||||
var in chan<- url.URL
|
||||
var out <-chan url.URL
|
||||
|
||||
func main() {
|
||||
if len(os.Args) != 2 {
|
||||
println("Usage: ./crawl <url>")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
in, out = makeInfinite()
|
||||
|
||||
go func() {
|
||||
var visitedLast int64 = 0
|
||||
for range time.NewTicker(time.Second).C {
|
||||
visitedNow := atomic.LoadInt64(&visited)
|
||||
logrus.
|
||||
WithField("per_second", visitedNow - visitedLast).
|
||||
WithField("total", visitedNow).
|
||||
Info("Tick")
|
||||
visitedLast = visitedNow
|
||||
}
|
||||
}()
|
||||
|
||||
base, _ := url.Parse(os.Args[1])
|
||||
in <- *base
|
||||
wait.Add(nConns)
|
||||
for i := 0; i < nConns; i++ {
|
||||
go worker()
|
||||
}
|
||||
wait.Wait()
|
||||
}
|
||||
|
||||
func worker() {
|
||||
for u := range out {
|
||||
if strings.HasSuffix(u.Path, "/") {
|
||||
// Dir
|
||||
links := listDir(u)
|
||||
for _, sub := range links {
|
||||
subrefi, err := url.Parse(sub)
|
||||
subref := *subrefi
|
||||
if err != nil { continue }
|
||||
|
||||
in <- *u.ResolveReference(&subref)
|
||||
}
|
||||
} else {
|
||||
// File
|
||||
// TODO check file
|
||||
}
|
||||
}
|
||||
wait.Done()
|
||||
}
|
||||
|
||||
func listDir(u url.URL) (links []string) {
|
||||
//logrus.Infof("Visiting %s", u)
|
||||
atomic.AddInt64(&visited, 1)
|
||||
|
||||
res, err := client.Get(u.String())
|
||||
if err != nil {
|
||||
logrus.Error(err)
|
||||
return
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(res.Body)
|
||||
if err != nil {
|
||||
logrus.Error(err)
|
||||
return
|
||||
}
|
||||
|
||||
doc.Find("a[href]").Each(func(_ int, s *goquery.Selection) {
|
||||
href, _ := s.Attr("href")
|
||||
text := s.Text()
|
||||
|
||||
if href == "." {
|
||||
return
|
||||
}
|
||||
|
||||
for _, entry := range blackList {
|
||||
if strings.Contains(href, entry) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
for _, entry := range fileNameBlackList {
|
||||
if strings.Contains(text, entry) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
links = append(links, href)
|
||||
})
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func makeInfinite() (chan<- url.URL, <-chan url.URL) {
|
||||
in := make(chan url.URL)
|
||||
out := make(chan url.URL)
|
||||
// Set up in and out queues
|
||||
go func() {
|
||||
var inQueue []url.URL
|
||||
outCh := func() chan url.URL {
|
||||
if len(inQueue) == 0 {
|
||||
return nil
|
||||
}
|
||||
return out
|
||||
}
|
||||
for len(inQueue) > 0 || in != nil {
|
||||
if len(inQueue) == 0 {
|
||||
v, ok := <-in
|
||||
if !ok {
|
||||
in = nil
|
||||
} else {
|
||||
inQueue = append(inQueue, v)
|
||||
}
|
||||
} else {
|
||||
select {
|
||||
case v, ok := <-in:
|
||||
if !ok {
|
||||
in = nil
|
||||
} else {
|
||||
inQueue = append(inQueue, v)
|
||||
}
|
||||
case outCh() <- inQueue[0]:
|
||||
inQueue = inQueue[1:]
|
||||
}
|
||||
}
|
||||
}
|
||||
close(out)
|
||||
}()
|
||||
return in, out
|
||||
}
|
||||
|
||||
var blackList = [...]string {
|
||||
"?C=N&O=D",
|
||||
"?C=M&O=A",
|
||||
"?C=S&O=A",
|
||||
"?C=D&O=A",
|
||||
"?C=N;O=D",
|
||||
"?C=M;O=A",
|
||||
"?C=M&O=D",
|
||||
"?C=S;O=A",
|
||||
"?C=S&O=D",
|
||||
"?C=D;O=A",
|
||||
"?MA",
|
||||
"?SA",
|
||||
"?DA",
|
||||
"?ND",
|
||||
"?C=N&O=A",
|
||||
"?C=N&O=A",
|
||||
"?M=A",
|
||||
"?N=D",
|
||||
"?S=A",
|
||||
"?D=A",
|
||||
}
|
||||
|
||||
var fileNameBlackList = [...]string {
|
||||
"Parent Directory",
|
||||
" Parent Directory",
|
||||
"../",
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user