mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-19 10:26:43 +00:00
More logs
This commit is contained in:
parent
76c8c13d49
commit
3fb4d4bde9
40
crawl.go
40
crawl.go
@ -8,6 +8,7 @@ import (
|
|||||||
"golang.org/x/net/html/atom"
|
"golang.org/x/net/html/atom"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
|
"path"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@ -66,36 +67,54 @@ func main() {
|
|||||||
|
|
||||||
func worker() {
|
func worker() {
|
||||||
for u := range out {
|
for u := range out {
|
||||||
|
// File
|
||||||
|
var fil File
|
||||||
if strings.HasSuffix(u.Path, "/") {
|
if strings.HasSuffix(u.Path, "/") {
|
||||||
// Dir
|
// Dir
|
||||||
links := listDir(u)
|
links, err := listDir(u, &fil)
|
||||||
|
if err != nil {
|
||||||
|
logrus.WithError(err).
|
||||||
|
WithField("url", u.String()).
|
||||||
|
Error("Failed getting dir")
|
||||||
|
continue
|
||||||
|
}
|
||||||
for _, sub := range links {
|
for _, sub := range links {
|
||||||
subrefi, err := url.Parse(sub)
|
subrefi, err := url.Parse(sub)
|
||||||
subref := *subrefi
|
subref := *subrefi
|
||||||
// TODO Print errors
|
// TODO Print errors
|
||||||
if err != nil { continue }
|
if err != nil { continue }
|
||||||
abs := *u.ResolveReference(&subref)
|
abs := *u.ResolveReference(&subref)
|
||||||
|
// TODO Check if host changed
|
||||||
in <- abs
|
in <- abs
|
||||||
}
|
}
|
||||||
|
//logrus.Infof("LISTED %s", u.Path)
|
||||||
} else {
|
} else {
|
||||||
// File
|
|
||||||
var fil File
|
|
||||||
err := fileInfo(u, &fil)
|
err := fileInfo(u, &fil)
|
||||||
// TODO Print errors
|
if err != nil {
|
||||||
if err != nil { continue }
|
logrus.WithError(err).
|
||||||
|
WithField("url", u.String()).
|
||||||
|
Error("Failed getting file")
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
wait.Done()
|
wait.Done()
|
||||||
}
|
}
|
||||||
|
|
||||||
func listDir(u url.URL) (links []string) {
|
func listDir(u url.URL, f *File) (links []string, err error) {
|
||||||
|
f.IsDir = true
|
||||||
|
u.Path = path.Clean(u.Path)
|
||||||
|
// TODO Handle external links
|
||||||
|
f.Name = path.Base(u.Path)
|
||||||
|
f.Path = strings.TrimLeft(u.Path, "/")
|
||||||
|
|
||||||
req := fasthttp.AcquireRequest()
|
req := fasthttp.AcquireRequest()
|
||||||
req.SetRequestURI(u.String())
|
req.SetRequestURI(u.String())
|
||||||
|
|
||||||
res := fasthttp.AcquireResponse()
|
res := fasthttp.AcquireResponse()
|
||||||
defer fasthttp.ReleaseResponse(res)
|
defer fasthttp.ReleaseResponse(res)
|
||||||
|
|
||||||
err := client.Do(req, res)
|
err = client.Do(req, res)
|
||||||
fasthttp.ReleaseRequest(req)
|
fasthttp.ReleaseRequest(req)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -170,6 +189,11 @@ func listDir(u url.URL) (links []string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func fileInfo(u url.URL, f *File) (err error) {
|
func fileInfo(u url.URL, f *File) (err error) {
|
||||||
|
f.IsDir = false
|
||||||
|
u.Path = path.Clean(u.Path)
|
||||||
|
f.Name = path.Base(u.Path)
|
||||||
|
f.Path = strings.Trim(u.Path, "/")
|
||||||
|
|
||||||
req := fasthttp.AcquireRequest()
|
req := fasthttp.AcquireRequest()
|
||||||
req.Header.SetMethod("HEAD")
|
req.Header.SetMethod("HEAD")
|
||||||
req.SetRequestURI(u.String())
|
req.SetRequestURI(u.String())
|
||||||
@ -187,6 +211,8 @@ func fileInfo(u url.URL, f *File) (err error) {
|
|||||||
header := res.Header.Header()
|
header := res.Header.Header()
|
||||||
f.ParseHeader(header)
|
f.ParseHeader(header)
|
||||||
|
|
||||||
|
atomic.AddInt64(&visited, 1)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user