mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-18 18:06:45 +00:00
File HEAD requests
This commit is contained in:
parent
2844d344ec
commit
d748be72cd
82
crawl.go
82
crawl.go
@ -1,12 +1,14 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/valyala/fasthttp"
|
||||
"golang.org/x/net/html"
|
||||
"golang.org/x/net/html/atom"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
@ -17,7 +19,7 @@ const (
|
||||
nConns = 100
|
||||
)
|
||||
|
||||
var client = http.DefaultClient
|
||||
var client = fasthttp.Client{}
|
||||
var wait sync.WaitGroup
|
||||
|
||||
var visited int64
|
||||
@ -25,6 +27,14 @@ var visited int64
|
||||
var in chan<- url.URL
|
||||
var out <-chan url.URL
|
||||
|
||||
type File struct {
|
||||
Name string `json:"name"`
|
||||
Size int64 `json:"size"`
|
||||
MTime time.Time `json:"mtime"`
|
||||
Path string `json:"path"`
|
||||
IsDir bool `json:"-"`
|
||||
}
|
||||
|
||||
func main() {
|
||||
if len(os.Args) != 2 {
|
||||
println("Usage: ./crawl <url>")
|
||||
@ -62,27 +72,38 @@ func worker() {
|
||||
for _, sub := range links {
|
||||
subrefi, err := url.Parse(sub)
|
||||
subref := *subrefi
|
||||
// TODO Print errors
|
||||
if err != nil { continue }
|
||||
abs := *u.ResolveReference(&subref)
|
||||
in <- abs
|
||||
}
|
||||
} else {
|
||||
// File
|
||||
// TODO check file
|
||||
var fil File
|
||||
err := fileInfo(u, &fil)
|
||||
// TODO Print errors
|
||||
if err != nil { continue }
|
||||
}
|
||||
}
|
||||
wait.Done()
|
||||
}
|
||||
|
||||
func listDir(u url.URL) (links []string) {
|
||||
res, err := client.Get(u.String())
|
||||
req := fasthttp.AcquireRequest()
|
||||
req.SetRequestURI(u.String())
|
||||
|
||||
res := fasthttp.AcquireResponse()
|
||||
defer fasthttp.ReleaseResponse(res)
|
||||
|
||||
err := client.Do(req, res)
|
||||
fasthttp.ReleaseRequest(req)
|
||||
|
||||
if err != nil {
|
||||
logrus.Error(err)
|
||||
return
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
doc := html.NewTokenizer(res.Body)
|
||||
doc := html.NewTokenizer(bytes.NewReader(res.Body()))
|
||||
|
||||
var linkHref string
|
||||
var linkTexts []string
|
||||
@ -148,6 +169,55 @@ func listDir(u url.URL) (links []string) {
|
||||
return
|
||||
}
|
||||
|
||||
func fileInfo(u url.URL, f *File) (err error) {
|
||||
req := fasthttp.AcquireRequest()
|
||||
req.Header.SetMethod("HEAD")
|
||||
req.SetRequestURI(u.String())
|
||||
|
||||
res := fasthttp.AcquireResponse()
|
||||
res.SkipBody = true
|
||||
defer fasthttp.ReleaseResponse(res)
|
||||
|
||||
err = client.Do(req, res)
|
||||
fasthttp.ReleaseRequest(req)
|
||||
|
||||
if err != nil { return }
|
||||
|
||||
// TODO Inefficient af
|
||||
header := res.Header.String()
|
||||
|
||||
for _, line := range strings.Split(header, "\r\n") {
|
||||
if line == "" { continue }
|
||||
if strings.HasPrefix(line, "HTTP/1") { continue }
|
||||
|
||||
parts := strings.SplitN(line, ": ", 2)
|
||||
if len(parts) != 2 { continue }
|
||||
|
||||
k, v := parts[0], parts[1]
|
||||
k = strings.ToLower(k)
|
||||
|
||||
switch k {
|
||||
case "content-length":
|
||||
size, err := strconv.ParseInt(v, 10, 64)
|
||||
if err != nil { break }
|
||||
if size < 0 { break }
|
||||
f.Size = size
|
||||
|
||||
case "last-modified":
|
||||
var err error
|
||||
f.MTime, err = time.Parse(time.RFC1123, v)
|
||||
if err == nil { break }
|
||||
f.MTime, err = time.Parse(time.RFC850, v)
|
||||
if err == nil { break }
|
||||
// TODO Parse asctime
|
||||
f.MTime, err = time.Parse("2006-01-02", v[:10])
|
||||
if err == nil { break }
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func makeInfinite() (chan<- url.URL, <-chan url.URL) {
|
||||
in := make(chan url.URL)
|
||||
out := make(chan url.URL)
|
||||
|
Loading…
x
Reference in New Issue
Block a user