2018-12-18 15:31:09 +01:00

228 lines
4.1 KiB
Go

package main
import (
"bytes"
"crypto/tls"
"github.com/terorie/od-database-crawler/ds/redblackhash"
"github.com/terorie/od-database-crawler/fasturl"
"github.com/valyala/fasthttp"
"golang.org/x/crypto/blake2b"
"golang.org/x/net/html"
"net"
"path"
"strconv"
"strings"
"time"
)
var client = fasthttp.Client {
TLSConfig: &tls.Config{
InsecureSkipVerify: true,
},
}
func setDialTimeout(d time.Duration) {
client.Dial = func(addr string) (net.Conn, error) {
return fasthttp.DialTimeout(addr, d)
}
}
func setTimeout(d time.Duration) {
client.ReadTimeout = d
client.WriteTimeout = d / 2
}
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
f.IsDir = true
f.Name = path.Base(j.Uri.Path)
req := fasthttp.AcquireRequest()
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(j.UriStr)
res := fasthttp.AcquireResponse()
defer fasthttp.ReleaseResponse(res)
err = client.Do(req, res)
fasthttp.ReleaseRequest(req)
if err != nil {
return
}
err = checkStatusCode(res.StatusCode())
if err != nil {
return
}
body := res.Body()
return ParseDir(body, &j.Uri)
}
func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error) {
doc := html.NewTokenizer(bytes.NewReader(body))
var linkHref string
for {
err = nil
tokenType := doc.Next()
if tokenType == html.ErrorToken {
break
}
switch tokenType {
case html.StartTagToken:
name, hasAttr := doc.TagName()
if len(name) == 1 && name[0] == 'a' {
for hasAttr {
var ks, vs []byte
ks, vs, hasAttr = doc.TagAttr()
if bytes.Equal(ks, []byte("href")) {
// TODO Check escape
linkHref = string(vs)
break
}
}
}
case html.EndTagToken:
name, _ := doc.TagName()
if len(name) == 1 && name[0] == 'a' {
// Copy params
href := linkHref
// Reset params
linkHref = ""
if strings.LastIndexByte(href, '?') != -1 {
continue
}
switch href {
case "", " ", ".", "..", "/":
continue
}
if strings.Contains(href, "../") {
continue
}
var link fasturl.URL
err = baseUri.ParseRel(&link, href)
if err != nil {
continue
}
if link.Scheme != baseUri.Scheme ||
link.Host != baseUri.Host ||
link.Path == baseUri.Path ||
!strings.HasPrefix(link.Path, baseUri.Path) {
continue
}
links = append(links, link)
}
}
}
return
}
func GetFile(u fasturl.URL, f *File) (err error) {
f.IsDir = false
u.Path = path.Clean(u.Path)
f.Name = path.Base(u.Path)
f.Path = strings.Trim(path.Dir(u.Path), "/")
req := fasthttp.AcquireRequest()
req.Header.SetMethod("HEAD")
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(u.String())
res := fasthttp.AcquireResponse()
res.SkipBody = true
defer fasthttp.ReleaseResponse(res)
err = client.Do(req, res)
fasthttp.ReleaseRequest(req)
if err != nil {
return
}
err = checkStatusCode(res.StatusCode())
if err != nil {
return
}
f.applyContentLength(string(res.Header.Peek("content-length")))
f.applyLastModified(string(res.Header.Peek("last-modified")))
return nil
}
func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) {
h, _ := blake2b.New256(nil)
h.Write([]byte(f.Name))
for _, link := range links {
fileName := path.Base(link.Path)
h.Write([]byte(fileName))
}
sum := h.Sum(nil)
copy(o[:redblackhash.KeySize], sum)
return
}
func (f *File) applyContentLength(v string) {
if v == "" {
return
}
size, err := strconv.ParseInt(v, 10, 64)
if err != nil {
return
}
if size < 0 {
return
}
f.Size = size
}
// TODO Cleanup
func (f *File) applyLastModified(v string) {
if v == "" {
return
}
var t time.Time
var err error
t, err = time.Parse(time.RFC1123, v)
if err == nil {
f.MTime = t.Unix()
return
}
t, err = time.Parse(time.RFC850, v)
if err == nil {
f.MTime = t.Unix()
return
}
// TODO Parse asctime
t, err = time.Parse("2006-01-02", v[:10])
if err == nil {
f.MTime = t.Unix()
return
}
}
func checkStatusCode(status int) error {
switch status {
case fasthttp.StatusOK:
return nil
default:
return &HttpError{status}
}
}