mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-04 06:52:59 +00:00
228 lines
4.1 KiB
Go
228 lines
4.1 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"crypto/tls"
|
|
"github.com/terorie/od-database-crawler/ds/redblackhash"
|
|
"github.com/terorie/od-database-crawler/fasturl"
|
|
"github.com/valyala/fasthttp"
|
|
"golang.org/x/crypto/blake2b"
|
|
"golang.org/x/net/html"
|
|
"net"
|
|
"path"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
var client = fasthttp.Client {
|
|
TLSConfig: &tls.Config{
|
|
InsecureSkipVerify: true,
|
|
},
|
|
}
|
|
|
|
func setDialTimeout(d time.Duration) {
|
|
client.Dial = func(addr string) (net.Conn, error) {
|
|
return fasthttp.DialTimeout(addr, d)
|
|
}
|
|
}
|
|
|
|
func setTimeout(d time.Duration) {
|
|
client.ReadTimeout = d
|
|
client.WriteTimeout = d / 2
|
|
}
|
|
|
|
func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
|
|
f.IsDir = true
|
|
f.Name = path.Base(j.Uri.Path)
|
|
|
|
req := fasthttp.AcquireRequest()
|
|
if config.UserAgent != "" {
|
|
req.Header.SetUserAgent(config.UserAgent)
|
|
}
|
|
req.SetRequestURI(j.UriStr)
|
|
|
|
res := fasthttp.AcquireResponse()
|
|
defer fasthttp.ReleaseResponse(res)
|
|
|
|
err = client.Do(req, res)
|
|
fasthttp.ReleaseRequest(req)
|
|
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
err = checkStatusCode(res.StatusCode())
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
body := res.Body()
|
|
return ParseDir(body, &j.Uri)
|
|
}
|
|
|
|
func ParseDir(body []byte, baseUri *fasturl.URL) (links []fasturl.URL, err error) {
|
|
doc := html.NewTokenizer(bytes.NewReader(body))
|
|
|
|
var linkHref string
|
|
for {
|
|
err = nil
|
|
|
|
tokenType := doc.Next()
|
|
if tokenType == html.ErrorToken {
|
|
break
|
|
}
|
|
|
|
switch tokenType {
|
|
case html.StartTagToken:
|
|
name, hasAttr := doc.TagName()
|
|
if len(name) == 1 && name[0] == 'a' {
|
|
for hasAttr {
|
|
var ks, vs []byte
|
|
ks, vs, hasAttr = doc.TagAttr()
|
|
if bytes.Equal(ks, []byte("href")) {
|
|
// TODO Check escape
|
|
linkHref = string(vs)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
case html.EndTagToken:
|
|
name, _ := doc.TagName()
|
|
if len(name) == 1 && name[0] == 'a' {
|
|
// Copy params
|
|
href := linkHref
|
|
|
|
// Reset params
|
|
linkHref = ""
|
|
|
|
if strings.LastIndexByte(href, '?') != -1 {
|
|
continue
|
|
}
|
|
|
|
switch href {
|
|
case "", " ", ".", "..", "/":
|
|
continue
|
|
}
|
|
|
|
if strings.Contains(href, "../") {
|
|
continue
|
|
}
|
|
|
|
var link fasturl.URL
|
|
err = baseUri.ParseRel(&link, href)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
if link.Scheme != baseUri.Scheme ||
|
|
link.Host != baseUri.Host ||
|
|
link.Path == baseUri.Path ||
|
|
!strings.HasPrefix(link.Path, baseUri.Path) {
|
|
continue
|
|
}
|
|
|
|
links = append(links, link)
|
|
}
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func GetFile(u fasturl.URL, f *File) (err error) {
|
|
f.IsDir = false
|
|
u.Path = path.Clean(u.Path)
|
|
f.Name = path.Base(u.Path)
|
|
f.Path = strings.Trim(path.Dir(u.Path), "/")
|
|
|
|
req := fasthttp.AcquireRequest()
|
|
req.Header.SetMethod("HEAD")
|
|
if config.UserAgent != "" {
|
|
req.Header.SetUserAgent(config.UserAgent)
|
|
}
|
|
req.SetRequestURI(u.String())
|
|
|
|
res := fasthttp.AcquireResponse()
|
|
res.SkipBody = true
|
|
defer fasthttp.ReleaseResponse(res)
|
|
|
|
err = client.Do(req, res)
|
|
fasthttp.ReleaseRequest(req)
|
|
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
err = checkStatusCode(res.StatusCode())
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
f.applyContentLength(string(res.Header.Peek("content-length")))
|
|
f.applyLastModified(string(res.Header.Peek("last-modified")))
|
|
|
|
return nil
|
|
}
|
|
|
|
func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) {
|
|
h, _ := blake2b.New256(nil)
|
|
h.Write([]byte(f.Name))
|
|
for _, link := range links {
|
|
fileName := path.Base(link.Path)
|
|
h.Write([]byte(fileName))
|
|
}
|
|
sum := h.Sum(nil)
|
|
copy(o[:redblackhash.KeySize], sum)
|
|
return
|
|
}
|
|
|
|
func (f *File) applyContentLength(v string) {
|
|
if v == "" {
|
|
return
|
|
}
|
|
size, err := strconv.ParseInt(v, 10, 64)
|
|
if err != nil {
|
|
return
|
|
}
|
|
if size < 0 {
|
|
return
|
|
}
|
|
f.Size = size
|
|
}
|
|
|
|
// TODO Cleanup
|
|
func (f *File) applyLastModified(v string) {
|
|
if v == "" {
|
|
return
|
|
}
|
|
var t time.Time
|
|
var err error
|
|
t, err = time.Parse(time.RFC1123, v)
|
|
if err == nil {
|
|
f.MTime = t.Unix()
|
|
return
|
|
}
|
|
t, err = time.Parse(time.RFC850, v)
|
|
if err == nil {
|
|
f.MTime = t.Unix()
|
|
return
|
|
}
|
|
// TODO Parse asctime
|
|
t, err = time.Parse("2006-01-02", v[:10])
|
|
if err == nil {
|
|
f.MTime = t.Unix()
|
|
return
|
|
}
|
|
}
|
|
|
|
func checkStatusCode(status int) error {
|
|
switch status {
|
|
case fasthttp.StatusOK:
|
|
return nil
|
|
default:
|
|
return &HttpError{status}
|
|
}
|
|
}
|