diff --git a/crawl.go b/crawl.go index 0e999e1..805449a 100644 --- a/crawl.go +++ b/crawl.go @@ -2,14 +2,14 @@ package main import ( "bytes" - "encoding/base64" "fmt" "github.com/sirupsen/logrus" + "github.com/terorie/oddb-go/ds/redblackhash" + "github.com/terorie/oddb-go/fasturl" "github.com/valyala/fasthttp" "golang.org/x/crypto/blake2b" "golang.org/x/net/html" "golang.org/x/net/html/atom" - "net/url" "path" "strconv" "strings" @@ -18,12 +18,12 @@ import ( var client fasthttp.Client -func GetDir(j *Job, f *File) (links []url.URL, err error) { +func GetDir(j *Job, f *File) (links []fasturl.URL, err error) { f.IsDir = true f.Name = path.Base(j.Uri.Path) req := fasthttp.AcquireRequest() - req.SetRequestURI(j.Uri.String()) + req.SetRequestURI(j.UriStr) res := fasthttp.AcquireResponse() defer fasthttp.ReleaseResponse(res) @@ -94,11 +94,10 @@ func GetDir(j *Job, f *File) (links []url.URL, err error) { } } - subref, err := url.Parse(href) + var link fasturl.URL + err = j.Uri.ParseRel(&link, href) if err != nil { continue } - link := *j.Uri.ResolveReference(subref) - if link.Scheme != j.Uri.Scheme || link.Host != j.Uri.Host || link.Path == j.Uri.Path || @@ -116,7 +115,7 @@ func GetDir(j *Job, f *File) (links []url.URL, err error) { return } -func GetFile(u url.URL, f *File) (err error) { +func GetFile(u fasturl.URL, f *File) (err error) { f.IsDir = false u.Path = path.Clean(u.Path) f.Name = path.Base(u.Path) @@ -145,7 +144,7 @@ func GetFile(u url.URL, f *File) (err error) { return nil } -func (f *File) HashDir(links []url.URL) string { +func (f *File) HashDir(links []fasturl.URL) (o redblackhash.Key) { h, _ := blake2b.New256(nil) h.Write([]byte(f.Name)) for _, link := range links { @@ -153,8 +152,8 @@ func (f *File) HashDir(links []url.URL) string { h.Write([]byte(fileName)) } sum := h.Sum(nil) - b64sum := base64.StdEncoding.EncodeToString(sum) - return b64sum + copy(o[:redblackhash.KeySize], sum) + return } func (f *File) ParseHeader(h []byte) { diff --git a/ds/redblackhash/redblack.go b/ds/redblackhash/redblack.go new file mode 100644 index 0000000..921ee12 --- /dev/null +++ b/ds/redblackhash/redblack.go @@ -0,0 +1,521 @@ +// Copyright (c) 2015, Emir Pasic. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Modifications by terorie + +// Package redblacktree implements a red-black tree. +// +// Used by TreeSet and TreeMap. +// +// Structure is not thread safe. +// +// References: http://en.wikipedia.org/wiki/Red%E2%80%93black_tree +package redblackhash + +import ( + "fmt" +) + +const ( + black, red color = true, false + KeySize = 64 +) + +type color bool +type Key [KeySize]byte + +// Tree holds elements of the red-black tree +type Tree struct { + Root *Node + size int +} + +// Node is a single element within the tree +type Node struct { + Key Key + color color + Left *Node + Right *Node + Parent *Node +} + +func (k *Key) Compare(o *Key) int { + // TODO Assembly + /*for i := 0; i < KeySize / 8; i++ { + a := uint64(k[i+0] ) | + uint64(k[i+1] >> 8) | + uint64(k[i+2] >> 16) | + uint64(k[i+3] >> 24) | + uint64(k[i+4] >> 32) | + uint64(k[i+5] >> 40) | + uint64(k[i+6] >> 48) | + uint64(k[i+7] >> 56) + + b := uint64(o[i+0] ) | + uint64(o[i+1] >> 8) | + uint64(o[i+2] >> 16) | + uint64(o[i+3] >> 24) | + uint64(o[i+4] >> 32) | + uint64(o[i+5] >> 40) | + uint64(o[i+6] >> 48) | + uint64(o[i+7] >> 56) + + switch { + case a < b: + return -1 + case a > b: + return 1 + } + }*/ + for i := 0; i < KeySize; i++ { + switch { + case k[i] < o[i]: + return -1 + case k[i] > o[i]: + return 1 + } + } + return 0 +} + +// Put inserts node into the tree. +// Key should adhere to the comparator's type assertion, otherwise method panics. +func (tree *Tree) Put(key *Key) { + var insertedNode *Node + if tree.Root == nil { + // Assert key is of comparator's type for initial tree + tree.Root = &Node{Key: *key, color: red} + insertedNode = tree.Root + } else { + node := tree.Root + loop := true + for loop { + compare := key.Compare(&node.Key) + switch { + case compare == 0: + node.Key = *key + return + case compare < 0: + if node.Left == nil { + node.Left = &Node{Key: *key, color: red} + insertedNode = node.Left + loop = false + } else { + node = node.Left + } + case compare > 0: + if node.Right == nil { + node.Right = &Node{Key: *key, color: red} + insertedNode = node.Right + loop = false + } else { + node = node.Right + } + } + } + insertedNode.Parent = node + } + tree.insertCase1(insertedNode) + tree.size++ +} + +// Get searches the node in the tree by key and returns its value or nil if key is not found in tree. +// Second return parameter is true if key was found, otherwise false. +// Key should adhere to the comparator's type assertion, otherwise method panics. +func (tree *Tree) Get(key *Key) (found bool) { + node := tree.lookup(key) + return node != nil +} + +// Remove remove the node from the tree by key. +// Key should adhere to the comparator's type assertion, otherwise method panics. +func (tree *Tree) Remove(key *Key) { + var child *Node + node := tree.lookup(key) + if node == nil { + return + } + if node.Left != nil && node.Right != nil { + pred := node.Left.maximumNode() + node.Key = pred.Key + node = pred + } + if node.Left == nil || node.Right == nil { + if node.Right == nil { + child = node.Left + } else { + child = node.Right + } + if node.color == black { + node.color = nodeColor(child) + tree.deleteCase1(node) + } + tree.replaceNode(node, child) + if node.Parent == nil && child != nil { + child.color = black + } + } + tree.size-- +} + +// Empty returns true if tree does not contain any nodes +func (tree *Tree) Empty() bool { + return tree.size == 0 +} + +// Size returns number of nodes in the tree. +func (tree *Tree) Size() int { + return tree.size +} + +// Left returns the left-most (min) node or nil if tree is empty. +func (tree *Tree) Left() *Node { + var parent *Node + current := tree.Root + for current != nil { + parent = current + current = current.Left + } + return parent +} + +// Right returns the right-most (max) node or nil if tree is empty. +func (tree *Tree) Right() *Node { + var parent *Node + current := tree.Root + for current != nil { + parent = current + current = current.Right + } + return parent +} + +// Floor Finds floor node of the input key, return the floor node or nil if no floor is found. +// Second return parameter is true if floor was found, otherwise false. +// +// Floor node is defined as the largest node that is smaller than or equal to the given node. +// A floor node may not be found, either because the tree is empty, or because +// all nodes in the tree are larger than the given node. +// +// Key should adhere to the comparator's type assertion, otherwise method panics. +func (tree *Tree) Floor(key *Key) (floor *Node, found bool) { + found = false + node := tree.Root + for node != nil { + compare := key.Compare(&node.Key) + switch { + case compare == 0: + return node, true + case compare < 0: + node = node.Left + case compare > 0: + floor, found = node, true + node = node.Right + } + } + if found { + return floor, true + } + return nil, false +} + +// Ceiling finds ceiling node of the input key, return the ceiling node or nil if no ceiling is found. +// Second return parameter is true if ceiling was found, otherwise false. +// +// Ceiling node is defined as the smallest node that is larger than or equal to the given node. +// A ceiling node may not be found, either because the tree is empty, or because +// all nodes in the tree are smaller than the given node. +// +// Key should adhere to the comparator's type assertion, otherwise method panics. +func (tree *Tree) Ceiling(key *Key) (ceiling *Node, found bool) { + found = false + node := tree.Root + for node != nil { + compare := key.Compare(&node.Key) + switch { + case compare == 0: + return node, true + case compare < 0: + ceiling, found = node, true + node = node.Left + case compare > 0: + node = node.Right + } + } + if found { + return ceiling, true + } + return nil, false +} + +// Clear removes all nodes from the tree. +func (tree *Tree) Clear() { + tree.Root = nil + tree.size = 0 +} + +// String returns a string representation of container +func (tree *Tree) String() string { + str := "RedBlackTree\n" + if !tree.Empty() { + output(tree.Root, "", true, &str) + } + return str +} + +func (node *Node) String() string { + return fmt.Sprintf("%v", node.Key) +} + +func output(node *Node, prefix string, isTail bool, str *string) { + if node.Right != nil { + newPrefix := prefix + if isTail { + newPrefix += "│ " + } else { + newPrefix += " " + } + output(node.Right, newPrefix, false, str) + } + *str += prefix + if isTail { + *str += "└── " + } else { + *str += "┌── " + } + *str += node.String() + "\n" + if node.Left != nil { + newPrefix := prefix + if isTail { + newPrefix += " " + } else { + newPrefix += "│ " + } + output(node.Left, newPrefix, true, str) + } +} + +func (tree *Tree) lookup(key *Key) *Node { + node := tree.Root + for node != nil { + compare := key.Compare(&node.Key) + switch { + case compare == 0: + return node + case compare < 0: + node = node.Left + case compare > 0: + node = node.Right + } + } + return nil +} + +func (node *Node) grandparent() *Node { + if node != nil && node.Parent != nil { + return node.Parent.Parent + } + return nil +} + +func (node *Node) uncle() *Node { + if node == nil || node.Parent == nil || node.Parent.Parent == nil { + return nil + } + return node.Parent.sibling() +} + +func (node *Node) sibling() *Node { + if node == nil || node.Parent == nil { + return nil + } + if node == node.Parent.Left { + return node.Parent.Right + } + return node.Parent.Left +} + +func (tree *Tree) rotateLeft(node *Node) { + right := node.Right + tree.replaceNode(node, right) + node.Right = right.Left + if right.Left != nil { + right.Left.Parent = node + } + right.Left = node + node.Parent = right +} + +func (tree *Tree) rotateRight(node *Node) { + left := node.Left + tree.replaceNode(node, left) + node.Left = left.Right + if left.Right != nil { + left.Right.Parent = node + } + left.Right = node + node.Parent = left +} + +func (tree *Tree) replaceNode(old *Node, new *Node) { + if old.Parent == nil { + tree.Root = new + } else { + if old == old.Parent.Left { + old.Parent.Left = new + } else { + old.Parent.Right = new + } + } + if new != nil { + new.Parent = old.Parent + } +} + +func (tree *Tree) insertCase1(node *Node) { + if node.Parent == nil { + node.color = black + } else { + tree.insertCase2(node) + } +} + +func (tree *Tree) insertCase2(node *Node) { + if nodeColor(node.Parent) == black { + return + } + tree.insertCase3(node) +} + +func (tree *Tree) insertCase3(node *Node) { + uncle := node.uncle() + if nodeColor(uncle) == red { + node.Parent.color = black + uncle.color = black + node.grandparent().color = red + tree.insertCase1(node.grandparent()) + } else { + tree.insertCase4(node) + } +} + +func (tree *Tree) insertCase4(node *Node) { + grandparent := node.grandparent() + if node == node.Parent.Right && node.Parent == grandparent.Left { + tree.rotateLeft(node.Parent) + node = node.Left + } else if node == node.Parent.Left && node.Parent == grandparent.Right { + tree.rotateRight(node.Parent) + node = node.Right + } + tree.insertCase5(node) +} + +func (tree *Tree) insertCase5(node *Node) { + node.Parent.color = black + grandparent := node.grandparent() + grandparent.color = red + if node == node.Parent.Left && node.Parent == grandparent.Left { + tree.rotateRight(grandparent) + } else if node == node.Parent.Right && node.Parent == grandparent.Right { + tree.rotateLeft(grandparent) + } +} + +func (node *Node) maximumNode() *Node { + if node == nil { + return nil + } + for node.Right != nil { + node = node.Right + } + return node +} + +func (tree *Tree) deleteCase1(node *Node) { + if node.Parent == nil { + return + } + tree.deleteCase2(node) +} + +func (tree *Tree) deleteCase2(node *Node) { + sibling := node.sibling() + if nodeColor(sibling) == red { + node.Parent.color = red + sibling.color = black + if node == node.Parent.Left { + tree.rotateLeft(node.Parent) + } else { + tree.rotateRight(node.Parent) + } + } + tree.deleteCase3(node) +} + +func (tree *Tree) deleteCase3(node *Node) { + sibling := node.sibling() + if nodeColor(node.Parent) == black && + nodeColor(sibling) == black && + nodeColor(sibling.Left) == black && + nodeColor(sibling.Right) == black { + sibling.color = red + tree.deleteCase1(node.Parent) + } else { + tree.deleteCase4(node) + } +} + +func (tree *Tree) deleteCase4(node *Node) { + sibling := node.sibling() + if nodeColor(node.Parent) == red && + nodeColor(sibling) == black && + nodeColor(sibling.Left) == black && + nodeColor(sibling.Right) == black { + sibling.color = red + node.Parent.color = black + } else { + tree.deleteCase5(node) + } +} + +func (tree *Tree) deleteCase5(node *Node) { + sibling := node.sibling() + if node == node.Parent.Left && + nodeColor(sibling) == black && + nodeColor(sibling.Left) == red && + nodeColor(sibling.Right) == black { + sibling.color = red + sibling.Left.color = black + tree.rotateRight(sibling) + } else if node == node.Parent.Right && + nodeColor(sibling) == black && + nodeColor(sibling.Right) == red && + nodeColor(sibling.Left) == black { + sibling.color = red + sibling.Right.color = black + tree.rotateLeft(sibling) + } + tree.deleteCase6(node) +} + +func (tree *Tree) deleteCase6(node *Node) { + sibling := node.sibling() + sibling.color = nodeColor(node.Parent) + node.Parent.color = black + if node == node.Parent.Left && nodeColor(sibling.Right) == red { + sibling.Right.color = black + tree.rotateLeft(node.Parent) + } else if nodeColor(sibling.Left) == red { + sibling.Left.color = black + tree.rotateRight(node.Parent) + } +} + +func nodeColor(node *Node) color { + if node == nil { + return black + } + return node.color +} diff --git a/main.go b/main.go index f2951a9..2e15161 100644 --- a/main.go +++ b/main.go @@ -3,11 +3,11 @@ package main import ( "context" "github.com/sirupsen/logrus" + "github.com/terorie/oddb-go/fasturl" "github.com/urfave/cli" "log" "net/http" _ "net/http/pprof" - "net/url" "os" "strings" "time" @@ -55,12 +55,13 @@ func cmdCrawler(clic *cli.Context) error { if !strings.Contains(arg, "://") { arg = "http://" + arg } - u, err := url.Parse(arg) + var u fasturl.URL + err := u.Parse(arg) if !strings.HasSuffix(u.Path, "/") { u.Path += "/" } if err != nil { return err } - remotes[i] = &OD{ BaseUri: *u } + remotes[i] = &OD{ BaseUri: u } } c := context.Background() diff --git a/model.go b/model.go index 8bab19e..448da20 100644 --- a/model.go +++ b/model.go @@ -1,14 +1,15 @@ package main import ( - "net/url" + "github.com/terorie/oddb-go/ds/redblackhash" + "github.com/terorie/oddb-go/fasturl" "sync" "time" ) type Job struct { OD *OD - Uri url.URL + Uri fasturl.URL UriStr string Fails int LastError error @@ -16,11 +17,12 @@ type Job struct { type OD struct { Wait sync.WaitGroup - BaseUri url.URL - lock sync.Mutex + BaseUri fasturl.URL Files []File WCtx WorkerContext - Scanned sync.Map + Scanned redblackhash.Tree + + lock sync.Mutex } type File struct { @@ -30,3 +32,14 @@ type File struct { Path string `json:"path"` IsDir bool `json:"-"` } + +func (o *OD) LoadOrStoreKey(k *redblackhash.Key) (exists bool) { + o.lock.Lock() + defer o.lock.Unlock() + + exists = o.Scanned.Get(k) + if exists { return true } + + o.Scanned.Put(k) + return false +} diff --git a/worker.go b/worker.go index d9c52b6..bbc1b9e 100644 --- a/worker.go +++ b/worker.go @@ -3,7 +3,6 @@ package main import ( "github.com/sirupsen/logrus" "math" - "strings" "sync" "sync/atomic" "time" @@ -67,12 +66,13 @@ func (w WorkerContext) step(job Job) { } func DoJob(job *Job, f *File) (newJobs []Job, err error) { - if strings.HasSuffix(job.Uri.Path, "/") { + if len(job.Uri.Path) == 0 { return } + if job.Uri.Path[len(job.Uri.Path)-1] == '/' { // Load directory links, err := GetDir(job, f) if err != nil { logrus.WithError(err). - WithField("url", job.Uri.String()). + WithField("url", job.UriStr). Error("Failed getting dir") return nil, err } @@ -81,15 +81,15 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) { hash := f.HashDir(links) // Skip symlinked dirs - if _, old := job.OD.Scanned.LoadOrStore(hash, true); old { + if job.OD.LoadOrStoreKey(&hash) { return nil, ErrKnown } for _, link := range links { // Skip already queued links - if _, old := job.OD.Scanned.LoadOrStore(link, true); old { - continue - } + //if _, old := job.OD.Scanned.LoadOrStore(link, true); old { + // continue + //} job.OD.Wait.Add(1) newJobs = append(newJobs, Job{ OD: job.OD, @@ -98,16 +98,18 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) { Fails: 0, }) } - logrus.WithFields(logrus.Fields{ - "url": job.UriStr, - "files": len(links), - }).Debug("Listed") + if config.Verbose { + logrus.WithFields(logrus.Fields{ + "url": job.UriStr, + "files": len(links), + }).Debug("Listed") + } } else { // Load file err := GetFile(job.Uri, f) if err != nil { logrus.WithError(err). - WithField("url", job.Uri.String()). + WithField("url", job.UriStr). Error("Failed getting file") return nil, err }