5 Commits
v1.0 ... v1.0.1

Author SHA1 Message Date
Richard Patel
8ed2cf3b93 Bump to v1.0.1 2018-11-18 14:49:07 +01:00
Richard Patel
f3620262fc Add log file support 2018-11-18 14:46:52 +01:00
Richard Patel
dc4e4212a0 Add freebsd to release.sh 2018-11-18 14:38:18 +01:00
Richard Patel
6e6a4edd27 Ignore all HTTP errors 2018-11-18 14:25:06 +01:00
Richard Patel
a71157b4d8 Add User-Agent parameter 2018-11-18 14:24:04 +01:00
7 changed files with 74 additions and 20 deletions

View File

@@ -1,9 +1,11 @@
package main
import (
"bufio"
"fmt"
"github.com/sirupsen/logrus"
"github.com/spf13/viper"
"io"
"os"
"strings"
"time"
@@ -17,6 +19,7 @@ var config struct {
ChunkSize int64
Retries int
Workers int
UserAgent string
Timeout time.Duration
Tasks int32
CrawlStats time.Duration
@@ -34,22 +37,26 @@ const (
ConfTasks = "crawl.tasks"
ConfRetries = "crawl.retries"
ConfWorkers = "crawl.connections"
ConfUserAgent = "crawl.user-agent"
ConfTimeout = "crawl.timeout"
ConfCrawlStats = "output.crawl_stats"
ConfAllocStats = "output.resource_stats"
ConfVerbose = "output.verbose"
ConfPrintHTTP = "output.http"
ConfLogFile = "output.log"
)
func prepareConfig() {
viper.SetDefault(ConfRetries, 5)
viper.SetDefault(ConfWorkers, 2)
viper.SetDefault(ConfTasks, 3)
viper.SetDefault(ConfUserAgent, "")
viper.SetDefault(ConfTimeout, 10 * time.Second)
viper.SetDefault(ConfCrawlStats, 3 * time.Second)
viper.SetDefault(ConfAllocStats, 0)
viper.SetDefault(ConfVerbose, false)
viper.SetDefault(ConfPrintHTTP, false)
viper.SetDefault(ConfLogFile, "")
viper.SetDefault(ConfRecheck, 3 * time.Second)
viper.SetDefault(ConfChunkSize, "1 MB")
}
@@ -98,6 +105,8 @@ func readConfig() {
configOOB(ConfTasks, int(config.Tasks))
}
config.UserAgent = viper.GetString(ConfUserAgent)
config.Timeout = viper.GetDuration(ConfTimeout)
config.CrawlStats = viper.GetDuration(ConfCrawlStats)
@@ -109,6 +118,17 @@ func readConfig() {
logrus.SetLevel(logrus.DebugLevel)
}
if filePath := viper.GetString(ConfLogFile); filePath != "" {
f, err := os.OpenFile(filePath, os.O_CREATE | os.O_WRONLY | os.O_APPEND, 0644)
bufWriter := bufio.NewWriter(f)
if err != nil { panic(err) }
exitHooks.Add(func() {
bufWriter.Flush()
f.Close()
})
logrus.SetOutput(io.MultiWriter(os.Stdout, bufWriter))
}
config.PrintHTTP = viper.GetBool(ConfPrintHTTP)
}

View File

@@ -23,13 +23,20 @@ server:
output:
# Crawl statistics
crawl_stats: 1s
# CPU/RAM/Job queue stats
resource_stats: 10s
# More output? (Every listed dir)
verbose: false
# Print HTTP errors (Super spammy)
http: false
# Log file
# If empty, no log file is created.
log: crawler.log
# Crawler settings
crawl:
# Number of sites that can be processed at once
@@ -47,3 +54,7 @@ crawl:
# Time before discarding a network request
timeout: 10s
# Crawler User-Agent
# If empty, no User-Agent header is sent.
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"

View File

@@ -25,6 +25,9 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
f.Name = path.Base(j.Uri.Path)
req := fasthttp.AcquireRequest()
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(j.UriStr)
res := fasthttp.AcquireResponse()
@@ -120,6 +123,9 @@ func GetFile(u fasturl.URL, f *File) (err error) {
req := fasthttp.AcquireRequest()
req.Header.SetMethod("HEAD")
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(u.String())
res := fasthttp.AcquireResponse()

20
main.go
View File

@@ -5,9 +5,6 @@ import (
"github.com/sirupsen/logrus"
"github.com/terorie/od-database-crawler/fasturl"
"github.com/urfave/cli"
"log"
"net/http"
_ "net/http/pprof"
"os"
"strings"
"sync/atomic"
@@ -17,7 +14,7 @@ import (
var app = cli.App {
Name: "od-database-crawler",
Usage: "OD-Database Go crawler",
Version: "1.0",
Version: "1.0.1",
BashComplete: cli.DefaultAppComplete,
Writer: os.Stdout,
Action: cmdBase,
@@ -29,28 +26,29 @@ var app = cli.App {
Action: cmdCrawler,
},
},
After: func(i *cli.Context) error {
exitHooks.Execute()
return nil
},
}
var exitHooks Hooks
func init() {
prepareConfig()
}
func main() {
go func() {
log.Println(http.ListenAndServe("localhost:42069", nil))
}()
err := os.MkdirAll("crawled", 0755)
if err != nil {
panic(err)
}
readConfig()
app.Run(os.Args)
}
func cmdBase(_ *cli.Context) error {
readConfig()
// TODO Graceful shutdown
appCtx := context.Background()
forceCtx := context.Background()
@@ -107,8 +105,6 @@ func cmdBase(_ *cli.Context) error {
}
func cmdCrawler(clic *cli.Context) error {
readConfig()
if clic.NArg() != 1 {
cli.ShowCommandHelpAndExit(clic, "crawl", 1)
}

View File

@@ -18,3 +18,8 @@ name=${appname}-${tag}-mac
GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name
name=${appname}-${tag}-freebsd
GOOS="freebsd" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name

22
util.go
View File

@@ -1,6 +1,9 @@
package main
import "fmt"
import (
"fmt"
"sync"
)
// https://programming.guide/go/formatting-byte-size-to-human-readable-format.html
func FormatByteCount(b uint64) string {
@@ -16,3 +19,20 @@ func FormatByteCount(b uint64) string {
return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp])
}
}
type Hooks struct {
m sync.Mutex
l []func()
}
func (h *Hooks) Add(hook func()) {
h.m.Lock()
h.l = append(h.l, hook)
h.m.Unlock()
}
func (h *Hooks) Execute() {
for _, hook := range h.l {
hook()
}
}

View File

@@ -42,15 +42,11 @@ func (w WorkerContext) step(results chan<- File, job Job) {
if httpErr, ok := err.(*HttpError); ok {
switch httpErr.code {
case
fasthttp.StatusMovedPermanently,
fasthttp.StatusFound,
fasthttp.StatusUnauthorized,
fasthttp.StatusForbidden,
fasthttp.StatusNotFound:
return
case fasthttp.StatusTooManyRequests:
err = ErrRateLimit
default:
// Don't retry HTTP error codes
return
}
}