5 Commits
v1.0 ... v1.0.1

Author SHA1 Message Date
Richard Patel
8ed2cf3b93 Bump to v1.0.1 2018-11-18 14:49:07 +01:00
Richard Patel
f3620262fc Add log file support 2018-11-18 14:46:52 +01:00
Richard Patel
dc4e4212a0 Add freebsd to release.sh 2018-11-18 14:38:18 +01:00
Richard Patel
6e6a4edd27 Ignore all HTTP errors 2018-11-18 14:25:06 +01:00
Richard Patel
a71157b4d8 Add User-Agent parameter 2018-11-18 14:24:04 +01:00
7 changed files with 74 additions and 20 deletions

View File

@@ -1,9 +1,11 @@
package main package main
import ( import (
"bufio"
"fmt" "fmt"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/spf13/viper" "github.com/spf13/viper"
"io"
"os" "os"
"strings" "strings"
"time" "time"
@@ -17,6 +19,7 @@ var config struct {
ChunkSize int64 ChunkSize int64
Retries int Retries int
Workers int Workers int
UserAgent string
Timeout time.Duration Timeout time.Duration
Tasks int32 Tasks int32
CrawlStats time.Duration CrawlStats time.Duration
@@ -34,22 +37,26 @@ const (
ConfTasks = "crawl.tasks" ConfTasks = "crawl.tasks"
ConfRetries = "crawl.retries" ConfRetries = "crawl.retries"
ConfWorkers = "crawl.connections" ConfWorkers = "crawl.connections"
ConfUserAgent = "crawl.user-agent"
ConfTimeout = "crawl.timeout" ConfTimeout = "crawl.timeout"
ConfCrawlStats = "output.crawl_stats" ConfCrawlStats = "output.crawl_stats"
ConfAllocStats = "output.resource_stats" ConfAllocStats = "output.resource_stats"
ConfVerbose = "output.verbose" ConfVerbose = "output.verbose"
ConfPrintHTTP = "output.http" ConfPrintHTTP = "output.http"
ConfLogFile = "output.log"
) )
func prepareConfig() { func prepareConfig() {
viper.SetDefault(ConfRetries, 5) viper.SetDefault(ConfRetries, 5)
viper.SetDefault(ConfWorkers, 2) viper.SetDefault(ConfWorkers, 2)
viper.SetDefault(ConfTasks, 3) viper.SetDefault(ConfTasks, 3)
viper.SetDefault(ConfUserAgent, "")
viper.SetDefault(ConfTimeout, 10 * time.Second) viper.SetDefault(ConfTimeout, 10 * time.Second)
viper.SetDefault(ConfCrawlStats, 3 * time.Second) viper.SetDefault(ConfCrawlStats, 3 * time.Second)
viper.SetDefault(ConfAllocStats, 0) viper.SetDefault(ConfAllocStats, 0)
viper.SetDefault(ConfVerbose, false) viper.SetDefault(ConfVerbose, false)
viper.SetDefault(ConfPrintHTTP, false) viper.SetDefault(ConfPrintHTTP, false)
viper.SetDefault(ConfLogFile, "")
viper.SetDefault(ConfRecheck, 3 * time.Second) viper.SetDefault(ConfRecheck, 3 * time.Second)
viper.SetDefault(ConfChunkSize, "1 MB") viper.SetDefault(ConfChunkSize, "1 MB")
} }
@@ -98,6 +105,8 @@ func readConfig() {
configOOB(ConfTasks, int(config.Tasks)) configOOB(ConfTasks, int(config.Tasks))
} }
config.UserAgent = viper.GetString(ConfUserAgent)
config.Timeout = viper.GetDuration(ConfTimeout) config.Timeout = viper.GetDuration(ConfTimeout)
config.CrawlStats = viper.GetDuration(ConfCrawlStats) config.CrawlStats = viper.GetDuration(ConfCrawlStats)
@@ -109,6 +118,17 @@ func readConfig() {
logrus.SetLevel(logrus.DebugLevel) logrus.SetLevel(logrus.DebugLevel)
} }
if filePath := viper.GetString(ConfLogFile); filePath != "" {
f, err := os.OpenFile(filePath, os.O_CREATE | os.O_WRONLY | os.O_APPEND, 0644)
bufWriter := bufio.NewWriter(f)
if err != nil { panic(err) }
exitHooks.Add(func() {
bufWriter.Flush()
f.Close()
})
logrus.SetOutput(io.MultiWriter(os.Stdout, bufWriter))
}
config.PrintHTTP = viper.GetBool(ConfPrintHTTP) config.PrintHTTP = viper.GetBool(ConfPrintHTTP)
} }

View File

@@ -23,13 +23,20 @@ server:
output: output:
# Crawl statistics # Crawl statistics
crawl_stats: 1s crawl_stats: 1s
# CPU/RAM/Job queue stats # CPU/RAM/Job queue stats
resource_stats: 10s resource_stats: 10s
# More output? (Every listed dir) # More output? (Every listed dir)
verbose: false verbose: false
# Print HTTP errors (Super spammy) # Print HTTP errors (Super spammy)
http: false http: false
# Log file
# If empty, no log file is created.
log: crawler.log
# Crawler settings # Crawler settings
crawl: crawl:
# Number of sites that can be processed at once # Number of sites that can be processed at once
@@ -47,3 +54,7 @@ crawl:
# Time before discarding a network request # Time before discarding a network request
timeout: 10s timeout: 10s
# Crawler User-Agent
# If empty, no User-Agent header is sent.
user-agent: "Mozilla/5.0 (X11; od-database-crawler) Gecko/20100101 Firefox/52.0"

View File

@@ -25,6 +25,9 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) {
f.Name = path.Base(j.Uri.Path) f.Name = path.Base(j.Uri.Path)
req := fasthttp.AcquireRequest() req := fasthttp.AcquireRequest()
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(j.UriStr) req.SetRequestURI(j.UriStr)
res := fasthttp.AcquireResponse() res := fasthttp.AcquireResponse()
@@ -120,6 +123,9 @@ func GetFile(u fasturl.URL, f *File) (err error) {
req := fasthttp.AcquireRequest() req := fasthttp.AcquireRequest()
req.Header.SetMethod("HEAD") req.Header.SetMethod("HEAD")
if config.UserAgent != "" {
req.Header.SetUserAgent(config.UserAgent)
}
req.SetRequestURI(u.String()) req.SetRequestURI(u.String())
res := fasthttp.AcquireResponse() res := fasthttp.AcquireResponse()

20
main.go
View File

@@ -5,9 +5,6 @@ import (
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/terorie/od-database-crawler/fasturl" "github.com/terorie/od-database-crawler/fasturl"
"github.com/urfave/cli" "github.com/urfave/cli"
"log"
"net/http"
_ "net/http/pprof"
"os" "os"
"strings" "strings"
"sync/atomic" "sync/atomic"
@@ -17,7 +14,7 @@ import (
var app = cli.App { var app = cli.App {
Name: "od-database-crawler", Name: "od-database-crawler",
Usage: "OD-Database Go crawler", Usage: "OD-Database Go crawler",
Version: "1.0", Version: "1.0.1",
BashComplete: cli.DefaultAppComplete, BashComplete: cli.DefaultAppComplete,
Writer: os.Stdout, Writer: os.Stdout,
Action: cmdBase, Action: cmdBase,
@@ -29,28 +26,29 @@ var app = cli.App {
Action: cmdCrawler, Action: cmdCrawler,
}, },
}, },
After: func(i *cli.Context) error {
exitHooks.Execute()
return nil
},
} }
var exitHooks Hooks
func init() { func init() {
prepareConfig() prepareConfig()
} }
func main() { func main() {
go func() {
log.Println(http.ListenAndServe("localhost:42069", nil))
}()
err := os.MkdirAll("crawled", 0755) err := os.MkdirAll("crawled", 0755)
if err != nil { if err != nil {
panic(err) panic(err)
} }
readConfig()
app.Run(os.Args) app.Run(os.Args)
} }
func cmdBase(_ *cli.Context) error { func cmdBase(_ *cli.Context) error {
readConfig()
// TODO Graceful shutdown // TODO Graceful shutdown
appCtx := context.Background() appCtx := context.Background()
forceCtx := context.Background() forceCtx := context.Background()
@@ -107,8 +105,6 @@ func cmdBase(_ *cli.Context) error {
} }
func cmdCrawler(clic *cli.Context) error { func cmdCrawler(clic *cli.Context) error {
readConfig()
if clic.NArg() != 1 { if clic.NArg() != 1 {
cli.ShowCommandHelpAndExit(clic, "crawl", 1) cli.ShowCommandHelpAndExit(clic, "crawl", 1)
} }

View File

@@ -18,3 +18,8 @@ name=${appname}-${tag}-mac
GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o $name GOOS="darwin" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name gzip -f $name
echo $name echo $name
name=${appname}-${tag}-freebsd
GOOS="freebsd" GOARCH="amd64" go build -ldflags="-s -w" -o $name
gzip -f $name
echo $name

22
util.go
View File

@@ -1,6 +1,9 @@
package main package main
import "fmt" import (
"fmt"
"sync"
)
// https://programming.guide/go/formatting-byte-size-to-human-readable-format.html // https://programming.guide/go/formatting-byte-size-to-human-readable-format.html
func FormatByteCount(b uint64) string { func FormatByteCount(b uint64) string {
@@ -16,3 +19,20 @@ func FormatByteCount(b uint64) string {
return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp]) return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp])
} }
} }
type Hooks struct {
m sync.Mutex
l []func()
}
func (h *Hooks) Add(hook func()) {
h.m.Lock()
h.l = append(h.l, hook)
h.m.Unlock()
}
func (h *Hooks) Execute() {
for _, hook := range h.l {
hook()
}
}

View File

@@ -42,15 +42,11 @@ func (w WorkerContext) step(results chan<- File, job Job) {
if httpErr, ok := err.(*HttpError); ok { if httpErr, ok := err.(*HttpError); ok {
switch httpErr.code { switch httpErr.code {
case
fasthttp.StatusMovedPermanently,
fasthttp.StatusFound,
fasthttp.StatusUnauthorized,
fasthttp.StatusForbidden,
fasthttp.StatusNotFound:
return
case fasthttp.StatusTooManyRequests: case fasthttp.StatusTooManyRequests:
err = ErrRateLimit err = ErrRateLimit
default:
// Don't retry HTTP error codes
return
} }
} }