mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-04 06:52:59 +00:00
196 lines
3.8 KiB
Go
196 lines
3.8 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"github.com/sirupsen/logrus"
|
|
"github.com/spf13/cobra"
|
|
"github.com/spf13/viper"
|
|
"github.com/terorie/od-database-crawler/fasturl"
|
|
"os"
|
|
"os/signal"
|
|
"strings"
|
|
"sync/atomic"
|
|
"time"
|
|
)
|
|
|
|
var configFile string
|
|
|
|
var rootCmd = cobra.Command {
|
|
Use: "od-database-crawler",
|
|
Version: "1.2.2",
|
|
Short: "OD-Database Go crawler",
|
|
Long: helpText,
|
|
PersistentPreRunE: preRun,
|
|
PersistentPostRun: func(cmd *cobra.Command, args []string) {
|
|
exitHooks.Execute()
|
|
},
|
|
}
|
|
|
|
var serverCmd = cobra.Command {
|
|
Use: "server",
|
|
Short: "Start crawl server",
|
|
Long: "Connect to the OD-Database and contribute to the database\n" +
|
|
"by crawling the web for open directories!",
|
|
Run: cmdBase,
|
|
}
|
|
|
|
var crawlCmd = cobra.Command {
|
|
Use: "crawl",
|
|
Short: "Crawl an URL",
|
|
Long: "Crawl the URL specified.\n" +
|
|
"Results will not be uploaded to the database,\n" +
|
|
"they're saved under crawled/0.json instead.\n" +
|
|
"Primarily used for testing and benchmarking.",
|
|
RunE: cmdCrawler,
|
|
Args: cobra.ExactArgs(1),
|
|
}
|
|
|
|
var exitHooks Hooks
|
|
|
|
func init() {
|
|
rootCmd.AddCommand(&crawlCmd)
|
|
rootCmd.AddCommand(&serverCmd)
|
|
|
|
prepareConfig()
|
|
}
|
|
|
|
func preRun(cmd *cobra.Command, args []string) error {
|
|
if err := os.MkdirAll("crawled", 0755);
|
|
err != nil { panic(err) }
|
|
|
|
if err := os.MkdirAll("queue", 0755);
|
|
err != nil { panic(err) }
|
|
|
|
return nil
|
|
}
|
|
|
|
func main() {
|
|
err := rootCmd.Execute()
|
|
if err != nil {
|
|
fmt.Fprintln(os.Stderr, err)
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
|
|
func cmdBase(_ *cobra.Command, _ []string) {
|
|
onlineMode = true
|
|
readConfig()
|
|
|
|
appCtx, soft := context.WithCancel(context.Background())
|
|
forceCtx, hard := context.WithCancel(context.Background())
|
|
go hardShutdown(forceCtx)
|
|
go listenCtrlC(soft, hard)
|
|
|
|
inRemotes := make(chan *OD)
|
|
go Schedule(appCtx, inRemotes)
|
|
|
|
ticker := time.NewTicker(config.Recheck)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-appCtx.Done():
|
|
goto shutdown
|
|
case <-ticker.C:
|
|
t, err := FetchTask()
|
|
if err != nil {
|
|
logrus.WithError(err).
|
|
Error("Failed to get new task")
|
|
if !sleep(viper.GetDuration(ConfCooldown), appCtx) {
|
|
goto shutdown
|
|
}
|
|
continue
|
|
}
|
|
if t == nil {
|
|
// No new task
|
|
if atomic.LoadInt32(&numActiveTasks) == 0 {
|
|
logrus.Info("Waiting …")
|
|
}
|
|
continue
|
|
}
|
|
|
|
var baseUri fasturl.URL
|
|
err = baseUri.Parse(t.Url)
|
|
if urlErr, ok := err.(*fasturl.Error); ok && urlErr.Err == fasturl.ErrUnknownScheme {
|
|
// Not an error
|
|
err = nil
|
|
// TODO FTP crawler
|
|
continue
|
|
} else if err != nil {
|
|
logrus.WithError(err).
|
|
Error("Failed to get new task")
|
|
time.Sleep(viper.GetDuration(ConfCooldown))
|
|
continue
|
|
}
|
|
ScheduleTask(inRemotes, t, &baseUri)
|
|
}
|
|
}
|
|
|
|
shutdown:
|
|
globalWait.Wait()
|
|
}
|
|
|
|
func cmdCrawler(_ *cobra.Command, args []string) error {
|
|
onlineMode = false
|
|
readConfig()
|
|
|
|
arg := args[0]
|
|
// https://github.com/golang/go/issues/19779
|
|
if !strings.Contains(arg, "://") {
|
|
arg = "http://" + arg
|
|
}
|
|
var u fasturl.URL
|
|
err := u.Parse(arg)
|
|
if !strings.HasSuffix(u.Path, "/") {
|
|
u.Path += "/"
|
|
}
|
|
if err != nil { return err }
|
|
|
|
// TODO Graceful shutdown
|
|
forceCtx := context.Background()
|
|
|
|
inRemotes := make(chan *OD)
|
|
go Schedule(forceCtx, inRemotes)
|
|
|
|
ticker := time.NewTicker(3 * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
task := Task {
|
|
WebsiteId: 0,
|
|
Url: u.String(),
|
|
}
|
|
ScheduleTask(inRemotes, &task, &u)
|
|
|
|
// Wait for all jobs to finish
|
|
globalWait.Wait()
|
|
|
|
return nil
|
|
}
|
|
|
|
func listenCtrlC(soft, hard context.CancelFunc) {
|
|
c := make(chan os.Signal)
|
|
signal.Notify(c, os.Interrupt)
|
|
|
|
<-c
|
|
logrus.Info(">>> Shutting down crawler... <<<")
|
|
soft()
|
|
|
|
<-c
|
|
logrus.Warning(">>> Force shutdown! <<<")
|
|
hard()
|
|
}
|
|
|
|
func hardShutdown(c context.Context) {
|
|
<-c.Done()
|
|
os.Exit(1)
|
|
}
|
|
|
|
func sleep(d time.Duration, c context.Context) bool {
|
|
select {
|
|
case <-time.After(d):
|
|
return true
|
|
case <-c.Done():
|
|
return false
|
|
}
|
|
}
|