diff --git a/go.mod b/go.mod index 23b468f..aaa2fba 100644 --- a/go.mod +++ b/go.mod @@ -4,10 +4,10 @@ require ( github.com/beeker1121/goque v2.0.1+incompatible github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect github.com/sirupsen/logrus v1.3.0 + github.com/spf13/cobra v0.0.3 github.com/spf13/viper v1.3.1 github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2 // indirect github.com/terorie/od-database-crawler v1.1.1 - github.com/urfave/cli v1.20.0 github.com/valyala/fasthttp v1.1.0 golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613 golang.org/x/net v0.0.0-20180911220305-26e67e76b6c3 diff --git a/go.sum b/go.sum index 6ea5db2..498b38a 100644 --- a/go.sum +++ b/go.sum @@ -29,6 +29,8 @@ github.com/spf13/afero v1.1.2 h1:m8/z1t7/fwjysjQRYbP0RD+bUIF/8tJwPdEZsI83ACI= github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= github.com/spf13/cast v1.3.0 h1:oget//CVOEoFewqQxwr0Ej5yjygnqGkvggSE/gB35Q8= github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= +github.com/spf13/cobra v0.0.3 h1:ZlrZ4XsMRm04Fr5pSFxBgfND2EBVa1nLpiy1stUsX/8= +github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ= github.com/spf13/jwalterweatherman v1.0.0 h1:XHEdyB+EcvlqZamSM4ZOMGlc93t6AcsBEu9Gc1vn7yk= github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg= @@ -42,8 +44,6 @@ github.com/syndtr/goleveldb v0.0.0-20181128100959-b001fa50d6b2/go.mod h1:Z4AUp2K github.com/terorie/od-database-crawler v1.1.1 h1:Ca+ZqbZX3rVWBR8SDRzvroyxjBtUs75MQXZ9YG0gqGo= github.com/terorie/od-database-crawler v1.1.1/go.mod h1:vVJ7pLkudrlUNp9qu24JCzQ8N6mFsrOmX1tPXr155DQ= github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= -github.com/urfave/cli v1.20.0 h1:fDqGv3UG/4jbVl/QkFwEdddtEDjh/5Ov6X+0B/3bPaw= -github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= github.com/valyala/fasthttp v1.1.0 h1:3BohG7mqwj4lq7PTX//7gLbUlzNvZSPmuHFnloXT0lw= diff --git a/help.go b/help.go new file mode 100644 index 0000000..2965c84 --- /dev/null +++ b/help.go @@ -0,0 +1,16 @@ +package main + +const helpText = +`HTTP crawler for the OD-Database + DB >> https://od-db.the-eye.eu << + Crawler >> https://github.com/terorie/od-database-crawler << + Config >> https://bit.ly/2MOAsUp + Server >> https://github.com/simon987/od-database << + +Quick start: + - get config file (config.yml in working dir) + - get OD-DB server ("server.url": Database URL + /api) + - get access token ("server.token": e.g. c010b6dd-20...) + - ./od-database-crawler server + +Questions? Discord @terorie#2664 / Telegram @terorie` diff --git a/main.go b/main.go index b896e50..9131c19 100644 --- a/main.go +++ b/main.go @@ -2,10 +2,11 @@ package main import ( "context" + "fmt" "github.com/sirupsen/logrus" + "github.com/spf13/cobra" "github.com/spf13/viper" "github.com/terorie/od-database-crawler/fasturl" - "github.com/urfave/cli" "os" "strings" "sync/atomic" @@ -14,47 +15,48 @@ import ( var configFile string -var app = cli.App { - Name: "od-database-crawler", - Usage: "OD-Database Go crawler", - Version: "1.1.1", - BashComplete: cli.DefaultAppComplete, - Writer: os.Stdout, - Action: cmdBase, - Commands: []cli.Command { - { - Name: "crawl", - Usage: "Crawl a list of URLs", - ArgsUsage: "", - Action: cmdCrawler, - }, - }, - Flags: []cli.Flag { - cli.StringFlag { - Name: "config", - EnvVar: "CONFIG", - Destination: &configFile, - }, - }, - Before: func(i *cli.Context) error { - if configFile != "" { - viper.SetConfigFile(configFile) - } - return nil - }, - After: func(i *cli.Context) error { +var rootCmd = cobra.Command { + Use: "od-database-crawler", + Version: "1.1.1", + Short: "OD-Database Go crawler", + Long: helpText, + PersistentPreRunE: preRun, + PersistentPostRun: func(cmd *cobra.Command, args []string) { exitHooks.Execute() - return nil }, } +var serverCmd = cobra.Command { + Use: "server", + Short: "Start crawl server", + Long: "Connect to the OD-Database and contribute to the database\n" + + "by crawling the web for open directories!", + Run: cmdBase, +} + +var crawlCmd = cobra.Command { + Use: "crawl", + Short: "Crawl an URL", + Long: "Crawl the URL specified.\n" + + "Results will not be uploaded to the database,\n" + + "they're saved under crawled/0.json instead.\n" + + "Primarily used for testing and benchmarking.", + RunE: cmdCrawler, + Args: cobra.ExactArgs(1), +} + var exitHooks Hooks func init() { + rootCmd.AddCommand(&crawlCmd) + rootCmd.AddCommand(&serverCmd) + + pf := rootCmd.PersistentFlags() + pf.StringVar(&configFile, "config", "", "Config file") prepareConfig() } -func main() { +func preRun(cmd *cobra.Command, args []string) error { if err := os.MkdirAll("crawled", 0755); err != nil { panic(err) } @@ -62,10 +64,22 @@ func main() { err != nil { panic(err) } readConfig() - app.Run(os.Args) + + if configFile != "" { + viper.SetConfigFile(configFile) + } + return nil } -func cmdBase(_ *cli.Context) error { +func main() { + err := rootCmd.Execute() + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } +} + +func cmdBase(_ *cobra.Command, _ []string) { // TODO Graceful shutdown appCtx := context.Background() forceCtx := context.Background() @@ -78,7 +92,7 @@ func cmdBase(_ *cli.Context) error { for { select { case <-appCtx.Done(): - return nil + return case <-ticker.C: t, err := FetchTask() if err != nil { @@ -117,16 +131,10 @@ func cmdBase(_ *cli.Context) error { ScheduleTask(inRemotes, t, &baseUri) } } - - return nil } -func cmdCrawler(clic *cli.Context) error { - if clic.NArg() != 1 { - cli.ShowCommandHelpAndExit(clic, "crawl", 1) - } - - arg := clic.Args()[0] +func cmdCrawler(_ *cobra.Command, args []string) error { + arg := args[0] // https://github.com/golang/go/issues/19779 if !strings.Contains(arg, "://") { arg = "http://" + arg