mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-16 08:56:44 +00:00
89 lines
1.5 KiB
Go
89 lines
1.5 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"github.com/sirupsen/logrus"
|
|
"github.com/terorie/oddb-go/fasturl"
|
|
"github.com/urfave/cli"
|
|
"log"
|
|
"net/http"
|
|
_ "net/http/pprof"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
var app = cli.App {
|
|
Name: "oddb-go",
|
|
Usage: "OD-Database Go crawler",
|
|
Version: "0.1",
|
|
BashComplete: cli.DefaultAppComplete,
|
|
Writer: os.Stdout,
|
|
Compiled: buildDate,
|
|
Commands: []cli.Command{
|
|
{
|
|
Name: "crawl",
|
|
Usage: "Crawl a list of URLs",
|
|
ArgsUsage: "[site, site, ...]",
|
|
Action: cmdCrawler,
|
|
},
|
|
},
|
|
}
|
|
|
|
func init() {
|
|
prepareConfig()
|
|
}
|
|
|
|
func main() {
|
|
go func() {
|
|
log.Println(http.ListenAndServe("localhost:42069", nil))
|
|
}()
|
|
app.Run(os.Args)
|
|
}
|
|
|
|
func cmdCrawler(clic *cli.Context) error {
|
|
readConfig()
|
|
|
|
if clic.NArg() == 0 {
|
|
cli.ShowCommandHelpAndExit(clic, "crawl", 1)
|
|
}
|
|
|
|
args := clic.Args()
|
|
remotes := make([]*OD, len(args))
|
|
for i, arg := range args {
|
|
// https://github.com/golang/go/issues/19779
|
|
if !strings.Contains(arg, "://") {
|
|
arg = "http://" + arg
|
|
}
|
|
var u fasturl.URL
|
|
err := u.Parse(arg)
|
|
if !strings.HasSuffix(u.Path, "/") {
|
|
u.Path += "/"
|
|
}
|
|
if err != nil { return err }
|
|
remotes[i] = &OD{ BaseUri: u }
|
|
}
|
|
|
|
c := context.Background()
|
|
|
|
inRemotes := make(chan *OD)
|
|
go Schedule(c, inRemotes)
|
|
|
|
for _, remote := range remotes {
|
|
globalWait.Add(1)
|
|
inRemotes <- remote
|
|
}
|
|
|
|
// Wait for all jobs to finish
|
|
globalWait.Wait()
|
|
|
|
logrus.Info("All dirs processed!")
|
|
|
|
return nil
|
|
}
|
|
|
|
var buildDate = time.Date(
|
|
2018, 10, 28,
|
|
17, 10, 0, 0,
|
|
time.UTC)
|