Fix crawl descent

This commit is contained in:
Richard Patel 2018-10-28 03:06:18 +01:00
parent a507110787
commit 1c33346f45
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB
5 changed files with 31 additions and 13 deletions

View File

@ -12,6 +12,7 @@ var config struct {
Retries int Retries int
Workers int Workers int
StatsInterval time.Duration StatsInterval time.Duration
Verbose bool
} }
const ( const (
@ -20,12 +21,14 @@ const (
ConfRetries = "retries" ConfRetries = "retries"
ConfWorkers = "workers" ConfWorkers = "workers"
ConfStatsInterval = "stats_interval" ConfStatsInterval = "stats_interval"
ConfVerbose = "verbose"
) )
func prepareConfig() { func prepareConfig() {
viper.SetDefault(ConfRetries, 3) viper.SetDefault(ConfRetries, 3)
viper.SetDefault(ConfWorkers, 50) viper.SetDefault(ConfWorkers, 50)
viper.SetDefault(ConfStatsInterval, 3 * time.Second) viper.SetDefault(ConfStatsInterval, 3 * time.Second)
viper.SetDefault(ConfVerbose, false)
} }
func readConfig() { func readConfig() {
@ -57,4 +60,9 @@ func readConfig() {
} }
config.StatsInterval = viper.GetDuration(ConfStatsInterval) config.StatsInterval = viper.GetDuration(ConfStatsInterval)
config.Verbose = viper.GetBool(ConfVerbose)
if config.Verbose {
logrus.SetLevel(logrus.DebugLevel)
}
} }

View File

@ -1,3 +1,5 @@
server_url: localhost:6969 server_url: localhost:6969
token: abc token: abc
stats: 5s stats_interval: 1s
verbose: true
retries: 0

View File

@ -27,15 +27,12 @@ func NewRemoteDir(u url.URL) *RemoteDir {
return &RemoteDir{ BaseUri: u } return &RemoteDir{ BaseUri: u }
} }
func GetDir(u url.URL, f *File) (links []url.URL, err error) { func GetDir(j *Job, f *File) (links []url.URL, err error) {
f.IsDir = true f.IsDir = true
u.Path = path.Clean(u.Path) f.Name = path.Base(j.Uri.Path)
// TODO Handle external links
f.Name = path.Base(u.Path)
f.Path = strings.TrimLeft(u.Path, "/")
req := fasthttp.AcquireRequest() req := fasthttp.AcquireRequest()
req.SetRequestURI(u.String()) req.SetRequestURI(j.Uri.String())
res := fasthttp.AcquireResponse() res := fasthttp.AcquireResponse()
defer fasthttp.ReleaseResponse(res) defer fasthttp.ReleaseResponse(res)
@ -48,7 +45,8 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) {
return return
} }
doc := html.NewTokenizer(bytes.NewReader(res.Body())) body := res.Body()
doc := html.NewTokenizer(bytes.NewReader(body))
var linkHref string var linkHref string
var linkTexts []string var linkTexts []string
@ -105,10 +103,12 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) {
subref, err := url.Parse(href) subref, err := url.Parse(href)
if err != nil { continue } if err != nil { continue }
link := *u.ResolveReference(subref) link := *j.Uri.ResolveReference(subref)
if link.Scheme != u.Scheme || if link.Scheme != j.Uri.Scheme ||
link.Host != u.Host { link.Host != j.Uri.Host ||
link.Path == j.Uri.Path ||
!strings.HasPrefix(link.Path, j.Uri.Path) {
continue continue
} }
@ -119,6 +119,10 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) {
nextToken: nextToken:
} }
if len(links) == 0 {
println(string(body))
}
return return
} }

View File

@ -14,7 +14,7 @@ func main() {
remotes := make(chan *RemoteDir) remotes := make(chan *RemoteDir)
go Schedule(c, remotes) go Schedule(c, remotes)
u, _ := url.Parse("http://mine.terorie.com:420/") u, _ := url.Parse("https://the-eye.eu/public/rom/")
remote := NewRemoteDir(*u) remote := NewRemoteDir(*u)
globalWait.Add(1) globalWait.Add(1)

View File

@ -59,7 +59,7 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
// File // File
if strings.HasSuffix(job.Uri.Path, "/") { if strings.HasSuffix(job.Uri.Path, "/") {
// Dir // Dir
links, err := GetDir(job.Uri, f) links, err := GetDir(job, f)
if err != nil { if err != nil {
logrus.WithError(err). logrus.WithError(err).
WithField("url", job.Uri.String()). WithField("url", job.Uri.String()).
@ -75,6 +75,10 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
Fails: 0, Fails: 0,
}) })
} }
logrus.WithFields(logrus.Fields{
"url": job.UriStr,
"files": len(links),
}).Debug("Listed")
} else { } else {
err := GetFile(job.Uri, f) err := GetFile(job.Uri, f)
if err != nil { if err != nil {