Fix crawl descent

This commit is contained in:
Richard Patel 2018-10-28 03:06:18 +01:00
parent a507110787
commit 1c33346f45
No known key found for this signature in database
GPG Key ID: C268B2BBDA2ABECB
5 changed files with 31 additions and 13 deletions

View File

@ -12,6 +12,7 @@ var config struct {
Retries int
Workers int
StatsInterval time.Duration
Verbose bool
}
const (
@ -20,12 +21,14 @@ const (
ConfRetries = "retries"
ConfWorkers = "workers"
ConfStatsInterval = "stats_interval"
ConfVerbose = "verbose"
)
func prepareConfig() {
viper.SetDefault(ConfRetries, 3)
viper.SetDefault(ConfWorkers, 50)
viper.SetDefault(ConfStatsInterval, 3 * time.Second)
viper.SetDefault(ConfVerbose, false)
}
func readConfig() {
@ -57,4 +60,9 @@ func readConfig() {
}
config.StatsInterval = viper.GetDuration(ConfStatsInterval)
config.Verbose = viper.GetBool(ConfVerbose)
if config.Verbose {
logrus.SetLevel(logrus.DebugLevel)
}
}

View File

@ -1,3 +1,5 @@
server_url: localhost:6969
token: abc
stats: 5s
stats_interval: 1s
verbose: true
retries: 0

View File

@ -27,15 +27,12 @@ func NewRemoteDir(u url.URL) *RemoteDir {
return &RemoteDir{ BaseUri: u }
}
func GetDir(u url.URL, f *File) (links []url.URL, err error) {
func GetDir(j *Job, f *File) (links []url.URL, err error) {
f.IsDir = true
u.Path = path.Clean(u.Path)
// TODO Handle external links
f.Name = path.Base(u.Path)
f.Path = strings.TrimLeft(u.Path, "/")
f.Name = path.Base(j.Uri.Path)
req := fasthttp.AcquireRequest()
req.SetRequestURI(u.String())
req.SetRequestURI(j.Uri.String())
res := fasthttp.AcquireResponse()
defer fasthttp.ReleaseResponse(res)
@ -48,7 +45,8 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) {
return
}
doc := html.NewTokenizer(bytes.NewReader(res.Body()))
body := res.Body()
doc := html.NewTokenizer(bytes.NewReader(body))
var linkHref string
var linkTexts []string
@ -105,10 +103,12 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) {
subref, err := url.Parse(href)
if err != nil { continue }
link := *u.ResolveReference(subref)
link := *j.Uri.ResolveReference(subref)
if link.Scheme != u.Scheme ||
link.Host != u.Host {
if link.Scheme != j.Uri.Scheme ||
link.Host != j.Uri.Host ||
link.Path == j.Uri.Path ||
!strings.HasPrefix(link.Path, j.Uri.Path) {
continue
}
@ -119,6 +119,10 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) {
nextToken:
}
if len(links) == 0 {
println(string(body))
}
return
}

View File

@ -14,7 +14,7 @@ func main() {
remotes := make(chan *RemoteDir)
go Schedule(c, remotes)
u, _ := url.Parse("http://mine.terorie.com:420/")
u, _ := url.Parse("https://the-eye.eu/public/rom/")
remote := NewRemoteDir(*u)
globalWait.Add(1)

View File

@ -59,7 +59,7 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
// File
if strings.HasSuffix(job.Uri.Path, "/") {
// Dir
links, err := GetDir(job.Uri, f)
links, err := GetDir(job, f)
if err != nil {
logrus.WithError(err).
WithField("url", job.Uri.String()).
@ -75,6 +75,10 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
Fails: 0,
})
}
logrus.WithFields(logrus.Fields{
"url": job.UriStr,
"files": len(links),
}).Debug("Listed")
} else {
err := GetFile(job.Uri, f)
if err != nil {