mirror of
https://github.com/terorie/od-database-crawler.git
synced 2025-04-18 18:06:45 +00:00
Fix crawl descent
This commit is contained in:
parent
a507110787
commit
1c33346f45
@ -12,6 +12,7 @@ var config struct {
|
||||
Retries int
|
||||
Workers int
|
||||
StatsInterval time.Duration
|
||||
Verbose bool
|
||||
}
|
||||
|
||||
const (
|
||||
@ -20,12 +21,14 @@ const (
|
||||
ConfRetries = "retries"
|
||||
ConfWorkers = "workers"
|
||||
ConfStatsInterval = "stats_interval"
|
||||
ConfVerbose = "verbose"
|
||||
)
|
||||
|
||||
func prepareConfig() {
|
||||
viper.SetDefault(ConfRetries, 3)
|
||||
viper.SetDefault(ConfWorkers, 50)
|
||||
viper.SetDefault(ConfStatsInterval, 3 * time.Second)
|
||||
viper.SetDefault(ConfVerbose, false)
|
||||
}
|
||||
|
||||
func readConfig() {
|
||||
@ -57,4 +60,9 @@ func readConfig() {
|
||||
}
|
||||
|
||||
config.StatsInterval = viper.GetDuration(ConfStatsInterval)
|
||||
|
||||
config.Verbose = viper.GetBool(ConfVerbose)
|
||||
if config.Verbose {
|
||||
logrus.SetLevel(logrus.DebugLevel)
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,5 @@
|
||||
server_url: localhost:6969
|
||||
token: abc
|
||||
stats: 5s
|
||||
stats_interval: 1s
|
||||
verbose: true
|
||||
retries: 0
|
@ -27,15 +27,12 @@ func NewRemoteDir(u url.URL) *RemoteDir {
|
||||
return &RemoteDir{ BaseUri: u }
|
||||
}
|
||||
|
||||
func GetDir(u url.URL, f *File) (links []url.URL, err error) {
|
||||
func GetDir(j *Job, f *File) (links []url.URL, err error) {
|
||||
f.IsDir = true
|
||||
u.Path = path.Clean(u.Path)
|
||||
// TODO Handle external links
|
||||
f.Name = path.Base(u.Path)
|
||||
f.Path = strings.TrimLeft(u.Path, "/")
|
||||
f.Name = path.Base(j.Uri.Path)
|
||||
|
||||
req := fasthttp.AcquireRequest()
|
||||
req.SetRequestURI(u.String())
|
||||
req.SetRequestURI(j.Uri.String())
|
||||
|
||||
res := fasthttp.AcquireResponse()
|
||||
defer fasthttp.ReleaseResponse(res)
|
||||
@ -48,7 +45,8 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) {
|
||||
return
|
||||
}
|
||||
|
||||
doc := html.NewTokenizer(bytes.NewReader(res.Body()))
|
||||
body := res.Body()
|
||||
doc := html.NewTokenizer(bytes.NewReader(body))
|
||||
|
||||
var linkHref string
|
||||
var linkTexts []string
|
||||
@ -105,10 +103,12 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) {
|
||||
subref, err := url.Parse(href)
|
||||
if err != nil { continue }
|
||||
|
||||
link := *u.ResolveReference(subref)
|
||||
link := *j.Uri.ResolveReference(subref)
|
||||
|
||||
if link.Scheme != u.Scheme ||
|
||||
link.Host != u.Host {
|
||||
if link.Scheme != j.Uri.Scheme ||
|
||||
link.Host != j.Uri.Host ||
|
||||
link.Path == j.Uri.Path ||
|
||||
!strings.HasPrefix(link.Path, j.Uri.Path) {
|
||||
continue
|
||||
}
|
||||
|
||||
@ -119,6 +119,10 @@ func GetDir(u url.URL, f *File) (links []url.URL, err error) {
|
||||
nextToken:
|
||||
}
|
||||
|
||||
if len(links) == 0 {
|
||||
println(string(body))
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
|
2
main.go
2
main.go
@ -14,7 +14,7 @@ func main() {
|
||||
remotes := make(chan *RemoteDir)
|
||||
go Schedule(c, remotes)
|
||||
|
||||
u, _ := url.Parse("http://mine.terorie.com:420/")
|
||||
u, _ := url.Parse("https://the-eye.eu/public/rom/")
|
||||
remote := NewRemoteDir(*u)
|
||||
|
||||
globalWait.Add(1)
|
||||
|
@ -59,7 +59,7 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
||||
// File
|
||||
if strings.HasSuffix(job.Uri.Path, "/") {
|
||||
// Dir
|
||||
links, err := GetDir(job.Uri, f)
|
||||
links, err := GetDir(job, f)
|
||||
if err != nil {
|
||||
logrus.WithError(err).
|
||||
WithField("url", job.Uri.String()).
|
||||
@ -75,6 +75,10 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) {
|
||||
Fails: 0,
|
||||
})
|
||||
}
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"url": job.UriStr,
|
||||
"files": len(links),
|
||||
}).Debug("Listed")
|
||||
} else {
|
||||
err := GetFile(job.Uri, f)
|
||||
if err != nil {
|
||||
|
Loading…
x
Reference in New Issue
Block a user