mirror of
				https://github.com/terorie/od-database-crawler.git
				synced 2025-10-28 04:06:51 +00:00 
			
		
		
		
	Timeout and results saving
This commit is contained in:
		
							parent
							
								
									a268c6dbcf
								
							
						
					
					
						commit
						ffde1a9e5d
					
				| @ -13,6 +13,7 @@ var config struct { | |||||||
| 	Token      string | 	Token      string | ||||||
| 	Retries    int | 	Retries    int | ||||||
| 	Workers    int | 	Workers    int | ||||||
|  | 	Timeout    time.Duration | ||||||
| 	Tasks      int32 | 	Tasks      int32 | ||||||
| 	CrawlStats time.Duration | 	CrawlStats time.Duration | ||||||
| 	AllocStats time.Duration | 	AllocStats time.Duration | ||||||
| @ -25,6 +26,7 @@ const ( | |||||||
| 	ConfTasks      = "crawl.tasks" | 	ConfTasks      = "crawl.tasks" | ||||||
| 	ConfRetries    = "crawl.retries" | 	ConfRetries    = "crawl.retries" | ||||||
| 	ConfWorkers    = "crawl.connections" | 	ConfWorkers    = "crawl.connections" | ||||||
|  | 	ConfTimeout    = "crawl.timeout" | ||||||
| 	ConfCrawlStats = "output.crawl_stats" | 	ConfCrawlStats = "output.crawl_stats" | ||||||
| 	ConfAllocStats = "output.resource_stats" | 	ConfAllocStats = "output.resource_stats" | ||||||
| 	ConfVerbose    = "output.verbose" | 	ConfVerbose    = "output.verbose" | ||||||
| @ -34,6 +36,7 @@ func prepareConfig() { | |||||||
| 	viper.SetDefault(ConfRetries, 5) | 	viper.SetDefault(ConfRetries, 5) | ||||||
| 	viper.SetDefault(ConfWorkers, 2) | 	viper.SetDefault(ConfWorkers, 2) | ||||||
| 	viper.SetDefault(ConfTasks, 3) | 	viper.SetDefault(ConfTasks, 3) | ||||||
|  | 	viper.SetDefault(ConfTimeout, 10 * time.Second) | ||||||
| 	viper.SetDefault(ConfCrawlStats, 3 * time.Second) | 	viper.SetDefault(ConfCrawlStats, 3 * time.Second) | ||||||
| 	viper.SetDefault(ConfAllocStats, 0) | 	viper.SetDefault(ConfAllocStats, 0) | ||||||
| 	viper.SetDefault(ConfVerbose, false) | 	viper.SetDefault(ConfVerbose, false) | ||||||
| @ -73,6 +76,8 @@ func readConfig() { | |||||||
| 		configOOB(ConfTasks, int(config.Tasks)) | 		configOOB(ConfTasks, int(config.Tasks)) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	config.Timeout = viper.GetDuration(ConfTimeout) | ||||||
|  | 
 | ||||||
| 	config.CrawlStats = viper.GetDuration(ConfCrawlStats) | 	config.CrawlStats = viper.GetDuration(ConfCrawlStats) | ||||||
| 
 | 
 | ||||||
| 	config.AllocStats = viper.GetDuration(ConfAllocStats) | 	config.AllocStats = viper.GetDuration(ConfAllocStats) | ||||||
|  | |||||||
| @ -24,3 +24,5 @@ crawl: | |||||||
|   # How often to retry getting data |   # How often to retry getting data | ||||||
|   # from the site before giving up |   # from the site before giving up | ||||||
|   retries: 5 |   retries: 5 | ||||||
|  |   # Time before discarding a network request | ||||||
|  |   timeout: 10s | ||||||
|  | |||||||
							
								
								
									
										10
									
								
								crawl.go
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								crawl.go
									
									
									
									
									
								
							| @ -3,7 +3,6 @@ package main | |||||||
| import ( | import ( | ||||||
| 	"bytes" | 	"bytes" | ||||||
| 	"fmt" | 	"fmt" | ||||||
| 	"github.com/sirupsen/logrus" |  | ||||||
| 	"github.com/terorie/oddb-go/ds/redblackhash" | 	"github.com/terorie/oddb-go/ds/redblackhash" | ||||||
| 	"github.com/terorie/oddb-go/fasturl" | 	"github.com/terorie/oddb-go/fasturl" | ||||||
| 	"github.com/valyala/fasthttp" | 	"github.com/valyala/fasthttp" | ||||||
| @ -28,13 +27,10 @@ func GetDir(j *Job, f *File) (links []fasturl.URL, err error) { | |||||||
| 	res := fasthttp.AcquireResponse() | 	res := fasthttp.AcquireResponse() | ||||||
| 	defer fasthttp.ReleaseResponse(res) | 	defer fasthttp.ReleaseResponse(res) | ||||||
| 
 | 
 | ||||||
| 	err = client.Do(req, res) | 	err = client.DoTimeout(req, res, config.Timeout) | ||||||
| 	fasthttp.ReleaseRequest(req) | 	fasthttp.ReleaseRequest(req) | ||||||
| 
 | 
 | ||||||
| 	if err != nil { | 	if err != nil { return } | ||||||
| 		logrus.Error(err) |  | ||||||
| 		return |  | ||||||
| 	} |  | ||||||
| 
 | 
 | ||||||
| 	err = checkStatusCode(res.StatusCode()) | 	err = checkStatusCode(res.StatusCode()) | ||||||
| 	if err != nil { return } | 	if err != nil { return } | ||||||
| @ -129,7 +125,7 @@ func GetFile(u fasturl.URL, f *File) (err error) { | |||||||
| 	res.SkipBody = true | 	res.SkipBody = true | ||||||
| 	defer fasthttp.ReleaseResponse(res) | 	defer fasthttp.ReleaseResponse(res) | ||||||
| 
 | 
 | ||||||
| 	err = client.Do(req, res) | 	err = client.DoTimeout(req, res, config.Timeout) | ||||||
| 	fasthttp.ReleaseRequest(req) | 	fasthttp.ReleaseRequest(req) | ||||||
| 
 | 
 | ||||||
| 	if err != nil { return } | 	if err != nil { return } | ||||||
|  | |||||||
							
								
								
									
										8
									
								
								main.go
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								main.go
									
									
									
									
									
								
							| @ -61,7 +61,13 @@ func cmdCrawler(clic *cli.Context) error { | |||||||
| 			u.Path += "/" | 			u.Path += "/" | ||||||
| 		} | 		} | ||||||
| 		if err != nil { return err } | 		if err != nil { return err } | ||||||
| 		remotes[i] = &OD{ BaseUri: u } | 		remotes[i] = &OD { | ||||||
|  | 			Task: &Task{ | ||||||
|  | 				WebsiteId: 0, | ||||||
|  | 				Url: u.String(), | ||||||
|  | 			}, | ||||||
|  | 			BaseUri: u, | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	c := context.Background() | 	c := context.Background() | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								model.go
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								model.go
									
									
									
									
									
								
							| @ -16,9 +16,9 @@ type Job struct { | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| type OD struct { | type OD struct { | ||||||
|  | 	Task    *Task | ||||||
| 	Wait    sync.WaitGroup | 	Wait    sync.WaitGroup | ||||||
| 	BaseUri fasturl.URL | 	BaseUri fasturl.URL | ||||||
| 	Files   []File |  | ||||||
| 	WCtx    WorkerContext | 	WCtx    WorkerContext | ||||||
| 	Scanned redblackhash.Tree | 	Scanned redblackhash.Tree | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										57
									
								
								scheduler.go
									
									
									
									
									
								
							
							
						
						
									
										57
									
								
								scheduler.go
									
									
									
									
									
								
							| @ -2,7 +2,11 @@ package main | |||||||
| 
 | 
 | ||||||
| import ( | import ( | ||||||
| 	"context" | 	"context" | ||||||
|  | 	"encoding/json" | ||||||
|  | 	"fmt" | ||||||
| 	"github.com/sirupsen/logrus" | 	"github.com/sirupsen/logrus" | ||||||
|  | 	"os" | ||||||
|  | 	"path" | ||||||
| 	"sync/atomic" | 	"sync/atomic" | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| @ -12,19 +16,17 @@ var totalBuffered int64 | |||||||
| func Schedule(c context.Context, remotes <-chan *OD) { | func Schedule(c context.Context, remotes <-chan *OD) { | ||||||
| 	go Stats(c) | 	go Stats(c) | ||||||
| 
 | 
 | ||||||
| 	for { | 	for remote := range remotes { | ||||||
| 		select { |  | ||||||
| 		case <-c.Done(): |  | ||||||
| 			return |  | ||||||
| 
 |  | ||||||
| 		case remote := <-remotes: |  | ||||||
| 		logrus.WithField("url", remote.BaseUri.String()). | 		logrus.WithField("url", remote.BaseUri.String()). | ||||||
| 			Info("Starting crawler") | 			Info("Starting crawler") | ||||||
| 
 | 
 | ||||||
|  | 		// Collect results | ||||||
|  | 		results := make(chan File) | ||||||
|  | 
 | ||||||
| 		// Spawn workers | 		// Spawn workers | ||||||
| 		remote.WCtx.in, remote.WCtx.out = makeJobBuffer(c) | 		remote.WCtx.in, remote.WCtx.out = makeJobBuffer(c) | ||||||
| 		for i := 0; i < config.Workers; i++ { | 		for i := 0; i < config.Workers; i++ { | ||||||
| 				go remote.WCtx.Worker() | 			go remote.WCtx.Worker(results) | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		// Enqueue initial job | 		// Enqueue initial job | ||||||
| @ -37,12 +39,13 @@ func Schedule(c context.Context, remotes <-chan *OD) { | |||||||
| 		}) | 		}) | ||||||
| 
 | 
 | ||||||
| 		// Upload result when ready | 		// Upload result when ready | ||||||
| 			go remote.Watch() | 		go remote.Watch(results) | ||||||
| 		} |  | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func (r *OD) Watch() { | func (r *OD) Watch(results chan File) { | ||||||
|  | 	go r.Task.Collect(results) | ||||||
|  | 
 | ||||||
| 	// Wait for all jobs on remote to finish | 	// Wait for all jobs on remote to finish | ||||||
| 	r.Wait.Wait() | 	r.Wait.Wait() | ||||||
| 	close(r.WCtx.in) | 	close(r.WCtx.in) | ||||||
| @ -52,6 +55,40 @@ func (r *OD) Watch() { | |||||||
| 		Info("Crawler finished") | 		Info("Crawler finished") | ||||||
| 
 | 
 | ||||||
| 	globalWait.Done() | 	globalWait.Done() | ||||||
|  | 
 | ||||||
|  | 	close(results) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | func (t *Task) Collect(results chan File) { | ||||||
|  | 	err := t.collect(results) | ||||||
|  | 	if err != nil { | ||||||
|  | 		logrus.WithError(err). | ||||||
|  | 			Error("Failed saving crawl results") | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | func (t *Task) collect(results chan File) error { | ||||||
|  | 	err := os.MkdirAll("crawled", 0755) | ||||||
|  | 	if err != nil { return err } | ||||||
|  | 
 | ||||||
|  | 	f, err := os.OpenFile( | ||||||
|  | 		path.Join("crawled", fmt.Sprintf("%d.json", t.WebsiteId)), | ||||||
|  | 		os.O_CREATE | os.O_WRONLY | os.O_TRUNC, | ||||||
|  | 		0755, | ||||||
|  | 	) | ||||||
|  | 	if err != nil { return err } | ||||||
|  | 	defer f.Close() | ||||||
|  | 
 | ||||||
|  | 	for result := range results { | ||||||
|  | 		resJson, err := json.Marshal(result) | ||||||
|  | 		if err != nil { panic(err) } | ||||||
|  | 		_, err = f.Write(resJson) | ||||||
|  | 		if err != nil { return err } | ||||||
|  | 		_, err = f.Write([]byte{'\n'}) | ||||||
|  | 		if err != nil { return err } | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func makeJobBuffer(c context.Context) (chan<- Job, <-chan Job) { | func makeJobBuffer(c context.Context) (chan<- Job, <-chan Job) { | ||||||
|  | |||||||
							
								
								
									
										10
									
								
								worker.go
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								worker.go
									
									
									
									
									
								
							| @ -19,13 +19,13 @@ type WorkerContext struct { | |||||||
| 	numRateLimits int | 	numRateLimits int | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func (w WorkerContext) Worker() { | func (w WorkerContext) Worker(results chan<- File) { | ||||||
| 	for job := range w.out { | 	for job := range w.out { | ||||||
| 		w.step(job) | 		w.step(results, job) | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func (w WorkerContext) step(job Job) { | func (w WorkerContext) step(results chan<- File, job Job) { | ||||||
| 	defer w.finishJob(&job) | 	defer w.finishJob(&job) | ||||||
| 
 | 
 | ||||||
| 	var f File | 	var f File | ||||||
| @ -64,7 +64,9 @@ func (w WorkerContext) step(job Job) { | |||||||
| 		w.queueJob(job) | 		w.queueJob(job) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	job.OD.Files = append(job.OD.Files, f) | 	if !f.IsDir { | ||||||
|  | 		results <- f | ||||||
|  | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func DoJob(job *Job, f *File) (newJobs []Job, err error) { | func DoJob(job *Job, f *File) (newJobs []Job, err error) { | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user