mirror of
				https://github.com/terorie/od-database-crawler.git
				synced 2025-10-25 11:26:51 +00:00 
			
		
		
		
	Fix wait, add back crawl command
This commit is contained in:
		
							parent
							
								
									cc777bcaeb
								
							
						
					
					
						commit
						145d37f84a
					
				
							
								
								
									
										50
									
								
								main.go
									
									
									
									
									
								
							
							
						
						
									
										50
									
								
								main.go
									
									
									
									
									
								
							| @ -9,6 +9,7 @@ import ( | ||||
| 	"net/http" | ||||
| 	_ "net/http/pprof" | ||||
| 	"os" | ||||
| 	"strings" | ||||
| 	"sync/atomic" | ||||
| 	"time" | ||||
| ) | ||||
| @ -21,6 +22,14 @@ var app = cli.App { | ||||
| 	Writer:       os.Stdout, | ||||
| 	Compiled:     buildDate, | ||||
| 	Action:       cmdBase, | ||||
| 	Commands:     []cli.Command{ | ||||
| 		{ | ||||
| 			Name:      "crawl", | ||||
| 			Usage:     "Crawl a list of URLs", | ||||
| 			ArgsUsage: "<site>", | ||||
| 			Action:    cmdCrawler, | ||||
| 		}, | ||||
| 	}, | ||||
| } | ||||
| 
 | ||||
| func init() { | ||||
| @ -74,6 +83,7 @@ func cmdBase(clic *cli.Context) error { | ||||
| 				time.Sleep(30 * time.Second) | ||||
| 				continue | ||||
| 			} | ||||
| 			globalWait.Add(1) | ||||
| 			inRemotes <- &OD { | ||||
| 				Task: t, | ||||
| 				BaseUri: baseUri, | ||||
| @ -81,6 +91,46 @@ func cmdBase(clic *cli.Context) error { | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func cmdCrawler(clic *cli.Context) error { | ||||
| 	readConfig() | ||||
| 
 | ||||
| 	if clic.NArg() != 1 { | ||||
| 		cli.ShowCommandHelpAndExit(clic, "crawl", 1) | ||||
| 	} | ||||
| 
 | ||||
| 	arg := clic.Args()[0] | ||||
| 	// https://github.com/golang/go/issues/19779 | ||||
| 	if !strings.Contains(arg, "://") { | ||||
| 		arg = "http://" + arg | ||||
| 	} | ||||
| 	var u fasturl.URL | ||||
| 	err := u.Parse(arg) | ||||
| 	if !strings.HasSuffix(u.Path, "/") { | ||||
| 		u.Path += "/" | ||||
| 	} | ||||
| 	if err != nil { return err } | ||||
| 
 | ||||
| 	// TODO Graceful shutdown | ||||
| 	forceCtx := context.Background() | ||||
| 
 | ||||
| 	inRemotes := make(chan *OD) | ||||
| 	go Schedule(forceCtx, inRemotes) | ||||
| 
 | ||||
| 	ticker := time.NewTicker(3 * time.Second) | ||||
| 	defer ticker.Stop() | ||||
| 
 | ||||
| 	globalWait.Add(1) | ||||
| 	inRemotes <- &OD { | ||||
| 		Task: &Task{ | ||||
| 			WebsiteId: 0, | ||||
| 			Url: u.String(), | ||||
| 		}, | ||||
| 		BaseUri: u, | ||||
| 	} | ||||
| 
 | ||||
| 	// Wait for all jobs to finish | ||||
| 	globalWait.Wait() | ||||
| 
 | ||||
|  | ||||
| @ -42,6 +42,7 @@ func Schedule(c context.Context, remotes <-chan *OD) { | ||||
| 		// Upload result when ready | ||||
| 		go remote.Watch(results) | ||||
| 
 | ||||
| 		// Sleep if max number of tasks are active | ||||
| 		for atomic.LoadInt32(&activeTasks) > config.Tasks { | ||||
| 			select { | ||||
| 			case <-c.Done(): | ||||
|  | ||||
| @ -105,7 +105,6 @@ func DoJob(job *Job, f *File) (newJobs []Job, err error) { | ||||
| 			} | ||||
| 			lastLink = uriStr | ||||
| 
 | ||||
| 			job.OD.Wait.Add(1) | ||||
| 			newJobs = append(newJobs, Job{ | ||||
| 				OD:     job.OD, | ||||
| 				Uri:    link, | ||||
| @ -143,11 +142,10 @@ func (w WorkerContext) queueJob(job Job) { | ||||
| 		} else { | ||||
| 			time.Sleep(time.Duration(math.Sqrt(float64(50 * w.numRateLimits))) * | ||||
| 				100 * time.Millisecond) | ||||
| 			w.in <- job | ||||
| 		} | ||||
| 	} else { | ||||
| 		w.in <- job | ||||
| 	} | ||||
| 
 | ||||
| 	w.in <- job | ||||
| } | ||||
| 
 | ||||
| func (w WorkerContext) finishJob(job *Job) { | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user