diff --git a/README.md b/README.md index 9d57a86..0ed81a3 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,26 @@ RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000) SUBMIT_FTP = False # Allow http(s) websites in /submit SUBMIT_HTTP = True + +# Number of re-crawl tasks to keep in the queue +RECRAWL_POOL_SIZE = 10000 +# task_tracker API url +TT_API = "http://localhost:3010" +# task_tracker crawl project id +TT_CRAWL_PROJECT = 3 +# task_tracker indexing project id +TT_INDEX_PROJECT = 9 +# Number of threads to use for ES indexing +INDEXER_THREADS = 4 + +# ws_bucket API url +WSB_API = "http://localhost:3020" +# ws_bucket secret +WSB_SECRET = "default_secret" +# ws_bucket data directory +WSB_PATH = "/mnt/data/github.com/simon987/ws_bucket/data" +# od-database PostgreSQL connection string +DB_CONN_STR = "dbname=od-database user=od-database password=xxx" ``` ## Running the crawl server diff --git a/tasks.py b/tasks.py index 9e73027..f301531 100644 --- a/tasks.py +++ b/tasks.py @@ -77,10 +77,12 @@ class TaskManager: logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, )) for _ in range(config.INDEXER_THREADS): t = Thread(target=self._do_indexing) + t.setDaemon(True) self._indexer_threads.append(t) t.start() self._recrawl_thread = Thread(target=self._do_recrawl) + self._recrawl_thread.setDaemon(True) self._recrawl_thread.start() def _do_indexing(self): @@ -131,7 +133,7 @@ class TaskManager: def _generate_crawling_tasks(self): # TODO: Insert more in-depth re-crawl logic here - websites_to_crawl = self.db.get_oldest_updated_websites(10000) + websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE) def recrawl(website: Website): crawl_task = Task(website.id, website.url,