From b8b531f5110daa7582bb3fa5ecfe07a859e62f20 Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 6 Apr 2019 10:50:15 -0400 Subject: [PATCH] Move recrawl task to cron job --- common.py | 1 + do_recrawl.py | 4 ++++ od-database.ini | 2 +- tasks.py | 24 ++++++++++-------------- 4 files changed, 16 insertions(+), 15 deletions(-) create mode 100644 do_recrawl.py diff --git a/common.py b/common.py index 089f8d7..1b31d7f 100644 --- a/common.py +++ b/common.py @@ -26,6 +26,7 @@ logger.addHandler(file_handler) logger.addHandler(StreamHandler(sys.stdout)) taskManager = TaskManager() +taskManager.start_indexer_threads() searchEngine = ElasticSearchEngine("od-database") searchEngine.start_stats_scheduler() db = Database(config.DB_CONN_STR) diff --git a/do_recrawl.py b/do_recrawl.py new file mode 100644 index 0000000..bea906d --- /dev/null +++ b/do_recrawl.py @@ -0,0 +1,4 @@ +from tasks import TaskManager + +tm = TaskManager() +tm.do_recrawl() diff --git a/od-database.ini b/od-database.ini index 5dd1813..c1522e9 100644 --- a/od-database.ini +++ b/od-database.ini @@ -1,5 +1,5 @@ [uwsgi] -socket = 127.0.0.1:3031 +uwsgi-socket = 127.0.0.1:3031 wsgi-file = uwsgi.py processes = 4 threads = 4 diff --git a/tasks.py b/tasks.py index 51ce6ca..cc7fb52 100644 --- a/tasks.py +++ b/tasks.py @@ -2,6 +2,7 @@ import json import logging import os import time +from multiprocessing.pool import ThreadPool from threading import Thread from uuid import uuid4 @@ -71,8 +72,9 @@ class TaskManager: self.worker.request_access(config.TT_INDEX_PROJECT, True, False) self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET) - self._indexer_threads = list() + + def start_indexer_threads(self): logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, )) for _ in range(config.INDEXER_THREADS): t = Thread(target=self._do_indexing) @@ -80,10 +82,6 @@ class TaskManager: self._indexer_threads.append(t) t.start() - self._recrawl_thread = Thread(target=self._do_recrawl) - self._recrawl_thread.setDaemon(True) - self._recrawl_thread.start() - def _do_indexing(self): while True: @@ -123,11 +121,9 @@ class TaskManager: self.db.update_website_date_if_exists(task.website_id) - def _do_recrawl(self): - while True: - logger.debug("Creating re-crawl tasks") - self._generate_crawling_tasks() - time.sleep(60 * 30) + def do_recrawl(self): + logger.debug("Creating re-crawl tasks") + self._generate_crawling_tasks() def _generate_crawling_tasks(self): @@ -136,12 +132,12 @@ class TaskManager: def recrawl(website: Website): crawl_task = Task(website.id, website.url, - priority=(int((time.time() - website.last_modified.timestamp()) / 3600)) - ) + priority=(int((time.time() - website.last_modified.timestamp()) / 3600))) self.queue_task(crawl_task) - for w in websites_to_crawl: - recrawl(w) + pool = ThreadPool(processes=30) + pool.map(func=recrawl, iterable=websites_to_crawl) + pool.close() def queue_task(self, task: Task): max_assign_time = 24 * 7 * 3600