Move recrawl task to cron job

This commit is contained in:
simon987 2019-04-06 10:50:15 -04:00
parent 0c3d0b38e6
commit b8b531f511
4 changed files with 16 additions and 15 deletions

View File

@ -26,6 +26,7 @@ logger.addHandler(file_handler)
logger.addHandler(StreamHandler(sys.stdout)) logger.addHandler(StreamHandler(sys.stdout))
taskManager = TaskManager() taskManager = TaskManager()
taskManager.start_indexer_threads()
searchEngine = ElasticSearchEngine("od-database") searchEngine = ElasticSearchEngine("od-database")
searchEngine.start_stats_scheduler() searchEngine.start_stats_scheduler()
db = Database(config.DB_CONN_STR) db = Database(config.DB_CONN_STR)

4
do_recrawl.py Normal file
View File

@ -0,0 +1,4 @@
from tasks import TaskManager
tm = TaskManager()
tm.do_recrawl()

View File

@ -1,5 +1,5 @@
[uwsgi] [uwsgi]
socket = 127.0.0.1:3031 uwsgi-socket = 127.0.0.1:3031
wsgi-file = uwsgi.py wsgi-file = uwsgi.py
processes = 4 processes = 4
threads = 4 threads = 4

View File

@ -2,6 +2,7 @@ import json
import logging import logging
import os import os
import time import time
from multiprocessing.pool import ThreadPool
from threading import Thread from threading import Thread
from uuid import uuid4 from uuid import uuid4
@ -71,8 +72,9 @@ class TaskManager:
self.worker.request_access(config.TT_INDEX_PROJECT, True, False) self.worker.request_access(config.TT_INDEX_PROJECT, True, False)
self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET) self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
self._indexer_threads = list() self._indexer_threads = list()
def start_indexer_threads(self):
logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, )) logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, ))
for _ in range(config.INDEXER_THREADS): for _ in range(config.INDEXER_THREADS):
t = Thread(target=self._do_indexing) t = Thread(target=self._do_indexing)
@ -80,10 +82,6 @@ class TaskManager:
self._indexer_threads.append(t) self._indexer_threads.append(t)
t.start() t.start()
self._recrawl_thread = Thread(target=self._do_recrawl)
self._recrawl_thread.setDaemon(True)
self._recrawl_thread.start()
def _do_indexing(self): def _do_indexing(self):
while True: while True:
@ -123,11 +121,9 @@ class TaskManager:
self.db.update_website_date_if_exists(task.website_id) self.db.update_website_date_if_exists(task.website_id)
def _do_recrawl(self): def do_recrawl(self):
while True: logger.debug("Creating re-crawl tasks")
logger.debug("Creating re-crawl tasks") self._generate_crawling_tasks()
self._generate_crawling_tasks()
time.sleep(60 * 30)
def _generate_crawling_tasks(self): def _generate_crawling_tasks(self):
@ -136,12 +132,12 @@ class TaskManager:
def recrawl(website: Website): def recrawl(website: Website):
crawl_task = Task(website.id, website.url, crawl_task = Task(website.id, website.url,
priority=(int((time.time() - website.last_modified.timestamp()) / 3600)) priority=(int((time.time() - website.last_modified.timestamp()) / 3600)))
)
self.queue_task(crawl_task) self.queue_task(crawl_task)
for w in websites_to_crawl: pool = ThreadPool(processes=30)
recrawl(w) pool.map(func=recrawl, iterable=websites_to_crawl)
pool.close()
def queue_task(self, task: Task): def queue_task(self, task: Task):
max_assign_time = 24 * 7 * 3600 max_assign_time = 24 * 7 * 3600