From 06ae89f4d2bb5fea425a1ef9a77984ae398d119d Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 6 Apr 2019 09:07:17 -0400 Subject: [PATCH] Only queue http tasks (temp) --- database.py | 5 +++-- tasks.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/database.py b/database.py index eeb06b4..96c7fe4 100644 --- a/database.py +++ b/database.py @@ -309,14 +309,15 @@ class Database: conn.commit() - def get_oldest_updated_websites(self, size: int): + def get_oldest_updated_websites(self, size: int, prefix: str): with psycopg2.connect(self.db_conn_str) as conn: cursor = conn.cursor() cursor.execute("SELECT id, url, last_modified FROM website " + "WHERE url LIKE %s " "ORDER BY last_modified ASC LIMIT %s", - (size,)) + (prefix + "%", size, )) return [Website(url=r[1], website_id=r[0], last_modified=r[2], diff --git a/tasks.py b/tasks.py index f301531..012235a 100644 --- a/tasks.py +++ b/tasks.py @@ -133,7 +133,7 @@ class TaskManager: def _generate_crawling_tasks(self): # TODO: Insert more in-depth re-crawl logic here - websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE) + websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http") def recrawl(website: Website): crawl_task = Task(website.id, website.url, @@ -141,7 +141,7 @@ class TaskManager: ) self.queue_task(crawl_task) - pool = ThreadPool(processes=10) + pool = ThreadPool(processes=3) pool.map(func=recrawl, iterable=websites_to_crawl) pool.close()