Only queue http tasks (temp)

This commit is contained in:
simon987 2019-04-06 09:07:17 -04:00
parent 310f343423
commit 06ae89f4d2
2 changed files with 5 additions and 4 deletions

View File

@ -309,14 +309,15 @@ class Database:
conn.commit() conn.commit()
def get_oldest_updated_websites(self, size: int): def get_oldest_updated_websites(self, size: int, prefix: str):
with psycopg2.connect(self.db_conn_str) as conn: with psycopg2.connect(self.db_conn_str) as conn:
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute("SELECT id, url, last_modified FROM website " cursor.execute("SELECT id, url, last_modified FROM website "
"WHERE url LIKE %s "
"ORDER BY last_modified ASC LIMIT %s", "ORDER BY last_modified ASC LIMIT %s",
(size,)) (prefix + "%", size, ))
return [Website(url=r[1], return [Website(url=r[1],
website_id=r[0], website_id=r[0],
last_modified=r[2], last_modified=r[2],

View File

@ -133,7 +133,7 @@ class TaskManager:
def _generate_crawling_tasks(self): def _generate_crawling_tasks(self):
# TODO: Insert more in-depth re-crawl logic here # TODO: Insert more in-depth re-crawl logic here
websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE) websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http")
def recrawl(website: Website): def recrawl(website: Website):
crawl_task = Task(website.id, website.url, crawl_task = Task(website.id, website.url,
@ -141,7 +141,7 @@ class TaskManager:
) )
self.queue_task(crawl_task) self.queue_task(crawl_task)
pool = ThreadPool(processes=10) pool = ThreadPool(processes=3)
pool.map(func=recrawl, iterable=websites_to_crawl) pool.map(func=recrawl, iterable=websites_to_crawl)
pool.close() pool.close()