From 06ae89f4d2bb5fea425a1ef9a77984ae398d119d Mon Sep 17 00:00:00 2001
From: simon987 <fortier.simon@protonmail.com>
Date: Sat, 6 Apr 2019 09:07:17 -0400
Subject: [PATCH] Only queue http tasks (temp)

---
 database.py | 5 +++--
 tasks.py    | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/database.py b/database.py
index eeb06b4..96c7fe4 100644
--- a/database.py
+++ b/database.py
@@ -309,14 +309,15 @@ class Database:
 
             conn.commit()
 
-    def get_oldest_updated_websites(self, size: int):
+    def get_oldest_updated_websites(self, size: int, prefix: str):
 
         with psycopg2.connect(self.db_conn_str) as conn:
             cursor = conn.cursor()
 
             cursor.execute("SELECT id, url, last_modified FROM website "
+                           "WHERE url LIKE %s "
                            "ORDER BY last_modified ASC LIMIT %s",
-                           (size,))
+                           (prefix + "%", size, ))
             return [Website(url=r[1],
                             website_id=r[0],
                             last_modified=r[2],
diff --git a/tasks.py b/tasks.py
index f301531..012235a 100644
--- a/tasks.py
+++ b/tasks.py
@@ -133,7 +133,7 @@ class TaskManager:
     def _generate_crawling_tasks(self):
 
         # TODO: Insert more in-depth re-crawl logic here
-        websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE)
+        websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http")
 
         def recrawl(website: Website):
             crawl_task = Task(website.id, website.url,
@@ -141,7 +141,7 @@ class TaskManager:
                               )
             self.queue_task(crawl_task)
 
-        pool = ThreadPool(processes=10)
+        pool = ThreadPool(processes=3)
         pool.map(func=recrawl, iterable=websites_to_crawl)
         pool.close()