Only queue http tasks (temp)

2025-04-24 12:45:51 +00:00 · 2019-04-06 09:07:17 -04:00 · 2019-04-06 09:07:17 -04:00 · 06ae89f4d2
commit 06ae89f4d2
parent 310f343423
2 changed files with 5 additions and 4 deletions
--- a/database.py
+++ b/database.py
@ -309,14 +309,15 @@ class Database:

            conn.commit()

-    def get_oldest_updated_websites(self, size: int):
+    def get_oldest_updated_websites(self, size: int, prefix: str):

        with psycopg2.connect(self.db_conn_str) as conn:
            cursor = conn.cursor()

            cursor.execute("SELECT id, url, last_modified FROM website "
+                           "WHERE url LIKE %s "
                           "ORDER BY last_modified ASC LIMIT %s",
-                           (size,))
+                           (prefix + "%", size, ))
            return [Website(url=r[1],
                            website_id=r[0],
                            last_modified=r[2],
--- a/tasks.py
+++ b/tasks.py
@ -133,7 +133,7 @@ class TaskManager:
    def _generate_crawling_tasks(self):

        # TODO: Insert more in-depth re-crawl logic here
-        websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE)
+        websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http")

        def recrawl(website: Website):
            crawl_task = Task(website.id, website.url,
@ -141,7 +141,7 @@ class TaskManager:
                              )
            self.queue_task(crawl_task)

-        pool = ThreadPool(processes=10)
+        pool = ThreadPool(processes=3)
        pool.map(func=recrawl, iterable=websites_to_crawl)
        pool.close()