Decentralised crawling should work in theory + temporary fix for going further than the maximum 10k results elasticsearch allows by default

2025-12-13 23:09:01 +00:00 · 2018-06-21 19:44:27 -04:00
parent 098ad2be72
commit 14d384e366
9 changed files with 275 additions and 84 deletions
--- a/database.py
+++ b/database.py
@@ -4,6 +4,7 @@ from urllib.parse import urlparse
 import os
 import bcrypt
 import uuid
+import task


 class InvalidQueryException(Exception):
@@ -277,6 +278,33 @@ class Database:
            cursor.execute("SELECT * FROM BlacklistedWebsite")
            return [BlacklistedWebsite(r[0], r[1]) for r in cursor.fetchall()]

+    def add_crawl_server(self, server: task.CrawlServer):
+
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.cursor()
+
+            cursor.execute("INSERT INTO CrawlServer (url, name, slots, token) VALUES (?,?,?,?)",
+                           (server.url, server.name, server.slots, server.token))
+            conn.commit()
+
+    def remove_crawl_server(self, server_id):
+
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.cursor()
+
+            cursor.execute("DELETE FROM CrawlServer WHERE id=?", (server_id, ))
+            conn.commit()
+
+    def get_crawl_servers(self) -> list:
+
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.cursor()
+
+            cursor.execute("SELECT url, name, slots, token, id FROM CrawlServer")
+
+            return [task.CrawlServer(r[0], r[1], r[2], r[3], r[4]) for r in cursor.fetchall()]
+
+