mirror of
				https://github.com/simon987/od-database.git
				synced 2025-10-25 19:56:51 +00:00 
			
		
		
		
	barebones crawl_server microservice
This commit is contained in:
		
							parent
							
								
									8421cc0885
								
							
						
					
					
						commit
						d849227798
					
				
							
								
								
									
										0
									
								
								crawl_server/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								crawl_server/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										73
									
								
								crawl_server/database.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								crawl_server/database.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,73 @@ | ||||
| import os | ||||
| import json | ||||
| import sqlite3 | ||||
| 
 | ||||
| 
 | ||||
| class Task: | ||||
| 
 | ||||
|     def __init__(self, url: str, priority: int = 1, callback_type: str = None, callback_args: str = None): | ||||
|         self.url = url | ||||
|         self.priority = priority | ||||
|         self.callback_type = callback_type | ||||
|         self.callback_args = json.loads(callback_args) if callback_args else {} | ||||
| 
 | ||||
|     def to_json(self): | ||||
|         return ({ | ||||
|             "url": self.url, | ||||
|             "priority": self.priority, | ||||
|             "callback_type": self.callback_type, | ||||
|             "callback_args": json.dumps(self.callback_args) | ||||
|         }) | ||||
| 
 | ||||
| 
 | ||||
| class TaskManagerDatabase: | ||||
| 
 | ||||
|     def __init__(self, db_path): | ||||
|         self.db_path = db_path | ||||
| 
 | ||||
|         if not os.path.exists(db_path): | ||||
|             self.init_database() | ||||
| 
 | ||||
|     def init_database(self): | ||||
| 
 | ||||
|         with open("task_db_init.sql", "r") as f: | ||||
|             init_script = f.read() | ||||
| 
 | ||||
|         with sqlite3.connect(self.db_path) as conn: | ||||
|             conn.executescript(init_script) | ||||
|             conn.commit() | ||||
| 
 | ||||
|     def pop_task(self): | ||||
| 
 | ||||
|         with sqlite3.connect(self.db_path) as conn: | ||||
|             cursor = conn.cursor() | ||||
| 
 | ||||
|             cursor.execute("SELECT id, url, priority, callback_type, callback_args" | ||||
|                            " FROM Queue ORDER BY priority DESC, Queue.id ASC LIMIT 1") | ||||
|             task = cursor.fetchone() | ||||
| 
 | ||||
|             if task: | ||||
|                 cursor.execute("DELETE FROM Queue WHERE id=?", (task[0],)) | ||||
|                 conn.commit() | ||||
|                 return Task(task[1], task[2], task[3], task[4]) | ||||
|             else: | ||||
|                 return None | ||||
| 
 | ||||
|     def put_task(self, task: Task): | ||||
| 
 | ||||
|         with sqlite3.connect(self.db_path) as conn: | ||||
|             cursor = conn.cursor() | ||||
| 
 | ||||
|             cursor.execute("INSERT INTO Queue (url, priority, callback_type, callback_args) VALUES (?,?,?,?)", | ||||
|                            (task.url, task.priority, task.callback_type, json.dumps(task.callback_args))) | ||||
|             conn.commit() | ||||
| 
 | ||||
|     def get_tasks(self): | ||||
| 
 | ||||
|         with sqlite3.connect(self.db_path) as conn: | ||||
|             cursor = conn.cursor() | ||||
| 
 | ||||
|             cursor.execute("SELECT * FROM Queue") | ||||
|             tasks = cursor.fetchall() | ||||
| 
 | ||||
|             return [Task(t[1], t[2], t[3], t[4]) for t in tasks] | ||||
							
								
								
									
										40
									
								
								crawl_server/server.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								crawl_server/server.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,40 @@ | ||||
| from flask import Flask, request, abort, Response | ||||
| import json | ||||
| from crawl_server.task_manager import TaskManager, Task | ||||
| app = Flask(__name__) | ||||
| 
 | ||||
| tm = TaskManager("tm_db.sqlite3") | ||||
| 
 | ||||
| 
 | ||||
| @app.route("/") | ||||
| def hello(): | ||||
|     return "Hello World!" | ||||
| 
 | ||||
| 
 | ||||
| @app.route("/task/") | ||||
| def get_tasks(): | ||||
|     json_str = json.dumps([task.to_json() for task in tm.get_tasks()]) | ||||
|     return Response(json_str, mimetype="application/json") | ||||
| 
 | ||||
| 
 | ||||
| @app.route("/task/put", methods=["POST"]) | ||||
| def task_put(): | ||||
| 
 | ||||
|     if request.json: | ||||
|         try: | ||||
|             url = request.json["url"] | ||||
|             priority = request.json["priority"] | ||||
|             callback_type = request.json["callback_type"] | ||||
|             callback_args = request.json["callback_args"] | ||||
|         except KeyError: | ||||
|             return abort(400) | ||||
| 
 | ||||
|         task = Task(url, priority, callback_type, callback_args) | ||||
|         tm.put_task(task) | ||||
|         return '{"ok": "true"}' | ||||
| 
 | ||||
|     return abort(400) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     app.run() | ||||
							
								
								
									
										73
									
								
								crawl_server/task_manager.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								crawl_server/task_manager.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,73 @@ | ||||
| from crawl_server.database import TaskManagerDatabase, Task | ||||
| from multiprocessing import Pool | ||||
| from apscheduler.schedulers.background import BackgroundScheduler | ||||
| from enum import Enum | ||||
| from datetime import datetime | ||||
| from crawler.crawler import RemoteDirectoryCrawler | ||||
| 
 | ||||
| 
 | ||||
| class TaskResultStatus(Enum): | ||||
|     SUCCESS = 0 | ||||
|     FAILURE = 1 | ||||
| 
 | ||||
| 
 | ||||
| class TaskResult: | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         self.status_code: TaskResultStatus = None | ||||
|         self.file_count = 0 | ||||
|         self.start_time = None | ||||
|         self.end_time = None | ||||
|         self.website_id = None | ||||
| 
 | ||||
| 
 | ||||
| class TaskManager: | ||||
| 
 | ||||
|     def __init__(self, db_path, max_processes=8): | ||||
|         self.db = TaskManagerDatabase(db_path) | ||||
|         self.pool = Pool(processes=max_processes) | ||||
| 
 | ||||
|         scheduler = BackgroundScheduler() | ||||
|         scheduler.add_job(self.execute_queued_task, "interval", seconds=1) | ||||
|         scheduler.start() | ||||
| 
 | ||||
|     def put_task(self, task: Task): | ||||
|         self.db.put_task(task) | ||||
| 
 | ||||
|     def get_tasks(self): | ||||
|         return self.db.get_tasks() | ||||
| 
 | ||||
|     def execute_queued_task(self): | ||||
| 
 | ||||
|         task = self.db.pop_task() | ||||
|         if task: | ||||
|             print("pooled " + task.url) | ||||
|             self.pool.apply_async( | ||||
|                 TaskManager.run_task, | ||||
|                 args=(task, ), | ||||
|                 callback=TaskManager.task_complete | ||||
|             ) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def run_task(task): | ||||
|         result = TaskResult() | ||||
|         result.start_time = datetime.utcnow() | ||||
| 
 | ||||
|         print("Starting task " + task.url) | ||||
| 
 | ||||
|         crawler = RemoteDirectoryCrawler(task.url, 10) | ||||
|         crawler.crawl_directory() | ||||
| 
 | ||||
|         print("End task " + task.url) | ||||
| 
 | ||||
|         result.end_time = datetime.utcnow() | ||||
| 
 | ||||
|         return result | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def task_complete(result: TaskResult): | ||||
|         print("Task done " + str(result)) | ||||
|         # todo save in db | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| @ -71,7 +71,7 @@ class RemoteDirectoryCrawler: | ||||
| 
 | ||||
|         try: | ||||
|             directory = RemoteDirectoryFactory.get_directory(self.url) | ||||
|             root_listing = directory.list_dir("/dl2/")  # todo get path | ||||
|             root_listing = directory.list_dir("/") | ||||
|             directory.close() | ||||
|         except TimeoutError: | ||||
|             return | ||||
|  | ||||
| @ -54,7 +54,7 @@ class HttpDirectory(RemoteDirectory): | ||||
|             if self._should_ignore(link): | ||||
|                 continue | ||||
| 
 | ||||
|             file_url = urljoin(path_url, link[1]) | ||||
|             file_url = urljoin(path_url, link.url) | ||||
|             path, file_name = os.path.split(file_url[len(self.base_url) - 1:]) | ||||
| 
 | ||||
|             if self._isdir(link): | ||||
|  | ||||
							
								
								
									
										14
									
								
								debug_put.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								debug_put.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,14 @@ | ||||
| import requests | ||||
| import json | ||||
| 
 | ||||
| 
 | ||||
| payload = json.dumps({ | ||||
|     "url": "http://124.158.108.137/ebooks/", | ||||
|     "priority": 2, | ||||
|     "callback_type": "", | ||||
|     "callback_args": "{}" | ||||
| }) | ||||
| 
 | ||||
| r = requests.post("http://localhost:5000/task/put", | ||||
|                   headers={"Content-Type": "application/json"}, | ||||
|                   data=payload) | ||||
| @ -1,4 +1,5 @@ | ||||
| flask | ||||
| flask_testing | ||||
| requests | ||||
| bs4 | ||||
| validators | ||||
|  | ||||
							
								
								
									
										2
									
								
								task.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								task.py
									
									
									
									
									
								
							| @ -2,8 +2,6 @@ from apscheduler.schedulers.background import BackgroundScheduler | ||||
| import os | ||||
| from database import Website | ||||
| from multiprocessing import Value, Process | ||||
| from scrapy.crawler import CrawlerProcess | ||||
| from scrapy.utils.project import get_project_settings | ||||
| from database import Database | ||||
| from reddit_bot import RedditBot | ||||
| import praw | ||||
|  | ||||
							
								
								
									
										8
									
								
								task_db_init.sql
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								task_db_init.sql
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | ||||
| 
 | ||||
| CREATE TABLE Queue ( | ||||
|   id INTEGER PRIMARY KEY, | ||||
|   url TEXT, | ||||
|   priority INTEGER, | ||||
|   callback_type TEXT, | ||||
|   callback_args TEXT | ||||
| ); | ||||
| @ -1,190 +0,0 @@ | ||||
| from unittest import TestCase | ||||
| import sqlite3 | ||||
| from database import Database, File, Website, InvalidQueryException | ||||
| import os | ||||
| 
 | ||||
| 
 | ||||
| class DatabaseTest(TestCase): | ||||
| 
 | ||||
|     def tearDown(self): | ||||
|         if os.path.exists("test.sqlite3"): | ||||
|             os.remove("test.sqlite3") | ||||
| 
 | ||||
|     def test_init_database_existing(self): | ||||
| 
 | ||||
|         with open("test.sqlite3", "w"): | ||||
|             pass | ||||
| 
 | ||||
|         Database("test.sqlite3") | ||||
| 
 | ||||
|         self.assertEqual(os.path.getsize("test.sqlite3"), 0) | ||||
| 
 | ||||
|     def test_init_database_new(self): | ||||
| 
 | ||||
|         Database("test.sqlite3") | ||||
| 
 | ||||
|         conn = sqlite3.connect("test.sqlite3") | ||||
|         cur = conn.cursor() | ||||
| 
 | ||||
|         self.assertTrue(cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='Website'")) | ||||
| 
 | ||||
|         conn.close() | ||||
| 
 | ||||
|     def test_insert_website(self): | ||||
| 
 | ||||
|         db = Database("test.sqlite3") | ||||
|         website_id = db.insert_website(Website("https://google.ca", "127.0.0.1", "firefox")) | ||||
| 
 | ||||
|         conn = sqlite3.connect("test.sqlite3") | ||||
|         cursor = conn.cursor() | ||||
|         cursor.execute("SELECT * FROM Website WHERE id=?", (website_id, )) | ||||
| 
 | ||||
|         db_website = cursor.fetchone() | ||||
| 
 | ||||
|         self.assertEqual(db_website[0], 1) | ||||
|         self.assertEqual(db_website[1], "https://google.ca") | ||||
|         self.assertEqual(db_website[2], "127.0.0.1") | ||||
|         self.assertEqual(db_website[3], "firefox") | ||||
|         self.assertIsNotNone(db_website[4]) | ||||
| 
 | ||||
|     def test_insert_files(self): | ||||
| 
 | ||||
|         db = Database("test.sqlite3") | ||||
|         website_id = db.insert_website(Website("", "", "")) | ||||
|         db.insert_files([File(website_id, "/some/dir/", "text/plain", "file.txt", 1234)]) | ||||
| 
 | ||||
|         conn = sqlite3.connect("test.sqlite3") | ||||
|         cursor = conn.cursor() | ||||
| 
 | ||||
|         cursor.execute("SELECT * FROM File WHERE id=?", (1, )) | ||||
|         db_file = cursor.fetchone() | ||||
| 
 | ||||
|         cursor.execute("SELECT * FROM WebsitePath WHERE id=?", (db_file[1], )) | ||||
|         db_path = cursor.fetchone() | ||||
| 
 | ||||
|         self.assertEqual(db_file[0], 1) | ||||
|         self.assertEqual(db_file[1], db_path[0]) | ||||
|         self.assertEqual(db_file[3], "file.txt") | ||||
|         self.assertEqual(db_file[4], 1234) | ||||
|         self.assertEqual(db_path[1], website_id) | ||||
|         self.assertEqual(db_path[2], "/some/dir/") | ||||
| 
 | ||||
|     def test_import_json(self): | ||||
| 
 | ||||
|         db = Database("test.sqlite3") | ||||
| 
 | ||||
|         website_url = "http://google.ca/" | ||||
|         logged_ip = "127.0.0.1" | ||||
|         logged_useragent = "firefox" | ||||
| 
 | ||||
|         db.import_json("test/test_scan1.json", Website(website_url, logged_ip, logged_useragent)) | ||||
| 
 | ||||
|         with sqlite3.connect("test.sqlite3") as conn: | ||||
|             cursor = conn.cursor() | ||||
| 
 | ||||
|             cursor.execute("SELECT * FROM File WHERE name='Bleach - Chapter 001.cbz'") | ||||
|             db_file1 = cursor.fetchone() | ||||
| 
 | ||||
|             self.assertEqual(db_file1[4], 8770750) | ||||
| 
 | ||||
|             cursor.execute("SELECT * FROM File WHERE name='Bleach - Chapter 007.cbz'") | ||||
|             db_file2 = cursor.fetchone() | ||||
| 
 | ||||
|             self.assertEqual(db_file2[4], 3443820) | ||||
| 
 | ||||
|     def test_select_website(self): | ||||
| 
 | ||||
|         db = Database("test.sqlite3") | ||||
| 
 | ||||
|         website_id = db.insert_website(Website("https://simon987.net/", "127.0.0.1", "firefox")) | ||||
| 
 | ||||
|         website = db.get_website_by_url("https://simon987.net/") | ||||
| 
 | ||||
|         self.assertEqual(website.url, "https://simon987.net/") | ||||
|         self.assertEqual(website.logged_ip, "127.0.0.1") | ||||
|         self.assertEqual(website.logged_useragent, "firefox") | ||||
|         self.assertEqual(website.id, website_id) | ||||
|         self.assertIsNotNone(website.last_modified) | ||||
| 
 | ||||
|         self.assertIsNone(db.get_website_by_url("does not exist")) | ||||
| 
 | ||||
|     def test_enqueue(self): | ||||
| 
 | ||||
|         db = Database("test.sqlite3") | ||||
| 
 | ||||
|         web_id = db.insert_website(Website("https://simon987.net", "127.0.0.1", "firefox")) | ||||
| 
 | ||||
|         db.enqueue(web_id) | ||||
|         db.enqueue(web_id) | ||||
| 
 | ||||
|         with sqlite3.connect("test.sqlite3") as conn: | ||||
|             cursor = conn.cursor() | ||||
| 
 | ||||
|             cursor.execute("SELECT * FROM Queue") | ||||
|             db_queued_website = cursor.fetchone() | ||||
| 
 | ||||
|             self.assertEqual(db_queued_website[0], 1) | ||||
|             self.assertEqual(db_queued_website[1], web_id) | ||||
|             self.assertIsNone(cursor.fetchone()) | ||||
| 
 | ||||
|     def test_dequeue(self): | ||||
| 
 | ||||
|         db = Database("test.sqlite3") | ||||
| 
 | ||||
|         web_id_1 = db.insert_website(Website("", "", "")) | ||||
|         web_id_2 = db.insert_website(Website("", "", "")) | ||||
| 
 | ||||
|         db.enqueue(web_id_1) | ||||
|         db.enqueue(web_id_2, "postid") | ||||
| 
 | ||||
|         self.assertEqual(db.dequeue()[0], web_id_1) | ||||
|         self.assertEqual(db.dequeue()[1], "postid") | ||||
|         self.assertEqual(db.dequeue(), None) | ||||
|         self.assertEqual(db.dequeue(), None) | ||||
| 
 | ||||
|     def test_queue(self): | ||||
| 
 | ||||
|         db = Database("test.sqlite3") | ||||
| 
 | ||||
|         db.enqueue(db.insert_website(Website("w1", "i1", "a1"))) | ||||
|         db.enqueue(db.insert_website(Website("w2", "i2", "a2"))) | ||||
|         db.enqueue(db.insert_website(Website("w3", "i3", "a3"))) | ||||
| 
 | ||||
|         queue = db.queue() | ||||
| 
 | ||||
|         self.assertEqual(queue[0].url, "w1") | ||||
|         self.assertEqual(queue[1].logged_ip, "i2") | ||||
|         self.assertEqual(queue[2].logged_useragent, "a3") | ||||
|         self.assertIsNotNone(queue[2].last_modified) | ||||
|         self.assertEqual(len(queue), 3) | ||||
| 
 | ||||
|     def test_get_website_by_id(self): | ||||
| 
 | ||||
|         db = Database("test.sqlite3") | ||||
| 
 | ||||
|         website_id = db.insert_website(Website("a", "b", "c")) | ||||
| 
 | ||||
|         website = db.get_website_by_id(website_id) | ||||
| 
 | ||||
|         self.assertEqual(website.id, website_id) | ||||
|         self.assertEqual(website.url, "a") | ||||
|         self.assertEqual(website.logged_ip, "b") | ||||
|         self.assertEqual(website.logged_useragent, "c") | ||||
|         self.assertIsNone(db.get_website_by_id(999)) | ||||
| 
 | ||||
|     def test_search_handle_invalid_query(self): | ||||
| 
 | ||||
|         db = Database("test.sqlite3") | ||||
| 
 | ||||
|         with self.assertRaises(InvalidQueryException): | ||||
|             db.search(";DROP DATABASE;") | ||||
|         with self.assertRaises(InvalidQueryException): | ||||
|             db.search("invalidCol:") | ||||
|         with self.assertRaises(InvalidQueryException): | ||||
|             db.search("*test*") | ||||
| 
 | ||||
|     def test_stats(self): | ||||
| 
 | ||||
|         db = Database("test.sqlite3") | ||||
| 
 | ||||
|         db.get_stats()  # todo test | ||||
							
								
								
									
										53
									
								
								test/test_crawl_server.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								test/test_crawl_server.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,53 @@ | ||||
| from flask_testing import LiveServerTestCase | ||||
| import os | ||||
| import json | ||||
| import requests | ||||
| from crawl_server.server import app | ||||
| from crawl_server.task_manager import TaskManager | ||||
| 
 | ||||
| 
 | ||||
| class CrawlServerTest(LiveServerTestCase): | ||||
| 
 | ||||
|     headers = { | ||||
|         "Content-Type": "application/json" | ||||
|     } | ||||
| 
 | ||||
|     HOST = "http://localhost:9999" | ||||
| 
 | ||||
|     def create_app(self): | ||||
| 
 | ||||
|         self.app = app | ||||
|         app.config['LIVESERVER_PORT'] = 9999 | ||||
|         return app | ||||
| 
 | ||||
|     def test_put_only_accepts_json(self): | ||||
| 
 | ||||
|         payload = json.dumps({"url": "", "priority": 1, "callback_type": "", "callback_args": "{}"}) | ||||
|         r = requests.post(self.HOST + "/task/put", data=payload) | ||||
|         self.assertEqual(400, r.status_code) | ||||
| 
 | ||||
|         r2 = requests.post(self.HOST + "/task/put", headers=self.headers, data=payload) | ||||
|         self.assertEqual(200, r2.status_code) | ||||
| 
 | ||||
|     def test_put_task(self): | ||||
| 
 | ||||
|         payload = json.dumps({ | ||||
|             "url": "a", | ||||
|             "priority": 2, | ||||
|             "callback_type": "c", | ||||
|             "callback_args": '{"d": 4}' | ||||
|         }) | ||||
| 
 | ||||
|         requests.post(self.HOST + "/task/put", data=payload, headers=self.headers) | ||||
| 
 | ||||
|         r = requests.get(self.HOST + "/task") | ||||
|         self.assertEqual(200, r.status_code) | ||||
| 
 | ||||
|         print(r.text) | ||||
|         result = json.loads(r.text)[0] | ||||
|         self.assertEqual(result["url"], "a") | ||||
|         self.assertEqual(result["priority"], 2) | ||||
|         self.assertEqual(result["callback_type"], "c") | ||||
|         self.assertEqual(result["callback_args"], '{"d": 4}') | ||||
| 
 | ||||
| 
 | ||||
| @ -1,22 +0,0 @@ | ||||
| from unittest import TestCase | ||||
| from od_util import is_valid_url | ||||
| 
 | ||||
| 
 | ||||
| class InputValidationTest(TestCase): | ||||
| 
 | ||||
|     def test_valid_url(self): | ||||
|         self.assertTrue(is_valid_url("https://google.ca/")) | ||||
|         self.assertTrue(is_valid_url("http://google.ca/")) | ||||
|         self.assertTrue(is_valid_url("http://www.google.ca/")) | ||||
|         self.assertTrue(is_valid_url("http://www.subdomain.google.ca/")) | ||||
|         self.assertTrue(is_valid_url("http://mộtsốkýtựngẫunhiên.whatever/")) | ||||
|         self.assertTrue(is_valid_url("http://simon987.net:1234/")) | ||||
|         self.assertTrue(is_valid_url("http://simon987.net:12345/")) | ||||
| 
 | ||||
|     def test_invalid_url(self): | ||||
| 
 | ||||
|         self.assertFalse(is_valid_url("ftp://simon987.net")) | ||||
|         self.assertFalse(is_valid_url("git://simon987.net")) | ||||
|         self.assertFalse(is_valid_url("simon987.net")) | ||||
|         self.assertFalse(is_valid_url("http://simon987.net:8080")) | ||||
|         self.assertFalse(is_valid_url("http://simon987/")) | ||||
| @ -1,4 +0,0 @@ | ||||
| [ | ||||
| {"path": "/", "name": "Bleach - Chapter 001.cbz", "size": 8770750, "mime": "application/x-cbr"}, | ||||
| {"path": "/", "name": "Bleach - Chapter 007.cbz", "size": 3443820, "mime": "application/x-cbr"} | ||||
| ] | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user