barebones crawl_server microservice

This commit is contained in:
Simon 2018-06-11 19:00:43 -04:00
parent 8421cc0885
commit d849227798
14 changed files with 264 additions and 220 deletions

0
crawl_server/__init__.py Normal file
View File

73
crawl_server/database.py Normal file
View File

@ -0,0 +1,73 @@
import os
import json
import sqlite3
class Task:
def __init__(self, url: str, priority: int = 1, callback_type: str = None, callback_args: str = None):
self.url = url
self.priority = priority
self.callback_type = callback_type
self.callback_args = json.loads(callback_args) if callback_args else {}
def to_json(self):
return ({
"url": self.url,
"priority": self.priority,
"callback_type": self.callback_type,
"callback_args": json.dumps(self.callback_args)
})
class TaskManagerDatabase:
def __init__(self, db_path):
self.db_path = db_path
if not os.path.exists(db_path):
self.init_database()
def init_database(self):
with open("task_db_init.sql", "r") as f:
init_script = f.read()
with sqlite3.connect(self.db_path) as conn:
conn.executescript(init_script)
conn.commit()
def pop_task(self):
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("SELECT id, url, priority, callback_type, callback_args"
" FROM Queue ORDER BY priority DESC, Queue.id ASC LIMIT 1")
task = cursor.fetchone()
if task:
cursor.execute("DELETE FROM Queue WHERE id=?", (task[0],))
conn.commit()
return Task(task[1], task[2], task[3], task[4])
else:
return None
def put_task(self, task: Task):
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("INSERT INTO Queue (url, priority, callback_type, callback_args) VALUES (?,?,?,?)",
(task.url, task.priority, task.callback_type, json.dumps(task.callback_args)))
conn.commit()
def get_tasks(self):
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("SELECT * FROM Queue")
tasks = cursor.fetchall()
return [Task(t[1], t[2], t[3], t[4]) for t in tasks]

40
crawl_server/server.py Normal file
View File

@ -0,0 +1,40 @@
from flask import Flask, request, abort, Response
import json
from crawl_server.task_manager import TaskManager, Task
app = Flask(__name__)
tm = TaskManager("tm_db.sqlite3")
@app.route("/")
def hello():
return "Hello World!"
@app.route("/task/")
def get_tasks():
json_str = json.dumps([task.to_json() for task in tm.get_tasks()])
return Response(json_str, mimetype="application/json")
@app.route("/task/put", methods=["POST"])
def task_put():
if request.json:
try:
url = request.json["url"]
priority = request.json["priority"]
callback_type = request.json["callback_type"]
callback_args = request.json["callback_args"]
except KeyError:
return abort(400)
task = Task(url, priority, callback_type, callback_args)
tm.put_task(task)
return '{"ok": "true"}'
return abort(400)
if __name__ == "__main__":
app.run()

View File

@ -0,0 +1,73 @@
from crawl_server.database import TaskManagerDatabase, Task
from multiprocessing import Pool
from apscheduler.schedulers.background import BackgroundScheduler
from enum import Enum
from datetime import datetime
from crawler.crawler import RemoteDirectoryCrawler
class TaskResultStatus(Enum):
SUCCESS = 0
FAILURE = 1
class TaskResult:
def __init__(self):
self.status_code: TaskResultStatus = None
self.file_count = 0
self.start_time = None
self.end_time = None
self.website_id = None
class TaskManager:
def __init__(self, db_path, max_processes=8):
self.db = TaskManagerDatabase(db_path)
self.pool = Pool(processes=max_processes)
scheduler = BackgroundScheduler()
scheduler.add_job(self.execute_queued_task, "interval", seconds=1)
scheduler.start()
def put_task(self, task: Task):
self.db.put_task(task)
def get_tasks(self):
return self.db.get_tasks()
def execute_queued_task(self):
task = self.db.pop_task()
if task:
print("pooled " + task.url)
self.pool.apply_async(
TaskManager.run_task,
args=(task, ),
callback=TaskManager.task_complete
)
@staticmethod
def run_task(task):
result = TaskResult()
result.start_time = datetime.utcnow()
print("Starting task " + task.url)
crawler = RemoteDirectoryCrawler(task.url, 10)
crawler.crawl_directory()
print("End task " + task.url)
result.end_time = datetime.utcnow()
return result
@staticmethod
def task_complete(result: TaskResult):
print("Task done " + str(result))
# todo save in db

View File

@ -71,7 +71,7 @@ class RemoteDirectoryCrawler:
try:
directory = RemoteDirectoryFactory.get_directory(self.url)
root_listing = directory.list_dir("/dl2/") # todo get path
root_listing = directory.list_dir("/")
directory.close()
except TimeoutError:
return

View File

@ -54,7 +54,7 @@ class HttpDirectory(RemoteDirectory):
if self._should_ignore(link):
continue
file_url = urljoin(path_url, link[1])
file_url = urljoin(path_url, link.url)
path, file_name = os.path.split(file_url[len(self.base_url) - 1:])
if self._isdir(link):

14
debug_put.py Normal file
View File

@ -0,0 +1,14 @@
import requests
import json
payload = json.dumps({
"url": "http://124.158.108.137/ebooks/",
"priority": 2,
"callback_type": "",
"callback_args": "{}"
})
r = requests.post("http://localhost:5000/task/put",
headers={"Content-Type": "application/json"},
data=payload)

View File

@ -1,4 +1,5 @@
flask
flask_testing
requests
bs4
validators

View File

@ -2,8 +2,6 @@ from apscheduler.schedulers.background import BackgroundScheduler
import os
from database import Website
from multiprocessing import Value, Process
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from database import Database
from reddit_bot import RedditBot
import praw

8
task_db_init.sql Normal file
View File

@ -0,0 +1,8 @@
CREATE TABLE Queue (
id INTEGER PRIMARY KEY,
url TEXT,
priority INTEGER,
callback_type TEXT,
callback_args TEXT
);

View File

@ -1,190 +0,0 @@
from unittest import TestCase
import sqlite3
from database import Database, File, Website, InvalidQueryException
import os
class DatabaseTest(TestCase):
def tearDown(self):
if os.path.exists("test.sqlite3"):
os.remove("test.sqlite3")
def test_init_database_existing(self):
with open("test.sqlite3", "w"):
pass
Database("test.sqlite3")
self.assertEqual(os.path.getsize("test.sqlite3"), 0)
def test_init_database_new(self):
Database("test.sqlite3")
conn = sqlite3.connect("test.sqlite3")
cur = conn.cursor()
self.assertTrue(cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='Website'"))
conn.close()
def test_insert_website(self):
db = Database("test.sqlite3")
website_id = db.insert_website(Website("https://google.ca", "127.0.0.1", "firefox"))
conn = sqlite3.connect("test.sqlite3")
cursor = conn.cursor()
cursor.execute("SELECT * FROM Website WHERE id=?", (website_id, ))
db_website = cursor.fetchone()
self.assertEqual(db_website[0], 1)
self.assertEqual(db_website[1], "https://google.ca")
self.assertEqual(db_website[2], "127.0.0.1")
self.assertEqual(db_website[3], "firefox")
self.assertIsNotNone(db_website[4])
def test_insert_files(self):
db = Database("test.sqlite3")
website_id = db.insert_website(Website("", "", ""))
db.insert_files([File(website_id, "/some/dir/", "text/plain", "file.txt", 1234)])
conn = sqlite3.connect("test.sqlite3")
cursor = conn.cursor()
cursor.execute("SELECT * FROM File WHERE id=?", (1, ))
db_file = cursor.fetchone()
cursor.execute("SELECT * FROM WebsitePath WHERE id=?", (db_file[1], ))
db_path = cursor.fetchone()
self.assertEqual(db_file[0], 1)
self.assertEqual(db_file[1], db_path[0])
self.assertEqual(db_file[3], "file.txt")
self.assertEqual(db_file[4], 1234)
self.assertEqual(db_path[1], website_id)
self.assertEqual(db_path[2], "/some/dir/")
def test_import_json(self):
db = Database("test.sqlite3")
website_url = "http://google.ca/"
logged_ip = "127.0.0.1"
logged_useragent = "firefox"
db.import_json("test/test_scan1.json", Website(website_url, logged_ip, logged_useragent))
with sqlite3.connect("test.sqlite3") as conn:
cursor = conn.cursor()
cursor.execute("SELECT * FROM File WHERE name='Bleach - Chapter 001.cbz'")
db_file1 = cursor.fetchone()
self.assertEqual(db_file1[4], 8770750)
cursor.execute("SELECT * FROM File WHERE name='Bleach - Chapter 007.cbz'")
db_file2 = cursor.fetchone()
self.assertEqual(db_file2[4], 3443820)
def test_select_website(self):
db = Database("test.sqlite3")
website_id = db.insert_website(Website("https://simon987.net/", "127.0.0.1", "firefox"))
website = db.get_website_by_url("https://simon987.net/")
self.assertEqual(website.url, "https://simon987.net/")
self.assertEqual(website.logged_ip, "127.0.0.1")
self.assertEqual(website.logged_useragent, "firefox")
self.assertEqual(website.id, website_id)
self.assertIsNotNone(website.last_modified)
self.assertIsNone(db.get_website_by_url("does not exist"))
def test_enqueue(self):
db = Database("test.sqlite3")
web_id = db.insert_website(Website("https://simon987.net", "127.0.0.1", "firefox"))
db.enqueue(web_id)
db.enqueue(web_id)
with sqlite3.connect("test.sqlite3") as conn:
cursor = conn.cursor()
cursor.execute("SELECT * FROM Queue")
db_queued_website = cursor.fetchone()
self.assertEqual(db_queued_website[0], 1)
self.assertEqual(db_queued_website[1], web_id)
self.assertIsNone(cursor.fetchone())
def test_dequeue(self):
db = Database("test.sqlite3")
web_id_1 = db.insert_website(Website("", "", ""))
web_id_2 = db.insert_website(Website("", "", ""))
db.enqueue(web_id_1)
db.enqueue(web_id_2, "postid")
self.assertEqual(db.dequeue()[0], web_id_1)
self.assertEqual(db.dequeue()[1], "postid")
self.assertEqual(db.dequeue(), None)
self.assertEqual(db.dequeue(), None)
def test_queue(self):
db = Database("test.sqlite3")
db.enqueue(db.insert_website(Website("w1", "i1", "a1")))
db.enqueue(db.insert_website(Website("w2", "i2", "a2")))
db.enqueue(db.insert_website(Website("w3", "i3", "a3")))
queue = db.queue()
self.assertEqual(queue[0].url, "w1")
self.assertEqual(queue[1].logged_ip, "i2")
self.assertEqual(queue[2].logged_useragent, "a3")
self.assertIsNotNone(queue[2].last_modified)
self.assertEqual(len(queue), 3)
def test_get_website_by_id(self):
db = Database("test.sqlite3")
website_id = db.insert_website(Website("a", "b", "c"))
website = db.get_website_by_id(website_id)
self.assertEqual(website.id, website_id)
self.assertEqual(website.url, "a")
self.assertEqual(website.logged_ip, "b")
self.assertEqual(website.logged_useragent, "c")
self.assertIsNone(db.get_website_by_id(999))
def test_search_handle_invalid_query(self):
db = Database("test.sqlite3")
with self.assertRaises(InvalidQueryException):
db.search(";DROP DATABASE;")
with self.assertRaises(InvalidQueryException):
db.search("invalidCol:")
with self.assertRaises(InvalidQueryException):
db.search("*test*")
def test_stats(self):
db = Database("test.sqlite3")
db.get_stats() # todo test

53
test/test_crawl_server.py Normal file
View File

@ -0,0 +1,53 @@
from flask_testing import LiveServerTestCase
import os
import json
import requests
from crawl_server.server import app
from crawl_server.task_manager import TaskManager
class CrawlServerTest(LiveServerTestCase):
headers = {
"Content-Type": "application/json"
}
HOST = "http://localhost:9999"
def create_app(self):
self.app = app
app.config['LIVESERVER_PORT'] = 9999
return app
def test_put_only_accepts_json(self):
payload = json.dumps({"url": "", "priority": 1, "callback_type": "", "callback_args": "{}"})
r = requests.post(self.HOST + "/task/put", data=payload)
self.assertEqual(400, r.status_code)
r2 = requests.post(self.HOST + "/task/put", headers=self.headers, data=payload)
self.assertEqual(200, r2.status_code)
def test_put_task(self):
payload = json.dumps({
"url": "a",
"priority": 2,
"callback_type": "c",
"callback_args": '{"d": 4}'
})
requests.post(self.HOST + "/task/put", data=payload, headers=self.headers)
r = requests.get(self.HOST + "/task")
self.assertEqual(200, r.status_code)
print(r.text)
result = json.loads(r.text)[0]
self.assertEqual(result["url"], "a")
self.assertEqual(result["priority"], 2)
self.assertEqual(result["callback_type"], "c")
self.assertEqual(result["callback_args"], '{"d": 4}')

View File

@ -1,22 +0,0 @@
from unittest import TestCase
from od_util import is_valid_url
class InputValidationTest(TestCase):
def test_valid_url(self):
self.assertTrue(is_valid_url("https://google.ca/"))
self.assertTrue(is_valid_url("http://google.ca/"))
self.assertTrue(is_valid_url("http://www.google.ca/"))
self.assertTrue(is_valid_url("http://www.subdomain.google.ca/"))
self.assertTrue(is_valid_url("http://mộtsốkýtựngẫunhiên.whatever/"))
self.assertTrue(is_valid_url("http://simon987.net:1234/"))
self.assertTrue(is_valid_url("http://simon987.net:12345/"))
def test_invalid_url(self):
self.assertFalse(is_valid_url("ftp://simon987.net"))
self.assertFalse(is_valid_url("git://simon987.net"))
self.assertFalse(is_valid_url("simon987.net"))
self.assertFalse(is_valid_url("http://simon987.net:8080"))
self.assertFalse(is_valid_url("http://simon987/"))

View File

@ -1,4 +0,0 @@
[
{"path": "/", "name": "Bleach - Chapter 001.cbz", "size": 8770750, "mime": "application/x-cbr"},
{"path": "/", "name": "Bleach - Chapter 007.cbz", "size": 3443820, "mime": "application/x-cbr"}
]