Decentralised crawling should work in theory + temporary fix for going further than the maximum 10k results elasticsearch allows by default

This commit is contained in:
Simon 2018-06-21 19:44:27 -04:00
parent 098ad2be72
commit 14d384e366
9 changed files with 275 additions and 84 deletions

36
app.py
View File

@ -8,7 +8,7 @@ from flask_recaptcha import ReCaptcha
import od_util import od_util
import config import config
from flask_caching import Cache from flask_caching import Cache
from task import TaskDispatcher, Task from task import TaskDispatcher, Task, CrawlServer
from search.search import ElasticSearchEngine from search.search import ElasticSearchEngine
app = Flask(__name__) app = Flask(__name__)
@ -349,8 +349,9 @@ def admin_dashboard():
tokens = db.get_tokens() tokens = db.get_tokens()
blacklist = db.get_blacklist() blacklist = db.get_blacklist()
crawl_servers = db.get_crawl_servers()
return render_template("dashboard.html", api_tokens=tokens, blacklist=blacklist) return render_template("dashboard.html", api_tokens=tokens, blacklist=blacklist, crawl_servers=crawl_servers)
else: else:
return abort(403) return abort(403)
@ -416,6 +417,37 @@ def admin_crawl_logs():
return abort(403) return abort(403)
@app.route("/crawl_server/add", methods=["POST"])
def admin_add_crawl_server():
if "username" in session:
server = CrawlServer(
request.form.get("url"),
request.form.get("name"),
request.form.get("slots"),
request.form.get("token")
)
db.add_crawl_server(server)
flash("Added crawl server", "success")
return redirect("/dashboard")
else:
return abort(403)
@app.route("/crawl_server/<int:server_id>/delete")
def admin_delete_crawl_server(server_id):
if "username" in session:
db.remove_crawl_server(server_id)
flash("Deleted crawl server", "success")
return redirect("/dashboard")
else:
abort(403)
if __name__ == '__main__': if __name__ == '__main__':
if config.USE_SSL: if config.USE_SSL:
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)

View File

@ -84,7 +84,7 @@ class RemoteDirectoryCrawler:
self.crawled_paths = list() self.crawled_paths = list()
def crawl_directory(self, out_file: str) -> CrawlResult: def crawl_directory(self, out_file: str) -> CrawlResult:
try:
try: try:
directory = RemoteDirectoryFactory.get_directory(self.url) directory = RemoteDirectoryFactory.get_directory(self.url)
path, root_listing = directory.list_dir("") path, root_listing = directory.list_dir("")
@ -124,6 +124,8 @@ class RemoteDirectoryCrawler:
file_writer_thread.join() file_writer_thread.join()
return CrawlResult(files_written[0], "success") return CrawlResult(files_written[0], "success")
except Exception as e:
return CrawlResult(0, str(e) + " \nType:" + str(type(e)))
def _process_listings(self, url: str, in_q: Queue, files_q: Queue): def _process_listings(self, url: str, in_q: Queue, files_q: Queue):

View File

@ -7,15 +7,14 @@ import config
app = Flask(__name__) app = Flask(__name__)
auth = HTTPTokenAuth(scheme="Token") auth = HTTPTokenAuth(scheme="Token")
tokens = [config.CRAWL_SERVER_TOKEN] token = config.CRAWL_SERVER_TOKEN
tm = TaskManager("tm_db.sqlite3", 32) tm = TaskManager("tm_db.sqlite3", 32)
@auth.verify_token @auth.verify_token
def verify_token(token): def verify_token(provided_token):
if token in tokens: return token == provided_token
return True
@app.route("/task/") @app.route("/task/")
@ -99,4 +98,4 @@ def get_stats():
if __name__ == "__main__": if __name__ == "__main__":
app.run(port=5001, host="0.0.0.0") app.run(port=config.CRAWL_SERVER_PORT, host="0.0.0.0", ssl_context="adhoc")

View File

@ -4,6 +4,7 @@ from urllib.parse import urlparse
import os import os
import bcrypt import bcrypt
import uuid import uuid
import task
class InvalidQueryException(Exception): class InvalidQueryException(Exception):
@ -277,6 +278,33 @@ class Database:
cursor.execute("SELECT * FROM BlacklistedWebsite") cursor.execute("SELECT * FROM BlacklistedWebsite")
return [BlacklistedWebsite(r[0], r[1]) for r in cursor.fetchall()] return [BlacklistedWebsite(r[0], r[1]) for r in cursor.fetchall()]
def add_crawl_server(self, server: task.CrawlServer):
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("INSERT INTO CrawlServer (url, name, slots, token) VALUES (?,?,?,?)",
(server.url, server.name, server.slots, server.token))
conn.commit()
def remove_crawl_server(self, server_id):
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("DELETE FROM CrawlServer WHERE id=?", (server_id, ))
conn.commit()
def get_crawl_servers(self) -> list:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("SELECT url, name, slots, token, id FROM CrawlServer")
return [task.CrawlServer(r[0], r[1], r[2], r[3], r[4]) for r in cursor.fetchall()]

View File

@ -23,3 +23,11 @@ CREATE TABLE BlacklistedWebsite (
id INTEGER PRIMARY KEY NOT NULL, id INTEGER PRIMARY KEY NOT NULL,
url TEXT url TEXT
); );
CREATE TABLE CrawlServer (
id INTEGER PRIMARY KEY NOT NULL,
url TEXT,
name TEXT,
token TEXT,
slots INTEGER
)

View File

@ -15,3 +15,4 @@ python-dateutil
flask_httpauth flask_httpauth
ujson ujson
timeout_decorator timeout_decorator
urllib3

View File

@ -168,7 +168,7 @@ class ElasticSearchEngine(SearchEngine):
"path": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]} "path": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]}
} }
}, },
"size": per_page, "from": page * per_page}, index=self.index_name) "size": per_page, "from": min(page * per_page, 10000 - per_page)}, index=self.index_name)
return page return page

165
task.py
View File

@ -1,31 +1,41 @@
import random
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from search.search import ElasticSearchEngine from search.search import ElasticSearchEngine
from crawl_server.database import Task, TaskResult from crawl_server.database import Task, TaskResult
import requests import requests
from requests.exceptions import ConnectionError from requests.exceptions import ConnectionError
import json import json
import config import database
from database import Database from concurrent.futures import ThreadPoolExecutor
import urllib3
urllib3.disable_warnings()
class CrawlServer: class CrawlServer:
headers = { def __init__(self, url, name, slots, token, server_id=None):
"Content-Type": "application/json",
"Authorization": "Token " + config.CRAWL_SERVER_TOKEN,
}
def __init__(self, url, name):
self.url = url self.url = url
self.name = name self.name = name
self.slots = slots
self.used_slots = 0
self.token = token
self.id = server_id
def _generate_headers(self):
return {
"Content-Type": "application/json",
"Authorization": "Token " + self.token,
}
def queue_task(self, task: Task) -> bool: def queue_task(self, task: Task) -> bool:
print("Sending task to crawl server " + self.url) print("Sending task to crawl server " + self.url)
try: try:
payload = json.dumps(task.to_json()) payload = json.dumps(task.to_json())
r = requests.post(self.url + "/task/put", headers=CrawlServer.headers, data=payload) r = requests.post(self.url + "/task/put", headers=self._generate_headers(), data=payload, verify=False)
print(r) print(r) # TODO: If the task could not be added, fallback to another server
return r.status_code == 200 return r.status_code == 200
except ConnectionError: except ConnectionError:
return False return False
@ -33,40 +43,63 @@ class CrawlServer:
def fetch_completed_tasks(self) -> list: def fetch_completed_tasks(self) -> list:
try: try:
r = requests.get(self.url + "/task/completed", headers=CrawlServer.headers) r = requests.get(self.url + "/task/completed", headers=self._generate_headers(), verify=False)
if r.status_code != 200:
print("Problem while fetching completed tasks for '" + self.name + "': " + str(r.status_code))
print(r.text)
return []
return [ return [
TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"]) TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"])
for r in json.loads(r.text)] for r in json.loads(r.text)]
except ConnectionError: except ConnectionError:
print("Crawl server cannot be reached " + self.url) print("Crawl server cannot be reached @ " + self.url)
return [] return []
def fetch_queued_tasks(self) -> list: def fetch_queued_tasks(self):
try: try:
r = requests.get(self.url + "/task/", headers=CrawlServer.headers) r = requests.get(self.url + "/task/", headers=self._generate_headers(), verify=False)
if r.status_code != 200:
print("Problem while fetching queued tasks for '" + self.name + "' " + str(r.status_code))
print(r.text)
return None
return [ return [
Task(t["website_id"], t["url"], t["priority"], t["callback_type"], t["callback_args"]) Task(t["website_id"], t["url"], t["priority"], t["callback_type"], t["callback_args"])
for t in json.loads(r.text) for t in json.loads(r.text)
] ]
except ConnectionError: except ConnectionError:
return [] return None
def fetch_current_tasks(self): def fetch_current_tasks(self):
try: try:
r = requests.get(self.url + "/task/current", headers=CrawlServer.headers) r = requests.get(self.url + "/task/current", headers=self._generate_headers(), verify=False)
if r.status_code != 200:
print("Problem while fetching current tasks for '" + self.name + "' " + str(r.status_code))
print(r.text)
return None
return [ return [
Task(t["website_id"], t["url"], t["priority"], t["callback_type"], t["callback_args"]) Task(t["website_id"], t["url"], t["priority"], t["callback_type"], t["callback_args"])
for t in json.loads(r.text) for t in json.loads(r.text)
] ]
except ConnectionError: except ConnectionError:
return [] return None
def fetch_website_files(self, website_id) -> str: def fetch_website_files(self, website_id) -> str:
try: try:
r = requests.get(self.url + "/file_list/" + str(website_id) + "/", stream=True, headers=CrawlServer.headers) r = requests.get(self.url + "/file_list/" + str(website_id) + "/", stream=True,
headers=self._generate_headers(), verify=False)
if r.status_code != 200:
print("Problem while fetching website files for '" + self.name + "': " + str(r.status_code))
print(r.text)
return ""
for line in r.iter_lines(chunk_size=1024 * 256): for line in r.iter_lines(chunk_size=1024 * 256):
yield line yield line
except ConnectionError: except ConnectionError:
@ -75,7 +108,8 @@ class CrawlServer:
def free_website_files(self, website_id) -> bool: def free_website_files(self, website_id) -> bool:
try: try:
r = requests.get(self.url + "/file_list/" + str(website_id) + "/free", headers=CrawlServer.headers) r = requests.get(self.url + "/file_list/" + str(website_id) + "/free", headers=self._generate_headers(),
verify=False)
return r.status_code == 200 return r.status_code == 200
except ConnectionError as e: except ConnectionError as e:
print(e) print(e)
@ -84,16 +118,29 @@ class CrawlServer:
def fetch_crawl_logs(self): def fetch_crawl_logs(self):
try: try:
r = requests.get(self.url + "/task/logs/", headers=CrawlServer.headers) r = requests.get(self.url + "/task/logs/", headers=self._generate_headers(), verify=False)
if r.status_code != 200:
print("Problem while fetching crawl logs for '" + self.name + "': " + str(r.status_code))
print(r.text)
return []
return [ return [
TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"], r["indexed_time"]) TaskResult(r["status_code"], r["file_count"], r["start_time"],
r["end_time"], r["website_id"], r["indexed_time"])
for r in json.loads(r.text)] for r in json.loads(r.text)]
except ConnectionError: except ConnectionError:
return [] return []
def fetch_stats(self): def fetch_stats(self):
try: try:
r = requests.get(self.url + "/stats/", headers=CrawlServer.headers) r = requests.get(self.url + "/stats/", headers=self._generate_headers(), verify=False)
if r.status_code != 200:
print("Problem while fetching stats for '" + self.name + "': " + str(r.status_code))
print(r.text)
return []
return json.loads(r.text) return json.loads(r.text)
except ConnectionError: except ConnectionError:
return {} return {}
@ -107,16 +154,11 @@ class TaskDispatcher:
scheduler.start() scheduler.start()
self.search = ElasticSearchEngine("od-database") self.search = ElasticSearchEngine("od-database")
self.db = Database("db.sqlite3") self.db = database.Database("db.sqlite3")
# TODO load from config
self.crawl_servers = [
CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"),
]
def check_completed_tasks(self): def check_completed_tasks(self):
for server in self.crawl_servers: for server in self.db.get_crawl_servers():
for task in server.fetch_completed_tasks(): for task in server.fetch_completed_tasks():
print("Completed task") print("Completed task")
# All files are overwritten # All files are overwritten
@ -135,24 +177,63 @@ class TaskDispatcher:
self._get_available_crawl_server().queue_task(task) self._get_available_crawl_server().queue_task(task)
def _get_available_crawl_server(self) -> CrawlServer: def _get_available_crawl_server(self) -> CrawlServer:
# TODO: Load balancing & health check for crawl servers
return self.crawl_servers[0] queued_tasks_by_server = self._get_current_tasks_by_server()
server_with_most_free_slots = None
most_free_slots = 0
for server in queued_tasks_by_server:
free_slots = server.slots - len(queued_tasks_by_server[server])
if free_slots > most_free_slots:
server_with_most_free_slots = server
most_free_slots = free_slots
if server_with_most_free_slots:
print("Dispatching task to '" +
server_with_most_free_slots.name + "' " +
str(most_free_slots) + " free out of " + str(server_with_most_free_slots.slots))
return self.db.get_crawl_servers()[0]
def get_queued_tasks(self) -> list: def get_queued_tasks(self) -> list:
queued_tasks = [] queued_tasks_by_server = self._get_current_tasks_by_server()
for queued_tasks in queued_tasks_by_server.values():
for task in queued_tasks:
yield task
for server in self.crawl_servers: def _get_queued_tasks_by_server(self) -> dict:
queued_tasks.extend(server.fetch_queued_tasks())
queued_tasks = dict()
pool = ThreadPoolExecutor(max_workers=10)
crawl_servers = self.db.get_crawl_servers()
responses = list(pool.map(lambda server: server.fetch_queued_tasks()))
pool.shutdown()
for i, server in enumerate(crawl_servers):
if responses[i] is not None:
queued_tasks[server] = responses[i]
return queued_tasks return queued_tasks
def get_current_tasks(self) -> list: def get_current_tasks(self):
# TODO mem cache this
current_tasks = [] current_tasks_by_server = self._get_current_tasks_by_server()
for server in self.crawl_servers: for current_tasks in current_tasks_by_server.values():
current_tasks.extend(server.fetch_current_tasks()) for task in current_tasks:
yield task
def _get_current_tasks_by_server(self) -> dict:
current_tasks = dict()
pool = ThreadPoolExecutor(max_workers=10)
crawl_servers = self.db.get_crawl_servers()
responses = list(pool.map(lambda s: s.fetch_current_tasks(), crawl_servers))
pool.shutdown()
for i, server in enumerate(crawl_servers):
if responses[i] is not None:
current_tasks[server] = responses[i]
return current_tasks return current_tasks
@ -160,7 +241,7 @@ class TaskDispatcher:
task_logs = dict() task_logs = dict()
for server in self.crawl_servers: for server in self.db.get_crawl_servers():
task_logs[server.name] = server.fetch_crawl_logs() task_logs[server.name] = server.fetch_crawl_logs()
return task_logs return task_logs
@ -169,11 +250,9 @@ class TaskDispatcher:
stats = dict() stats = dict()
for server in self.crawl_servers: for server in self.db.get_crawl_servers():
server_stats = server.fetch_stats() server_stats = server.fetch_stats()
if server_stats: if server_stats:
stats[server.name] = server_stats stats[server.name] = server_stats
return stats return stats

View File

@ -7,6 +7,48 @@
<div class="card-header">Dashboard</div> <div class="card-header">Dashboard</div>
<div class="card-body"> <div class="card-body">
<h3>Crawl servers</h3>
<table class="table table-striped">
<thead>
<tr>
<th>Url</th>
<th>Name</th>
<th>Slots</th>
<th>Action</th>
</tr>
</thead>
<tbody>
{% for server in crawl_servers %}
<tr>
<td>{{ server.url }}</td>
<td>{{ server.name }}</td>
<td>{{ server.slots }}</td>
<td><a class="btn btn-danger" href="/crawl_server/{{ server.id }}/delete">Delete</a></td>
</tr>
{% endfor %}
</tbody>
</table>
<form action="/crawl_server/add" method="post">
<div class="form-row">
<div class="col col-md-3">
<input class="form-control" name="url" placeholder="Url">
</div>
<div class="col col-md-3">
<input class="form-control" name="name" placeholder="Name">
</div>
<div class="col col-md-2">
<input class="form-control" name="token" placeholder="Token">
</div>
<div class="col col-md-2">
<input class="form-control" name="slots" placeholder="Slots" type="number">
</div>
<div class="col col-md-2">
<input type="submit" class="form-control btn btn-primary" value="Add server">
</div>
</div>
</form>
<br>
<hr>
<h3>API Keys</h3> <h3>API Keys</h3>
<table class="table table-striped"> <table class="table table-striped">
<thead> <thead>