diff --git a/api.py b/api.py index 4498dd4..6e0f18f 100644 --- a/api.py +++ b/api.py @@ -16,50 +16,9 @@ uploadLock = Lock() def setup_api(app): - @app.route("/api/task/get", methods=["POST"]) - def api_get_task(): - token = request.form.get("token") - name = oddb.db.check_api_token(token) - accept_ftp = request.form.get("accept") == "ftp" if "accept" in request.form else False - - if name: - task = oddb.db.pop_task(name, accept_ftp) - oddb.logger.debug("API get task from " + name) - - if task: - oddb.logger.info("Assigning task " + str(task.to_json()) + " to " + name) - else: - oddb.logger.info("No queued tasks, creating a new one") - - try: - task = oddb.db.make_task_for_oldest(name) - except: - oddb.logger.error("Couldn't create new task") - abort(404) - - return Response(str(task), mimetype="application/json") - else: - return abort(403) - - @app.route("/api/task/cancel", methods=["POST"]) - def api_cancel_task(): - token = request.form.get("token") - name = oddb.db.check_api_token(token) - - if name: - website_id = request.form.get("website_id") if "website_id" in request.form else None - if website_id: - oddb.logger.debug("API task cancel for " + str(website_id) + " by " + name) - oddb.db.delete_task(website_id) - return Response("cancelled task") - else: - abort(400) - - else: - abort(403) - @app.route("/api/task/complete", methods=["POST"]) def api_complete_task(): + # TODO: task_tracker token = request.form.get("token") name = oddb.db.check_api_token(token) @@ -201,6 +160,7 @@ def setup_api(app): if name: url = request.form.get("url") + # TODO: task_tracker message, result = oddb.try_enqueue(url) oddb.logger.info("API try enqueue '" + url + "' by " + name + " (" + message + ")") diff --git a/database.py b/database.py index 71079c3..711fc7a 100644 --- a/database.py +++ b/database.py @@ -1,11 +1,10 @@ -import sqlite3 import json -import datetime -from urllib.parse import urlparse import os -import bcrypt +import sqlite3 import uuid -import tasks +from urllib.parse import urlparse + +import bcrypt class BlacklistedWebsite: @@ -155,6 +154,7 @@ class Database: def make_task_for_oldest(self, assigned_crawler): + # TODO: task_tracker with sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() cursor.execute("INSERT INTO QUEUE (website_id, url, assigned_crawler) SELECT Website.id, Website.url, ? FROM Website WHERE Website.id not in (SELECT website_id FROM Queue) " @@ -326,47 +326,6 @@ class Database: cursor.execute("SELECT * FROM BlacklistedWebsite") return [BlacklistedWebsite(r[0], r[1]) for r in cursor.fetchall()] - def log_result(self, result): - - with sqlite3.connect(self.db_path) as conn: - - cursor = conn.cursor() - - cursor.execute("INSERT INTO TaskResult " - "(server, website_id, status_code, file_count, start_time, end_time) " - "VALUES (?,?,?,?,?,?)", - (result.server_id, result.website_id, result.status_code, - result.file_count, result.start_time, result.end_time)) - conn.commit() - - def get_crawl_logs(self): - - with sqlite3.connect(self.db_path) as conn: - cursor = conn.cursor() - - cursor.execute("SELECT website_id, status_code, file_count, start_time, end_time, server " - "FROM TaskResult ORDER BY end_time DESC") - return [tasks.TaskResult(r[1], r[2], r[3], r[4], r[0], str(r[5])) for r in cursor.fetchall()] - - def get_stats_by_crawler(self): - stats = [] - task_results = self.get_crawl_logs() - - for crawler in self.get_tokens(): - task_count = sum(1 for result in task_results if result.server_name == crawler.name) - if task_count > 0: - info = dict() - info["file_count"] = sum(result.file_count for result in task_results if result.server_name == crawler.name) - info["time"] = sum((result.end_time - result.start_time) for result in task_results if result.server_name == crawler.name) - info["task_count"] = task_count - info["time_avg"] = info["time"] / task_count - info["file_count_avg"] = info["file_count"] / task_count - stats.append((crawler.name, info)) - - stats.sort(key=lambda t: t[1]["file_count"], reverse=True) - - return stats - def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took): with sqlite3.connect(self.db_path) as conn: @@ -376,71 +335,3 @@ class Database: "VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took)) conn.commit() - - def put_task(self, task: Task, assigned_crawler=None) -> None: - - with sqlite3.connect(self.db_path) as conn: - cursor = conn.cursor() - - cursor.execute("INSERT INTO Queue (website_id, url, priority, callback_type, callback_args, assigned_crawler) " - "VALUES (?,?,?,?,?,?)", - (task.website_id, task.url, task.priority, - task.callback_type, json.dumps(task.callback_args), assigned_crawler)) - conn.commit() - - def get_tasks(self) -> list: - - with sqlite3.connect(self.db_path) as conn: - cursor = conn.cursor() - - cursor.execute("SELECT website_id, url, priority, callback_type, callback_args FROM Queue " - "WHERE assigned_crawler is NULL ") - db_tasks = cursor.fetchall() - - return [Task(t[0], t[1], t[2], t[3], t[4]) for t in db_tasks] - - def pop_task(self, name, ftp: bool) -> Task: - - with sqlite3.connect(self.db_path) as conn: - cursor = conn.cursor() - - cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args " + - "FROM Queue WHERE assigned_crawler is NULL " + - ("AND url LIKE 'ftp%' " if ftp else "AND url LIKE 'http%' ") + - "ORDER BY priority DESC, Queue.id " + - "ASC LIMIT 1") - task = cursor.fetchone() - - if task: - cursor.execute("UPDATE Queue SET assigned_crawler=? WHERE id=?", (name, task[0],)) - conn.commit() - return Task(task[1], task[2], task[3], task[4], task[5]) - else: - return None - - def delete_task(self, website_id): - - with sqlite3.connect(self.db_path) as conn: - cursor = conn.cursor() - cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, )) - - def complete_task(self, website_id: int, name: str) -> Task: - - with sqlite3.connect(self.db_path) as conn: - cursor = conn.cursor() - - cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM " - "Queue WHERE website_id=?", (website_id, )) - - task = cursor.fetchone() - - if task: - cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, )) - conn.commit() - return Task(task[1], task[2], task[3], task[4], task[5]) - else: - return None - - - - diff --git a/debug_put.py b/debug_put.py deleted file mode 100644 index 8a2f7eb..0000000 --- a/debug_put.py +++ /dev/null @@ -1,18 +0,0 @@ -import requests -import json - - -payload = json.dumps({ - "token": "4eafc6ed-74b7-4f04-9d34-7f3e01201003", - "website_id": 3, - "url": "http://localhost:8000/", - "priority": 2, - "callback_type": "", - "callback_args": "{}" -}) - -r = requests.post("http://localhost/api/task/enqueue", - headers={"Content-Type": "application/json"}, - data=payload) -print(r) -print(r.text) diff --git a/init_script.sql b/init_script.sql index 2774e08..c5de4a9 100644 --- a/init_script.sql +++ b/init_script.sql @@ -20,19 +20,6 @@ CREATE TABLE BlacklistedWebsite ( url TEXT ); -CREATE TABLE TaskResult ( - id INTEGER PRIMARY KEY, - server TEXT, - website_id INT, - status_code TEXT, - file_count INT, - start_time TIMESTAMP, - end_time TIMESTAMP, - indexed_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - - FOREIGN KEY (server) REFERENCES ApiClient(name) -); - CREATE TABLE ApiClient ( name TEXT PRIMARY KEY NOT NULL, token TEXT NOT NULL @@ -51,15 +38,3 @@ CREATE TABLE SearchLogEntry ( results INT DEFAULT 0, took INT DEFAULT 0 ); - -CREATE TABLE Queue ( - id INTEGER PRIMARY KEY, - website_id INTEGER, - url TEXT, - priority INTEGER, - callback_type TEXT, - callback_args TEXT, - assigned_crawler TEXT NULL DEFAULT NULL, - - FOREIGN KEY (assigned_crawler) REFERENCES ApiClient(name) -); diff --git a/stress_test.py b/stress_test.py deleted file mode 100644 index dec7f59..0000000 --- a/stress_test.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import json -import shutil -from search.search import ElasticSearchEngine -from concurrent.futures import ThreadPoolExecutor -import requests -import random - -terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain") \ - .text.splitlines() -exts = [ - "zip", "exe", "mp3", "avi", "mp4", "rar", "7zip", "ogg", "m4a", "flac", "doc", "docx", "aac", "xls", - "cab", "txt", "c", "java", "class", "jar", "py", "cpp", "h", "png", "jpg", "jpeg", "ttf", "torrent", - "part", "blend", "3ds", "obj", "ico", "html", "css", "js", "ts", "ape", "asm", "nasm", "fasm", "o", - "so", "dll", "tar", "gz", "bin", "cad", "cmd", "bat", "sh", "md" -] - - -def dump_local_filesystem(root_dir: str): - - docs = [] - - for root, dirs, files in os.walk(root_dir): - - for filename in files: - full_path = os.path.join(root, filename) - stats = os.stat(full_path) - - doc = dict() - doc["name"] = filename - doc["path"] = root - doc["mtime"] = stats.st_mtime - doc["size"] = stats.st_size - - docs.append(doc) - - with open("local_filesystem.json", "w") as f: - f.writelines(json.dumps(doc) + "\n" for doc in docs) - - -def random_path(): - return "/".join(random.choices(terms, k=random.randint(1, 5))) - - -def random_file_name(): - return random.choice(["_", " ", "-", ".", "#", ""]).\ - join(random.choices(terms, k=random.randint(1, 3))) + "." + random.choice(exts) - - -def get_random_file(): - - doc = dict() - doc["name"] = random_file_name() - doc["path"] = random_path() - doc["mtime"] = random.randint(0, 1000000000000) - doc["size"] = random.randint(-1, 1000000000) - - return doc - - -def dump_random_files(count=10): - with open("random_dump.json", "w") as f: - f.writelines(json.dumps(get_random_file()) + "\n" for _ in range(count)) - - -def index_file_list(path: str, website_id): - - es = ElasticSearchEngine("od-database") - with open(path, "r") as f: - es.import_json(f.readlines(), website_id) - - -def search(term=""): - requests.get("http://localhost/search?q=" + term, verify=False) - print(term) - - -def random_searches(count=10000000, max_workers=1000): - - pool = ThreadPoolExecutor(max_workers=max_workers) - pool.map(search, random.choices(terms, k=count)) - - -def make_wide_filesystem(count=100000): - - shutil.rmtree("stress_test") - os.mkdir("stress_test") - for _ in range(count): - new_path = "stress_test/" + random.choice(terms) - if not os.path.exists(new_path): - os.mkdir(new_path) - - -# dump_local_filesystem("/mnt/") -# index_file_list("local_filesystem.json", 4) -# random_searches(100000) -# dump_random_files(20000 * 100000) -# make_wide_filesystem(10000) diff --git a/tasks.py b/tasks.py index 3b221fa..a15ac28 100644 --- a/tasks.py +++ b/tasks.py @@ -90,5 +90,3 @@ class TaskManager: self.db.put_task(task) print("Queued task and made it available to crawlers: " + str(task.website_id)) - def get_queued_tasks(self) -> list: - return self.db.get_tasks() diff --git a/templates/crawl_logs.html b/templates/crawl_logs.html deleted file mode 100644 index 6422df4..0000000 --- a/templates/crawl_logs.html +++ /dev/null @@ -1,36 +0,0 @@ -{% extends "layout.html" %} -{% set title = "Crawl logs - OD-Database" %} - -{% block body %} -
- - - - - - - - - - - - - - - - {% for task_result in logs %} - - - - - - - - - - {% endfor %} - -
CrawlerWebsiteStatus codeFile countStartEndDelta
{{ task_result.server_name }}#{{ task_result.website_id }}{{ task_result.status_code }}{{ task_result.file_count }}{{ task_result.start_time | int | datetime_format }}{{ task_result.end_time | int | datetime_format }}{{ ((task_result.end_time - task_result.start_time)) | int }} sec
- -
-{% endblock body %} diff --git a/templates/dashboard.html b/templates/dashboard.html index c446368..1d1c2ff 100644 --- a/templates/dashboard.html +++ b/templates/dashboard.html @@ -7,9 +7,6 @@
Dashboard
- Logs -
-

API Keys

diff --git a/templates/stats.html b/templates/stats.html index 73139dd..7ee2f6e 100644 --- a/templates/stats.html +++ b/templates/stats.html @@ -69,33 +69,6 @@
- -

Crawl server stats

- - - - - - - - - - - - - {% for entry in crawl_server_stats %} - {% set server, info = entry %} - - - - - - - - - {% endfor %} - -
ServerTasks doneCrawl timeCrawl time avg.Files crawledFiles crawled avg.
{{ server }}{{ info.task_count }}{{ info.time | duration_format() }}{{ info.time_avg | duration_format() }}{{ info.file_count }}{{ "%.2f" % info.file_count_avg }}
diff --git a/templates/submit.html b/templates/submit.html index 55a97da..afc8697 100644 --- a/templates/submit.html +++ b/templates/submit.html @@ -62,34 +62,5 @@

- - -
-
Queued websites
-
- - - - - - - - - - - - {% for task in queue %} - - - - - - {% endfor %} - - -
UrlPriorityTask type
{{ task.url | truncate(70) }}{{ task.priority }}{{ task.callback_type if task.callback_type else "NORMAL" }}
- -
-
{% endblock body %} diff --git a/test/__init__.py b/test/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test/files/apache_table.html b/test/files/apache_table.html deleted file mode 100644 index 06e21c9..0000000 --- a/test/files/apache_table.html +++ /dev/null @@ -1,21 +0,0 @@ - - - - Index of /Public/bootstrap - - -

Index of /Public/bootstrap

- - - - - - - - - - - -
[ICO]NameLast modifiedSizeDescription

[PARENTDIR]Parent Directory   -  
[   ]bower.json 2017-04-05 01:45 1.0K 
[DIR]css/ 2017-09-07 18:03 -  
[DIR]image/ 2017-09-07 18:03 -  
[DIR]js/ 2017-09-07 18:03 -  
[DIR]less/ 2017-09-07 18:03 -  
[   ]package.json 2017-04-05 01:45 666  

- - diff --git a/test/files/lighttpd_table.html b/test/files/lighttpd_table.html deleted file mode 100644 index fe58d9f..0000000 --- a/test/files/lighttpd_table.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - - Index of /gentoo/releases/ - - - -

Index of /gentoo/releases/

-
- - - - - - - - - - - - - - - - - - - -
NameLast ModifiedSizeType
Parent Directory/ -  Directory
alpha/2009-Aug-09 03:47:09-  Directory
amd64/2017-Feb-09 18:50:44-  Directory
arm/2014-Apr-29 13:42:06-  Directory
hppa/2014-Apr-29 13:42:12-  Directory
ia64/2009-Aug-09 03:47:09-  Directory
mips/2011-Apr-28 23:38:14-  Directory
ppc/2014-Apr-29 13:41:00-  Directory
s390/2014-Apr-29 13:41:06-  Directory
sh/2014-Apr-29 13:41:16-  Directory
snapshots/2009-Apr-16 05:08:17-  Directory
sparc/2009-Aug-09 03:47:09-  Directory
x86/2016-Jul-04 21:14:19-  Directory
README2014-Jun-22 05:18:430.1Kapplication/octet-stream
verify-digests.sh2016-Jun-10 02:40:334.5Kapplication/octet-stream
-
-
lighttpd/1.4.29
- - diff --git a/test/files/nginx_pre.html b/test/files/nginx_pre.html deleted file mode 100644 index 5bbd35e..0000000 --- a/test/files/nginx_pre.html +++ /dev/null @@ -1,11 +0,0 @@ - -Index of /test/To process/Android nak newer/ - -

Index of /test/To process/Android nak newer/


../
-DCIM/                                              31-Jul-2018 00:26                   -
-Pictures/                                          31-Jul-2018 00:26                   -
-1529682937580.webm                                 25-Jun-2018 03:58             3768511
-1529716051300.webm                                 25-Jun-2018 04:01             3181867
-1529725898345.webm                                 25-Jun-2018 04:05             4138908
-

- diff --git a/test/webserver.py b/test/webserver.py deleted file mode 100644 index a3a1c14..0000000 --- a/test/webserver.py +++ /dev/null @@ -1,13 +0,0 @@ -from flask import Flask, send_file - -app = Flask(__name__) - - -@app.route("/test1/") -def test1(): - return send_file("files/apache_table.html") - - -if __name__ == '__main__': - app.run("0.0.0.0", port=8888, threaded=True) - diff --git a/views.py b/views.py index 58be38d..02e003e 100644 --- a/views.py +++ b/views.py @@ -48,8 +48,7 @@ def setup_views(app): @app.route("/stats") @cache.cached(120) def stats_page(): - crawl_server_stats = db.get_stats_by_crawler() - return render_template("stats.html", crawl_server_stats=crawl_server_stats) + return render_template("stats.html") @app.route("/stats/json_chart") @cache.cached(240) @@ -254,9 +253,7 @@ def setup_views(app): @app.route("/submit") def submit(): - queued_websites = taskManager.get_queued_tasks()[:30] - return render_template("submit.html", queue=queued_websites, captcha=captcha, - show_captcha=config.CAPTCHA_SUBMIT) + return render_template("submit.html", captcha=captcha, show_captcha=config.CAPTCHA_SUBMIT) def try_enqueue(url): url = os.path.join(url, "") @@ -412,11 +409,3 @@ def setup_views(app): db.delete_token(token) flash("Deleted API token", "success") return redirect("/dashboard") - - # TODO: pages scrolling - @app.route("/logs", methods=["GET"]) - def admin_crawl_logs(): - require_role("admin") - results = db.get_crawl_logs() - - return render_template("crawl_logs.html", logs=results)