diff --git a/api.py b/api.py
index 4498dd4..6e0f18f 100644
--- a/api.py
+++ b/api.py
@@ -16,50 +16,9 @@ uploadLock = Lock()
def setup_api(app):
- @app.route("/api/task/get", methods=["POST"])
- def api_get_task():
- token = request.form.get("token")
- name = oddb.db.check_api_token(token)
- accept_ftp = request.form.get("accept") == "ftp" if "accept" in request.form else False
-
- if name:
- task = oddb.db.pop_task(name, accept_ftp)
- oddb.logger.debug("API get task from " + name)
-
- if task:
- oddb.logger.info("Assigning task " + str(task.to_json()) + " to " + name)
- else:
- oddb.logger.info("No queued tasks, creating a new one")
-
- try:
- task = oddb.db.make_task_for_oldest(name)
- except:
- oddb.logger.error("Couldn't create new task")
- abort(404)
-
- return Response(str(task), mimetype="application/json")
- else:
- return abort(403)
-
- @app.route("/api/task/cancel", methods=["POST"])
- def api_cancel_task():
- token = request.form.get("token")
- name = oddb.db.check_api_token(token)
-
- if name:
- website_id = request.form.get("website_id") if "website_id" in request.form else None
- if website_id:
- oddb.logger.debug("API task cancel for " + str(website_id) + " by " + name)
- oddb.db.delete_task(website_id)
- return Response("cancelled task")
- else:
- abort(400)
-
- else:
- abort(403)
-
@app.route("/api/task/complete", methods=["POST"])
def api_complete_task():
+ # TODO: task_tracker
token = request.form.get("token")
name = oddb.db.check_api_token(token)
@@ -201,6 +160,7 @@ def setup_api(app):
if name:
url = request.form.get("url")
+ # TODO: task_tracker
message, result = oddb.try_enqueue(url)
oddb.logger.info("API try enqueue '" + url + "' by " + name + " (" + message + ")")
diff --git a/database.py b/database.py
index 71079c3..711fc7a 100644
--- a/database.py
+++ b/database.py
@@ -1,11 +1,10 @@
-import sqlite3
import json
-import datetime
-from urllib.parse import urlparse
import os
-import bcrypt
+import sqlite3
import uuid
-import tasks
+from urllib.parse import urlparse
+
+import bcrypt
class BlacklistedWebsite:
@@ -155,6 +154,7 @@ class Database:
def make_task_for_oldest(self, assigned_crawler):
+ # TODO: task_tracker
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("INSERT INTO QUEUE (website_id, url, assigned_crawler) SELECT Website.id, Website.url, ? FROM Website WHERE Website.id not in (SELECT website_id FROM Queue) "
@@ -326,47 +326,6 @@ class Database:
cursor.execute("SELECT * FROM BlacklistedWebsite")
return [BlacklistedWebsite(r[0], r[1]) for r in cursor.fetchall()]
- def log_result(self, result):
-
- with sqlite3.connect(self.db_path) as conn:
-
- cursor = conn.cursor()
-
- cursor.execute("INSERT INTO TaskResult "
- "(server, website_id, status_code, file_count, start_time, end_time) "
- "VALUES (?,?,?,?,?,?)",
- (result.server_id, result.website_id, result.status_code,
- result.file_count, result.start_time, result.end_time))
- conn.commit()
-
- def get_crawl_logs(self):
-
- with sqlite3.connect(self.db_path) as conn:
- cursor = conn.cursor()
-
- cursor.execute("SELECT website_id, status_code, file_count, start_time, end_time, server "
- "FROM TaskResult ORDER BY end_time DESC")
- return [tasks.TaskResult(r[1], r[2], r[3], r[4], r[0], str(r[5])) for r in cursor.fetchall()]
-
- def get_stats_by_crawler(self):
- stats = []
- task_results = self.get_crawl_logs()
-
- for crawler in self.get_tokens():
- task_count = sum(1 for result in task_results if result.server_name == crawler.name)
- if task_count > 0:
- info = dict()
- info["file_count"] = sum(result.file_count for result in task_results if result.server_name == crawler.name)
- info["time"] = sum((result.end_time - result.start_time) for result in task_results if result.server_name == crawler.name)
- info["task_count"] = task_count
- info["time_avg"] = info["time"] / task_count
- info["file_count_avg"] = info["file_count"] / task_count
- stats.append((crawler.name, info))
-
- stats.sort(key=lambda t: t[1]["file_count"], reverse=True)
-
- return stats
-
def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took):
with sqlite3.connect(self.db_path) as conn:
@@ -376,71 +335,3 @@ class Database:
"VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took))
conn.commit()
-
- def put_task(self, task: Task, assigned_crawler=None) -> None:
-
- with sqlite3.connect(self.db_path) as conn:
- cursor = conn.cursor()
-
- cursor.execute("INSERT INTO Queue (website_id, url, priority, callback_type, callback_args, assigned_crawler) "
- "VALUES (?,?,?,?,?,?)",
- (task.website_id, task.url, task.priority,
- task.callback_type, json.dumps(task.callback_args), assigned_crawler))
- conn.commit()
-
- def get_tasks(self) -> list:
-
- with sqlite3.connect(self.db_path) as conn:
- cursor = conn.cursor()
-
- cursor.execute("SELECT website_id, url, priority, callback_type, callback_args FROM Queue "
- "WHERE assigned_crawler is NULL ")
- db_tasks = cursor.fetchall()
-
- return [Task(t[0], t[1], t[2], t[3], t[4]) for t in db_tasks]
-
- def pop_task(self, name, ftp: bool) -> Task:
-
- with sqlite3.connect(self.db_path) as conn:
- cursor = conn.cursor()
-
- cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args " +
- "FROM Queue WHERE assigned_crawler is NULL " +
- ("AND url LIKE 'ftp%' " if ftp else "AND url LIKE 'http%' ") +
- "ORDER BY priority DESC, Queue.id " +
- "ASC LIMIT 1")
- task = cursor.fetchone()
-
- if task:
- cursor.execute("UPDATE Queue SET assigned_crawler=? WHERE id=?", (name, task[0],))
- conn.commit()
- return Task(task[1], task[2], task[3], task[4], task[5])
- else:
- return None
-
- def delete_task(self, website_id):
-
- with sqlite3.connect(self.db_path) as conn:
- cursor = conn.cursor()
- cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, ))
-
- def complete_task(self, website_id: int, name: str) -> Task:
-
- with sqlite3.connect(self.db_path) as conn:
- cursor = conn.cursor()
-
- cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM "
- "Queue WHERE website_id=?", (website_id, ))
-
- task = cursor.fetchone()
-
- if task:
- cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, ))
- conn.commit()
- return Task(task[1], task[2], task[3], task[4], task[5])
- else:
- return None
-
-
-
-
diff --git a/debug_put.py b/debug_put.py
deleted file mode 100644
index 8a2f7eb..0000000
--- a/debug_put.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import requests
-import json
-
-
-payload = json.dumps({
- "token": "4eafc6ed-74b7-4f04-9d34-7f3e01201003",
- "website_id": 3,
- "url": "http://localhost:8000/",
- "priority": 2,
- "callback_type": "",
- "callback_args": "{}"
-})
-
-r = requests.post("http://localhost/api/task/enqueue",
- headers={"Content-Type": "application/json"},
- data=payload)
-print(r)
-print(r.text)
diff --git a/init_script.sql b/init_script.sql
index 2774e08..c5de4a9 100644
--- a/init_script.sql
+++ b/init_script.sql
@@ -20,19 +20,6 @@ CREATE TABLE BlacklistedWebsite (
url TEXT
);
-CREATE TABLE TaskResult (
- id INTEGER PRIMARY KEY,
- server TEXT,
- website_id INT,
- status_code TEXT,
- file_count INT,
- start_time TIMESTAMP,
- end_time TIMESTAMP,
- indexed_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-
- FOREIGN KEY (server) REFERENCES ApiClient(name)
-);
-
CREATE TABLE ApiClient (
name TEXT PRIMARY KEY NOT NULL,
token TEXT NOT NULL
@@ -51,15 +38,3 @@ CREATE TABLE SearchLogEntry (
results INT DEFAULT 0,
took INT DEFAULT 0
);
-
-CREATE TABLE Queue (
- id INTEGER PRIMARY KEY,
- website_id INTEGER,
- url TEXT,
- priority INTEGER,
- callback_type TEXT,
- callback_args TEXT,
- assigned_crawler TEXT NULL DEFAULT NULL,
-
- FOREIGN KEY (assigned_crawler) REFERENCES ApiClient(name)
-);
diff --git a/stress_test.py b/stress_test.py
deleted file mode 100644
index dec7f59..0000000
--- a/stress_test.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import os
-import json
-import shutil
-from search.search import ElasticSearchEngine
-from concurrent.futures import ThreadPoolExecutor
-import requests
-import random
-
-terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain") \
- .text.splitlines()
-exts = [
- "zip", "exe", "mp3", "avi", "mp4", "rar", "7zip", "ogg", "m4a", "flac", "doc", "docx", "aac", "xls",
- "cab", "txt", "c", "java", "class", "jar", "py", "cpp", "h", "png", "jpg", "jpeg", "ttf", "torrent",
- "part", "blend", "3ds", "obj", "ico", "html", "css", "js", "ts", "ape", "asm", "nasm", "fasm", "o",
- "so", "dll", "tar", "gz", "bin", "cad", "cmd", "bat", "sh", "md"
-]
-
-
-def dump_local_filesystem(root_dir: str):
-
- docs = []
-
- for root, dirs, files in os.walk(root_dir):
-
- for filename in files:
- full_path = os.path.join(root, filename)
- stats = os.stat(full_path)
-
- doc = dict()
- doc["name"] = filename
- doc["path"] = root
- doc["mtime"] = stats.st_mtime
- doc["size"] = stats.st_size
-
- docs.append(doc)
-
- with open("local_filesystem.json", "w") as f:
- f.writelines(json.dumps(doc) + "\n" for doc in docs)
-
-
-def random_path():
- return "/".join(random.choices(terms, k=random.randint(1, 5)))
-
-
-def random_file_name():
- return random.choice(["_", " ", "-", ".", "#", ""]).\
- join(random.choices(terms, k=random.randint(1, 3))) + "." + random.choice(exts)
-
-
-def get_random_file():
-
- doc = dict()
- doc["name"] = random_file_name()
- doc["path"] = random_path()
- doc["mtime"] = random.randint(0, 1000000000000)
- doc["size"] = random.randint(-1, 1000000000)
-
- return doc
-
-
-def dump_random_files(count=10):
- with open("random_dump.json", "w") as f:
- f.writelines(json.dumps(get_random_file()) + "\n" for _ in range(count))
-
-
-def index_file_list(path: str, website_id):
-
- es = ElasticSearchEngine("od-database")
- with open(path, "r") as f:
- es.import_json(f.readlines(), website_id)
-
-
-def search(term=""):
- requests.get("http://localhost/search?q=" + term, verify=False)
- print(term)
-
-
-def random_searches(count=10000000, max_workers=1000):
-
- pool = ThreadPoolExecutor(max_workers=max_workers)
- pool.map(search, random.choices(terms, k=count))
-
-
-def make_wide_filesystem(count=100000):
-
- shutil.rmtree("stress_test")
- os.mkdir("stress_test")
- for _ in range(count):
- new_path = "stress_test/" + random.choice(terms)
- if not os.path.exists(new_path):
- os.mkdir(new_path)
-
-
-# dump_local_filesystem("/mnt/")
-# index_file_list("local_filesystem.json", 4)
-# random_searches(100000)
-# dump_random_files(20000 * 100000)
-# make_wide_filesystem(10000)
diff --git a/tasks.py b/tasks.py
index 3b221fa..a15ac28 100644
--- a/tasks.py
+++ b/tasks.py
@@ -90,5 +90,3 @@ class TaskManager:
self.db.put_task(task)
print("Queued task and made it available to crawlers: " + str(task.website_id))
- def get_queued_tasks(self) -> list:
- return self.db.get_tasks()
diff --git a/templates/crawl_logs.html b/templates/crawl_logs.html
deleted file mode 100644
index 6422df4..0000000
--- a/templates/crawl_logs.html
+++ /dev/null
@@ -1,36 +0,0 @@
-{% extends "layout.html" %}
-{% set title = "Crawl logs - OD-Database" %}
-
-{% block body %}
-
-
-
-
-
- Crawler |
- Website |
- Status code |
- File count |
- Start |
- End |
- Delta |
-
-
-
-
- {% for task_result in logs %}
-
- {{ task_result.server_name }} |
- #{{ task_result.website_id }} |
- {{ task_result.status_code }} |
- {{ task_result.file_count }} |
- {{ task_result.start_time | int | datetime_format }} |
- {{ task_result.end_time | int | datetime_format }} |
- {{ ((task_result.end_time - task_result.start_time)) | int }} sec |
-
- {% endfor %}
-
-
-
-
-{% endblock body %}
diff --git a/templates/dashboard.html b/templates/dashboard.html
index c446368..1d1c2ff 100644
--- a/templates/dashboard.html
+++ b/templates/dashboard.html
@@ -7,9 +7,6 @@
-
Logs
-
-
API Keys
diff --git a/templates/stats.html b/templates/stats.html
index 73139dd..7ee2f6e 100644
--- a/templates/stats.html
+++ b/templates/stats.html
@@ -69,33 +69,6 @@
-
-
Crawl server stats
-
-
-
- Server |
- Tasks done |
- Crawl time |
- Crawl time avg. |
- Files crawled |
- Files crawled avg. |
-
-
-
- {% for entry in crawl_server_stats %}
- {% set server, info = entry %}
-
- {{ server }} |
- {{ info.task_count }} |
- {{ info.time | duration_format() }} |
- {{ info.time_avg | duration_format() }} |
- {{ info.file_count }} |
- {{ "%.2f" % info.file_count_avg }} |
-
- {% endfor %}
-
-
diff --git a/templates/submit.html b/templates/submit.html
index 55a97da..afc8697 100644
--- a/templates/submit.html
+++ b/templates/submit.html
@@ -62,34 +62,5 @@
-
-
-
-
-
-
-
-
-
- Url |
- Priority |
- Task type |
-
-
-
-
- {% for task in queue %}
-
- {{ task.url | truncate(70) }} |
- {{ task.priority }} |
- {{ task.callback_type if task.callback_type else "NORMAL" }} |
-
- {% endfor %}
-
-
-
-
-
-
{% endblock body %}
diff --git a/test/__init__.py b/test/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/test/files/apache_table.html b/test/files/apache_table.html
deleted file mode 100644
index 06e21c9..0000000
--- a/test/files/apache_table.html
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-
- Index of /Public/bootstrap
-
-
-Index of /Public/bootstrap
-
-
-
diff --git a/test/files/lighttpd_table.html b/test/files/lighttpd_table.html
deleted file mode 100644
index fe58d9f..0000000
--- a/test/files/lighttpd_table.html
+++ /dev/null
@@ -1,47 +0,0 @@
-
-
-
-
- Index of /gentoo/releases/
-
-
-
-Index of /gentoo/releases/
-
-
- Name | Last Modified | Size | Type |
-
- Parent Directory/ | | - | Directory |
- alpha/ | 2009-Aug-09 03:47:09 | - | Directory |
- amd64/ | 2017-Feb-09 18:50:44 | - | Directory |
- arm/ | 2014-Apr-29 13:42:06 | - | Directory |
- hppa/ | 2014-Apr-29 13:42:12 | - | Directory |
- ia64/ | 2009-Aug-09 03:47:09 | - | Directory |
- mips/ | 2011-Apr-28 23:38:14 | - | Directory |
- ppc/ | 2014-Apr-29 13:41:00 | - | Directory |
- s390/ | 2014-Apr-29 13:41:06 | - | Directory |
- sh/ | 2014-Apr-29 13:41:16 | - | Directory |
- snapshots/ | 2009-Apr-16 05:08:17 | - | Directory |
- sparc/ | 2009-Aug-09 03:47:09 | - | Directory |
- x86/ | 2016-Jul-04 21:14:19 | - | Directory |
- README | 2014-Jun-22 05:18:43 | 0.1K | application/octet-stream |
- verify-digests.sh | 2016-Jun-10 02:40:33 | 4.5K | application/octet-stream |
-
-
-
-
-
-
diff --git a/test/files/nginx_pre.html b/test/files/nginx_pre.html
deleted file mode 100644
index 5bbd35e..0000000
--- a/test/files/nginx_pre.html
+++ /dev/null
@@ -1,11 +0,0 @@
-
-Index of /test/To process/Android nak newer/
-
-Index of /test/To process/Android nak newer/
../
-DCIM/ 31-Jul-2018 00:26 -
-Pictures/ 31-Jul-2018 00:26 -
-1529682937580.webm 25-Jun-2018 03:58 3768511
-1529716051300.webm 25-Jun-2018 04:01 3181867
-1529725898345.webm 25-Jun-2018 04:05 4138908
-
-
diff --git a/test/webserver.py b/test/webserver.py
deleted file mode 100644
index a3a1c14..0000000
--- a/test/webserver.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from flask import Flask, send_file
-
-app = Flask(__name__)
-
-
-@app.route("/test1/")
-def test1():
- return send_file("files/apache_table.html")
-
-
-if __name__ == '__main__':
- app.run("0.0.0.0", port=8888, threaded=True)
-
diff --git a/views.py b/views.py
index 58be38d..02e003e 100644
--- a/views.py
+++ b/views.py
@@ -48,8 +48,7 @@ def setup_views(app):
@app.route("/stats")
@cache.cached(120)
def stats_page():
- crawl_server_stats = db.get_stats_by_crawler()
- return render_template("stats.html", crawl_server_stats=crawl_server_stats)
+ return render_template("stats.html")
@app.route("/stats/json_chart")
@cache.cached(240)
@@ -254,9 +253,7 @@ def setup_views(app):
@app.route("/submit")
def submit():
- queued_websites = taskManager.get_queued_tasks()[:30]
- return render_template("submit.html", queue=queued_websites, captcha=captcha,
- show_captcha=config.CAPTCHA_SUBMIT)
+ return render_template("submit.html", captcha=captcha, show_captcha=config.CAPTCHA_SUBMIT)
def try_enqueue(url):
url = os.path.join(url, "")
@@ -412,11 +409,3 @@ def setup_views(app):
db.delete_token(token)
flash("Deleted API token", "success")
return redirect("/dashboard")
-
- # TODO: pages scrolling
- @app.route("/logs", methods=["GET"])
- def admin_crawl_logs():
- require_role("admin")
- results = db.get_crawl_logs()
-
- return render_template("crawl_logs.html", logs=results)