Remove task tracking

This commit is contained in:
simon987 2019-03-09 13:26:05 -05:00
parent 6000e46ad7
commit 00e3fd7340
16 changed files with 9 additions and 499 deletions

44
api.py
View File

@ -16,50 +16,9 @@ uploadLock = Lock()
def setup_api(app):
@app.route("/api/task/get", methods=["POST"])
def api_get_task():
token = request.form.get("token")
name = oddb.db.check_api_token(token)
accept_ftp = request.form.get("accept") == "ftp" if "accept" in request.form else False
if name:
task = oddb.db.pop_task(name, accept_ftp)
oddb.logger.debug("API get task from " + name)
if task:
oddb.logger.info("Assigning task " + str(task.to_json()) + " to " + name)
else:
oddb.logger.info("No queued tasks, creating a new one")
try:
task = oddb.db.make_task_for_oldest(name)
except:
oddb.logger.error("Couldn't create new task")
abort(404)
return Response(str(task), mimetype="application/json")
else:
return abort(403)
@app.route("/api/task/cancel", methods=["POST"])
def api_cancel_task():
token = request.form.get("token")
name = oddb.db.check_api_token(token)
if name:
website_id = request.form.get("website_id") if "website_id" in request.form else None
if website_id:
oddb.logger.debug("API task cancel for " + str(website_id) + " by " + name)
oddb.db.delete_task(website_id)
return Response("cancelled task")
else:
abort(400)
else:
abort(403)
@app.route("/api/task/complete", methods=["POST"])
def api_complete_task():
# TODO: task_tracker
token = request.form.get("token")
name = oddb.db.check_api_token(token)
@ -201,6 +160,7 @@ def setup_api(app):
if name:
url = request.form.get("url")
# TODO: task_tracker
message, result = oddb.try_enqueue(url)
oddb.logger.info("API try enqueue '" + url + "' by " + name + " (" + message + ")")

View File

@ -1,11 +1,10 @@
import sqlite3
import json
import datetime
from urllib.parse import urlparse
import os
import bcrypt
import sqlite3
import uuid
import tasks
from urllib.parse import urlparse
import bcrypt
class BlacklistedWebsite:
@ -155,6 +154,7 @@ class Database:
def make_task_for_oldest(self, assigned_crawler):
# TODO: task_tracker
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("INSERT INTO QUEUE (website_id, url, assigned_crawler) SELECT Website.id, Website.url, ? FROM Website WHERE Website.id not in (SELECT website_id FROM Queue) "
@ -326,47 +326,6 @@ class Database:
cursor.execute("SELECT * FROM BlacklistedWebsite")
return [BlacklistedWebsite(r[0], r[1]) for r in cursor.fetchall()]
def log_result(self, result):
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("INSERT INTO TaskResult "
"(server, website_id, status_code, file_count, start_time, end_time) "
"VALUES (?,?,?,?,?,?)",
(result.server_id, result.website_id, result.status_code,
result.file_count, result.start_time, result.end_time))
conn.commit()
def get_crawl_logs(self):
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("SELECT website_id, status_code, file_count, start_time, end_time, server "
"FROM TaskResult ORDER BY end_time DESC")
return [tasks.TaskResult(r[1], r[2], r[3], r[4], r[0], str(r[5])) for r in cursor.fetchall()]
def get_stats_by_crawler(self):
stats = []
task_results = self.get_crawl_logs()
for crawler in self.get_tokens():
task_count = sum(1 for result in task_results if result.server_name == crawler.name)
if task_count > 0:
info = dict()
info["file_count"] = sum(result.file_count for result in task_results if result.server_name == crawler.name)
info["time"] = sum((result.end_time - result.start_time) for result in task_results if result.server_name == crawler.name)
info["task_count"] = task_count
info["time_avg"] = info["time"] / task_count
info["file_count_avg"] = info["file_count"] / task_count
stats.append((crawler.name, info))
stats.sort(key=lambda t: t[1]["file_count"], reverse=True)
return stats
def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took):
with sqlite3.connect(self.db_path) as conn:
@ -376,71 +335,3 @@ class Database:
"VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took))
conn.commit()
def put_task(self, task: Task, assigned_crawler=None) -> None:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("INSERT INTO Queue (website_id, url, priority, callback_type, callback_args, assigned_crawler) "
"VALUES (?,?,?,?,?,?)",
(task.website_id, task.url, task.priority,
task.callback_type, json.dumps(task.callback_args), assigned_crawler))
conn.commit()
def get_tasks(self) -> list:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("SELECT website_id, url, priority, callback_type, callback_args FROM Queue "
"WHERE assigned_crawler is NULL ")
db_tasks = cursor.fetchall()
return [Task(t[0], t[1], t[2], t[3], t[4]) for t in db_tasks]
def pop_task(self, name, ftp: bool) -> Task:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args " +
"FROM Queue WHERE assigned_crawler is NULL " +
("AND url LIKE 'ftp%' " if ftp else "AND url LIKE 'http%' ") +
"ORDER BY priority DESC, Queue.id " +
"ASC LIMIT 1")
task = cursor.fetchone()
if task:
cursor.execute("UPDATE Queue SET assigned_crawler=? WHERE id=?", (name, task[0],))
conn.commit()
return Task(task[1], task[2], task[3], task[4], task[5])
else:
return None
def delete_task(self, website_id):
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, ))
def complete_task(self, website_id: int, name: str) -> Task:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM "
"Queue WHERE website_id=?", (website_id, ))
task = cursor.fetchone()
if task:
cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, ))
conn.commit()
return Task(task[1], task[2], task[3], task[4], task[5])
else:
return None

View File

@ -1,18 +0,0 @@
import requests
import json
payload = json.dumps({
"token": "4eafc6ed-74b7-4f04-9d34-7f3e01201003",
"website_id": 3,
"url": "http://localhost:8000/",
"priority": 2,
"callback_type": "",
"callback_args": "{}"
})
r = requests.post("http://localhost/api/task/enqueue",
headers={"Content-Type": "application/json"},
data=payload)
print(r)
print(r.text)

View File

@ -20,19 +20,6 @@ CREATE TABLE BlacklistedWebsite (
url TEXT
);
CREATE TABLE TaskResult (
id INTEGER PRIMARY KEY,
server TEXT,
website_id INT,
status_code TEXT,
file_count INT,
start_time TIMESTAMP,
end_time TIMESTAMP,
indexed_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (server) REFERENCES ApiClient(name)
);
CREATE TABLE ApiClient (
name TEXT PRIMARY KEY NOT NULL,
token TEXT NOT NULL
@ -51,15 +38,3 @@ CREATE TABLE SearchLogEntry (
results INT DEFAULT 0,
took INT DEFAULT 0
);
CREATE TABLE Queue (
id INTEGER PRIMARY KEY,
website_id INTEGER,
url TEXT,
priority INTEGER,
callback_type TEXT,
callback_args TEXT,
assigned_crawler TEXT NULL DEFAULT NULL,
FOREIGN KEY (assigned_crawler) REFERENCES ApiClient(name)
);

View File

@ -1,98 +0,0 @@
import os
import json
import shutil
from search.search import ElasticSearchEngine
from concurrent.futures import ThreadPoolExecutor
import requests
import random
terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain") \
.text.splitlines()
exts = [
"zip", "exe", "mp3", "avi", "mp4", "rar", "7zip", "ogg", "m4a", "flac", "doc", "docx", "aac", "xls",
"cab", "txt", "c", "java", "class", "jar", "py", "cpp", "h", "png", "jpg", "jpeg", "ttf", "torrent",
"part", "blend", "3ds", "obj", "ico", "html", "css", "js", "ts", "ape", "asm", "nasm", "fasm", "o",
"so", "dll", "tar", "gz", "bin", "cad", "cmd", "bat", "sh", "md"
]
def dump_local_filesystem(root_dir: str):
docs = []
for root, dirs, files in os.walk(root_dir):
for filename in files:
full_path = os.path.join(root, filename)
stats = os.stat(full_path)
doc = dict()
doc["name"] = filename
doc["path"] = root
doc["mtime"] = stats.st_mtime
doc["size"] = stats.st_size
docs.append(doc)
with open("local_filesystem.json", "w") as f:
f.writelines(json.dumps(doc) + "\n" for doc in docs)
def random_path():
return "/".join(random.choices(terms, k=random.randint(1, 5)))
def random_file_name():
return random.choice(["_", " ", "-", ".", "#", ""]).\
join(random.choices(terms, k=random.randint(1, 3))) + "." + random.choice(exts)
def get_random_file():
doc = dict()
doc["name"] = random_file_name()
doc["path"] = random_path()
doc["mtime"] = random.randint(0, 1000000000000)
doc["size"] = random.randint(-1, 1000000000)
return doc
def dump_random_files(count=10):
with open("random_dump.json", "w") as f:
f.writelines(json.dumps(get_random_file()) + "\n" for _ in range(count))
def index_file_list(path: str, website_id):
es = ElasticSearchEngine("od-database")
with open(path, "r") as f:
es.import_json(f.readlines(), website_id)
def search(term=""):
requests.get("http://localhost/search?q=" + term, verify=False)
print(term)
def random_searches(count=10000000, max_workers=1000):
pool = ThreadPoolExecutor(max_workers=max_workers)
pool.map(search, random.choices(terms, k=count))
def make_wide_filesystem(count=100000):
shutil.rmtree("stress_test")
os.mkdir("stress_test")
for _ in range(count):
new_path = "stress_test/" + random.choice(terms)
if not os.path.exists(new_path):
os.mkdir(new_path)
# dump_local_filesystem("/mnt/")
# index_file_list("local_filesystem.json", 4)
# random_searches(100000)
# dump_random_files(20000 * 100000)
# make_wide_filesystem(10000)

View File

@ -90,5 +90,3 @@ class TaskManager:
self.db.put_task(task)
print("Queued task and made it available to crawlers: " + str(task.website_id))
def get_queued_tasks(self) -> list:
return self.db.get_tasks()

View File

@ -1,36 +0,0 @@
{% extends "layout.html" %}
{% set title = "Crawl logs - OD-Database" %}
{% block body %}
<div class="container-fluid">
<table class="table table-striped">
<thead>
<tr>
<th>Crawler</th>
<th>Website</th>
<th>Status code</th>
<th>File count</th>
<th>Start</th>
<th>End</th>
<th>Delta</th>
</tr>
</thead>
<tbody>
{% for task_result in logs %}
<tr>
<td>{{ task_result.server_name }}</td>
<td><a href="/website/{{ task_result.website_id }}/">#{{ task_result.website_id }}</a></td>
<td>{{ task_result.status_code }}</td>
<td>{{ task_result.file_count }}</td>
<td>{{ task_result.start_time | int | datetime_format }}</td>
<td>{{ task_result.end_time | int | datetime_format }}</td>
<td>{{ ((task_result.end_time - task_result.start_time)) | int }} sec</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% endblock body %}

View File

@ -7,9 +7,6 @@
<div class="card-header">Dashboard</div>
<div class="card-body">
<a href="/logs">Logs</a>
<br>
<hr>
<h3>API Keys</h3>
<table class="table table-striped">
<thead>

View File

@ -69,33 +69,6 @@
</tr>
</tbody>
</table>
<h4>Crawl server stats</h4>
<table class="table table-striped">
<thead>
<tr>
<th>Server</th>
<th>Tasks done</th>
<th>Crawl time</th>
<th>Crawl time avg.</th>
<th>Files crawled</th>
<th>Files crawled avg.</th>
</tr>
</thead>
<tbody>
{% for entry in crawl_server_stats %}
{% set server, info = entry %}
<tr>
<td><b>{{ server }}</b></td>
<td class="td-numeric">{{ info.task_count }}</td>
<td class="td-numeric">{{ info.time | duration_format() }}</td>
<td class="td-numeric">{{ info.time_avg | duration_format() }}</td>
<td class="td-numeric">{{ info.file_count }}</td>
<td class="td-numeric">{{ "%.2f" % info.file_count_avg }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>

View File

@ -62,34 +62,5 @@
</p>
</div>
</div>
<div class="card">
<div class="card-header">Queued websites</div>
<div class="card-body">
<table class="table table-striped">
<thead>
<tr>
<th>Url</th>
<th>Priority</th>
<th>Task type</th>
</tr>
</thead>
<tbody>
{% for task in queue %}
<tr>
<td title="{{ task.url }}">{{ task.url | truncate(70) }}</td>
<td>{{ task.priority }}</td>
<td>{{ task.callback_type if task.callback_type else "NORMAL" }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
{% endblock body %}

View File

View File

@ -1,21 +0,0 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head>
<title>Index of /Public/bootstrap</title>
</head>
<body>
<h1>Index of /Public/bootstrap</h1>
<table>
<tr><th valign="top"><img src="/icons/blank.gif" alt="[ICO]"></th><th><a href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last modified</a></th><th><a href="?C=S;O=A">Size</a></th><th><a href="?C=D;O=A">Description</a></th></tr>
<tr><th colspan="5"><hr></th></tr>
<tr><td valign="top"><img src="/icons/back.gif" alt="[PARENTDIR]"></td><td><a href="/Public/">Parent Directory</a> </td><td>&nbsp;</td><td align="right"> - </td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/unknown.gif" alt="[ ]"></td><td><a href="bower.json">bower.json</a> </td><td align="right">2017-04-05 01:45 </td><td align="right">1.0K</td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="css/">css/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="image/">image/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="js/">js/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="less/">less/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/unknown.gif" alt="[ ]"></td><td><a href="package.json">package.json</a> </td><td align="right">2017-04-05 01:45 </td><td align="right">666 </td><td>&nbsp;</td></tr>
<tr><th colspan="5"><hr></th></tr>
</table>
</body></html>

View File

@ -1,47 +0,0 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<title>Index of /gentoo/releases/</title>
<style type="text/css">
a, a:active {text-decoration: none; color: blue;}
a:visited {color: #48468F;}
a:hover, a:focus {text-decoration: underline; color: red;}
body {background-color: #F5F5F5;}
h2 {margin-bottom: 12px;}
table {margin-left: 12px;}
th, td { font: 90% monospace; text-align: left;}
th { font-weight: bold; padding-right: 14px; padding-bottom: 3px;}
td {padding-right: 14px;}
td.s, th.s {text-align: right;}
div.list { background-color: white; border-top: 1px solid #646464; border-bottom: 1px solid #646464; padding-top: 10px; padding-bottom: 14px;}
div.foot { font: 90% monospace; color: #787878; padding-top: 4px;}
</style>
</head>
<body>
<h2>Index of /gentoo/releases/</h2>
<div class="list">
<table summary="Directory Listing" cellpadding="0" cellspacing="0">
<thead><tr><th class="n">Name</th><th class="m">Last Modified</th><th class="s">Size</th><th class="t">Type</th></tr></thead>
<tbody>
<tr><td class="n"><a href="../">Parent Directory</a>/</td><td class="m">&nbsp;</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="alpha/">alpha</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="amd64/">amd64</a>/</td><td class="m">2017-Feb-09 18:50:44</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="arm/">arm</a>/</td><td class="m">2014-Apr-29 13:42:06</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="hppa/">hppa</a>/</td><td class="m">2014-Apr-29 13:42:12</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="ia64/">ia64</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="mips/">mips</a>/</td><td class="m">2011-Apr-28 23:38:14</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="ppc/">ppc</a>/</td><td class="m">2014-Apr-29 13:41:00</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="s390/">s390</a>/</td><td class="m">2014-Apr-29 13:41:06</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="sh/">sh</a>/</td><td class="m">2014-Apr-29 13:41:16</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="snapshots/">snapshots</a>/</td><td class="m">2009-Apr-16 05:08:17</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="sparc/">sparc</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="x86/">x86</a>/</td><td class="m">2016-Jul-04 21:14:19</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="README">README</a></td><td class="m">2014-Jun-22 05:18:43</td><td class="s">0.1K</td><td class="t">application/octet-stream</td></tr>
<tr><td class="n"><a href="verify-digests.sh">verify-digests.sh</a></td><td class="m">2016-Jun-10 02:40:33</td><td class="s">4.5K</td><td class="t">application/octet-stream</td></tr>
</tbody>
</table>
</div>
<div class="foot">lighttpd/1.4.29</div>
</body>
</html>

View File

@ -1,11 +0,0 @@
<html>
<head><title>Index of /test/To process/Android nak newer/</title></head>
<body bgcolor="white">
<h1>Index of /test/To process/Android nak newer/</h1><hr><pre><a href="../">../</a>
<a href="DCIM/">DCIM/</a> 31-Jul-2018 00:26 -
<a href="Pictures/">Pictures/</a> 31-Jul-2018 00:26 -
<a href="1529682937580.webm">1529682937580.webm</a> 25-Jun-2018 03:58 3768511
<a href="1529716051300.webm">1529716051300.webm</a> 25-Jun-2018 04:01 3181867
<a href="1529725898345.webm">1529725898345.webm</a> 25-Jun-2018 04:05 4138908
</pre><hr></body>
</html>

View File

@ -1,13 +0,0 @@
from flask import Flask, send_file
app = Flask(__name__)
@app.route("/test1/")
def test1():
return send_file("files/apache_table.html")
if __name__ == '__main__':
app.run("0.0.0.0", port=8888, threaded=True)

View File

@ -48,8 +48,7 @@ def setup_views(app):
@app.route("/stats")
@cache.cached(120)
def stats_page():
crawl_server_stats = db.get_stats_by_crawler()
return render_template("stats.html", crawl_server_stats=crawl_server_stats)
return render_template("stats.html")
@app.route("/stats/json_chart")
@cache.cached(240)
@ -254,9 +253,7 @@ def setup_views(app):
@app.route("/submit")
def submit():
queued_websites = taskManager.get_queued_tasks()[:30]
return render_template("submit.html", queue=queued_websites, captcha=captcha,
show_captcha=config.CAPTCHA_SUBMIT)
return render_template("submit.html", captcha=captcha, show_captcha=config.CAPTCHA_SUBMIT)
def try_enqueue(url):
url = os.path.join(url, "")
@ -412,11 +409,3 @@ def setup_views(app):
db.delete_token(token)
flash("Deleted API token", "success")
return redirect("/dashboard")
# TODO: pages scrolling
@app.route("/logs", methods=["GET"])
def admin_crawl_logs():
require_role("admin")
results = db.get_crawl_logs()
return render_template("crawl_logs.html", logs=results)