app.py small cleanup + some logging

This commit is contained in:
Simon 2018-11-17 11:53:41 -05:00
parent a6c421c4a6
commit d8df91a0d6
2 changed files with 124 additions and 169 deletions

111
app.py
View File

@ -1,4 +1,4 @@
from flask import Flask, render_template, redirect, request, flash, abort, Response, send_from_directory, session from flask import Flask, render_template, redirect, request, flash, abort, Response, session
from multiprocessing import Pool from multiprocessing import Pool
import json import json
from urllib.parse import urlparse from urllib.parse import urlparse
@ -16,11 +16,14 @@ from search.search import ElasticSearchEngine, InvalidQueryException
from callbacks import PostCrawlCallbackFactory from callbacks import PostCrawlCallbackFactory
app = Flask(__name__) app = Flask(__name__)
app.secret_key = config.FLASK_SECRET
# Disable flask logging # Disable flask logging
flaskLogger = logging.getLogger('werkzeug') flaskLogger = logging.getLogger('werkzeug')
flaskLogger.setLevel(logging.ERROR) flaskLogger.setLevel(logging.ERROR)
logger = logging.getLogger("default")
if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN: if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN:
recaptcha = ReCaptcha(app=app, recaptcha = ReCaptcha(app=app,
site_key=config.CAPTCHA_SITE_KEY, site_key=config.CAPTCHA_SITE_KEY,
@ -33,7 +36,7 @@ if config.CAPTCHA_SEARCH:
secret_key=config.CAPTCHA_S_SECRET_KEY) secret_key=config.CAPTCHA_S_SECRET_KEY)
else: else:
recaptcha_search = None recaptcha_search = None
app.secret_key = config.FLASK_SECRET
db = Database("db.sqlite3") db = Database("db.sqlite3")
cache = Cache(app, config={'CACHE_TYPE': 'simple'}) cache = Cache(app, config={'CACHE_TYPE': 'simple'})
app.jinja_env.globals.update(truncate_path=od_util.truncate_path) app.jinja_env.globals.update(truncate_path=od_util.truncate_path)
@ -61,17 +64,19 @@ def from_timestamp(value):
@app.route("/dl") @app.route("/dl")
@cache.cached(120)
def downloads(): def downloads():
try: try:
export_file_stats = os.stat("static/out.csv.lzma") export_file_stats = os.stat("static/out.csv.lzm4")
except FileNotFoundError: except FileNotFoundError:
print("No export file") logger.warning("No export file to display in /dl")
export_file_stats = None export_file_stats = None
return render_template("downloads.html", export_file_stats=export_file_stats) return render_template("downloads.html", export_file_stats=export_file_stats)
@app.route("/stats") @app.route("/stats")
@cache.cached(120)
def stats_page(): def stats_page():
crawl_server_stats = db.get_stats_by_crawler() crawl_server_stats = db.get_stats_by_crawler()
return render_template("stats.html", crawl_server_stats=crawl_server_stats) return render_template("stats.html", crawl_server_stats=crawl_server_stats)
@ -147,18 +152,7 @@ def random_website():
return redirect("/website/" + str(db.get_random_website_id())) return redirect("/website/" + str(db.get_random_website_id()))
@app.route("/website/redispatch_queued") ## TODO: move to DB
def admin_redispatch_queued():
if "username" in session:
count = taskManager.redispatch_queued()
flash("Re-dispatched " + str(count) + " tasks", "success")
return redirect("/dashboard")
else:
abort(404)
def get_empty_websites(): def get_empty_websites():
current_tasks = taskManager.get_queued_tasks() current_tasks = taskManager.get_queued_tasks()
@ -188,21 +182,6 @@ def admin_delete_empty_website():
abort(403) abort(403)
@app.route("/website/queue_empty")
def admin_queue_empty_websites():
if "username" in session:
for website_id in get_empty_websites():
website = db.get_website_by_id(website_id)
task = Task(website.id, website.url, 1)
taskManager.queue_task(task)
flash("Dispatched empty websites", "success")
return redirect("/dashboard")
else:
abort(403)
@app.route("/website/<int:website_id>/clear") @app.route("/website/<int:website_id>/clear")
def admin_clear_website(website_id): def admin_clear_website(website_id):
if "username" in session: if "username" in session:
@ -249,7 +228,6 @@ def admin_rescan_website(website_id):
@app.route("/search") @app.route("/search")
def search(): def search():
q = request.args.get("q") if "q" in request.args else "" q = request.args.get("q") if "q" in request.args else ""
sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score" sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score"
@ -305,7 +283,7 @@ def search():
except InvalidQueryException as e: except InvalidQueryException as e:
flash("<strong>Invalid query:</strong> " + str(e), "warning") flash("<strong>Invalid query:</strong> " + str(e), "warning")
blocked = True blocked = True
except Exception: except:
flash("Query failed, this could mean that the search server is overloaded or is not reachable. " flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
"Please try again later", "danger") "Please try again later", "danger")
@ -336,6 +314,7 @@ def search():
@app.route("/contribute") @app.route("/contribute")
@cache.cached(600)
def contribute(): def contribute():
return render_template("contribute.html") return render_template("contribute.html")
@ -424,7 +403,7 @@ def enqueue_bulk():
if urls: if urls:
urls = urls.split() urls = urls.split()
if 0 < len(urls) <= 1000: if 0 < len(urls) <= 1000: # TODO: Load from config & adjust placeholder/messages?
pool = Pool(processes=6) pool = Pool(processes=6)
pool.map(func=check_url, iterable=urls) pool.map(func=check_url, iterable=urls)
@ -539,6 +518,7 @@ def admin_del_token():
return abort(403) return abort(403)
# TODO: pages scrolling
@app.route("/logs", methods=["GET"]) @app.route("/logs", methods=["GET"])
def admin_crawl_logs(): def admin_crawl_logs():
if "username" in session: if "username" in session:
@ -557,18 +537,23 @@ def api_get_task():
if name: if name:
task = db.pop_task(name, False) task = db.pop_task(name, False)
logger.debug("API get task from " + name)
if task: if task:
print("Assigning task " + str(task.website_id) + " to " + name) logger.info("Assigning task " + str(task.to_json()) + " to " + name)
else: else:
print("No queued tasks, creating new rescan task") logger.info("No queued tasks, creating a new one")
try:
website_id = db.get_oldest_website_id() website_id = db.get_oldest_website_id()
website = db.get_website_by_id(website_id) website = db.get_website_by_id(website_id)
task = Task(website_id, website.url) task = Task(website_id, website.url)
db.put_task(task) db.put_task(task)
task = db.pop_task(name, False) task = db.pop_task(name, False)
except:
logger.error("Couldn't create new task")
abort(404)
return Response(str(task), mimetype="application/json") return Response(str(task), mimetype="application/json")
else: else:
@ -583,6 +568,7 @@ def api_cancel_task():
if name: if name:
website_id = request.form.get("website_id") if "website_id" in request.form else None website_id = request.form.get("website_id") if "website_id" in request.form else None
if website_id: if website_id:
logger.debug("API task cancel for " + str(website_id) + " by " + name)
db.delete_task(website_id) db.delete_task(website_id)
return Response("cancelled task") return Response("cancelled task")
else: else:
@ -595,14 +581,15 @@ def api_cancel_task():
@app.route("/api/task/complete", methods=["POST"]) @app.route("/api/task/complete", methods=["POST"])
def api_complete_task(): def api_complete_task():
token = request.form.get("token") token = request.form.get("token")
tr = json.loads(request.form.get("result"))
print(tr)
task_result = TaskResult(tr["status_code"], tr["file_count"], tr["start_time"], tr["end_time"], tr["website_id"])
name = db.check_api_token(token) name = db.check_api_token(token)
if name: if name:
print("Task for " + str(task_result.website_id) + " completed by " + name) tr = json.loads(request.form.get("result"))
logger.debug("Task result: " + str(tr))
task_result = TaskResult(tr["status_code"], tr["file_count"], tr["start_time"], tr["end_time"],
tr["website_id"])
logger.info("Task for " + str(task_result.website_id) + " completed by " + name)
task = db.complete_task(task_result.website_id, name) task = db.complete_task(task_result.website_id, name)
if task: if task:
@ -623,7 +610,7 @@ def api_complete_task():
return "Successfully logged task result and indexed files" return "Successfully logged task result and indexed files"
else: else:
print("ERROR: " + name + " indicated that task for " + str(task_result.website_id) + logger.error("ERROR: " + name + " indicated that task for " + str(task_result.website_id) +
" was completed but there is no such task in the database.") " was completed but there is no such task in the database.")
return "No such task" return "No such task"
return abort(403) return abort(403)
@ -632,23 +619,25 @@ def api_complete_task():
@app.route("/api/task/upload", methods=["POST"]) @app.route("/api/task/upload", methods=["POST"])
def api_upload(): def api_upload():
token = request.form.get("token") token = request.form.get("token")
website_id = request.form.get("website_id")
name = db.check_api_token(token) name = db.check_api_token(token)
if name: if name:
website_id = request.form.get("website_id")
logger.debug("Result part upload for '" + str(website_id) + "' by " + name)
if "file_list" in request.files: if "file_list" in request.files:
file = request.files['file_list'] file = request.files['file_list']
filename = "./tmp/" + str(website_id) + ".json" filename = "./tmp/" + str(website_id) + ".json"
if os.path.exists(filename): if os.path.exists(filename):
print("Appending chunk to existing file...") logger.debug("Appending chunk to existing file...")
with open(filename, "ab") as f: with open(filename, "ab") as f:
f.write(file.stream.read()) f.write(file.stream.read())
else: else:
print("Saving temp file " + filename + " ...") logger.debug("Saving temp file " + filename + " ...")
file.save(filename) file.save(filename)
print("Done") logger.debug("Done saving temp file")
return "ok" return "ok"
else: else:
return abort(403) return abort(403)
@ -657,11 +646,12 @@ def api_upload():
@app.route("/api/website/by_url", methods=["GET"]) @app.route("/api/website/by_url", methods=["GET"])
def api_website_by_url(): def api_website_by_url():
token = request.args.get("token") token = request.args.get("token")
url = request.args.get("url")
name = db.check_api_token(token) name = db.check_api_token(token)
if name: if name:
url = request.args.get("url")
website = db.get_website_by_url(url) website = db.get_website_by_url(url)
logger.info("API get website by url '" + url + "' by " + name)
if website: if website:
return str(website.id) return str(website.id)
return abort(404) return abort(404)
@ -676,6 +666,7 @@ def api_website_is_blacklisted():
name = db.check_api_token(token) name = db.check_api_token(token)
if name: if name:
logger.info("API get website is blacklisted '" + url + "' by " + name)
return str(db.is_blacklisted(url)) return str(db.is_blacklisted(url))
else: else:
return abort(403) return abort(403)
@ -692,6 +683,7 @@ def api_add_website():
website_id = db.insert_website(Website(url, str(request.remote_addr + "_" + website_id = db.insert_website(Website(url, str(request.remote_addr + "_" +
request.headers.get("X-Forwarded-For", "")), request.headers.get("X-Forwarded-For", "")),
"API_CLIENT_" + name)) "API_CLIENT_" + name))
logger.info("API add website '" + url + "' by " + name + "(" + str(website_id) + ")")
return str(website_id) return str(website_id)
else: else:
return abort(403) return abort(403)
@ -715,6 +707,9 @@ def api_task_enqueue():
request.json["callback_type"], request.json["callback_type"],
json.dumps(request.json["callback_args"]) json.dumps(request.json["callback_args"])
) )
logger.info("API force enqueue by " + name + "\n(" + str(task.to_json()) + ")")
taskManager.queue_task(task) taskManager.queue_task(task)
return "" return ""
else: else:
@ -723,18 +718,16 @@ def api_task_enqueue():
@app.route("/api/task/try_enqueue", methods=["POST"]) @app.route("/api/task/try_enqueue", methods=["POST"])
def api_task_try_enqueue(): def api_task_try_enqueue():
try:
token = request.form.get("token") token = request.form.get("token")
url = request.form.get("url")
except KeyError:
return abort(400)
name = db.check_api_token(token) name = db.check_api_token(token)
if name: if name:
url = request.form.get("url")
message, result = try_enqueue(url) message, result = try_enqueue(url)
logger.info("API try enqueue '" + url + "' by " + name + " (" + message + ")")
return json.dumps({ return json.dumps({
"message": message, "message": message,
"result": result "result": result
@ -745,15 +738,11 @@ def api_task_try_enqueue():
@app.route("/api/website/random") @app.route("/api/website/random")
def api_random_website(): def api_random_website():
try:
token = request.json["token"] token = request.json["token"]
except KeyError:
return abort(400)
name = db.check_api_token(token) name = db.check_api_token(token)
if name: if name:
logger.info("API get random website by " + name)
return str(db.get_random_website_id()) return str(db.get_random_website_id())
else: else:
return abort(403) return abort(403)
@ -761,12 +750,7 @@ def api_random_website():
@app.route("/api/search", methods=["POST"]) @app.route("/api/search", methods=["POST"])
def api_search(): def api_search():
try:
token = request.json["token"] token = request.json["token"]
except KeyError:
return abort(400)
name = db.check_api_token(token) name = db.check_api_token(token)
if name: if name:
@ -784,14 +768,15 @@ def api_search():
) )
hits = db.join_website_on_search_result(hits) hits = db.join_website_on_search_result(hits)
logger.info("API search '" + request.json["query"] + "' by " + name)
return json.dumps(hits) return json.dumps(hits)
except InvalidQueryException as e: except InvalidQueryException as e:
logger.info("API search failed: " + str(e))
return str(e) return str(e)
else: else:
return abort(403) return abort(403)
if __name__ == '__main__': if __name__ == '__main__':
app.run("0.0.0.0", port=12345, threaded=True) app.run("0.0.0.0", port=12345, threaded=True)

View File

@ -82,40 +82,10 @@
<a class="btn btn-danger" href="/website/delete_empty">Delete websites with no associated files that are <a class="btn btn-danger" href="/website/delete_empty">Delete websites with no associated files that are
not queued</a> not queued</a>
<a class="btn btn-danger" href="/website/redispatch_queued">Re-dispatch queued tasks</a>
<a class="btn btn-danger" href="/website/queue_empty">Re-queue websites with no associated files</a>
<hr> <hr>
<a class="btn btn-info" href="/logout">Logout</a> <a class="btn btn-info" href="/logout">Logout</a>
</div> </div>
</div> </div>
</div> </div>
<script>
function changeSlots(id) {
let slotsElem = document.getElementById("slots-" + id);
let parent = slotsElem.parentNode;
let td = document.createElement("td");
let form = document.createElement("form");
form.setAttribute("action", "/crawl_server/" + id + "/update");
form.setAttribute("method", "post");
let slotsInput = document.createElement("input");
slotsInput.setAttribute("class", "form-control");
slotsInput.setAttribute("name", "slots");
form.appendChild(slotsInput);
td.appendChild(form);
parent.insertBefore(td, slotsElem);
slotsElem.remove();
slotsInput.focus();
slotsInput.addEventListener("focusout", function () {
form.submit();
});
}
</script>
{% endblock body %} {% endblock body %}