Logging and bugfix for http crawler

This commit is contained in:
Simon
2018-06-25 14:36:16 -04:00
parent 5fd00f22af
commit d7ce1670a8
8 changed files with 67 additions and 42 deletions

View File

@@ -1,6 +1,7 @@
from flask import Flask, request, abort, Response, send_file
from flask_httpauth import HTTPTokenAuth
import json
from crawl_server import logger
from crawl_server.task_manager import TaskManager, Task
import os
import config
@@ -35,11 +36,13 @@ def task_put():
priority = request.json["priority"]
callback_type = request.json["callback_type"]
callback_args = request.json["callback_args"]
except KeyError:
except KeyError as e:
logger.error("Invalid task put request from " + request.remote_addr + " missing key: " + str(e))
return abort(400)
task = Task(website_id, url, priority, callback_type, callback_args)
tm.put_task(task)
logger.info("Submitted new task to queue: " + str(task.to_json()))
return '{"ok": "true"}'
return abort(400)
@@ -49,6 +52,7 @@ def task_put():
@auth.login_required
def get_completed_tasks():
json_str = json.dumps([result.to_json() for result in tm.get_non_indexed_results()])
logger.debug("Webserver has requested list of newly completed tasks from " + request.remote_addr)
return Response(json_str, mimetype="application/json")
@@ -57,6 +61,7 @@ def get_completed_tasks():
def get_current_tasks():
current_tasks = tm.get_current_tasks()
logger.debug("Webserver has requested list of current tasks from " + request.remote_addr)
return json.dumps([t.to_json() for t in current_tasks])
@@ -66,8 +71,10 @@ def get_file_list(website_id):
file_name = "./crawled/" + str(website_id) + ".json"
if os.path.exists(file_name):
logger.info("Webserver requested file list of website with id" + str(website_id))
return send_file(file_name)
else:
logger.error("Webserver requested file list of non-existent or empty website with id: " + str(website_id))
return abort(404)
@@ -77,24 +84,19 @@ def free_file_list(website_id):
file_name = "./crawled/" + str(website_id) + ".json"
if os.path.exists(file_name):
os.remove(file_name)
logger.debug("Webserver indicated that the files for the website with id " +
str(website_id) + " are safe to delete")
return '{"ok": "true"}'
else:
return abort(404)
@app.route("/task/logs/")
@auth.login_required
def get_task_logs():
json_str = json.dumps([result.to_json() for result in tm.get_all_results()])
return Response(json_str, mimetype="application/json")
@app.route("/task/pop_all")
@auth.login_required
def pop_queued_tasks():
json_str = json.dumps([task.to_json() for task in tm.pop_tasks()])
logger.info("Webserver poped all queued tasks")
return Response(json_str, mimetype="application/json")