Files are indexed into ES when task is complete

This commit is contained in:
Simon
2018-06-12 15:45:00 -04:00
parent 6c912ea8c5
commit 1718bb91ca
7 changed files with 41 additions and 38 deletions

View File

@@ -118,9 +118,8 @@ class TaskManagerDatabase:
cursor.execute("SELECT status_code, file_count, start_time, end_time, website_id"
" FROM TaskResult WHERE indexed_time IS NULL")
db_result = cursor.fetchall()
print(len(db_result))
cursor.execute("UPDATE TaskResult SET indexed_time = CURRENT_TIMESTAMP WHERE indexed_time IS NULL")
cursor.execute("UPDATE TaskResult SET indexed_time=CURRENT_TIMESTAMP WHERE indexed_time IS NULL")
conn.commit()
return [TaskResult(r[0], r[1], r[2], r[3], r[4]) for r in db_result]

View File

@@ -1,4 +1,4 @@
from flask import Flask, request, abort, Response
from flask import Flask, request, abort, Response, send_from_directory
import json
from crawl_server.task_manager import TaskManager, Task, TaskResult
app = Flask(__name__)
@@ -45,5 +45,10 @@ def get_current_tasks():
return current_tasks
@app.route("/file_list/<int:website_id>/")
def get_file_list(website_id):
return send_from_directory(directory="./crawled/", filename=str(website_id) + ".json")
if __name__ == "__main__":
app.run(port=5001)

View File

@@ -0,0 +1,19 @@
CREATE TABLE Queue (
id INTEGER PRIMARY KEY,
website_id INTEGER,
url TEXT,
priority INTEGER,
callback_type TEXT,
callback_args TEXT
);
CREATE TABLE TaskResult (
id INTEGER PRIMARY KEY,
website_id INT,
status_code TEXT,
file_count INT,
start_time INT,
end_time INT,
indexed_time INT DEFAULT NULL
);

View File

@@ -55,14 +55,13 @@ class TaskManager:
print("Starting task " + task.url)
crawler = RemoteDirectoryCrawler(task.url, 100)
crawl_result = crawler.crawl_directory("crawled/" + str(task.website_id) + ".json")
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
result.file_count = crawl_result.file_count
result.status_code = crawl_result.status_code
print("End task " + task.url)
result.end_time = datetime.utcnow()
print("End task " + task.url)
return dict(result=result, db_path=db_path)
@@ -77,6 +76,7 @@ class TaskManager:
db = TaskManagerDatabase(db_path)
db.log_result(result)
print("Logged result to DB")
@staticmethod
def task_error(err):