mirror of
https://github.com/simon987/od-database.git
synced 2025-12-11 14:08:51 +00:00
Files are indexed into ES when task is complete
This commit is contained in:
@@ -118,9 +118,8 @@ class TaskManagerDatabase:
|
||||
cursor.execute("SELECT status_code, file_count, start_time, end_time, website_id"
|
||||
" FROM TaskResult WHERE indexed_time IS NULL")
|
||||
db_result = cursor.fetchall()
|
||||
print(len(db_result))
|
||||
|
||||
cursor.execute("UPDATE TaskResult SET indexed_time = CURRENT_TIMESTAMP WHERE indexed_time IS NULL")
|
||||
cursor.execute("UPDATE TaskResult SET indexed_time=CURRENT_TIMESTAMP WHERE indexed_time IS NULL")
|
||||
conn.commit()
|
||||
|
||||
return [TaskResult(r[0], r[1], r[2], r[3], r[4]) for r in db_result]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from flask import Flask, request, abort, Response
|
||||
from flask import Flask, request, abort, Response, send_from_directory
|
||||
import json
|
||||
from crawl_server.task_manager import TaskManager, Task, TaskResult
|
||||
app = Flask(__name__)
|
||||
@@ -45,5 +45,10 @@ def get_current_tasks():
|
||||
return current_tasks
|
||||
|
||||
|
||||
@app.route("/file_list/<int:website_id>/")
|
||||
def get_file_list(website_id):
|
||||
return send_from_directory(directory="./crawled/", filename=str(website_id) + ".json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(port=5001)
|
||||
|
||||
19
crawl_server/task_db_init.sql
Normal file
19
crawl_server/task_db_init.sql
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
CREATE TABLE Queue (
|
||||
id INTEGER PRIMARY KEY,
|
||||
website_id INTEGER,
|
||||
url TEXT,
|
||||
priority INTEGER,
|
||||
callback_type TEXT,
|
||||
callback_args TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE TaskResult (
|
||||
id INTEGER PRIMARY KEY,
|
||||
website_id INT,
|
||||
status_code TEXT,
|
||||
file_count INT,
|
||||
start_time INT,
|
||||
end_time INT,
|
||||
indexed_time INT DEFAULT NULL
|
||||
);
|
||||
@@ -55,14 +55,13 @@ class TaskManager:
|
||||
print("Starting task " + task.url)
|
||||
|
||||
crawler = RemoteDirectoryCrawler(task.url, 100)
|
||||
crawl_result = crawler.crawl_directory("crawled/" + str(task.website_id) + ".json")
|
||||
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
|
||||
|
||||
result.file_count = crawl_result.file_count
|
||||
result.status_code = crawl_result.status_code
|
||||
|
||||
print("End task " + task.url)
|
||||
|
||||
result.end_time = datetime.utcnow()
|
||||
print("End task " + task.url)
|
||||
|
||||
return dict(result=result, db_path=db_path)
|
||||
|
||||
@@ -77,6 +76,7 @@ class TaskManager:
|
||||
|
||||
db = TaskManagerDatabase(db_path)
|
||||
db.log_result(result)
|
||||
print("Logged result to DB")
|
||||
|
||||
@staticmethod
|
||||
def task_error(err):
|
||||
|
||||
Reference in New Issue
Block a user