mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Overwrite document on re-index, update website last_modified on task complete, delete website files on index complete
This commit is contained in:
parent
8486555426
commit
e54609972c
@ -72,6 +72,17 @@ def get_file_list(website_id):
|
|||||||
return abort(404)
|
return abort(404)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/file_list/<int:website_id>/free")
|
||||||
|
@auth.login_required
|
||||||
|
def free_file_list(website_id):
|
||||||
|
file_name = "./crawled/" + str(website_id) + ".json"
|
||||||
|
if os.path.exists(file_name):
|
||||||
|
os.remove(file_name)
|
||||||
|
return '{"ok": "true"}'
|
||||||
|
else:
|
||||||
|
return abort(404)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/task/logs/")
|
@app.route("/task/logs/")
|
||||||
@auth.login_required
|
@auth.login_required
|
||||||
def get_task_logs():
|
def get_task_logs():
|
||||||
|
@ -45,6 +45,14 @@ class Database:
|
|||||||
conn.executescript(init_script)
|
conn.executescript(init_script)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
def update_website_date_if_exists(self, website_id):
|
||||||
|
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id=?", (website_id, ))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
def insert_website(self, website: Website):
|
def insert_website(self, website: Website):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
@ -3,9 +3,9 @@ import json
|
|||||||
|
|
||||||
|
|
||||||
payload = json.dumps({
|
payload = json.dumps({
|
||||||
"website_id": 123,
|
"website_id": 3,
|
||||||
"url": "ftp://132.249.213.137",
|
# "url": "ftp://132.249.213.137",
|
||||||
# "url": "http://localhost:8000/",
|
"url": "http://localhost:8000/",
|
||||||
# "url": "http://ubuntu.mirrorservice.org/",
|
# "url": "http://ubuntu.mirrorservice.org/",
|
||||||
"priority": 2,
|
"priority": 2,
|
||||||
"callback_type": "",
|
"callback_type": "",
|
||||||
|
@ -91,6 +91,22 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
def ping(self):
|
def ping(self):
|
||||||
return self.es.ping()
|
return self.es.ping()
|
||||||
|
|
||||||
|
def delete_docs(self, website_id):
|
||||||
|
|
||||||
|
try:
|
||||||
|
print("Deleting docs of " + str(website_id))
|
||||||
|
self.es.delete_by_query(body={
|
||||||
|
"query": {
|
||||||
|
"constant_score": {
|
||||||
|
"filter": {
|
||||||
|
"term": {"website_id": website_id}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, index=self.index_name)
|
||||||
|
except elasticsearch.exceptions.ConflictError:
|
||||||
|
print("Error: multiple delete tasks at the same time")
|
||||||
|
|
||||||
def import_json(self, in_lines, website_id: int):
|
def import_json(self, in_lines, website_id: int):
|
||||||
|
|
||||||
import_every = 5000
|
import_every = 5000
|
||||||
@ -270,7 +286,8 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
|
stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
|
||||||
stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
|
stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
|
||||||
stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
|
stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
|
||||||
stats["es_search_time_avg"] = stats["es_search_time"] / (stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
|
stats["es_search_time_avg"] = stats["es_search_time"] / (
|
||||||
|
stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
|
||||||
stats["total_count"] = es_stats["indices"][self.index_name]["total"]["indexing"]["index_total"]
|
stats["total_count"] = es_stats["indices"][self.index_name]["total"]["indexing"]["index_total"]
|
||||||
stats["total_count_nonzero"] = total_stats["hits"]["total"]
|
stats["total_count_nonzero"] = total_stats["hits"]["total"]
|
||||||
stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
|
stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
|
||||||
|
19
task.py
19
task.py
@ -5,6 +5,7 @@ import requests
|
|||||||
from requests.exceptions import ConnectionError
|
from requests.exceptions import ConnectionError
|
||||||
import json
|
import json
|
||||||
import config
|
import config
|
||||||
|
from database import Database
|
||||||
|
|
||||||
|
|
||||||
class CrawlServer:
|
class CrawlServer:
|
||||||
@ -71,6 +72,15 @@ class CrawlServer:
|
|||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
def free_website_files(self, website_id) -> bool:
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = requests.get(self.url + "/file_list/" + str(website_id) + "/free", headers=CrawlServer.headers)
|
||||||
|
return r.status_code == 200
|
||||||
|
except ConnectionError as e:
|
||||||
|
print(e)
|
||||||
|
return False
|
||||||
|
|
||||||
def fetch_crawl_logs(self):
|
def fetch_crawl_logs(self):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -97,6 +107,7 @@ class TaskDispatcher:
|
|||||||
scheduler.start()
|
scheduler.start()
|
||||||
|
|
||||||
self.search = ElasticSearchEngine("od-database")
|
self.search = ElasticSearchEngine("od-database")
|
||||||
|
self.db = Database("db.sqlite3")
|
||||||
|
|
||||||
# TODO load from config
|
# TODO load from config
|
||||||
self.crawl_servers = [
|
self.crawl_servers = [
|
||||||
@ -108,10 +119,18 @@ class TaskDispatcher:
|
|||||||
for server in self.crawl_servers:
|
for server in self.crawl_servers:
|
||||||
for task in server.fetch_completed_tasks():
|
for task in server.fetch_completed_tasks():
|
||||||
print("Completed task")
|
print("Completed task")
|
||||||
|
# All files are overwritten
|
||||||
|
self.search.delete_docs(task.website_id)
|
||||||
file_list = server.fetch_website_files(task.website_id)
|
file_list = server.fetch_website_files(task.website_id)
|
||||||
if file_list:
|
if file_list:
|
||||||
self.search.import_json(file_list, task.website_id)
|
self.search.import_json(file_list, task.website_id)
|
||||||
|
|
||||||
|
# Update last_modified date for website
|
||||||
|
self.db.update_website_date_if_exists(task.website_id)
|
||||||
|
|
||||||
|
# File list is safe to delete once indexed
|
||||||
|
server.free_website_files(task.website_id)
|
||||||
|
|
||||||
def dispatch_task(self, task: Task):
|
def dispatch_task(self, task: Task):
|
||||||
self._get_available_crawl_server().queue_task(task)
|
self._get_available_crawl_server().queue_task(task)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user