mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 02:46:45 +00:00
Crawler no longer crashes when website has no files
This commit is contained in:
parent
e18ded7ac1
commit
112400886e
2
app.py
2
app.py
@ -562,7 +562,7 @@ def api_complete_task():
|
|||||||
|
|
||||||
taskManager.complete_task(filename, task, task_result, name)
|
taskManager.complete_task(filename, task, task_result, name)
|
||||||
|
|
||||||
if os.path.exists(filename):
|
if filename and os.path.exists(filename):
|
||||||
os.remove(filename)
|
os.remove(filename)
|
||||||
|
|
||||||
# TODO: handle callback here
|
# TODO: handle callback here
|
||||||
|
@ -51,13 +51,16 @@ class TaskManager:
|
|||||||
}
|
}
|
||||||
|
|
||||||
filename = "./crawled/" + str(task_result.website_id) + ".json"
|
filename = "./crawled/" + str(task_result.website_id) + ".json"
|
||||||
files = {
|
if os.path.exists(filename):
|
||||||
"file_list": open(filename)
|
files = {
|
||||||
}
|
"file_list": open(filename)
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
files = None
|
||||||
|
|
||||||
r = requests.post(config.SERVER_URL + "/task/complete", data=payload, files=files)
|
r = requests.post(config.SERVER_URL + "/task/complete", data=payload, files=files)
|
||||||
|
|
||||||
print("RESPONSE: " + r.text)
|
logger.info("RESPONSE: " + r.text)
|
||||||
|
|
||||||
if os.path.exists(filename):
|
if os.path.exists(filename):
|
||||||
os.remove(filename)
|
os.remove(filename)
|
||||||
@ -89,17 +92,16 @@ class TaskManager:
|
|||||||
result.start_time = datetime.utcnow().timestamp()
|
result.start_time = datetime.utcnow().timestamp()
|
||||||
result.website_id = task.website_id
|
result.website_id = task.website_id
|
||||||
|
|
||||||
print("Starting task " + task.url)
|
logger.info("Starting task " + task.url)
|
||||||
|
|
||||||
crawler = RemoteDirectoryCrawler(task.url, config.CRAWL_SERVER_THREADS)
|
crawler = RemoteDirectoryCrawler(task.url, config.CRAWL_SERVER_THREADS)
|
||||||
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
|
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
|
||||||
del crawler
|
|
||||||
|
|
||||||
result.file_count = crawl_result.file_count
|
result.file_count = crawl_result.file_count
|
||||||
result.status_code = crawl_result.status_code
|
result.status_code = crawl_result.status_code
|
||||||
|
|
||||||
result.end_time = datetime.utcnow().timestamp()
|
result.end_time = datetime.utcnow().timestamp()
|
||||||
print("End task " + task.url)
|
logger.info("End task " + task.url)
|
||||||
|
|
||||||
return result, current_tasks
|
return result, current_tasks
|
||||||
|
|
||||||
@ -113,9 +115,9 @@ class TaskManager:
|
|||||||
|
|
||||||
task_result, current_tasks = result
|
task_result, current_tasks = result
|
||||||
|
|
||||||
print("Task completed, sending result to server")
|
logger.info("Task completed, sending result to server")
|
||||||
print("Status code: " + task_result.status_code)
|
logger.info("Status code: " + task_result.status_code)
|
||||||
print("File count: " + str(task_result.file_count))
|
logger.info("File count: " + str(task_result.file_count))
|
||||||
|
|
||||||
TaskManager.push_result(task_result)
|
TaskManager.push_result(task_result)
|
||||||
|
|
||||||
|
4
tasks.py
4
tasks.py
@ -63,9 +63,9 @@ class TaskManager:
|
|||||||
|
|
||||||
def complete_task(self, file_list, task, task_result, crawler_name):
|
def complete_task(self, file_list, task, task_result, crawler_name):
|
||||||
|
|
||||||
if file_list:
|
self.search.delete_docs(task_result.website_id)
|
||||||
self.search.delete_docs(task_result.website_id)
|
|
||||||
|
|
||||||
|
if file_list:
|
||||||
def iter_lines():
|
def iter_lines():
|
||||||
|
|
||||||
with open(file_list, "r") as f:
|
with open(file_list, "r") as f:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user