diff --git a/app.py b/app.py index 1716814..2057574 100644 --- a/app.py +++ b/app.py @@ -562,15 +562,7 @@ def api_complete_task(): if task: - if "file_list" in request.files: - file = request.files['file_list'] - filename = "./tmp/" + str(task_result.website_id) + ".json" - print("Saving temp file " + filename + " ...") - file.save(filename) - print("Done") - else: - filename = None - + filename = "./tmp/" + str(task_result.website_id) + ".json" taskManager.complete_task(filename, task, task_result, name) if filename and os.path.exists(filename): @@ -585,6 +577,31 @@ def api_complete_task(): return "No such task" +@app.route("/api/task/upload", methods=["POST"]) +def api_upload(): + token = request.form.get("token") + website_id = request.form.get("website_id") + name = db.check_api_token(token) + + if name: + if "file_list" in request.files: + file = request.files['file_list'] + + filename = "./tmp/" + str(website_id) + ".json" + + if os.path.exists(filename): + print("Appending chunk to existing file...") + with open(filename, "ab") as f: + f.write(file.stream.read()) + else: + print("Saving temp file " + filename + " ...") + file.save(filename) + print("Done") + return "ok" + else: + return abort(403) + + @app.route("/api/website/by_url", methods=["GET"]) def api_website_by_url(): token = request.args.get("token") diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index aa5f71d..e2e6266 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -45,21 +45,31 @@ class TaskManager: try: + logger.info("Uploading file list in small chunks") + filename = "./crawled/" + str(task_result.website_id) + ".json" + CHUNK_SIZE = 1000000 * 10 + with open(filename) as f: + chunk = f.read(CHUNK_SIZE) + while chunk: + payload = { + "token": config.API_TOKEN, + "website_id": task_result.website_id + } + + files = { + "file_list": chunk + } + + r = requests.post(config.SERVER_URL + "/task/upload", data=payload, files=files) + logger.info("RESPONSE: " + r.text) + chunk = f.read(CHUNK_SIZE) + payload = { "token": config.API_TOKEN, "result": json.dumps(task_result.to_json()) } - filename = "./crawled/" + str(task_result.website_id) + ".json" - if os.path.exists(filename): - files = { - "file_list": open(filename) - } - else: - files = None - - r = requests.post(config.SERVER_URL + "/task/complete", data=payload, files=files) - + r = requests.post(config.SERVER_URL + "/task/complete", data=payload) logger.info("RESPONSE: " + r.text) if os.path.exists(filename):