From 458641654c80c0920d8449da8cd0661019c24f48 Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 8 Aug 2018 21:24:55 -0400 Subject: [PATCH 01/34] Minimal configuration for reddit comment callback --- app.py | 9 +++++-- callbacks.py | 27 ++++++++++++--------- crawl_server/reddit_bot.py => reddit_bot.py | 13 +++++----- 3 files changed, 29 insertions(+), 20 deletions(-) rename crawl_server/reddit_bot.py => reddit_bot.py (82%) diff --git a/app.py b/app.py index 0cb6bf2..27da164 100644 --- a/app.py +++ b/app.py @@ -13,6 +13,7 @@ import config from flask_caching import Cache from tasks import TaskManager, Task, TaskResult from search.search import ElasticSearchEngine +from callbacks import PostCrawlCallbackFactory app = Flask(__name__) if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN: @@ -572,7 +573,11 @@ def api_complete_task(): if filename and os.path.exists(filename): os.remove(filename) - # TODO: handle callback here + # Handle task callback + callback = PostCrawlCallbackFactory.get_callback(task) + if callback: + callback.run(task_result, searchEngine) + return "Successfully logged task result and indexed files" else: @@ -666,7 +671,7 @@ def api_task_enqueue(): request.json["url"], request.json["priority"], request.json["callback_type"], - request.json["callback_args"] + json.dumps(request.json["callback_args"]) ) taskManager.queue_task(task) return "" diff --git a/callbacks.py b/callbacks.py index 89bda6c..fec098a 100644 --- a/callbacks.py +++ b/callbacks.py @@ -1,6 +1,7 @@ -from tasks import Task -from crawl_server.reddit_bot import RedditBot +from tasks import Task, TaskResult +from reddit_bot import RedditBot import praw +from search.search import SearchEngine class PostCrawlCallback: @@ -8,7 +9,7 @@ class PostCrawlCallback: def __init__(self, task: Task): self.task = task - def run(self): + def run(self, task_result: TaskResult, search: SearchEngine): raise NotImplementedError @@ -36,26 +37,30 @@ class RedditCallback(PostCrawlCallback): user_agent='github.com/simon987/od-database (by /u/Hexahedr_n)') self.reddit_bot = RedditBot("crawled.txt", reddit) - def run(self): + def run(self, task_result: TaskResult, search: SearchEngine): raise NotImplementedError class RedditPostCallback(RedditCallback): - def run(self): + def run(self, task_result: TaskResult, search: SearchEngine): print("Reddit post callback for task " + str(self.task)) - pass class RedditCommentCallback(RedditCallback): - def run(self): - print("Reddit comment callback for task " + str(self.task)) - pass + def run(self, task_result: TaskResult, search: SearchEngine): + + comment_id = self.task.callback_args["comment_id"] + print("Replying to comment " + comment_id) + + stats = search.get_stats(self.task.website_id) + message = self.reddit_bot.get_comment(stats, self.task.website_id) + print(message) + self.reddit_bot.reply(self.reddit_bot.reddit.comment(comment_id), message) class DiscordCallback(PostCrawlCallback): - def run(self): + def run(self, task_result: TaskResult, search: SearchEngine): print("Discord callback for task " + str(self.task)) - pass diff --git a/crawl_server/reddit_bot.py b/reddit_bot.py similarity index 82% rename from crawl_server/reddit_bot.py rename to reddit_bot.py index bf3c3e4..bff336f 100644 --- a/crawl_server/reddit_bot.py +++ b/reddit_bot.py @@ -54,14 +54,13 @@ class RedditBot: @staticmethod def get_comment(stats: dict, website_id, message: str = ""): - comment = message + " \n" if len(message) > 0 else "" + comment = message + " \n" if message else "" - for stat in stats: - comment += stat + " \n" if len(stat) > 0 else "" - comment += RedditBot.format_stats(stats[stat]) + comment += RedditBot.format_stats(stats) - comment += "[Full Report](https://od-database.simon987.net/website/" + str(website_id) + "/)" - comment += " | [Link list](https://od-database.simon987.net/website/" + str(website_id) + "/links) \n" + comment += "[Full Report](https://od-db.the-eye.eu/website/" + str(website_id) + "/)" + comment += " | [Link list](https://od-db.the-eye.eu/website/" + str(website_id) + "/links)" + comment += " | [Source](https://github.com/simon987/od-database) \n" comment += "*** \n" comment += RedditBot.bottom_line @@ -74,7 +73,7 @@ class RedditBot: result += "File types | Count | Total Size\n" result += ":-- | :-- | :-- \n" counter = 0 - for mime in stats["mime_stats"]: + for mime in stats["ext_stats"]: result += mime[2] result += " | " + str(mime[1]) result += " | " + humanfriendly.format_size(mime[0]) + " \n" From 89e378ffd9610a3caa12924ed40f01023e391f89 Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 8 Aug 2018 22:41:25 -0400 Subject: [PATCH 02/34] Reddit comment callback is not an edit instead of a new comment --- callbacks.py | 4 ++-- reddit_bot.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/callbacks.py b/callbacks.py index fec098a..a238cdb 100644 --- a/callbacks.py +++ b/callbacks.py @@ -52,12 +52,12 @@ class RedditCommentCallback(RedditCallback): def run(self, task_result: TaskResult, search: SearchEngine): comment_id = self.task.callback_args["comment_id"] - print("Replying to comment " + comment_id) + print("Editing comment comment " + comment_id) stats = search.get_stats(self.task.website_id) message = self.reddit_bot.get_comment(stats, self.task.website_id) print(message) - self.reddit_bot.reply(self.reddit_bot.reddit.comment(comment_id), message) + self.reddit_bot.edit(self.reddit_bot.reddit.comment(comment_id), message) class DiscordCallback(PostCrawlCallback): diff --git a/reddit_bot.py b/reddit_bot.py index bff336f..26defae 100644 --- a/reddit_bot.py +++ b/reddit_bot.py @@ -41,17 +41,28 @@ class RedditBot: while True: try: - # Double check has_crawled if not self.has_crawled(reddit_obj.id): - reddit_obj.reply(comment) + reply = reddit_obj.reply(comment) self.log_crawl(reddit_obj.id) print("Reply to " + reddit_obj.id) + return reply break except Exception as e: print("Waiting 5 minutes: " + str(e)) time.sleep(300) continue + def edit(self, reddit_comment, new_message): + + while True: + try: + reddit_comment.edit(new_message) + print("Edit comment " + reddit_comment.id) + except Exception as e: + print("Waiting 5 minutes: " + str(e)) + time.sleep(300) + continue + @staticmethod def get_comment(stats: dict, website_id, message: str = ""): comment = message + " \n" if message else "" From 88166054ad397bc5e14f34bdd69a35b53fe56455 Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 8 Aug 2018 23:07:09 -0400 Subject: [PATCH 03/34] od_util can be used when od-database is a submodule --- od_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/od_util.py b/od_util.py index 00057e0..dc1e5e2 100644 --- a/od_util.py +++ b/od_util.py @@ -5,7 +5,7 @@ import os import validators import re from ftplib import FTP -import config +from . import config import urllib3 urllib3.disable_warnings() From f729b462f0dd7108fe27b1f81765147b6d7d8a7a Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 8 Aug 2018 23:31:50 -0400 Subject: [PATCH 04/34] od_util can be used when od-database is a submodule part 2 --- app.py | 1 - callbacks.py | 4 ++++ od_util.py | 7 ++++++- reddit_bot.py | 1 + 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/app.py b/app.py index 27da164..c8ba6f5 100644 --- a/app.py +++ b/app.py @@ -5,7 +5,6 @@ from urllib.parse import urlparse import os import time import datetime -import itertools from database import Database, Website, InvalidQueryException from flask_recaptcha import ReCaptcha import od_util diff --git a/callbacks.py b/callbacks.py index a238cdb..1965b82 100644 --- a/callbacks.py +++ b/callbacks.py @@ -2,6 +2,7 @@ from tasks import Task, TaskResult from reddit_bot import RedditBot import praw from search.search import SearchEngine +import json class PostCrawlCallback: @@ -9,6 +10,9 @@ class PostCrawlCallback: def __init__(self, task: Task): self.task = task + if self.task.callback_args: + self.task.callback_args = json.loads(self.task.callback_args) + def run(self, task_result: TaskResult, search: SearchEngine): raise NotImplementedError diff --git a/od_util.py b/od_util.py index dc1e5e2..e74f377 100644 --- a/od_util.py +++ b/od_util.py @@ -5,7 +5,12 @@ import os import validators import re from ftplib import FTP -from . import config + +# TODO: find a better way to do this +try: + from . import config +except ImportError: + import config import urllib3 urllib3.disable_warnings() diff --git a/reddit_bot.py b/reddit_bot.py index 26defae..3f15c5f 100644 --- a/reddit_bot.py +++ b/reddit_bot.py @@ -58,6 +58,7 @@ class RedditBot: try: reddit_comment.edit(new_message) print("Edit comment " + reddit_comment.id) + break except Exception as e: print("Waiting 5 minutes: " + str(e)) time.sleep(300) From 8ffd9179d2d09a657c003d797e9090d385e0580c Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 9 Aug 2018 14:26:22 -0400 Subject: [PATCH 05/34] Increased stats timeout value --- app.py | 33 +++++++++++++++++++++++++++++++++ callbacks.py | 4 +++- search/search.py | 18 +++++++++--------- 3 files changed, 45 insertions(+), 10 deletions(-) diff --git a/app.py b/app.py index c8ba6f5..b841b5c 100644 --- a/app.py +++ b/app.py @@ -716,5 +716,38 @@ def api_random_website(): return abort(403) +@app.route("/api/search", methods=["POST"]) +def api_search(): + + try: + token = request.json["token"] + except KeyError: + return abort(400) + + name = db.check_api_token(token) + + if name: + + try: + hits = searchEngine.search( + request.json["query"], + request.json["page"], request.json["per_page"], + request.json["sort_order"], + request.json["extensions"], + request.json["size_min"], request.json["size_max"], + request.json["match_all"], + request.json["fields"], + request.json["date_min"], request.json["date_max"] + ) + + hits = db.join_website_on_search_result(hits) + return json.dumps(hits) + + except InvalidQueryException as e: + return str(e) + else: + return abort(403) + + if __name__ == '__main__': app.run("0.0.0.0", port=12345, threaded=True) diff --git a/callbacks.py b/callbacks.py index 1965b82..f749ee1 100644 --- a/callbacks.py +++ b/callbacks.py @@ -59,7 +59,9 @@ class RedditCommentCallback(RedditCallback): print("Editing comment comment " + comment_id) stats = search.get_stats(self.task.website_id) - message = self.reddit_bot.get_comment(stats, self.task.website_id) + message = self.reddit_bot.get_comment(stats, self.task.website_id, + message="There you go! This website was crawled in `" + + str(int(task_result.end_time - task_result.start_time)) + "s`") print(message) self.reddit_bot.edit(self.reddit_bot.reddit.comment(comment_id), message) diff --git a/search/search.py b/search/search.py index fac8dd5..5c1efa8 100644 --- a/search/search.py +++ b/search/search.py @@ -311,7 +311,7 @@ class ElasticSearchEngine(SearchEngine): }, "size": 0 - }, index=self.index_name, request_timeout=120) + }, index=self.index_name, request_timeout=240) total_stats = self.es.search(body={ "query": { @@ -333,7 +333,7 @@ class ElasticSearchEngine(SearchEngine): }, "size": 0 - }, index=self.index_name, request_timeout=120) + }, index=self.index_name, request_timeout=241) size_and_date_histogram = self.es.search(body={ "query": { @@ -354,21 +354,21 @@ class ElasticSearchEngine(SearchEngine): "sizes": { "histogram": { "field": "size", - "interval": 50000000, # 50Mb - "min_doc_count": 100 + "interval": 100000000, # 100Mb + "min_doc_count": 500 } }, "dates": { "date_histogram": { "field": "mtime", "interval": "1y", - "min_doc_count": 100, + "min_doc_count": 500, "format": "yyyy" } } }, "size": 0 - }, index=self.index_name, request_timeout=120) + }, index=self.index_name, request_timeout=242) website_scatter = self.es.search(body={ "query": { @@ -384,7 +384,7 @@ class ElasticSearchEngine(SearchEngine): "websites": { "terms": { "field": "website_id", - "size": 500 # TODO: Figure out what size is appropriate + "size": 400 # TODO: Figure out what size is appropriate }, "aggs": { "size": { @@ -396,9 +396,9 @@ class ElasticSearchEngine(SearchEngine): } }, "size": 0 - }, index=self.index_name, request_timeout=120) + }, index=self.index_name, request_timeout=243) - es_stats = self.es.indices.stats(self.index_name, request_timeout=120) + es_stats = self.es.indices.stats(self.index_name, request_timeout=244) stats = dict() stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"] From ffeed4192e723ce991c2ec1c50f1db78f6928ad5 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 9 Aug 2018 16:19:21 -0400 Subject: [PATCH 06/34] Refresh index before reddit comment callback --- callbacks.py | 1 + search/search.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/callbacks.py b/callbacks.py index f749ee1..647c963 100644 --- a/callbacks.py +++ b/callbacks.py @@ -58,6 +58,7 @@ class RedditCommentCallback(RedditCallback): comment_id = self.task.callback_args["comment_id"] print("Editing comment comment " + comment_id) + search.refresh() # Make sure the newly indexed documents are available before commenting stats = search.get_stats(self.task.website_id) message = self.reddit_bot.get_comment(stats, self.task.website_id, message="There you go! This website was crawled in `" + diff --git a/search/search.py b/search/search.py index 5c1efa8..16439e2 100644 --- a/search/search.py +++ b/search/search.py @@ -31,6 +31,9 @@ class SearchEngine: def get_stats(self, website_id: int, subdir: str = None): raise NotImplementedError + def refresh(self): + raise NotImplementedError + class ElasticSearchEngine(SearchEngine): SORT_ORDERS = { @@ -460,3 +463,6 @@ class ElasticSearchEngine(SearchEngine): for website in websites: if website not in non_empty_websites: yield website + + def refresh(self): + self.es.indices.refresh(self.index_name) From 5a084cb8574252b4f10a6b8627fd95a80c8233ec Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 9 Aug 2018 17:12:43 -0400 Subject: [PATCH 07/34] Queue can be emptied more easily --- database.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/database.py b/database.py index 51edd67..948de9f 100644 --- a/database.py +++ b/database.py @@ -407,12 +407,12 @@ class Database: cursor = conn.cursor() cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM " - "Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name)) + "Queue WHERE website_id=?", (website_id, name)) task = cursor.fetchone() if task: - cursor.execute("DELETE FROM Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name)) + cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, name)) conn.commit() return Task(task[1], task[2], task[3], task[4], task[5]) else: From 42d858b62a33e2677313b41f01e33985e2215b10 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 9 Aug 2018 17:14:17 -0400 Subject: [PATCH 08/34] Queue can be emptied more easily pt.2 --- database.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/database.py b/database.py index 948de9f..252a30f 100644 --- a/database.py +++ b/database.py @@ -407,12 +407,12 @@ class Database: cursor = conn.cursor() cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM " - "Queue WHERE website_id=?", (website_id, name)) + "Queue WHERE website_id=?", (website_id, )) task = cursor.fetchone() if task: - cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, name)) + cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, )) conn.commit() return Task(task[1], task[2], task[3], task[4], task[5]) else: From faeff701dee8b9660fd850a736593c40b4d89219 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 9 Aug 2018 18:33:35 -0400 Subject: [PATCH 09/34] Increased search timeout value --- reddit_bot.py | 2 +- search/search.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/reddit_bot.py b/reddit_bot.py index 3f15c5f..66f03f6 100644 --- a/reddit_bot.py +++ b/reddit_bot.py @@ -72,7 +72,7 @@ class RedditBot: comment += "[Full Report](https://od-db.the-eye.eu/website/" + str(website_id) + "/)" comment += " | [Link list](https://od-db.the-eye.eu/website/" + str(website_id) + "/links)" - comment += " | [Source](https://github.com/simon987/od-database) \n" + comment += " | [Source](https://github.com/simon987) \n" comment += "*** \n" comment += RedditBot.bottom_line diff --git a/search/search.py b/search/search.py index 16439e2..4631e53 100644 --- a/search/search.py +++ b/search/search.py @@ -214,7 +214,7 @@ class ElasticSearchEngine(SearchEngine): } }, "size": per_page, "from": min(page * per_page, 10000 - per_page)}, - index=self.index_name, request_timeout=30) + index=self.index_name, request_timeout=60) return page From a6b1d9cba3ba8236a93c72f3ea7eddcc4887c573 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 9 Aug 2018 21:43:07 -0400 Subject: [PATCH 10/34] More help when no search results --- templates/search.html | 1 + 1 file changed, 1 insertion(+) diff --git a/templates/search.html b/templates/search.html index 8bb560f..f9dbf7a 100644 --- a/templates/search.html +++ b/templates/search.html @@ -174,6 +174,7 @@
  • Try checking the 'Match any word' box for a broader search.
  • Make sure you don't include the file extension in your query (Use the appropriate field to filter file types)
  • +
  • If you're searching for files in a particular website, use the website search page
From c94cf5b3131db5860303dd72c0a1e6c940fd673f Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 10 Aug 2018 11:46:16 -0400 Subject: [PATCH 11/34] Adjusted timeout values (again) --- search/search.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/search/search.py b/search/search.py index 4631e53..0249291 100644 --- a/search/search.py +++ b/search/search.py @@ -214,7 +214,7 @@ class ElasticSearchEngine(SearchEngine): } }, "size": per_page, "from": min(page * per_page, 10000 - per_page)}, - index=self.index_name, request_timeout=60) + index=self.index_name, request_timeout=35) return page @@ -232,7 +232,7 @@ class ElasticSearchEngine(SearchEngine): "ext_group": { "terms": { "field": "ext", - "size": 20 + "size": 12 }, "aggs": { "size": { @@ -249,7 +249,7 @@ class ElasticSearchEngine(SearchEngine): } }, "size": 0 - }, index=self.index_name, request_timeout=20) + }, index=self.index_name, request_timeout=30) stats = dict() stats["total_size"] = result["aggregations"]["total_size"]["value"] From c29af180c58ead26f16c8bf2f031d1e62c064575 Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 10 Aug 2018 12:46:40 -0400 Subject: [PATCH 12/34] Captcha for searches --- README.md | 3 + app.py | 134 +++++++++++++++++++++++------------------- templates/home.html | 18 ++++-- templates/search.html | 11 +++- 4 files changed, 100 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index a10a2bd..859d3e8 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,11 @@ Create `/config.py` and fill out the parameters. Sample config: # Leave default values for no CAPTCHAs CAPTCHA_LOGIN = False CAPTCHA_SUBMIT = False +CAPTCHA_SEARCH = False CAPTCHA_SITE_KEY = "" CAPTCHA_SECRET_KEY = "" +CAPTCHA_S_SITE_KEY = "" +CAPTCHA_S_SECRET_KEY = "" # Flask secret key for sessions FLASK_SECRET = "" diff --git a/app.py b/app.py index b841b5c..f99417c 100644 --- a/app.py +++ b/app.py @@ -21,6 +21,12 @@ if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN: secret_key=config.CAPTCHA_SECRET_KEY) else: recaptcha = None +if config.CAPTCHA_SEARCH: + recaptcha_search = ReCaptcha(app=app, + site_key=config.CAPTCHA_S_SITE_KEY, + secret_key=config.CAPTCHA_S_SECRET_KEY) +else: + recaptcha_search = None app.secret_key = config.FLASK_SECRET db = Database("db.sqlite3") cache = Cache(app, config={'CACHE_TYPE': 'simple'}) @@ -243,79 +249,86 @@ def admin_rescan_website(website_id): @app.route("/search") def search(): - q = request.args.get("q") if "q" in request.args else "" - sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score" - page = request.args.get("p") if "p" in request.args else "0" - page = int(page) if page.isdigit() else 0 + q = request.args.get("q") if "q" in request.args else "" + sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score" - per_page = request.args.get("per_page") if "per_page" in request.args else "50" - per_page = int(per_page) if per_page.isdigit() else "50" - per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50 + page = request.args.get("p") if "p" in request.args else "0" + page = int(page) if page.isdigit() else 0 - extensions = request.args.get("ext") if "ext" in request.args else None - extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else [] + per_page = request.args.get("per_page") if "per_page" in request.args else "50" + per_page = int(per_page) if per_page.isdigit() else "50" + per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50 - size_min = request.args.get("size_min") if "size_min" in request.args else "size_min" - size_min = int(size_min) if size_min.isdigit() else 0 - size_max = request.args.get("size_max") if "size_max" in request.args else "size_max" - size_max = int(size_max) if size_max.isdigit() else 0 + extensions = request.args.get("ext") if "ext" in request.args else None + extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else [] - date_min = request.args.get("date_min") if "date_min" in request.args else "date_min" - date_min = int(date_min) if date_min.isdigit() else 0 - date_max = request.args.get("date_max") if "date_max" in request.args else "date_max" - date_max = int(date_max) if date_max.isdigit() else 0 + size_min = request.args.get("size_min") if "size_min" in request.args else "size_min" + size_min = int(size_min) if size_min.isdigit() else 0 + size_max = request.args.get("size_max") if "size_max" in request.args else "size_max" + size_max = int(size_max) if size_max.isdigit() else 0 - match_all = "all" in request.args + date_min = request.args.get("date_min") if "date_min" in request.args else "date_min" + date_min = int(date_min) if date_min.isdigit() else 0 + date_max = request.args.get("date_max") if "date_max" in request.args else "date_max" + date_max = int(date_max) if date_max.isdigit() else 0 - field_name = "field_name" in request.args - field_trigram = "field_trigram" in request.args - field_path = "field_path" in request.args + match_all = "all" in request.args - if not field_name and not field_trigram and not field_path: - # If no fields are selected, search in all - field_name = field_path = field_trigram = True + field_name = "field_name" in request.args + field_trigram = "field_trigram" in request.args + field_path = "field_path" in request.args - fields = [] - if field_path: - fields.append("path") - if field_name: - fields.append("name^5") - if field_trigram: - fields.append("name.nGram^2") + if not field_name and not field_trigram and not field_path: + # If no fields are selected, search in all + field_name = field_path = field_trigram = True - if len(q) >= 3: + fields = [] + if field_path: + fields.append("path") + if field_name: + fields.append("name^5") + if field_trigram: + fields.append("name.nGram^2") - db.log_search(request.remote_addr, - request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None, - q, extensions, page) + if len(q) >= 3: - try: - hits = searchEngine.search(q, page, per_page, sort_order, - extensions, size_min, size_max, match_all, fields, date_min, date_max) - hits = db.join_website_on_search_result(hits) - except InvalidQueryException as e: - flash("Invalid query: " + str(e), "warning") - return redirect("/search") - except Exception: - flash("Query failed, this could mean that the search server is overloaded or is not reachable. " - "Please try again later", "danger") + response = request.args.get("g-recaptcha-response", "") + if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response): + db.log_search(request.remote_addr, + request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None, + q, extensions, page) + + try: + hits = searchEngine.search(q, page, per_page, sort_order, + extensions, size_min, size_max, match_all, fields, date_min, date_max) + hits = db.join_website_on_search_result(hits) + except InvalidQueryException as e: + flash("Invalid query: " + str(e), "warning") + return redirect("/search") + except Exception: + flash("Query failed, this could mean that the search server is overloaded or is not reachable. " + "Please try again later", "danger") + hits = None + else: + flash("Error: Invalid captcha please try again", "danger") + hits = None + + else: hits = None - else: - hits = None - - return render_template("search.html", - results=hits, - q=q, - p=page, per_page=per_page, - sort_order=sort_order, - results_set=config.RESULTS_PER_PAGE, - extensions=",".join(extensions), - size_min=size_min, size_max=size_max, - match_all=match_all, - field_trigram=field_trigram, field_path=field_path, field_name=field_name, - date_min=date_min, date_max=date_max) + return render_template("search.html", + results=hits, + q=q, + p=page, per_page=per_page, + sort_order=sort_order, + results_set=config.RESULTS_PER_PAGE, + extensions=",".join(extensions), + size_min=size_min, size_max=size_max, + match_all=match_all, + field_trigram=field_trigram, field_path=field_path, field_name=field_name, + date_min=date_min, date_max=date_max, + show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search) @app.route("/contribute") @@ -331,7 +344,8 @@ def home(): stats["website_count"] = len(db.get_all_websites()) except: stats = {} - return render_template("home.html", stats=stats) + return render_template("home.html", stats=stats, + show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search) @app.route("/submit") diff --git a/templates/home.html b/templates/home.html index 33881e5..29d30b4 100644 --- a/templates/home.html +++ b/templates/home.html @@ -23,13 +23,23 @@
Search
-
+ -
- +
+
+ +
+
+ {% if show_captcha %} + + + + {% else %} + + {% endif %} +
-
diff --git a/templates/search.html b/templates/search.html index f9dbf7a..a6a9d34 100644 --- a/templates/search.html +++ b/templates/search.html @@ -9,7 +9,7 @@
Search
-
+
@@ -92,7 +92,14 @@ {# Search button #}
- + + {% if show_captcha %} + + + + {% else %} + + {% endif %}
From aab1abba5472f660514729aeb89ffa54c10654be Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 10 Aug 2018 15:24:43 -0400 Subject: [PATCH 13/34] Fixed websites link --- templates/layout.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/layout.html b/templates/layout.html index b6cefb3..2513392 100644 --- a/templates/layout.html +++ b/templates/layout.html @@ -32,7 +32,7 @@
+
{# Size #} @@ -90,15 +91,16 @@ + {# Search button #}
{% if show_captcha %} - + {% else %} - + {% endif %}
@@ -163,12 +165,10 @@ {% if results["hits"]["total"] > (p + 1) * per_page %} - Next + {% endif %} {% if p > 0 %} - Previous + {% endif %} @@ -253,6 +253,16 @@ } }); + //Next button + function nextPage() { + document.getElementById("page").value = parseInt(document.getElementById("page").value) + 1; + document.getElementById("s").click(); + } + function prevPage() { + document.getElementById("page").value = parseInt(document.getElementById("page").value) - 1; + document.getElementById("s").click(); + } + From cc4c70f4004f7c70331a8c69d663d83a6706c2cf Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 11 Aug 2018 13:05:24 -0400 Subject: [PATCH 16/34] Request content is read all at once --- crawl_server/remote_http.py | 29 ++++++++++------------------- test/files/apache_table.html | 21 +++++++++++++++++++++ test/webserver.py | 13 +++++++++++++ 3 files changed, 44 insertions(+), 19 deletions(-) create mode 100644 test/files/apache_table.html create mode 100644 test/webserver.py diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 368aaac..37cf3c1 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -104,7 +104,7 @@ class HttpDirectory(RemoteDirectory): current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1] path_identifier = hashlib.md5(current_dir_name.encode()) path_url = urljoin(self.base_url, path, "") - body = self._stream_body(path_url) + body = self._fetch_body(path_url) anchors = self._parse_links(body) urls_to_request = [] @@ -176,19 +176,16 @@ class HttpDirectory(RemoteDirectory): logger.debug("TimeoutError - _request_file") raise TimeoutError - def _stream_body(self, url: str): + def _fetch_body(self, url: str): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: - r = self.session.get(url, stream=True, timeout=HttpDirectory.TIMEOUT) - for chunk in r.iter_content(chunk_size=8192): - try: - yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore") - except LookupError: - # Unsupported encoding - yield chunk.decode("utf-8", errors="ignore") - r.close() - return + r = self.session.get(url, timeout=HttpDirectory.TIMEOUT) + try: + return r.content.decode(r.encoding if r.encoding else "utf-8", errors="ignore") + except LookupError: + # Unsupported encoding + return r.content.decode("utf-8", errors="ignore") except RequestException: self.session.close() retries -= 1 @@ -200,14 +197,8 @@ class HttpDirectory(RemoteDirectory): def _parse_links(body): parser = HTMLAnchorParser() - anchors = [] - - for chunk in body: - parser.feed(chunk) - for anchor in parser.anchors: - anchors.append(anchor) - - return anchors + parser.feed(body) + return parser.anchors @staticmethod def _isdir(link: Anchor): diff --git a/test/files/apache_table.html b/test/files/apache_table.html new file mode 100644 index 0000000..06e21c9 --- /dev/null +++ b/test/files/apache_table.html @@ -0,0 +1,21 @@ + + + + Index of /Public/bootstrap + + +

Index of /Public/bootstrap

+ + + + + + + + + + + +
[ICO]NameLast modifiedSizeDescription

[PARENTDIR]Parent Directory   -  
[   ]bower.json 2017-04-05 01:45 1.0K 
[DIR]css/ 2017-09-07 18:03 -  
[DIR]image/ 2017-09-07 18:03 -  
[DIR]js/ 2017-09-07 18:03 -  
[DIR]less/ 2017-09-07 18:03 -  
[   ]package.json 2017-04-05 01:45 666  

+ + diff --git a/test/webserver.py b/test/webserver.py new file mode 100644 index 0000000..a3a1c14 --- /dev/null +++ b/test/webserver.py @@ -0,0 +1,13 @@ +from flask import Flask, send_file + +app = Flask(__name__) + + +@app.route("/test1/") +def test1(): + return send_file("files/apache_table.html") + + +if __name__ == '__main__': + app.run("0.0.0.0", port=8888, threaded=True) + From edede200f4f447003b3171354599a34468f4ba21 Mon Sep 17 00:00:00 2001 From: Simon Date: Sun, 12 Aug 2018 14:58:27 -0400 Subject: [PATCH 17/34] Decresed number of indexed documents per second --- search/search.py | 4 +-- test/files/lighttpd_table.html | 47 ++++++++++++++++++++++++++++++++++ test/files/nginx_pre.html | 11 ++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 test/files/lighttpd_table.html create mode 100644 test/files/nginx_pre.html diff --git a/search/search.py b/search/search.py index 6465de9..fe7b85d 100644 --- a/search/search.py +++ b/search/search.py @@ -125,8 +125,8 @@ class ElasticSearchEngine(SearchEngine): def import_json(self, in_lines, website_id: int): - import_every = 1000 - cooldown_time = 1 + import_every = 400 + cooldown_time = 0.6 docs = [] diff --git a/test/files/lighttpd_table.html b/test/files/lighttpd_table.html new file mode 100644 index 0000000..fe58d9f --- /dev/null +++ b/test/files/lighttpd_table.html @@ -0,0 +1,47 @@ + + + + + Index of /gentoo/releases/ + + + +

Index of /gentoo/releases/

+
+ + + + + + + + + + + + + + + + + + + +
NameLast ModifiedSizeType
Parent Directory/ -  Directory
alpha/2009-Aug-09 03:47:09-  Directory
amd64/2017-Feb-09 18:50:44-  Directory
arm/2014-Apr-29 13:42:06-  Directory
hppa/2014-Apr-29 13:42:12-  Directory
ia64/2009-Aug-09 03:47:09-  Directory
mips/2011-Apr-28 23:38:14-  Directory
ppc/2014-Apr-29 13:41:00-  Directory
s390/2014-Apr-29 13:41:06-  Directory
sh/2014-Apr-29 13:41:16-  Directory
snapshots/2009-Apr-16 05:08:17-  Directory
sparc/2009-Aug-09 03:47:09-  Directory
x86/2016-Jul-04 21:14:19-  Directory
README2014-Jun-22 05:18:430.1Kapplication/octet-stream
verify-digests.sh2016-Jun-10 02:40:334.5Kapplication/octet-stream
+
+
lighttpd/1.4.29
+ + diff --git a/test/files/nginx_pre.html b/test/files/nginx_pre.html new file mode 100644 index 0000000..5bbd35e --- /dev/null +++ b/test/files/nginx_pre.html @@ -0,0 +1,11 @@ + +Index of /test/To process/Android nak newer/ + +

Index of /test/To process/Android nak newer/


../
+DCIM/                                              31-Jul-2018 00:26                   -
+Pictures/                                          31-Jul-2018 00:26                   -
+1529682937580.webm                                 25-Jun-2018 03:58             3768511
+1529716051300.webm                                 25-Jun-2018 04:01             3181867
+1529725898345.webm                                 25-Jun-2018 04:05             4138908
+

+ From 5c386707edf9b55bcb4bc8d6367ca949816cdcc3 Mon Sep 17 00:00:00 2001 From: Simon Date: Mon, 13 Aug 2018 14:03:22 -0400 Subject: [PATCH 18/34] Should fix import error --- od_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/od_util.py b/od_util.py index e74f377..3407fc9 100644 --- a/od_util.py +++ b/od_util.py @@ -9,7 +9,7 @@ from ftplib import FTP # TODO: find a better way to do this try: from . import config -except ImportError: +except (ImportError, SystemError): import config import urllib3 From c92f2f493782baf55b7c0cbc8ea89a40ae32b19e Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 14 Aug 2018 12:21:34 -0400 Subject: [PATCH 19/34] Should fix export problem --- app.py | 1 + search/search.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/app.py b/app.py index f99417c..8e97e13 100644 --- a/app.py +++ b/app.py @@ -36,6 +36,7 @@ app.jinja_env.globals.update(get_mime=od_util.get_category) taskManager = TaskManager() searchEngine = ElasticSearchEngine("od-database") +searchEngine.start_stats_scheduler() @app.template_filter("date_format") diff --git a/search/search.py b/search/search.py index fe7b85d..5555b03 100644 --- a/search/search.py +++ b/search/search.py @@ -50,12 +50,14 @@ class ElasticSearchEngine(SearchEngine): self.index_name = index_name self.es = elasticsearch.Elasticsearch() + if not self.es.indices.exists(self.index_name): + self.init() + + def start_stats_scheduler(self): scheduler = BackgroundScheduler() scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120) scheduler.start() - if not self.es.indices.exists(self.index_name): - self.init() def init(self): print("Elasticsearch first time setup") From bbe8ed07a8d9221bd31242be42e642592a6c57a6 Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 14 Aug 2018 16:20:00 -0400 Subject: [PATCH 20/34] Reset page number on search --- search/search.py | 1 - templates/search.html | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/search/search.py b/search/search.py index 5555b03..90f0bd3 100644 --- a/search/search.py +++ b/search/search.py @@ -58,7 +58,6 @@ class ElasticSearchEngine(SearchEngine): scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120) scheduler.start() - def init(self): print("Elasticsearch first time setup") if self.es.indices.exists(self.index_name): diff --git a/templates/search.html b/templates/search.html index e247850..869832b 100644 --- a/templates/search.html +++ b/templates/search.html @@ -96,7 +96,10 @@
{% if show_captcha %} - + {% else %} From 6d27cbca02c3b5280f899a704b5bf514d9e18b5e Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 15 Aug 2018 11:32:36 -0400 Subject: [PATCH 21/34] xz -> lzma for export --- app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app.py b/app.py index 8e97e13..b821f79 100644 --- a/app.py +++ b/app.py @@ -57,7 +57,7 @@ def from_timestamp(value): @app.route("/dl") def downloads(): try: - export_file_stats = os.stat("static/out.csv.xz") + export_file_stats = os.stat("static/out.csv.lzma") except FileNotFoundError: print("No export file") export_file_stats = None From a2327bac7c8e4818069f2b3254d321a50ba074d8 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 16 Aug 2018 13:13:34 -0400 Subject: [PATCH 22/34] Bug fix for pages buttons --- templates/search.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/search.html b/templates/search.html index 869832b..ee84b95 100644 --- a/templates/search.html +++ b/templates/search.html @@ -97,7 +97,7 @@ {% if show_captcha %} From 8f218f3c9dbd64b9508975e08e57ef5ab56d4ec1 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 16 Aug 2018 13:24:00 -0400 Subject: [PATCH 23/34] Bug fix for pages buttons pt.2 --- templates/search.html | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/templates/search.html b/templates/search.html index ee84b95..f3437d5 100644 --- a/templates/search.html +++ b/templates/search.html @@ -97,11 +97,10 @@ {% if show_captcha %} - + {% else %} {% endif %} @@ -259,11 +258,11 @@ //Next button function nextPage() { document.getElementById("page").value = parseInt(document.getElementById("page").value) + 1; - document.getElementById("s").click(); + grecaptcha.execute(); } function prevPage() { document.getElementById("page").value = parseInt(document.getElementById("page").value) - 1; - document.getElementById("s").click(); + grecaptcha.execute(); } From 85c3aa918dd703766080600ed2e33bd10cff782a Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 23 Aug 2018 11:47:09 -0400 Subject: [PATCH 24/34] replaced requests by pycurl --- crawl_server/remote_http.py | 94 +++++++++++++++++++++++++++---------- requirements.txt | 4 +- search/search.py | 2 +- 3 files changed, 72 insertions(+), 28 deletions(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 37cf3c1..2d60728 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -1,14 +1,16 @@ +import pycurl +from io import BytesIO + from crawl_server import logger from urllib.parse import unquote, urljoin import os from html.parser import HTMLParser from itertools import repeat from crawl_server.crawler import RemoteDirectory, File -import requests -from requests.exceptions import RequestException from multiprocessing.pool import ThreadPool import config from dateutil.parser import parse as parse_date +from pycurl import Curl import hashlib import urllib3 @@ -94,10 +96,29 @@ class HttpDirectory(RemoteDirectory): def __init__(self, url): super().__init__(url) - self.session = requests.Session() - self.session.headers = HttpDirectory.HEADERS - self.session.verify = False - self.session.max_redirects = 1 + self.curl = None + self.curl_head = None + self.init_curl() + + def init_curl(self): + + self.curl = Curl() + self.curl.setopt(self.curl.SSL_VERIFYPEER, 0) + self.curl.setopt(self.curl.SSL_VERIFYHOST, 0) + self.curl.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT) + + self.curl_head = self._curl_handle() + + def _curl_handle(self): + + curl_head = Curl() + curl_head.setopt(self.curl.SSL_VERIFYPEER, 0) + curl_head.setopt(self.curl.SSL_VERIFYHOST, 0) + curl_head.setopt(pycurl.NOBODY, 1) + curl_head.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT) + + return curl_head + def list_dir(self, path): @@ -139,7 +160,8 @@ class HttpDirectory(RemoteDirectory): if len(urls_to_request) > 150: # Many urls, use multi-threaded solution pool = ThreadPool(processes=10) - files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request)) + handles = [self._curl_handle() for _ in range(len(urls_to_request))] + files = pool.starmap(self._request_file, zip(handles, urls_to_request, repeat(self.base_url))) pool.close() for file in files: if file: @@ -147,31 +169,38 @@ class HttpDirectory(RemoteDirectory): else: # Too few urls to create thread pool for url in urls_to_request: - file = self._request_file(url) + file = self._request_file(self.curl_head, url, self.base_url) if file: yield file - def _request_file(self, url): + @staticmethod + def _request_file(curl, url, base_url): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: - r = self.session.head(url, allow_redirects=False, timeout=HttpDirectory.TIMEOUT) + raw_headers = BytesIO() + curl.setopt(pycurl.URL, url) + curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write) + curl.perform() - stripped_url = url[len(self.base_url) - 1:] + stripped_url = url[len(base_url) - 1:] + headers = HttpDirectory._parse_dict_header(raw_headers.getvalue().decode("utf-8", errors="ignore")) + raw_headers.close() path, name = os.path.split(stripped_url) - date = r.headers.get("Last-Modified", "1970-01-01") + date = headers.get("Last-Modified", "1970-01-01") return File( path=unquote(path).strip("/"), name=unquote(name), - size=int(r.headers.get("Content-Length", -1)), + size=int(headers.get("Content-Length", -1)), mtime=int(parse_date(date).timestamp()), is_dir=False ) - except RequestException: - self.session.close() + except pycurl.error as e: + curl.close() retries -= 1 + raise e logger.debug("TimeoutError - _request_file") raise TimeoutError @@ -180,17 +209,19 @@ class HttpDirectory(RemoteDirectory): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: - r = self.session.get(url, timeout=HttpDirectory.TIMEOUT) - try: - return r.content.decode(r.encoding if r.encoding else "utf-8", errors="ignore") - except LookupError: - # Unsupported encoding - return r.content.decode("utf-8", errors="ignore") - except RequestException: - self.session.close() - retries -= 1 + content = BytesIO() + self.curl.setopt(pycurl.URL, url) + self.curl.setopt(pycurl.WRITEDATA, content) + self.curl.perform() - logger.debug("TimeoutError - _stream_body") + return content.getvalue().decode("utf-8", errors="ignore") + except pycurl.error as e: + self.curl.close() + retries -= 1 + print(e) + raise e + + logger.debug("TimeoutError - _fetch_body") raise TimeoutError @staticmethod @@ -222,8 +253,19 @@ class HttpDirectory(RemoteDirectory): if "?" in link.href: return True + @staticmethod + def _parse_dict_header(raw): + headers = dict() + for line in raw.split("\r\n")[1:]: # Ignore first 'HTTP/1.0 200 OK' line + if line: + k, v = line.split(":", maxsplit=1) + headers[k.strip()] = v.strip() + + return headers + def close(self): - self.session.close() + self.curl.close() logger.debug("Closing HTTPRemoteDirectory for " + self.base_url) + self.init_curl() diff --git a/requirements.txt b/requirements.txt index 4bb0370..b5a81c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,6 @@ flask_httpauth ujson urllib3 pyOpenSSL -pybloom-live \ No newline at end of file +pybloom-live +pycurl +lxml \ No newline at end of file diff --git a/search/search.py b/search/search.py index 90f0bd3..f65f539 100644 --- a/search/search.py +++ b/search/search.py @@ -127,7 +127,7 @@ class ElasticSearchEngine(SearchEngine): def import_json(self, in_lines, website_id: int): import_every = 400 - cooldown_time = 0.6 + cooldown_time = 0 docs = [] From 8dc8627f786a1e1dcfb9e69d88c86310ebdd3e64 Mon Sep 17 00:00:00 2001 From: Simon Fortier Date: Thu, 23 Aug 2018 11:51:48 -0400 Subject: [PATCH 25/34] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 859d3e8..b53d50a 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ Suggestions/concerns/PRs are welcome ## Installation Assuming you have Python 3 and git installed: ```bash +sudo apt install libssl-dev libcurl4-openssl-dev git clone https://github.com/simon987/od-database cd od-database sudo pip3 install -r requirements.txt From d42be56deed923ddb8c2d5a2467ee454e8cf09b8 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 23 Aug 2018 11:59:13 -0400 Subject: [PATCH 26/34] More debug info --- crawl_server/task_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index 841e991..cee1e17 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -63,7 +63,7 @@ class TaskManager: } r = requests.post(config.SERVER_URL + "/task/upload", data=payload, files=files) - logger.info("RESPONSE: " + r.text) + logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">") except Exception as e: logger.error("Exception while sending file_list chunk: " + str(e)) pass @@ -75,7 +75,7 @@ class TaskManager: } r = requests.post(config.SERVER_URL + "/task/complete", data=payload) - logger.info("RESPONSE: " + r.text) + logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">") if os.path.exists(filename): os.remove(filename) From 54b4d2d5b4817854976c0e3703fd2f6ee7c03547 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 23 Aug 2018 12:02:07 -0400 Subject: [PATCH 27/34] removed debug lines --- crawl_server/remote_http.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 2d60728..7d73c8e 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -197,10 +197,9 @@ class HttpDirectory(RemoteDirectory): mtime=int(parse_date(date).timestamp()), is_dir=False ) - except pycurl.error as e: + except pycurl.error: curl.close() retries -= 1 - raise e logger.debug("TimeoutError - _request_file") raise TimeoutError @@ -215,11 +214,9 @@ class HttpDirectory(RemoteDirectory): self.curl.perform() return content.getvalue().decode("utf-8", errors="ignore") - except pycurl.error as e: + except pycurl.error: self.curl.close() retries -= 1 - print(e) - raise e logger.debug("TimeoutError - _fetch_body") raise TimeoutError From cadaf14c1b9a8e5ef2b5861b87ff0fc65b7dd6bf Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 23 Aug 2018 12:12:23 -0400 Subject: [PATCH 28/34] Small bugfix --- crawl_server/remote_http.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 7d73c8e..3a99947 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -119,7 +119,6 @@ class HttpDirectory(RemoteDirectory): return curl_head - def list_dir(self, path): current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1] @@ -235,14 +234,14 @@ class HttpDirectory(RemoteDirectory): @staticmethod def _should_ignore(base_url, current_path, link: Anchor): - if urljoin(base_url, link.href) == urljoin(urljoin(base_url, current_path), "../"): + full_url = urljoin(base_url, link.href) + if full_url == urljoin(urljoin(base_url, current_path), "../") or full_url == base_url: return True if link.href.endswith(HttpDirectory.BLACK_LIST): return True # Ignore external links - full_url = urljoin(base_url, link.href) if not full_url.startswith(base_url): return True From 484a0baf9ddde256ca7745aa62612037c11696f8 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 23 Aug 2018 12:37:27 -0400 Subject: [PATCH 29/34] Bugfix post-pycurl update --- crawl_server/remote_http.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 3a99947..f897840 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -179,13 +179,14 @@ class HttpDirectory(RemoteDirectory): while retries > 0: try: raw_headers = BytesIO() - curl.setopt(pycurl.URL, url) + curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore")) curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write) curl.perform() stripped_url = url[len(base_url) - 1:] headers = HttpDirectory._parse_dict_header(raw_headers.getvalue().decode("utf-8", errors="ignore")) raw_headers.close() + curl.close() path, name = os.path.split(stripped_url) date = headers.get("Last-Modified", "1970-01-01") @@ -208,7 +209,7 @@ class HttpDirectory(RemoteDirectory): while retries > 0: try: content = BytesIO() - self.curl.setopt(pycurl.URL, url) + self.curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore")) self.curl.setopt(pycurl.WRITEDATA, content) self.curl.perform() From 6ffc43601b2b3a540f734248fd200ad8eb7ddf86 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 23 Aug 2018 12:40:13 -0400 Subject: [PATCH 30/34] Bugfix post-pycurl update pt. 2 --- crawl_server/remote_http.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index f897840..5e58bf1 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -162,6 +162,8 @@ class HttpDirectory(RemoteDirectory): handles = [self._curl_handle() for _ in range(len(urls_to_request))] files = pool.starmap(self._request_file, zip(handles, urls_to_request, repeat(self.base_url))) pool.close() + for handle in handles: + handle.close() for file in files: if file: yield file @@ -186,7 +188,6 @@ class HttpDirectory(RemoteDirectory): stripped_url = url[len(base_url) - 1:] headers = HttpDirectory._parse_dict_header(raw_headers.getvalue().decode("utf-8", errors="ignore")) raw_headers.close() - curl.close() path, name = os.path.split(stripped_url) date = headers.get("Last-Modified", "1970-01-01") From 25e1e5882830bb549f79adb71bfbf006d26c1ff0 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 23 Aug 2018 12:46:00 -0400 Subject: [PATCH 31/34] Bugfix post-pycurl update pt. 3 (Sorry!) --- crawl_server/remote_http.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 5e58bf1..76fd1a6 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -109,11 +109,12 @@ class HttpDirectory(RemoteDirectory): self.curl_head = self._curl_handle() - def _curl_handle(self): + @staticmethod + def _curl_handle(): curl_head = Curl() - curl_head.setopt(self.curl.SSL_VERIFYPEER, 0) - curl_head.setopt(self.curl.SSL_VERIFYHOST, 0) + curl_head.setopt(pycurl.SSL_VERIFYPEER, 0) + curl_head.setopt(pycurl.SSL_VERIFYHOST, 0) curl_head.setopt(pycurl.NOBODY, 1) curl_head.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT) @@ -159,27 +160,25 @@ class HttpDirectory(RemoteDirectory): if len(urls_to_request) > 150: # Many urls, use multi-threaded solution pool = ThreadPool(processes=10) - handles = [self._curl_handle() for _ in range(len(urls_to_request))] - files = pool.starmap(self._request_file, zip(handles, urls_to_request, repeat(self.base_url))) + files = pool.starmap(self._request_file, zip(urls_to_request, repeat(self.base_url))) pool.close() - for handle in handles: - handle.close() for file in files: if file: yield file else: # Too few urls to create thread pool for url in urls_to_request: - file = self._request_file(self.curl_head, url, self.base_url) + file = self._request_file(url, self.base_url) if file: yield file @staticmethod - def _request_file(curl, url, base_url): + def _request_file(url, base_url): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: + curl = HttpDirectory._curl_handle() raw_headers = BytesIO() curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore")) curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write) From dff4125c9fb07a055c528e04418b9484fa4b7dd7 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 23 Aug 2018 12:47:17 -0400 Subject: [PATCH 32/34] Bugfix post-pycurl update pt. 3 (Sorry!) --- crawl_server/remote_http.py | 1 - 1 file changed, 1 deletion(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 76fd1a6..d7e2716 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -198,7 +198,6 @@ class HttpDirectory(RemoteDirectory): is_dir=False ) except pycurl.error: - curl.close() retries -= 1 logger.debug("TimeoutError - _request_file") From faa9ac3ccb270aab349d0fcb773731dcf67c8274 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 23 Aug 2018 12:48:15 -0400 Subject: [PATCH 33/34] Closing curl handle manually just to make sure --- crawl_server/remote_http.py | 1 + 1 file changed, 1 insertion(+) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index d7e2716..e916e10 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -190,6 +190,7 @@ class HttpDirectory(RemoteDirectory): path, name = os.path.split(stripped_url) date = headers.get("Last-Modified", "1970-01-01") + curl.close() return File( path=unquote(path).strip("/"), name=unquote(name), From 8b13de4a6be703422ff80d29e356b5cba417fabf Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 25 Aug 2018 16:46:53 -0400 Subject: [PATCH 34/34] Re-init curl handle on error --- crawl_server/remote_http.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index e916e10..11452c4 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -215,7 +215,7 @@ class HttpDirectory(RemoteDirectory): return content.getvalue().decode("utf-8", errors="ignore") except pycurl.error: - self.curl.close() + self.close() retries -= 1 logger.debug("TimeoutError - _fetch_body") @@ -262,7 +262,6 @@ class HttpDirectory(RemoteDirectory): def close(self): self.curl.close() - logger.debug("Closing HTTPRemoteDirectory for " + self.base_url) self.init_curl()