From 458641654c80c0920d8449da8cd0661019c24f48 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Wed, 8 Aug 2018 21:24:55 -0400
Subject: [PATCH 01/34] Minimal configuration for reddit comment callback

---
 app.py                                      |  9 +++++--
 callbacks.py                                | 27 ++++++++++++---------
 crawl_server/reddit_bot.py => reddit_bot.py | 13 +++++-----
 3 files changed, 29 insertions(+), 20 deletions(-)
 rename crawl_server/reddit_bot.py => reddit_bot.py (82%)

diff --git a/app.py b/app.py
index 0cb6bf2..27da164 100644
--- a/app.py
+++ b/app.py
@@ -13,6 +13,7 @@ import config
 from flask_caching import Cache
 from tasks import TaskManager, Task, TaskResult
 from search.search import ElasticSearchEngine
+from callbacks import PostCrawlCallbackFactory
 
 app = Flask(__name__)
 if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN:
@@ -572,7 +573,11 @@ def api_complete_task():
             if filename and os.path.exists(filename):
                 os.remove(filename)
 
-            # TODO: handle callback here
+            # Handle task callback
+            callback = PostCrawlCallbackFactory.get_callback(task)
+            if callback:
+                callback.run(task_result, searchEngine)
+
             return "Successfully logged task result and indexed files"
 
         else:
@@ -666,7 +671,7 @@ def api_task_enqueue():
             request.json["url"],
             request.json["priority"],
             request.json["callback_type"],
-            request.json["callback_args"]
+            json.dumps(request.json["callback_args"])
         )
         taskManager.queue_task(task)
         return ""
diff --git a/callbacks.py b/callbacks.py
index 89bda6c..fec098a 100644
--- a/callbacks.py
+++ b/callbacks.py
@@ -1,6 +1,7 @@
-from tasks import Task
-from crawl_server.reddit_bot import RedditBot
+from tasks import Task, TaskResult
+from reddit_bot import RedditBot
 import praw
+from search.search import SearchEngine
 
 
 class PostCrawlCallback:
@@ -8,7 +9,7 @@ class PostCrawlCallback:
     def __init__(self, task: Task):
         self.task = task
 
-    def run(self):
+    def run(self, task_result: TaskResult, search: SearchEngine):
         raise NotImplementedError
 
 
@@ -36,26 +37,30 @@ class RedditCallback(PostCrawlCallback):
                              user_agent='github.com/simon987/od-database (by /u/Hexahedr_n)')
         self.reddit_bot = RedditBot("crawled.txt", reddit)
 
-    def run(self):
+    def run(self, task_result: TaskResult, search: SearchEngine):
         raise NotImplementedError
 
 
 class RedditPostCallback(RedditCallback):
 
-    def run(self):
+    def run(self, task_result: TaskResult, search: SearchEngine):
         print("Reddit post callback for task " + str(self.task))
-        pass
 
 
 class RedditCommentCallback(RedditCallback):
 
-    def run(self):
-        print("Reddit comment callback for task " + str(self.task))
-        pass
+    def run(self, task_result: TaskResult, search: SearchEngine):
+
+        comment_id = self.task.callback_args["comment_id"]
+        print("Replying to comment " + comment_id)
+
+        stats = search.get_stats(self.task.website_id)
+        message = self.reddit_bot.get_comment(stats, self.task.website_id)
+        print(message)
+        self.reddit_bot.reply(self.reddit_bot.reddit.comment(comment_id), message)
 
 
 class DiscordCallback(PostCrawlCallback):
 
-    def run(self):
+    def run(self, task_result: TaskResult, search: SearchEngine):
         print("Discord callback for task " + str(self.task))
-        pass
diff --git a/crawl_server/reddit_bot.py b/reddit_bot.py
similarity index 82%
rename from crawl_server/reddit_bot.py
rename to reddit_bot.py
index bf3c3e4..bff336f 100644
--- a/crawl_server/reddit_bot.py
+++ b/reddit_bot.py
@@ -54,14 +54,13 @@ class RedditBot:
 
     @staticmethod
     def get_comment(stats: dict, website_id, message: str = ""):
-        comment = message + "    \n" if len(message) > 0 else ""
+        comment = message + "    \n" if message else ""
 
-        for stat in stats:
-            comment += stat + "    \n" if len(stat) > 0 else ""
-            comment += RedditBot.format_stats(stats[stat])
+        comment += RedditBot.format_stats(stats)
 
-        comment += "[Full Report](https://od-database.simon987.net/website/" + str(website_id) + "/)"
-        comment += " | [Link list](https://od-database.simon987.net/website/" + str(website_id) + "/links)    \n"
+        comment += "[Full Report](https://od-db.the-eye.eu/website/" + str(website_id) + "/)"
+        comment += " | [Link list](https://od-db.the-eye.eu/website/" + str(website_id) + "/links)"
+        comment += " | [Source](https://github.com/simon987/od-database)    \n"
         comment += "***    \n"
         comment += RedditBot.bottom_line
 
@@ -74,7 +73,7 @@ class RedditBot:
         result += "File types | Count | Total Size\n"
         result += ":-- | :-- | :--    \n"
         counter = 0
-        for mime in stats["mime_stats"]:
+        for mime in stats["ext_stats"]:
             result += mime[2]
             result += " | " + str(mime[1])
             result += " | " + humanfriendly.format_size(mime[0]) + "    \n"

From 89e378ffd9610a3caa12924ed40f01023e391f89 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Wed, 8 Aug 2018 22:41:25 -0400
Subject: [PATCH 02/34] Reddit comment callback is not an edit instead of a new
 comment

---
 callbacks.py  |  4 ++--
 reddit_bot.py | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/callbacks.py b/callbacks.py
index fec098a..a238cdb 100644
--- a/callbacks.py
+++ b/callbacks.py
@@ -52,12 +52,12 @@ class RedditCommentCallback(RedditCallback):
     def run(self, task_result: TaskResult, search: SearchEngine):
 
         comment_id = self.task.callback_args["comment_id"]
-        print("Replying to comment " + comment_id)
+        print("Editing comment comment " + comment_id)
 
         stats = search.get_stats(self.task.website_id)
         message = self.reddit_bot.get_comment(stats, self.task.website_id)
         print(message)
-        self.reddit_bot.reply(self.reddit_bot.reddit.comment(comment_id), message)
+        self.reddit_bot.edit(self.reddit_bot.reddit.comment(comment_id), message)
 
 
 class DiscordCallback(PostCrawlCallback):
diff --git a/reddit_bot.py b/reddit_bot.py
index bff336f..26defae 100644
--- a/reddit_bot.py
+++ b/reddit_bot.py
@@ -41,17 +41,28 @@ class RedditBot:
 
         while True:
             try:
-                # Double check has_crawled
                 if not self.has_crawled(reddit_obj.id):
-                    reddit_obj.reply(comment)
+                    reply = reddit_obj.reply(comment)
                     self.log_crawl(reddit_obj.id)
                     print("Reply to " + reddit_obj.id)
+                    return reply
                 break
             except Exception as e:
                 print("Waiting 5 minutes: " + str(e))
                 time.sleep(300)
                 continue
 
+    def edit(self, reddit_comment, new_message):
+
+        while True:
+            try:
+                reddit_comment.edit(new_message)
+                print("Edit comment " + reddit_comment.id)
+            except Exception as e:
+                print("Waiting 5 minutes: " + str(e))
+                time.sleep(300)
+                continue
+
     @staticmethod
     def get_comment(stats: dict, website_id, message: str = ""):
         comment = message + "    \n" if message else ""

From 88166054ad397bc5e14f34bdd69a35b53fe56455 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Wed, 8 Aug 2018 23:07:09 -0400
Subject: [PATCH 03/34] od_util can be used when od-database is a submodule

---
 od_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/od_util.py b/od_util.py
index 00057e0..dc1e5e2 100644
--- a/od_util.py
+++ b/od_util.py
@@ -5,7 +5,7 @@ import os
 import validators
 import re
 from ftplib import FTP
-import config
+from . import config
 
 import urllib3
 urllib3.disable_warnings()

From f729b462f0dd7108fe27b1f81765147b6d7d8a7a Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Wed, 8 Aug 2018 23:31:50 -0400
Subject: [PATCH 04/34] od_util can be used when od-database is a submodule
 part 2

---
 app.py        | 1 -
 callbacks.py  | 4 ++++
 od_util.py    | 7 ++++++-
 reddit_bot.py | 1 +
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/app.py b/app.py
index 27da164..c8ba6f5 100644
--- a/app.py
+++ b/app.py
@@ -5,7 +5,6 @@ from urllib.parse import urlparse
 import os
 import time
 import datetime
-import itertools
 from database import Database, Website, InvalidQueryException
 from flask_recaptcha import ReCaptcha
 import od_util
diff --git a/callbacks.py b/callbacks.py
index a238cdb..1965b82 100644
--- a/callbacks.py
+++ b/callbacks.py
@@ -2,6 +2,7 @@ from tasks import Task, TaskResult
 from reddit_bot import RedditBot
 import praw
 from search.search import SearchEngine
+import json
 
 
 class PostCrawlCallback:
@@ -9,6 +10,9 @@ class PostCrawlCallback:
     def __init__(self, task: Task):
         self.task = task
 
+        if self.task.callback_args:
+            self.task.callback_args = json.loads(self.task.callback_args)
+
     def run(self, task_result: TaskResult, search: SearchEngine):
         raise NotImplementedError
 
diff --git a/od_util.py b/od_util.py
index dc1e5e2..e74f377 100644
--- a/od_util.py
+++ b/od_util.py
@@ -5,7 +5,12 @@ import os
 import validators
 import re
 from ftplib import FTP
-from . import config
+
+# TODO: find a better way to do this
+try:
+    from . import config
+except ImportError:
+    import config
 
 import urllib3
 urllib3.disable_warnings()
diff --git a/reddit_bot.py b/reddit_bot.py
index 26defae..3f15c5f 100644
--- a/reddit_bot.py
+++ b/reddit_bot.py
@@ -58,6 +58,7 @@ class RedditBot:
             try:
                 reddit_comment.edit(new_message)
                 print("Edit comment " + reddit_comment.id)
+                break
             except Exception as e:
                 print("Waiting 5 minutes: " + str(e))
                 time.sleep(300)

From 8ffd9179d2d09a657c003d797e9090d385e0580c Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 9 Aug 2018 14:26:22 -0400
Subject: [PATCH 05/34] Increased stats timeout value

---
 app.py           | 33 +++++++++++++++++++++++++++++++++
 callbacks.py     |  4 +++-
 search/search.py | 18 +++++++++---------
 3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/app.py b/app.py
index c8ba6f5..b841b5c 100644
--- a/app.py
+++ b/app.py
@@ -716,5 +716,38 @@ def api_random_website():
         return abort(403)
 
 
+@app.route("/api/search", methods=["POST"])
+def api_search():
+
+    try:
+        token = request.json["token"]
+    except KeyError:
+        return abort(400)
+
+    name = db.check_api_token(token)
+
+    if name:
+
+        try:
+            hits = searchEngine.search(
+                request.json["query"],
+                request.json["page"], request.json["per_page"],
+                request.json["sort_order"],
+                request.json["extensions"],
+                request.json["size_min"], request.json["size_max"],
+                request.json["match_all"],
+                request.json["fields"],
+                request.json["date_min"], request.json["date_max"]
+            )
+
+            hits = db.join_website_on_search_result(hits)
+            return json.dumps(hits)
+
+        except InvalidQueryException as e:
+            return str(e)
+    else:
+        return abort(403)
+
+
 if __name__ == '__main__':
     app.run("0.0.0.0", port=12345, threaded=True)
diff --git a/callbacks.py b/callbacks.py
index 1965b82..f749ee1 100644
--- a/callbacks.py
+++ b/callbacks.py
@@ -59,7 +59,9 @@ class RedditCommentCallback(RedditCallback):
         print("Editing comment comment " + comment_id)
 
         stats = search.get_stats(self.task.website_id)
-        message = self.reddit_bot.get_comment(stats, self.task.website_id)
+        message = self.reddit_bot.get_comment(stats, self.task.website_id,
+                                              message="There you go! This website was crawled in `" +
+                                                      str(int(task_result.end_time - task_result.start_time)) + "s`")
         print(message)
         self.reddit_bot.edit(self.reddit_bot.reddit.comment(comment_id), message)
 
diff --git a/search/search.py b/search/search.py
index fac8dd5..5c1efa8 100644
--- a/search/search.py
+++ b/search/search.py
@@ -311,7 +311,7 @@ class ElasticSearchEngine(SearchEngine):
             },
             "size": 0
 
-        }, index=self.index_name, request_timeout=120)
+        }, index=self.index_name, request_timeout=240)
 
         total_stats = self.es.search(body={
             "query": {
@@ -333,7 +333,7 @@ class ElasticSearchEngine(SearchEngine):
             },
             "size": 0
 
-        }, index=self.index_name, request_timeout=120)
+        }, index=self.index_name, request_timeout=241)
 
         size_and_date_histogram = self.es.search(body={
             "query": {
@@ -354,21 +354,21 @@ class ElasticSearchEngine(SearchEngine):
                 "sizes": {
                     "histogram": {
                         "field": "size",
-                        "interval": 50000000,  # 50Mb
-                        "min_doc_count": 100
+                        "interval": 100000000,  # 100Mb
+                        "min_doc_count": 500
                     }
                 },
                 "dates": {
                     "date_histogram": {
                         "field": "mtime",
                         "interval": "1y",
-                        "min_doc_count": 100,
+                        "min_doc_count": 500,
                         "format": "yyyy"
                     }
                 }
             },
             "size": 0
-        }, index=self.index_name, request_timeout=120)
+        }, index=self.index_name, request_timeout=242)
 
         website_scatter = self.es.search(body={
             "query": {
@@ -384,7 +384,7 @@ class ElasticSearchEngine(SearchEngine):
                 "websites": {
                     "terms": {
                         "field": "website_id",
-                        "size": 500  # TODO: Figure out what size is appropriate
+                        "size": 400  # TODO: Figure out what size is appropriate
                     },
                     "aggs": {
                         "size": {
@@ -396,9 +396,9 @@ class ElasticSearchEngine(SearchEngine):
                 }
             },
             "size": 0
-        }, index=self.index_name, request_timeout=120)
+        }, index=self.index_name, request_timeout=243)
 
-        es_stats = self.es.indices.stats(self.index_name, request_timeout=120)
+        es_stats = self.es.indices.stats(self.index_name, request_timeout=244)
 
         stats = dict()
         stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]

From ffeed4192e723ce991c2ec1c50f1db78f6928ad5 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 9 Aug 2018 16:19:21 -0400
Subject: [PATCH 06/34] Refresh index before reddit comment callback

---
 callbacks.py     | 1 +
 search/search.py | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/callbacks.py b/callbacks.py
index f749ee1..647c963 100644
--- a/callbacks.py
+++ b/callbacks.py
@@ -58,6 +58,7 @@ class RedditCommentCallback(RedditCallback):
         comment_id = self.task.callback_args["comment_id"]
         print("Editing comment comment " + comment_id)
 
+        search.refresh()  # Make sure the newly indexed documents are available before commenting
         stats = search.get_stats(self.task.website_id)
         message = self.reddit_bot.get_comment(stats, self.task.website_id,
                                               message="There you go! This website was crawled in `" +
diff --git a/search/search.py b/search/search.py
index 5c1efa8..16439e2 100644
--- a/search/search.py
+++ b/search/search.py
@@ -31,6 +31,9 @@ class SearchEngine:
     def get_stats(self, website_id: int, subdir: str = None):
         raise NotImplementedError
 
+    def refresh(self):
+        raise NotImplementedError
+
 
 class ElasticSearchEngine(SearchEngine):
     SORT_ORDERS = {
@@ -460,3 +463,6 @@ class ElasticSearchEngine(SearchEngine):
         for website in websites:
             if website not in non_empty_websites:
                 yield website
+
+    def refresh(self):
+        self.es.indices.refresh(self.index_name)

From 5a084cb8574252b4f10a6b8627fd95a80c8233ec Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 9 Aug 2018 17:12:43 -0400
Subject: [PATCH 07/34] Queue can be emptied more easily

---
 database.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/database.py b/database.py
index 51edd67..948de9f 100644
--- a/database.py
+++ b/database.py
@@ -407,12 +407,12 @@ class Database:
             cursor = conn.cursor()
 
             cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM "
-                           "Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name))
+                           "Queue WHERE website_id=?", (website_id, name))
 
             task = cursor.fetchone()
 
             if task:
-                cursor.execute("DELETE FROM Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name))
+                cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, name))
                 conn.commit()
                 return Task(task[1], task[2], task[3], task[4], task[5])
             else:

From 42d858b62a33e2677313b41f01e33985e2215b10 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 9 Aug 2018 17:14:17 -0400
Subject: [PATCH 08/34] Queue can be emptied more easily pt.2

---
 database.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/database.py b/database.py
index 948de9f..252a30f 100644
--- a/database.py
+++ b/database.py
@@ -407,12 +407,12 @@ class Database:
             cursor = conn.cursor()
 
             cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM "
-                           "Queue WHERE website_id=?", (website_id, name))
+                           "Queue WHERE website_id=?", (website_id, ))
 
             task = cursor.fetchone()
 
             if task:
-                cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, name))
+                cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, ))
                 conn.commit()
                 return Task(task[1], task[2], task[3], task[4], task[5])
             else:

From faeff701dee8b9660fd850a736593c40b4d89219 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 9 Aug 2018 18:33:35 -0400
Subject: [PATCH 09/34] Increased search timeout value

---
 reddit_bot.py    | 2 +-
 search/search.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/reddit_bot.py b/reddit_bot.py
index 3f15c5f..66f03f6 100644
--- a/reddit_bot.py
+++ b/reddit_bot.py
@@ -72,7 +72,7 @@ class RedditBot:
 
         comment += "[Full Report](https://od-db.the-eye.eu/website/" + str(website_id) + "/)"
         comment += " | [Link list](https://od-db.the-eye.eu/website/" + str(website_id) + "/links)"
-        comment += " | [Source](https://github.com/simon987/od-database)    \n"
+        comment += " | [Source](https://github.com/simon987)    \n"
         comment += "***    \n"
         comment += RedditBot.bottom_line
 
diff --git a/search/search.py b/search/search.py
index 16439e2..4631e53 100644
--- a/search/search.py
+++ b/search/search.py
@@ -214,7 +214,7 @@ class ElasticSearchEngine(SearchEngine):
                 }
             },
             "size": per_page, "from": min(page * per_page, 10000 - per_page)},
-            index=self.index_name, request_timeout=30)
+            index=self.index_name, request_timeout=60)
 
         return page
 

From a6b1d9cba3ba8236a93c72f3ea7eddcc4887c573 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 9 Aug 2018 21:43:07 -0400
Subject: [PATCH 10/34] More help when no search results

---
 templates/search.html | 1 +
 1 file changed, 1 insertion(+)

diff --git a/templates/search.html b/templates/search.html
index 8bb560f..f9dbf7a 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -174,6 +174,7 @@
                     <ul>
                         <li>Try checking the 'Match any word' box for a broader search.</li>
                         <li>Make sure you don't include the file extension in your query (Use the appropriate field to filter file types)</li>
+                        <li>If you're searching for files in a particular website, use the <a href="/website">website search page</a></li>
                     </ul>
                 </div>
 

From c94cf5b3131db5860303dd72c0a1e6c940fd673f Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Fri, 10 Aug 2018 11:46:16 -0400
Subject: [PATCH 11/34] Adjusted timeout values (again)

---
 search/search.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/search/search.py b/search/search.py
index 4631e53..0249291 100644
--- a/search/search.py
+++ b/search/search.py
@@ -214,7 +214,7 @@ class ElasticSearchEngine(SearchEngine):
                 }
             },
             "size": per_page, "from": min(page * per_page, 10000 - per_page)},
-            index=self.index_name, request_timeout=60)
+            index=self.index_name, request_timeout=35)
 
         return page
 
@@ -232,7 +232,7 @@ class ElasticSearchEngine(SearchEngine):
                 "ext_group": {
                     "terms": {
                         "field": "ext",
-                        "size": 20
+                        "size": 12
                     },
                     "aggs": {
                         "size": {
@@ -249,7 +249,7 @@ class ElasticSearchEngine(SearchEngine):
                 }
             },
             "size": 0
-        }, index=self.index_name, request_timeout=20)
+        }, index=self.index_name, request_timeout=30)
 
         stats = dict()
         stats["total_size"] = result["aggregations"]["total_size"]["value"]

From c29af180c58ead26f16c8bf2f031d1e62c064575 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Fri, 10 Aug 2018 12:46:40 -0400
Subject: [PATCH 12/34] Captcha for searches

---
 README.md             |   3 +
 app.py                | 134 +++++++++++++++++++++++-------------------
 templates/home.html   |  18 ++++--
 templates/search.html |  11 +++-
 4 files changed, 100 insertions(+), 66 deletions(-)

diff --git a/README.md b/README.md
index a10a2bd..859d3e8 100644
--- a/README.md
+++ b/README.md
@@ -14,8 +14,11 @@ Create `/config.py` and fill out the parameters. Sample config:
 # Leave default values for no CAPTCHAs
 CAPTCHA_LOGIN = False
 CAPTCHA_SUBMIT = False
+CAPTCHA_SEARCH = False
 CAPTCHA_SITE_KEY = ""
 CAPTCHA_SECRET_KEY = ""
+CAPTCHA_S_SITE_KEY = ""
+CAPTCHA_S_SECRET_KEY = ""
 
 # Flask secret key for sessions
 FLASK_SECRET = ""
diff --git a/app.py b/app.py
index b841b5c..f99417c 100644
--- a/app.py
+++ b/app.py
@@ -21,6 +21,12 @@ if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN:
                           secret_key=config.CAPTCHA_SECRET_KEY)
 else:
     recaptcha = None
+if config.CAPTCHA_SEARCH:
+    recaptcha_search = ReCaptcha(app=app,
+                                 site_key=config.CAPTCHA_S_SITE_KEY,
+                                 secret_key=config.CAPTCHA_S_SECRET_KEY)
+else:
+    recaptcha_search = None
 app.secret_key = config.FLASK_SECRET
 db = Database("db.sqlite3")
 cache = Cache(app, config={'CACHE_TYPE': 'simple'})
@@ -243,79 +249,86 @@ def admin_rescan_website(website_id):
 
 @app.route("/search")
 def search():
-    q = request.args.get("q") if "q" in request.args else ""
-    sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score"
 
-    page = request.args.get("p") if "p" in request.args else "0"
-    page = int(page) if page.isdigit() else 0
+        q = request.args.get("q") if "q" in request.args else ""
+        sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score"
 
-    per_page = request.args.get("per_page") if "per_page" in request.args else "50"
-    per_page = int(per_page) if per_page.isdigit() else "50"
-    per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50
+        page = request.args.get("p") if "p" in request.args else "0"
+        page = int(page) if page.isdigit() else 0
 
-    extensions = request.args.get("ext") if "ext" in request.args else None
-    extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else []
+        per_page = request.args.get("per_page") if "per_page" in request.args else "50"
+        per_page = int(per_page) if per_page.isdigit() else "50"
+        per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50
 
-    size_min = request.args.get("size_min") if "size_min" in request.args else "size_min"
-    size_min = int(size_min) if size_min.isdigit() else 0
-    size_max = request.args.get("size_max") if "size_max" in request.args else "size_max"
-    size_max = int(size_max) if size_max.isdigit() else 0
+        extensions = request.args.get("ext") if "ext" in request.args else None
+        extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else []
 
-    date_min = request.args.get("date_min") if "date_min" in request.args else "date_min"
-    date_min = int(date_min) if date_min.isdigit() else 0
-    date_max = request.args.get("date_max") if "date_max" in request.args else "date_max"
-    date_max = int(date_max) if date_max.isdigit() else 0
+        size_min = request.args.get("size_min") if "size_min" in request.args else "size_min"
+        size_min = int(size_min) if size_min.isdigit() else 0
+        size_max = request.args.get("size_max") if "size_max" in request.args else "size_max"
+        size_max = int(size_max) if size_max.isdigit() else 0
 
-    match_all = "all" in request.args
+        date_min = request.args.get("date_min") if "date_min" in request.args else "date_min"
+        date_min = int(date_min) if date_min.isdigit() else 0
+        date_max = request.args.get("date_max") if "date_max" in request.args else "date_max"
+        date_max = int(date_max) if date_max.isdigit() else 0
 
-    field_name = "field_name" in request.args
-    field_trigram = "field_trigram" in request.args
-    field_path = "field_path" in request.args
+        match_all = "all" in request.args
 
-    if not field_name and not field_trigram and not field_path:
-        # If no fields are selected, search in all
-        field_name = field_path = field_trigram = True
+        field_name = "field_name" in request.args
+        field_trigram = "field_trigram" in request.args
+        field_path = "field_path" in request.args
 
-    fields = []
-    if field_path:
-        fields.append("path")
-    if field_name:
-        fields.append("name^5")
-    if field_trigram:
-        fields.append("name.nGram^2")
+        if not field_name and not field_trigram and not field_path:
+            # If no fields are selected, search in all
+            field_name = field_path = field_trigram = True
 
-    if len(q) >= 3:
+        fields = []
+        if field_path:
+            fields.append("path")
+        if field_name:
+            fields.append("name^5")
+        if field_trigram:
+            fields.append("name.nGram^2")
 
-        db.log_search(request.remote_addr,
-                      request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
-                      q, extensions, page)
+        if len(q) >= 3:
 
-        try:
-            hits = searchEngine.search(q, page, per_page, sort_order,
-                                       extensions, size_min, size_max, match_all, fields, date_min, date_max)
-            hits = db.join_website_on_search_result(hits)
-        except InvalidQueryException as e:
-            flash("<strong>Invalid query:</strong> " + str(e), "warning")
-            return redirect("/search")
-        except Exception:
-            flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
-                  "Please try again later", "danger")
+            response = request.args.get("g-recaptcha-response", "")
+            if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response):
+                db.log_search(request.remote_addr,
+                              request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
+                              q, extensions, page)
+
+                try:
+                    hits = searchEngine.search(q, page, per_page, sort_order,
+                                               extensions, size_min, size_max, match_all, fields, date_min, date_max)
+                    hits = db.join_website_on_search_result(hits)
+                except InvalidQueryException as e:
+                    flash("<strong>Invalid query:</strong> " + str(e), "warning")
+                    return redirect("/search")
+                except Exception:
+                    flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
+                          "Please try again later", "danger")
+                    hits = None
+            else:
+                flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
+                hits = None
+
+        else:
             hits = None
 
-    else:
-        hits = None
-
-    return render_template("search.html",
-                           results=hits,
-                           q=q,
-                           p=page, per_page=per_page,
-                           sort_order=sort_order,
-                           results_set=config.RESULTS_PER_PAGE,
-                           extensions=",".join(extensions),
-                           size_min=size_min, size_max=size_max,
-                           match_all=match_all,
-                           field_trigram=field_trigram, field_path=field_path, field_name=field_name,
-                           date_min=date_min, date_max=date_max)
+        return render_template("search.html",
+                               results=hits,
+                               q=q,
+                               p=page, per_page=per_page,
+                               sort_order=sort_order,
+                               results_set=config.RESULTS_PER_PAGE,
+                               extensions=",".join(extensions),
+                               size_min=size_min, size_max=size_max,
+                               match_all=match_all,
+                               field_trigram=field_trigram, field_path=field_path, field_name=field_name,
+                               date_min=date_min, date_max=date_max,
+                               show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search)
 
 
 @app.route("/contribute")
@@ -331,7 +344,8 @@ def home():
         stats["website_count"] = len(db.get_all_websites())
     except:
         stats = {}
-    return render_template("home.html", stats=stats)
+    return render_template("home.html", stats=stats,
+                           show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search)
 
 
 @app.route("/submit")
diff --git a/templates/home.html b/templates/home.html
index 33881e5..29d30b4 100644
--- a/templates/home.html
+++ b/templates/home.html
@@ -23,13 +23,23 @@
         <div class="card">
             <div class="card-header">Search</div>
             <div class="card-body">
-                <form action="/search">
+                <form action="/search" id="sfrm">
 
-                    <div class="form-group">
-                        <input class="form-control" name="q" id="q" placeholder="Query">
+                    <div class="form-row">
+                        <div class="col-md-11">
+                            <input class="form-control" name="q" id="q" placeholder="Query">
+                        </div>
+                        <div class="col-md-1">
+                            {% if show_captcha %}
+                                <script>function f(token) {document.getElementById("sfrm").submit();}</script>
+                                <script src="https://www.google.com/recaptcha/api.js" async defer></script>
+                                <button class="g-recaptcha btn btn-primary btn-shadow" data-sitekey="6LcCXWkUAAAAAJo2NR9_m09Obn5YmDrI97sGrr2f" data-callback="f">Search</button>
+                            {% else %}
+                                <input class="btn btn-primary btn-shadow" type="submit" value="Search nocap">
+                            {% endif %}
+                        </div>
                     </div>
 
-                    <input class="btn btn-primary btn-shadow" type="submit" value="Search">
                 </form>
             </div>
         </div>
diff --git a/templates/search.html b/templates/search.html
index f9dbf7a..a6a9d34 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -9,7 +9,7 @@
         <div class="card">
             <div class="card-header">Search</div>
             <div class="card-body">
-                <form action="/search">
+                <form action="/search" id="sfrm">
 
                     <div class="form-row">
 
@@ -92,7 +92,14 @@
 
                         {# Search button #}
                         <div class="form-group col-md-7">
-                            <input class="btn btn-primary btn-shadow" type="submit" value="Search" style="float: right">
+
+                            {% if show_captcha %}
+                                <script>function f(token) {document.getElementById("sfrm").submit();}</script>
+                                <script src="https://www.google.com/recaptcha/api.js" async defer></script>
+                                <button class="g-recaptcha btn btn-primary btn-shadow" data-sitekey="6LcCXWkUAAAAAJo2NR9_m09Obn5YmDrI97sGrr2f" data-callback="f" style="float: right">Search</button>
+                            {% else %}
+                                <input class="btn btn-primary btn-shadow" type="submit" value="Search" style="float: right">
+                            {% endif %}
                         </div>
                     </div>
 

From aab1abba5472f660514729aeb89ffa54c10654be Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Fri, 10 Aug 2018 15:24:43 -0400
Subject: [PATCH 13/34] Fixed websites link

---
 templates/layout.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/layout.html b/templates/layout.html
index b6cefb3..2513392 100644
--- a/templates/layout.html
+++ b/templates/layout.html
@@ -32,7 +32,7 @@
     <div class="collapse navbar-collapse" id="navbarSupportedContent">
         <ul class="navbar-nav mr-auto">
             <li class="nav-item">
-                <a class="nav-link {{ "active" if current_page == "website" else "" }}" href="/website">Websites</a>
+                <a class="nav-link {{ "active" if current_page == "website" else "" }}" href="/website/">Websites</a>
             </li>
             <li class="nav-item">
                 <a class="nav-link {{ "active" if current_page == "submit" else "" }}" href="/submit">Submit website</a>

From bab68819df33425ee950f8b7d81e7341a8ae467a Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Fri, 10 Aug 2018 15:27:37 -0400
Subject: [PATCH 14/34] Increased stats generation interval

---
 search/search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/search/search.py b/search/search.py
index 0249291..6465de9 100644
--- a/search/search.py
+++ b/search/search.py
@@ -51,7 +51,7 @@ class ElasticSearchEngine(SearchEngine):
         self.es = elasticsearch.Elasticsearch()
 
         scheduler = BackgroundScheduler()
-        scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 15)
+        scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120)
         scheduler.start()
 
         if not self.es.indices.exists(self.index_name):

From 78d1b7a5bd267bb0d31f55cf22bd7d702434e1ff Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Fri, 10 Aug 2018 16:30:40 -0400
Subject: [PATCH 15/34] Next and previous buttons now works with captcha

---
 templates/search.html | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/templates/search.html b/templates/search.html
index a6a9d34..e247850 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -22,6 +22,7 @@
                                 </div>
                             </div>
                             <input class="form-control" name="q" id="q" placeholder="Query" value="{{ q }}">
+                            <input type="hidden" name="p" id="page" value="{{ p }}">
                         </div>
                     </div>
                     {# Size #}
@@ -90,15 +91,16 @@
                             </select>
                         </div>
 
+
                         {# Search button #}
                         <div class="form-group col-md-7">
 
                             {% if show_captcha %}
                                 <script>function f(token) {document.getElementById("sfrm").submit();}</script>
                                 <script src="https://www.google.com/recaptcha/api.js" async defer></script>
-                                <button class="g-recaptcha btn btn-primary btn-shadow" data-sitekey="6LcCXWkUAAAAAJo2NR9_m09Obn5YmDrI97sGrr2f" data-callback="f" style="float: right">Search</button>
+                                <button id="s" class="g-recaptcha btn btn-primary btn-shadow" data-sitekey="6LcCXWkUAAAAAJo2NR9_m09Obn5YmDrI97sGrr2f" data-callback="f" style="float: right">Search</button>
                             {% else %}
-                                <input class="btn btn-primary btn-shadow" type="submit" value="Search" style="float: right">
+                                <input id="s" class="btn btn-primary btn-shadow" type="submit" value="Search" style="float: right">
                             {% endif %}
                         </div>
                     </div>
@@ -163,12 +165,10 @@
                         </table>
                     </div>
                     {% if results["hits"]["total"] > (p + 1) * per_page %}
-                        <a href="/search?q={{ q }}&p={{ p + 1 }}&sort_order={{ sort_order }}&per_page={{ per_page }}&ext={{ extensions }}&size_min={{ size_min }}&size_max={{ size_max }}{{ "&field_path" if field_path else "" }}{{ "&field_name" if field_name else "" }}{{ "&field_trigram" if field_trigram else "" }}&date_min={{ date_min }}&date_max={{ date_max }}"
-                           class="btn btn-primary" style="float: right">Next</a>
+                        <button class="btn btn-primary" onclick="nextPage()" style="float: right">Next</button>
                     {% endif %}
                     {% if p > 0 %}
-                        <a href="/search?q={{ q }}&p={{ p - 1 }}&sort_order={{ sort_order }}&per_page={{ per_page }}&ext={{ extensions }}&size_min={{ size_min }}&size_max={{ size_max }}{{ "&field_path" if field_path else "" }}{{ "&field_name" if field_name else "" }}{{ "&field_trigram" if field_trigram else "" }}&date_min={{ date_min }}&date_max={{ date_max }}"
-                           class="btn btn-primary">Previous</a>
+                        <button class="btn btn-primary" onclick="prevPage()">Previous</button>
                     {% endif %}
 
                 </div>
@@ -253,6 +253,16 @@
                 }
             });
 
+            //Next button
+            function nextPage() {
+                document.getElementById("page").value = parseInt(document.getElementById("page").value) + 1;
+                document.getElementById("s").click();
+            }
+            function prevPage() {
+                document.getElementById("page").value = parseInt(document.getElementById("page").value) - 1;
+                document.getElementById("s").click();
+            }
+
         </script>
     </div>
 

From cc4c70f4004f7c70331a8c69d663d83a6706c2cf Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Sat, 11 Aug 2018 13:05:24 -0400
Subject: [PATCH 16/34] Request content is read all at once

---
 crawl_server/remote_http.py  | 29 ++++++++++-------------------
 test/files/apache_table.html | 21 +++++++++++++++++++++
 test/webserver.py            | 13 +++++++++++++
 3 files changed, 44 insertions(+), 19 deletions(-)
 create mode 100644 test/files/apache_table.html
 create mode 100644 test/webserver.py

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 368aaac..37cf3c1 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -104,7 +104,7 @@ class HttpDirectory(RemoteDirectory):
         current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
         path_identifier = hashlib.md5(current_dir_name.encode())
         path_url = urljoin(self.base_url, path, "")
-        body = self._stream_body(path_url)
+        body = self._fetch_body(path_url)
         anchors = self._parse_links(body)
 
         urls_to_request = []
@@ -176,19 +176,16 @@ class HttpDirectory(RemoteDirectory):
         logger.debug("TimeoutError - _request_file")
         raise TimeoutError
 
-    def _stream_body(self, url: str):
+    def _fetch_body(self, url: str):
         retries = HttpDirectory.MAX_RETRIES
         while retries > 0:
             try:
-                r = self.session.get(url, stream=True, timeout=HttpDirectory.TIMEOUT)
-                for chunk in r.iter_content(chunk_size=8192):
-                    try:
-                        yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore")
-                    except LookupError:
-                        # Unsupported encoding
-                        yield chunk.decode("utf-8", errors="ignore")
-                r.close()
-                return
+                r = self.session.get(url, timeout=HttpDirectory.TIMEOUT)
+                try:
+                    return r.content.decode(r.encoding if r.encoding else "utf-8", errors="ignore")
+                except LookupError:
+                    # Unsupported encoding
+                    return r.content.decode("utf-8", errors="ignore")
             except RequestException:
                 self.session.close()
                 retries -= 1
@@ -200,14 +197,8 @@ class HttpDirectory(RemoteDirectory):
     def _parse_links(body):
 
         parser = HTMLAnchorParser()
-        anchors = []
-
-        for chunk in body:
-            parser.feed(chunk)
-            for anchor in parser.anchors:
-                anchors.append(anchor)
-
-        return anchors
+        parser.feed(body)
+        return parser.anchors
 
     @staticmethod
     def _isdir(link: Anchor):
diff --git a/test/files/apache_table.html b/test/files/apache_table.html
new file mode 100644
index 0000000..06e21c9
--- /dev/null
+++ b/test/files/apache_table.html
@@ -0,0 +1,21 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html>
+<head>
+    <title>Index of /Public/bootstrap</title>
+</head>
+<body>
+<h1>Index of /Public/bootstrap</h1>
+<table>
+    <tr><th valign="top"><img src="/icons/blank.gif" alt="[ICO]"></th><th><a href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last modified</a></th><th><a href="?C=S;O=A">Size</a></th><th><a href="?C=D;O=A">Description</a></th></tr>
+    <tr><th colspan="5"><hr></th></tr>
+    <tr><td valign="top"><img src="/icons/back.gif" alt="[PARENTDIR]"></td><td><a href="/Public/">Parent Directory</a>       </td><td>&nbsp;</td><td align="right">  - </td><td>&nbsp;</td></tr>
+    <tr><td valign="top"><img src="/icons/unknown.gif" alt="[   ]"></td><td><a href="bower.json">bower.json</a>             </td><td align="right">2017-04-05 01:45  </td><td align="right">1.0K</td><td>&nbsp;</td></tr>
+    <tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="css/">css/</a>                   </td><td align="right">2017-09-07 18:03  </td><td align="right">  - </td><td>&nbsp;</td></tr>
+    <tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="image/">image/</a>                 </td><td align="right">2017-09-07 18:03  </td><td align="right">  - </td><td>&nbsp;</td></tr>
+    <tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="js/">js/</a>                    </td><td align="right">2017-09-07 18:03  </td><td align="right">  - </td><td>&nbsp;</td></tr>
+    <tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="less/">less/</a>                  </td><td align="right">2017-09-07 18:03  </td><td align="right">  - </td><td>&nbsp;</td></tr>
+    <tr><td valign="top"><img src="/icons/unknown.gif" alt="[   ]"></td><td><a href="package.json">package.json</a>           </td><td align="right">2017-04-05 01:45  </td><td align="right">666 </td><td>&nbsp;</td></tr>
+    <tr><th colspan="5"><hr></th></tr>
+</table>
+</body></html>
+
diff --git a/test/webserver.py b/test/webserver.py
new file mode 100644
index 0000000..a3a1c14
--- /dev/null
+++ b/test/webserver.py
@@ -0,0 +1,13 @@
+from flask import Flask, send_file
+
+app = Flask(__name__)
+
+
+@app.route("/test1/")
+def test1():
+    return send_file("files/apache_table.html")
+
+
+if __name__ == '__main__':
+    app.run("0.0.0.0", port=8888, threaded=True)
+

From edede200f4f447003b3171354599a34468f4ba21 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Sun, 12 Aug 2018 14:58:27 -0400
Subject: [PATCH 17/34] Decresed number of indexed documents per second

---
 search/search.py               |  4 +--
 test/files/lighttpd_table.html | 47 ++++++++++++++++++++++++++++++++++
 test/files/nginx_pre.html      | 11 ++++++++
 3 files changed, 60 insertions(+), 2 deletions(-)
 create mode 100644 test/files/lighttpd_table.html
 create mode 100644 test/files/nginx_pre.html

diff --git a/search/search.py b/search/search.py
index 6465de9..fe7b85d 100644
--- a/search/search.py
+++ b/search/search.py
@@ -125,8 +125,8 @@ class ElasticSearchEngine(SearchEngine):
 
     def import_json(self, in_lines, website_id: int):
 
-        import_every = 1000
-        cooldown_time = 1
+        import_every = 400
+        cooldown_time = 0.6
 
         docs = []
 
diff --git a/test/files/lighttpd_table.html b/test/files/lighttpd_table.html
new file mode 100644
index 0000000..fe58d9f
--- /dev/null
+++ b/test/files/lighttpd_table.html
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+<head>
+    <title>Index of /gentoo/releases/</title>
+    <style type="text/css">
+        a, a:active {text-decoration: none; color: blue;}
+        a:visited {color: #48468F;}
+        a:hover, a:focus {text-decoration: underline; color: red;}
+        body {background-color: #F5F5F5;}
+        h2 {margin-bottom: 12px;}
+        table {margin-left: 12px;}
+        th, td { font: 90% monospace; text-align: left;}
+        th { font-weight: bold; padding-right: 14px; padding-bottom: 3px;}
+        td {padding-right: 14px;}
+        td.s, th.s {text-align: right;}
+        div.list { background-color: white; border-top: 1px solid #646464; border-bottom: 1px solid #646464; padding-top: 10px; padding-bottom: 14px;}
+        div.foot { font: 90% monospace; color: #787878; padding-top: 4px;}
+    </style>
+</head>
+<body>
+<h2>Index of /gentoo/releases/</h2>
+<div class="list">
+    <table summary="Directory Listing" cellpadding="0" cellspacing="0">
+        <thead><tr><th class="n">Name</th><th class="m">Last Modified</th><th class="s">Size</th><th class="t">Type</th></tr></thead>
+        <tbody>
+        <tr><td class="n"><a href="../">Parent Directory</a>/</td><td class="m">&nbsp;</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="alpha/">alpha</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="amd64/">amd64</a>/</td><td class="m">2017-Feb-09 18:50:44</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="arm/">arm</a>/</td><td class="m">2014-Apr-29 13:42:06</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="hppa/">hppa</a>/</td><td class="m">2014-Apr-29 13:42:12</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="ia64/">ia64</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="mips/">mips</a>/</td><td class="m">2011-Apr-28 23:38:14</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="ppc/">ppc</a>/</td><td class="m">2014-Apr-29 13:41:00</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="s390/">s390</a>/</td><td class="m">2014-Apr-29 13:41:06</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="sh/">sh</a>/</td><td class="m">2014-Apr-29 13:41:16</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="snapshots/">snapshots</a>/</td><td class="m">2009-Apr-16 05:08:17</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="sparc/">sparc</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="x86/">x86</a>/</td><td class="m">2016-Jul-04 21:14:19</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
+        <tr><td class="n"><a href="README">README</a></td><td class="m">2014-Jun-22 05:18:43</td><td class="s">0.1K</td><td class="t">application/octet-stream</td></tr>
+        <tr><td class="n"><a href="verify-digests.sh">verify-digests.sh</a></td><td class="m">2016-Jun-10 02:40:33</td><td class="s">4.5K</td><td class="t">application/octet-stream</td></tr>
+        </tbody>
+    </table>
+</div>
+<div class="foot">lighttpd/1.4.29</div>
+</body>
+</html>
diff --git a/test/files/nginx_pre.html b/test/files/nginx_pre.html
new file mode 100644
index 0000000..5bbd35e
--- /dev/null
+++ b/test/files/nginx_pre.html
@@ -0,0 +1,11 @@
+<html>
+<head><title>Index of /test/To process/Android nak newer/</title></head>
+<body bgcolor="white">
+<h1>Index of /test/To process/Android nak newer/</h1><hr><pre><a href="../">../</a>
+<a href="DCIM/">DCIM/</a>                                              31-Jul-2018 00:26                   -
+<a href="Pictures/">Pictures/</a>                                          31-Jul-2018 00:26                   -
+<a href="1529682937580.webm">1529682937580.webm</a>                                 25-Jun-2018 03:58             3768511
+<a href="1529716051300.webm">1529716051300.webm</a>                                 25-Jun-2018 04:01             3181867
+<a href="1529725898345.webm">1529725898345.webm</a>                                 25-Jun-2018 04:05             4138908
+</pre><hr></body>
+</html>

From 5c386707edf9b55bcb4bc8d6367ca949816cdcc3 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Mon, 13 Aug 2018 14:03:22 -0400
Subject: [PATCH 18/34] Should fix import error

---
 od_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/od_util.py b/od_util.py
index e74f377..3407fc9 100644
--- a/od_util.py
+++ b/od_util.py
@@ -9,7 +9,7 @@ from ftplib import FTP
 # TODO: find a better way to do this
 try:
     from . import config
-except ImportError:
+except (ImportError, SystemError):
     import config
 
 import urllib3

From c92f2f493782baf55b7c0cbc8ea89a40ae32b19e Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Tue, 14 Aug 2018 12:21:34 -0400
Subject: [PATCH 19/34] Should fix export problem

---
 app.py           | 1 +
 search/search.py | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/app.py b/app.py
index f99417c..8e97e13 100644
--- a/app.py
+++ b/app.py
@@ -36,6 +36,7 @@ app.jinja_env.globals.update(get_mime=od_util.get_category)
 
 taskManager = TaskManager()
 searchEngine = ElasticSearchEngine("od-database")
+searchEngine.start_stats_scheduler()
 
 
 @app.template_filter("date_format")
diff --git a/search/search.py b/search/search.py
index fe7b85d..5555b03 100644
--- a/search/search.py
+++ b/search/search.py
@@ -50,12 +50,14 @@ class ElasticSearchEngine(SearchEngine):
         self.index_name = index_name
         self.es = elasticsearch.Elasticsearch()
 
+        if not self.es.indices.exists(self.index_name):
+            self.init()
+
+    def start_stats_scheduler(self):
         scheduler = BackgroundScheduler()
         scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120)
         scheduler.start()
 
-        if not self.es.indices.exists(self.index_name):
-            self.init()
 
     def init(self):
         print("Elasticsearch first time setup")

From bbe8ed07a8d9221bd31242be42e642592a6c57a6 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Tue, 14 Aug 2018 16:20:00 -0400
Subject: [PATCH 20/34] Reset page number on search

---
 search/search.py      | 1 -
 templates/search.html | 5 ++++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/search/search.py b/search/search.py
index 5555b03..90f0bd3 100644
--- a/search/search.py
+++ b/search/search.py
@@ -58,7 +58,6 @@ class ElasticSearchEngine(SearchEngine):
         scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120)
         scheduler.start()
 
-
     def init(self):
         print("Elasticsearch first time setup")
         if self.es.indices.exists(self.index_name):
diff --git a/templates/search.html b/templates/search.html
index e247850..869832b 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -96,7 +96,10 @@
                         <div class="form-group col-md-7">
 
                             {% if show_captcha %}
-                                <script>function f(token) {document.getElementById("sfrm").submit();}</script>
+                                <script>function f(token) {
+                                    document.getElementById("page").value = 0;
+                                    document.getElementById("sfrm").submit();
+                                }</script>
                                 <script src="https://www.google.com/recaptcha/api.js" async defer></script>
                                 <button id="s" class="g-recaptcha btn btn-primary btn-shadow" data-sitekey="6LcCXWkUAAAAAJo2NR9_m09Obn5YmDrI97sGrr2f" data-callback="f" style="float: right">Search</button>
                             {% else %}

From 6d27cbca02c3b5280f899a704b5bf514d9e18b5e Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Wed, 15 Aug 2018 11:32:36 -0400
Subject: [PATCH 21/34] xz -> lzma for export

---
 app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app.py b/app.py
index 8e97e13..b821f79 100644
--- a/app.py
+++ b/app.py
@@ -57,7 +57,7 @@ def from_timestamp(value):
 @app.route("/dl")
 def downloads():
     try:
-        export_file_stats = os.stat("static/out.csv.xz")
+        export_file_stats = os.stat("static/out.csv.lzma")
     except FileNotFoundError:
         print("No export file")
         export_file_stats = None

From a2327bac7c8e4818069f2b3254d321a50ba074d8 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 16 Aug 2018 13:13:34 -0400
Subject: [PATCH 22/34] Bug fix for pages buttons

---
 templates/search.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/search.html b/templates/search.html
index 869832b..ee84b95 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -97,7 +97,7 @@
 
                             {% if show_captcha %}
                                 <script>function f(token) {
-                                    document.getElementById("page").value = 0;
+                                    {#document.getElementById("page").value = 0;#}
                                     document.getElementById("sfrm").submit();
                                 }</script>
                                 <script src="https://www.google.com/recaptcha/api.js" async defer></script>

From 8f218f3c9dbd64b9508975e08e57ef5ab56d4ec1 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 16 Aug 2018 13:24:00 -0400
Subject: [PATCH 23/34] Bug fix for pages buttons pt.2

---
 templates/search.html | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/templates/search.html b/templates/search.html
index ee84b95..f3437d5 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -97,11 +97,10 @@
 
                             {% if show_captcha %}
                                 <script>function f(token) {
-                                    {#document.getElementById("page").value = 0;#}
                                     document.getElementById("sfrm").submit();
                                 }</script>
                                 <script src="https://www.google.com/recaptcha/api.js" async defer></script>
-                                <button id="s" class="g-recaptcha btn btn-primary btn-shadow" data-sitekey="6LcCXWkUAAAAAJo2NR9_m09Obn5YmDrI97sGrr2f" data-callback="f" style="float: right">Search</button>
+                                <button id="s" class="g-recaptcha btn btn-primary btn-shadow" data-sitekey="6LcCXWkUAAAAAJo2NR9_m09Obn5YmDrI97sGrr2f" data-callback="f" style="float: right" onclick="document.getElementById('page').value = 0">Search</button>
                             {% else %}
                                 <input id="s" class="btn btn-primary btn-shadow" type="submit" value="Search" style="float: right">
                             {% endif %}
@@ -259,11 +258,11 @@
             //Next button
             function nextPage() {
                 document.getElementById("page").value = parseInt(document.getElementById("page").value) + 1;
-                document.getElementById("s").click();
+                grecaptcha.execute();
             }
             function prevPage() {
                 document.getElementById("page").value = parseInt(document.getElementById("page").value) - 1;
-                document.getElementById("s").click();
+                grecaptcha.execute();
             }
 
         </script>

From 85c3aa918dd703766080600ed2e33bd10cff782a Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 23 Aug 2018 11:47:09 -0400
Subject: [PATCH 24/34] replaced requests by pycurl

---
 crawl_server/remote_http.py | 94 +++++++++++++++++++++++++++----------
 requirements.txt            |  4 +-
 search/search.py            |  2 +-
 3 files changed, 72 insertions(+), 28 deletions(-)

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 37cf3c1..2d60728 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -1,14 +1,16 @@
+import pycurl
+from io import BytesIO
+
 from crawl_server import logger
 from urllib.parse import unquote, urljoin
 import os
 from html.parser import HTMLParser
 from itertools import repeat
 from crawl_server.crawler import RemoteDirectory, File
-import requests
-from requests.exceptions import RequestException
 from multiprocessing.pool import ThreadPool
 import config
 from dateutil.parser import parse as parse_date
+from pycurl import Curl
 import hashlib
 
 import urllib3
@@ -94,10 +96,29 @@ class HttpDirectory(RemoteDirectory):
 
     def __init__(self, url):
         super().__init__(url)
-        self.session = requests.Session()
-        self.session.headers = HttpDirectory.HEADERS
-        self.session.verify = False
-        self.session.max_redirects = 1
+        self.curl = None
+        self.curl_head = None
+        self.init_curl()
+
+    def init_curl(self):
+
+        self.curl = Curl()
+        self.curl.setopt(self.curl.SSL_VERIFYPEER, 0)
+        self.curl.setopt(self.curl.SSL_VERIFYHOST, 0)
+        self.curl.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT)
+
+        self.curl_head = self._curl_handle()
+
+    def _curl_handle(self):
+
+        curl_head = Curl()
+        curl_head.setopt(self.curl.SSL_VERIFYPEER, 0)
+        curl_head.setopt(self.curl.SSL_VERIFYHOST, 0)
+        curl_head.setopt(pycurl.NOBODY, 1)
+        curl_head.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT)
+
+        return curl_head
+
 
     def list_dir(self, path):
 
@@ -139,7 +160,8 @@ class HttpDirectory(RemoteDirectory):
         if len(urls_to_request) > 150:
             # Many urls, use multi-threaded solution
             pool = ThreadPool(processes=10)
-            files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
+            handles = [self._curl_handle() for _ in range(len(urls_to_request))]
+            files = pool.starmap(self._request_file, zip(handles, urls_to_request, repeat(self.base_url)))
             pool.close()
             for file in files:
                 if file:
@@ -147,31 +169,38 @@ class HttpDirectory(RemoteDirectory):
         else:
             # Too few urls to create thread pool
             for url in urls_to_request:
-                file = self._request_file(url)
+                file = self._request_file(self.curl_head, url, self.base_url)
                 if file:
                     yield file
 
-    def _request_file(self, url):
+    @staticmethod
+    def _request_file(curl, url, base_url):
 
         retries = HttpDirectory.MAX_RETRIES
         while retries > 0:
             try:
-                r = self.session.head(url, allow_redirects=False, timeout=HttpDirectory.TIMEOUT)
+                raw_headers = BytesIO()
+                curl.setopt(pycurl.URL, url)
+                curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write)
+                curl.perform()
 
-                stripped_url = url[len(self.base_url) - 1:]
+                stripped_url = url[len(base_url) - 1:]
+                headers = HttpDirectory._parse_dict_header(raw_headers.getvalue().decode("utf-8", errors="ignore"))
+                raw_headers.close()
 
                 path, name = os.path.split(stripped_url)
-                date = r.headers.get("Last-Modified", "1970-01-01")
+                date = headers.get("Last-Modified", "1970-01-01")
                 return File(
                     path=unquote(path).strip("/"),
                     name=unquote(name),
-                    size=int(r.headers.get("Content-Length", -1)),
+                    size=int(headers.get("Content-Length", -1)),
                     mtime=int(parse_date(date).timestamp()),
                     is_dir=False
                 )
-            except RequestException:
-                self.session.close()
+            except pycurl.error as e:
+                curl.close()
                 retries -= 1
+                raise e
 
         logger.debug("TimeoutError - _request_file")
         raise TimeoutError
@@ -180,17 +209,19 @@ class HttpDirectory(RemoteDirectory):
         retries = HttpDirectory.MAX_RETRIES
         while retries > 0:
             try:
-                r = self.session.get(url, timeout=HttpDirectory.TIMEOUT)
-                try:
-                    return r.content.decode(r.encoding if r.encoding else "utf-8", errors="ignore")
-                except LookupError:
-                    # Unsupported encoding
-                    return r.content.decode("utf-8", errors="ignore")
-            except RequestException:
-                self.session.close()
-                retries -= 1
+                content = BytesIO()
+                self.curl.setopt(pycurl.URL, url)
+                self.curl.setopt(pycurl.WRITEDATA, content)
+                self.curl.perform()
 
-        logger.debug("TimeoutError - _stream_body")
+                return content.getvalue().decode("utf-8", errors="ignore")
+            except pycurl.error as e:
+                self.curl.close()
+                retries -= 1
+                print(e)
+                raise e
+
+        logger.debug("TimeoutError - _fetch_body")
         raise TimeoutError
 
     @staticmethod
@@ -222,8 +253,19 @@ class HttpDirectory(RemoteDirectory):
         if "?" in link.href:
             return True
 
+    @staticmethod
+    def _parse_dict_header(raw):
+        headers = dict()
+        for line in raw.split("\r\n")[1:]:  # Ignore first 'HTTP/1.0 200 OK' line
+            if line:
+                k, v = line.split(":", maxsplit=1)
+                headers[k.strip()] = v.strip()
+
+        return headers
+
     def close(self):
-        self.session.close()
+        self.curl.close()
         logger.debug("Closing HTTPRemoteDirectory for " + self.base_url)
+        self.init_curl()
 
 
diff --git a/requirements.txt b/requirements.txt
index 4bb0370..b5a81c6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,4 +16,6 @@ flask_httpauth
 ujson
 urllib3
 pyOpenSSL
-pybloom-live
\ No newline at end of file
+pybloom-live
+pycurl
+lxml
\ No newline at end of file
diff --git a/search/search.py b/search/search.py
index 90f0bd3..f65f539 100644
--- a/search/search.py
+++ b/search/search.py
@@ -127,7 +127,7 @@ class ElasticSearchEngine(SearchEngine):
     def import_json(self, in_lines, website_id: int):
 
         import_every = 400
-        cooldown_time = 0.6
+        cooldown_time = 0
 
         docs = []
 

From 8dc8627f786a1e1dcfb9e69d88c86310ebdd3e64 Mon Sep 17 00:00:00 2001
From: Simon Fortier <simon987@users.noreply.github.com>
Date: Thu, 23 Aug 2018 11:51:48 -0400
Subject: [PATCH 25/34] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 859d3e8..b53d50a 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@ Suggestions/concerns/PRs are welcome
 ## Installation
 Assuming you have Python 3 and git installed:
 ```bash
+sudo apt install libssl-dev libcurl4-openssl-dev
 git clone https://github.com/simon987/od-database
 cd od-database
 sudo pip3 install -r requirements.txt

From d42be56deed923ddb8c2d5a2467ee454e8cf09b8 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 23 Aug 2018 11:59:13 -0400
Subject: [PATCH 26/34] More debug info

---
 crawl_server/task_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py
index 841e991..cee1e17 100644
--- a/crawl_server/task_manager.py
+++ b/crawl_server/task_manager.py
@@ -63,7 +63,7 @@ class TaskManager:
                             }
 
                             r = requests.post(config.SERVER_URL + "/task/upload", data=payload, files=files)
-                            logger.info("RESPONSE: " + r.text)
+                            logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">")
                         except Exception as e:
                             logger.error("Exception while sending file_list chunk: " + str(e))
                             pass
@@ -75,7 +75,7 @@ class TaskManager:
             }
 
             r = requests.post(config.SERVER_URL + "/task/complete", data=payload)
-            logger.info("RESPONSE: " + r.text)
+            logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">")
 
             if os.path.exists(filename):
                 os.remove(filename)

From 54b4d2d5b4817854976c0e3703fd2f6ee7c03547 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 23 Aug 2018 12:02:07 -0400
Subject: [PATCH 27/34] removed debug lines

---
 crawl_server/remote_http.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 2d60728..7d73c8e 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -197,10 +197,9 @@ class HttpDirectory(RemoteDirectory):
                     mtime=int(parse_date(date).timestamp()),
                     is_dir=False
                 )
-            except pycurl.error as e:
+            except pycurl.error:
                 curl.close()
                 retries -= 1
-                raise e
 
         logger.debug("TimeoutError - _request_file")
         raise TimeoutError
@@ -215,11 +214,9 @@ class HttpDirectory(RemoteDirectory):
                 self.curl.perform()
 
                 return content.getvalue().decode("utf-8", errors="ignore")
-            except pycurl.error as e:
+            except pycurl.error:
                 self.curl.close()
                 retries -= 1
-                print(e)
-                raise e
 
         logger.debug("TimeoutError - _fetch_body")
         raise TimeoutError

From cadaf14c1b9a8e5ef2b5861b87ff0fc65b7dd6bf Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 23 Aug 2018 12:12:23 -0400
Subject: [PATCH 28/34] Small bugfix

---
 crawl_server/remote_http.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 7d73c8e..3a99947 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -119,7 +119,6 @@ class HttpDirectory(RemoteDirectory):
 
         return curl_head
 
-
     def list_dir(self, path):
 
         current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
@@ -235,14 +234,14 @@ class HttpDirectory(RemoteDirectory):
     @staticmethod
     def _should_ignore(base_url, current_path, link: Anchor):
 
-        if urljoin(base_url, link.href) == urljoin(urljoin(base_url, current_path), "../"):
+        full_url = urljoin(base_url, link.href)
+        if full_url == urljoin(urljoin(base_url, current_path), "../") or full_url == base_url:
             return True
 
         if link.href.endswith(HttpDirectory.BLACK_LIST):
             return True
 
         # Ignore external links
-        full_url = urljoin(base_url, link.href)
         if not full_url.startswith(base_url):
             return True
 

From 484a0baf9ddde256ca7745aa62612037c11696f8 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 23 Aug 2018 12:37:27 -0400
Subject: [PATCH 29/34] Bugfix post-pycurl update

---
 crawl_server/remote_http.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 3a99947..f897840 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -179,13 +179,14 @@ class HttpDirectory(RemoteDirectory):
         while retries > 0:
             try:
                 raw_headers = BytesIO()
-                curl.setopt(pycurl.URL, url)
+                curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore"))
                 curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write)
                 curl.perform()
 
                 stripped_url = url[len(base_url) - 1:]
                 headers = HttpDirectory._parse_dict_header(raw_headers.getvalue().decode("utf-8", errors="ignore"))
                 raw_headers.close()
+                curl.close()
 
                 path, name = os.path.split(stripped_url)
                 date = headers.get("Last-Modified", "1970-01-01")
@@ -208,7 +209,7 @@ class HttpDirectory(RemoteDirectory):
         while retries > 0:
             try:
                 content = BytesIO()
-                self.curl.setopt(pycurl.URL, url)
+                self.curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore"))
                 self.curl.setopt(pycurl.WRITEDATA, content)
                 self.curl.perform()
 

From 6ffc43601b2b3a540f734248fd200ad8eb7ddf86 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 23 Aug 2018 12:40:13 -0400
Subject: [PATCH 30/34] Bugfix post-pycurl update pt. 2

---
 crawl_server/remote_http.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index f897840..5e58bf1 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -162,6 +162,8 @@ class HttpDirectory(RemoteDirectory):
             handles = [self._curl_handle() for _ in range(len(urls_to_request))]
             files = pool.starmap(self._request_file, zip(handles, urls_to_request, repeat(self.base_url)))
             pool.close()
+            for handle in handles:
+                handle.close()
             for file in files:
                 if file:
                     yield file
@@ -186,7 +188,6 @@ class HttpDirectory(RemoteDirectory):
                 stripped_url = url[len(base_url) - 1:]
                 headers = HttpDirectory._parse_dict_header(raw_headers.getvalue().decode("utf-8", errors="ignore"))
                 raw_headers.close()
-                curl.close()
 
                 path, name = os.path.split(stripped_url)
                 date = headers.get("Last-Modified", "1970-01-01")

From 25e1e5882830bb549f79adb71bfbf006d26c1ff0 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 23 Aug 2018 12:46:00 -0400
Subject: [PATCH 31/34] Bugfix post-pycurl update pt. 3 (Sorry!)

---
 crawl_server/remote_http.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 5e58bf1..76fd1a6 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -109,11 +109,12 @@ class HttpDirectory(RemoteDirectory):
 
         self.curl_head = self._curl_handle()
 
-    def _curl_handle(self):
+    @staticmethod
+    def _curl_handle():
 
         curl_head = Curl()
-        curl_head.setopt(self.curl.SSL_VERIFYPEER, 0)
-        curl_head.setopt(self.curl.SSL_VERIFYHOST, 0)
+        curl_head.setopt(pycurl.SSL_VERIFYPEER, 0)
+        curl_head.setopt(pycurl.SSL_VERIFYHOST, 0)
         curl_head.setopt(pycurl.NOBODY, 1)
         curl_head.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT)
 
@@ -159,27 +160,25 @@ class HttpDirectory(RemoteDirectory):
         if len(urls_to_request) > 150:
             # Many urls, use multi-threaded solution
             pool = ThreadPool(processes=10)
-            handles = [self._curl_handle() for _ in range(len(urls_to_request))]
-            files = pool.starmap(self._request_file, zip(handles, urls_to_request, repeat(self.base_url)))
+            files = pool.starmap(self._request_file, zip(urls_to_request, repeat(self.base_url)))
             pool.close()
-            for handle in handles:
-                handle.close()
             for file in files:
                 if file:
                     yield file
         else:
             # Too few urls to create thread pool
             for url in urls_to_request:
-                file = self._request_file(self.curl_head, url, self.base_url)
+                file = self._request_file(url, self.base_url)
                 if file:
                     yield file
 
     @staticmethod
-    def _request_file(curl, url, base_url):
+    def _request_file(url, base_url):
 
         retries = HttpDirectory.MAX_RETRIES
         while retries > 0:
             try:
+                curl = HttpDirectory._curl_handle()
                 raw_headers = BytesIO()
                 curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore"))
                 curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write)

From dff4125c9fb07a055c528e04418b9484fa4b7dd7 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 23 Aug 2018 12:47:17 -0400
Subject: [PATCH 32/34] Bugfix post-pycurl update pt. 3 (Sorry!)

---
 crawl_server/remote_http.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 76fd1a6..d7e2716 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -198,7 +198,6 @@ class HttpDirectory(RemoteDirectory):
                     is_dir=False
                 )
             except pycurl.error:
-                curl.close()
                 retries -= 1
 
         logger.debug("TimeoutError - _request_file")

From faa9ac3ccb270aab349d0fcb773731dcf67c8274 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Thu, 23 Aug 2018 12:48:15 -0400
Subject: [PATCH 33/34] Closing curl handle manually just to make sure

---
 crawl_server/remote_http.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index d7e2716..e916e10 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -190,6 +190,7 @@ class HttpDirectory(RemoteDirectory):
 
                 path, name = os.path.split(stripped_url)
                 date = headers.get("Last-Modified", "1970-01-01")
+                curl.close()
                 return File(
                     path=unquote(path).strip("/"),
                     name=unquote(name),

From 8b13de4a6be703422ff80d29e356b5cba417fabf Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Sat, 25 Aug 2018 16:46:53 -0400
Subject: [PATCH 34/34] Re-init curl handle on error

---
 crawl_server/remote_http.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index e916e10..11452c4 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -215,7 +215,7 @@ class HttpDirectory(RemoteDirectory):
 
                 return content.getvalue().decode("utf-8", errors="ignore")
             except pycurl.error:
-                self.curl.close()
+                self.close()
                 retries -= 1
 
         logger.debug("TimeoutError - _fetch_body")
@@ -262,7 +262,6 @@ class HttpDirectory(RemoteDirectory):
 
     def close(self):
         self.curl.close()
-        logger.debug("Closing HTTPRemoteDirectory for " + self.base_url)
         self.init_curl()