From 8a73142ff8621972511df836b82c61d017515c00 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Mon, 18 Jun 2018 13:44:19 -0400
Subject: [PATCH] Support for more than just utf-8 and removed some debug info

---
 crawl_server/crawler.py      |  2 --
 crawl_server/remote_http.py  | 70 +++++++++++++++++++-----------------
 crawl_server/task_manager.py |  3 --
 debug_put.py                 |  2 +-
 4 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/crawl_server/crawler.py b/crawl_server/crawler.py
index 9d3177e..4acb78a 100644
--- a/crawl_server/crawler.py
+++ b/crawl_server/crawler.py
@@ -80,8 +80,6 @@ class RemoteDirectoryCrawler:
 
     def crawl_directory(self, out_file: str) -> CrawlResult:
 
-        import gc
-        gc.set_debug(gc.DEBUG_LEAK)
         try:
             directory = RemoteDirectoryFactory.get_directory(self.url)
             root_listing = directory.list_dir("")
diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 99a6345..74e8003 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -1,4 +1,5 @@
 from urllib.parse import unquote, urljoin
+import warnings
 import os
 from html.parser import HTMLParser
 from itertools import repeat
@@ -118,44 +119,47 @@ class HttpDirectory(RemoteDirectory):
 
     def _request_file(self, url):
 
-        retries = HttpDirectory.MAX_RETRIES
-        while retries > 0:
-            try:
-                r = self.session.head(url, allow_redirects=False, timeout=40)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            retries = HttpDirectory.MAX_RETRIES
+            while retries > 0:
+                try:
+                    r = self.session.head(url, allow_redirects=False, timeout=40)
 
-                stripped_url = url[len(self.base_url) - 1:]
+                    stripped_url = url[len(self.base_url) - 1:]
 
-                path, name = os.path.split(stripped_url)
-                date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
-                return File(
-                    path=unquote(path).strip("/"),
-                    name=unquote(name),
-                    size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
-                    mtime=int(parse_date(date).timestamp()),
-                    is_dir=False
-                )
-            except RequestException:
-                self.session.close()
-                retries -= 1
+                    path, name = os.path.split(stripped_url)
+                    date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
+                    return File(
+                        path=unquote(path).strip("/"),
+                        name=unquote(name),
+                        size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
+                        mtime=int(parse_date(date).timestamp()),
+                        is_dir=False
+                    )
+                except RequestException:
+                    self.session.close()
+                    retries -= 1
 
-        return None
+            return None
 
     def _stream_body(self, url: str):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            retries = HttpDirectory.MAX_RETRIES
+            while retries > 0:
+                try:
+                    r = self.session.get(url, stream=True, timeout=40)
+                    for chunk in r.iter_content(chunk_size=4096):
+                        yield chunk.decode(r.encoding, errors="ignore")
+                    r.close()
+                    del r
+                    break
+                except RequestException:
+                    self.session.close()
+                    retries -= 1
 
-        retries = HttpDirectory.MAX_RETRIES
-        while retries > 0:
-            try:
-                r = self.session.get(url, stream=True, timeout=40)
-                for chunk in r.iter_content(chunk_size=4096):
-                    yield chunk
-                r.close()
-                del r
-                break
-            except RequestException:
-                self.session.close()
-                retries -= 1
-
-        return None
+            return None
 
     @staticmethod
     def _parse_links(body):
@@ -163,7 +167,7 @@ class HttpDirectory(RemoteDirectory):
         parser = HTMLAnchorParser()
 
         for chunk in body:
-            parser.feed(chunk.decode("utf-8", errors="ignore"))
+            parser.feed(chunk)
             for anchor in parser.anchors:
                 yield anchor
 
diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py
index df98b77..82b4687 100644
--- a/crawl_server/task_manager.py
+++ b/crawl_server/task_manager.py
@@ -53,9 +53,6 @@ class TaskManager:
     @staticmethod
     def run_task(task, db_path, current_tasks):
 
-        # import gc
-        # gc.set_debug(gc.DEBUG_LEAK)
-
         result = TaskResult()
         result.start_time = datetime.utcnow()
         result.website_id = task.website_id
diff --git a/debug_put.py b/debug_put.py
index 1e18546..450f4c9 100644
--- a/debug_put.py
+++ b/debug_put.py
@@ -4,7 +4,7 @@ import json
 
 payload = json.dumps({
     "website_id": 123,
-    "url": "http://liminaire.fr/TEXTES/",
+    "url": "https://computerarchive.org/files/computer/",
     # "url": "http://localhost:8000/",
     # "url": "http://ubuntu.mirrorservice.org/",
     "priority": 2,