Should fix memory usage problem when crawling (part three)

2025-04-24 12:45:51 +00:00 · 2018-06-16 20:32:50 -04:00 · 2018-06-16 20:32:50 -04:00 · 1283cc9599
commit 1283cc9599
parent 86144935e3
3 changed files with 9 additions and 9 deletions
--- a/app.py
+++ b/app.py
@ -83,9 +83,7 @@ def website_links(website_id):
    website = db.get_website_by_id(website_id)
    if website:
        print("FIXME: website_links")
        links = searchEngine.get_link_list(website_id, website.url)
        print(links)
        return Response("\n".join(links), mimetype="text/plain")
    else:
        abort(404)
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@ -37,10 +37,10 @@ class HttpDirectory(RemoteDirectory):
        results = []
        path_url = os.path.join(self.base_url, path.strip("/"), "")
-        body = self._fetch_body(path_url)
+        body, encoding = self._fetch_body(path_url)
        if not body:
            return []
-        links = self._parse_links(body)
+        links = self._parse_links(body, encoding)
        urls_to_request = []
@ -93,13 +93,13 @@ class HttpDirectory(RemoteDirectory):
        while retries > 0:
            try:
                r = self.session.get(url)
-                return r.content
+                return r.content, r.encoding
            except RequestException:
                retries -= 1
        return None
-    def _parse_links(self, body: bytes) -> list:
+    def _parse_links(self, body: bytes, encoding) -> list:
        result = list()
        try:
@ -113,7 +113,7 @@ class HttpDirectory(RemoteDirectory):
            for link in links:
                result.append((link.text, link.get("href")))
        except UnicodeDecodeError:
-            tree = etree.HTML(body.decode("utf-8", errors="ignore").encode("utf-8"), parser=self.parser)
+            tree = etree.HTML(body.decode(encoding, errors="ignore").encode("utf-8"), parser=self.parser)
            links = []
            try:
                links = tree.findall(".//a/[@href]")
--- a/debug_put.py
+++ b/debug_put.py
@ -4,12 +4,14 @@ import json
 payload = json.dumps({
    "website_id": 123,
-    "url": "ftp://ien11-3-88-183-194-246.fbx.proxad.net/",
+    "url": "http://alphamediazone.com/data/Movies1/",
    # "url": "http://localhost:8000/",
    "priority": 2,
    "callback_type": "",
    "callback_args": "{}"
 })
 r = requests.post("http://localhost:5001/task/put",
-                  headers={"Content-Type": "application/json"},
+                  headers={"Content-Type": "application/json",
                           "Authorization": "Token abc"},
                  data=payload)