diff --git a/app.py b/app.py index 9fe8d3f..1a077ae 100644 --- a/app.py +++ b/app.py @@ -83,9 +83,7 @@ def website_links(website_id): website = db.get_website_by_id(website_id) if website: - print("FIXME: website_links") links = searchEngine.get_link_list(website_id, website.url) - print(links) return Response("\n".join(links), mimetype="text/plain") else: abort(404) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index c146b49..76ccc6d 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -37,10 +37,10 @@ class HttpDirectory(RemoteDirectory): results = [] path_url = os.path.join(self.base_url, path.strip("/"), "") - body = self._fetch_body(path_url) + body, encoding = self._fetch_body(path_url) if not body: return [] - links = self._parse_links(body) + links = self._parse_links(body, encoding) urls_to_request = [] @@ -93,13 +93,13 @@ class HttpDirectory(RemoteDirectory): while retries > 0: try: r = self.session.get(url) - return r.content + return r.content, r.encoding except RequestException: retries -= 1 return None - def _parse_links(self, body: bytes) -> list: + def _parse_links(self, body: bytes, encoding) -> list: result = list() try: @@ -113,7 +113,7 @@ class HttpDirectory(RemoteDirectory): for link in links: result.append((link.text, link.get("href"))) except UnicodeDecodeError: - tree = etree.HTML(body.decode("utf-8", errors="ignore").encode("utf-8"), parser=self.parser) + tree = etree.HTML(body.decode(encoding, errors="ignore").encode("utf-8"), parser=self.parser) links = [] try: links = tree.findall(".//a/[@href]") diff --git a/debug_put.py b/debug_put.py index ce20196..8dd789d 100644 --- a/debug_put.py +++ b/debug_put.py @@ -4,12 +4,14 @@ import json payload = json.dumps({ "website_id": 123, - "url": "ftp://ien11-3-88-183-194-246.fbx.proxad.net/", + "url": "http://alphamediazone.com/data/Movies1/", + # "url": "http://localhost:8000/", "priority": 2, "callback_type": "", "callback_args": "{}" }) r = requests.post("http://localhost:5001/task/put", - headers={"Content-Type": "application/json"}, + headers={"Content-Type": "application/json", + "Authorization": "Token abc"}, data=payload)