Should fix memory usage problem when crawling (part three)

This commit is contained in:
Simon 2018-06-16 20:32:50 -04:00
parent 86144935e3
commit 1283cc9599
3 changed files with 9 additions and 9 deletions

2
app.py
View File

@ -83,9 +83,7 @@ def website_links(website_id):
website = db.get_website_by_id(website_id)
if website:
print("FIXME: website_links")
links = searchEngine.get_link_list(website_id, website.url)
print(links)
return Response("\n".join(links), mimetype="text/plain")
else:
abort(404)

View File

@ -37,10 +37,10 @@ class HttpDirectory(RemoteDirectory):
results = []
path_url = os.path.join(self.base_url, path.strip("/"), "")
body = self._fetch_body(path_url)
body, encoding = self._fetch_body(path_url)
if not body:
return []
links = self._parse_links(body)
links = self._parse_links(body, encoding)
urls_to_request = []
@ -93,13 +93,13 @@ class HttpDirectory(RemoteDirectory):
while retries > 0:
try:
r = self.session.get(url)
return r.content
return r.content, r.encoding
except RequestException:
retries -= 1
return None
def _parse_links(self, body: bytes) -> list:
def _parse_links(self, body: bytes, encoding) -> list:
result = list()
try:
@ -113,7 +113,7 @@ class HttpDirectory(RemoteDirectory):
for link in links:
result.append((link.text, link.get("href")))
except UnicodeDecodeError:
tree = etree.HTML(body.decode("utf-8", errors="ignore").encode("utf-8"), parser=self.parser)
tree = etree.HTML(body.decode(encoding, errors="ignore").encode("utf-8"), parser=self.parser)
links = []
try:
links = tree.findall(".//a/[@href]")

View File

@ -4,12 +4,14 @@ import json
payload = json.dumps({
"website_id": 123,
"url": "ftp://ien11-3-88-183-194-246.fbx.proxad.net/",
"url": "http://alphamediazone.com/data/Movies1/",
# "url": "http://localhost:8000/",
"priority": 2,
"callback_type": "",
"callback_args": "{}"
})
r = requests.post("http://localhost:5001/task/put",
headers={"Content-Type": "application/json"},
headers={"Content-Type": "application/json",
"Authorization": "Token abc"},
data=payload)