Should fix memory usage problem when crawling (part three)

This commit is contained in:
Simon 2018-06-16 20:32:50 -04:00
parent 86144935e3
commit 1283cc9599
3 changed files with 9 additions and 9 deletions

2
app.py
View File

@ -83,9 +83,7 @@ def website_links(website_id):
website = db.get_website_by_id(website_id) website = db.get_website_by_id(website_id)
if website: if website:
print("FIXME: website_links")
links = searchEngine.get_link_list(website_id, website.url) links = searchEngine.get_link_list(website_id, website.url)
print(links)
return Response("\n".join(links), mimetype="text/plain") return Response("\n".join(links), mimetype="text/plain")
else: else:
abort(404) abort(404)

View File

@ -37,10 +37,10 @@ class HttpDirectory(RemoteDirectory):
results = [] results = []
path_url = os.path.join(self.base_url, path.strip("/"), "") path_url = os.path.join(self.base_url, path.strip("/"), "")
body = self._fetch_body(path_url) body, encoding = self._fetch_body(path_url)
if not body: if not body:
return [] return []
links = self._parse_links(body) links = self._parse_links(body, encoding)
urls_to_request = [] urls_to_request = []
@ -93,13 +93,13 @@ class HttpDirectory(RemoteDirectory):
while retries > 0: while retries > 0:
try: try:
r = self.session.get(url) r = self.session.get(url)
return r.content return r.content, r.encoding
except RequestException: except RequestException:
retries -= 1 retries -= 1
return None return None
def _parse_links(self, body: bytes) -> list: def _parse_links(self, body: bytes, encoding) -> list:
result = list() result = list()
try: try:
@ -113,7 +113,7 @@ class HttpDirectory(RemoteDirectory):
for link in links: for link in links:
result.append((link.text, link.get("href"))) result.append((link.text, link.get("href")))
except UnicodeDecodeError: except UnicodeDecodeError:
tree = etree.HTML(body.decode("utf-8", errors="ignore").encode("utf-8"), parser=self.parser) tree = etree.HTML(body.decode(encoding, errors="ignore").encode("utf-8"), parser=self.parser)
links = [] links = []
try: try:
links = tree.findall(".//a/[@href]") links = tree.findall(".//a/[@href]")

View File

@ -4,12 +4,14 @@ import json
payload = json.dumps({ payload = json.dumps({
"website_id": 123, "website_id": 123,
"url": "ftp://ien11-3-88-183-194-246.fbx.proxad.net/", "url": "http://alphamediazone.com/data/Movies1/",
# "url": "http://localhost:8000/",
"priority": 2, "priority": 2,
"callback_type": "", "callback_type": "",
"callback_args": "{}" "callback_args": "{}"
}) })
r = requests.post("http://localhost:5001/task/put", r = requests.post("http://localhost:5001/task/put",
headers={"Content-Type": "application/json"}, headers={"Content-Type": "application/json",
"Authorization": "Token abc"},
data=payload) data=payload)