Should fix memory usage problem when crawling

This commit is contained in:
Simon
2018-06-14 23:36:54 -04:00
parent 9aed18c2d2
commit adb94cf326
6 changed files with 46 additions and 12 deletions

View File

@@ -36,7 +36,7 @@ class HttpDirectory(RemoteDirectory):
def __init__(self, url):
super().__init__(url)
self.parser = etree.HTMLParser(collect_ids=False)
self.parser = etree.HTMLParser(collect_ids=False, encoding='utf-8')
def list_dir(self, path) -> list:
results = []
@@ -103,7 +103,7 @@ class HttpDirectory(RemoteDirectory):
while retries > 0:
try:
r = requests.get(url, headers=HttpDirectory.HEADERS)
return r.content
return r.text
except RequestException:
retries -= 1