Bug fixes with html parsing

This commit is contained in:
Simon 2018-06-14 20:02:06 -04:00
parent f3c7b551d2
commit 81fde6cc30
5 changed files with 13 additions and 7 deletions

View File

@ -36,7 +36,7 @@ class HttpDirectory(RemoteDirectory):
def __init__(self, url):
super().__init__(url)
self.parser = etree.HTMLParser(collect_ids=False, encoding="utf-8")
self.parser = etree.HTMLParser(collect_ids=False)
def list_dir(self, path) -> list:
results = []
@ -103,17 +103,21 @@ class HttpDirectory(RemoteDirectory):
while retries > 0:
try:
r = requests.get(url, headers=HttpDirectory.HEADERS)
return r.text
return r.content
except RequestException:
retries -= 1
return None
def _parse_links(self, body: str) -> set:
def _parse_links(self, body: bytes) -> set:
result = set()
tree = etree.HTML(body, parser=self.parser)
links = tree.findall(".//a/[@href]")
links = []
try:
links = tree.findall(".//a/[@href]")
except AttributeError:
pass
for link in links:
result.add(Link(link.text, link.get("href")))

View File

@ -9,7 +9,7 @@ auth = HTTPTokenAuth(scheme="Token")
tokens = [config.CRAWL_SERVER_TOKEN]
tm = TaskManager("tm_db.sqlite3", 8)
tm = TaskManager("tm_db.sqlite3", 64)
@auth.verify_token

View File

@ -54,7 +54,7 @@ class TaskManager:
print("Starting task " + task.url)
crawler = RemoteDirectoryCrawler(task.url, 100)
crawler = RemoteDirectoryCrawler(task.url, 10)
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
result.file_count = crawl_result.file_count

View File

@ -15,3 +15,4 @@ elasticsearch
python-dateutil
flask_httpauth
ujson
timeout_decorator

View File

@ -91,7 +91,8 @@ class TaskDispatcher:
for task in server.fetch_completed_tasks():
print("Completed task")
file_list = server.fetch_website_files(task.website_id)
self.search.import_json(file_list, task.website_id)
if file_list:
self.search.import_json(file_list, task.website_id)
def dispatch_task(self, task: Task):
self._get_available_crawl_server().queue_task(task)