mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 10:56:47 +00:00
Bug fixes with html parsing
This commit is contained in:
parent
f3c7b551d2
commit
81fde6cc30
@ -36,7 +36,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
|
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
super().__init__(url)
|
super().__init__(url)
|
||||||
self.parser = etree.HTMLParser(collect_ids=False, encoding="utf-8")
|
self.parser = etree.HTMLParser(collect_ids=False)
|
||||||
|
|
||||||
def list_dir(self, path) -> list:
|
def list_dir(self, path) -> list:
|
||||||
results = []
|
results = []
|
||||||
@ -103,17 +103,21 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
while retries > 0:
|
while retries > 0:
|
||||||
try:
|
try:
|
||||||
r = requests.get(url, headers=HttpDirectory.HEADERS)
|
r = requests.get(url, headers=HttpDirectory.HEADERS)
|
||||||
return r.text
|
return r.content
|
||||||
except RequestException:
|
except RequestException:
|
||||||
retries -= 1
|
retries -= 1
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _parse_links(self, body: str) -> set:
|
def _parse_links(self, body: bytes) -> set:
|
||||||
|
|
||||||
result = set()
|
result = set()
|
||||||
tree = etree.HTML(body, parser=self.parser)
|
tree = etree.HTML(body, parser=self.parser)
|
||||||
|
links = []
|
||||||
|
try:
|
||||||
links = tree.findall(".//a/[@href]")
|
links = tree.findall(".//a/[@href]")
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
result.add(Link(link.text, link.get("href")))
|
result.add(Link(link.text, link.get("href")))
|
||||||
|
@ -9,7 +9,7 @@ auth = HTTPTokenAuth(scheme="Token")
|
|||||||
|
|
||||||
tokens = [config.CRAWL_SERVER_TOKEN]
|
tokens = [config.CRAWL_SERVER_TOKEN]
|
||||||
|
|
||||||
tm = TaskManager("tm_db.sqlite3", 8)
|
tm = TaskManager("tm_db.sqlite3", 64)
|
||||||
|
|
||||||
|
|
||||||
@auth.verify_token
|
@auth.verify_token
|
||||||
|
@ -54,7 +54,7 @@ class TaskManager:
|
|||||||
|
|
||||||
print("Starting task " + task.url)
|
print("Starting task " + task.url)
|
||||||
|
|
||||||
crawler = RemoteDirectoryCrawler(task.url, 100)
|
crawler = RemoteDirectoryCrawler(task.url, 10)
|
||||||
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
|
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
|
||||||
|
|
||||||
result.file_count = crawl_result.file_count
|
result.file_count = crawl_result.file_count
|
||||||
|
@ -15,3 +15,4 @@ elasticsearch
|
|||||||
python-dateutil
|
python-dateutil
|
||||||
flask_httpauth
|
flask_httpauth
|
||||||
ujson
|
ujson
|
||||||
|
timeout_decorator
|
||||||
|
1
task.py
1
task.py
@ -91,6 +91,7 @@ class TaskDispatcher:
|
|||||||
for task in server.fetch_completed_tasks():
|
for task in server.fetch_completed_tasks():
|
||||||
print("Completed task")
|
print("Completed task")
|
||||||
file_list = server.fetch_website_files(task.website_id)
|
file_list = server.fetch_website_files(task.website_id)
|
||||||
|
if file_list:
|
||||||
self.search.import_json(file_list, task.website_id)
|
self.search.import_json(file_list, task.website_id)
|
||||||
|
|
||||||
def dispatch_task(self, task: Task):
|
def dispatch_task(self, task: Task):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user