mirror of
https://github.com/simon987/od-database.git
synced 2025-12-11 14:08:51 +00:00
More bugfixes for looping directories, some work on task dispatching
This commit is contained in:
@@ -87,8 +87,8 @@ class RemoteDirectoryCrawler:
|
||||
try:
|
||||
try:
|
||||
directory = RemoteDirectoryFactory.get_directory(self.url)
|
||||
path, root_listing = directory.list_dir("")
|
||||
self.crawled_paths.append(path)
|
||||
path_id, root_listing = directory.list_dir(urlparse(self.url).path)
|
||||
self.crawled_paths.append(path_id)
|
||||
directory.close()
|
||||
except TimeoutError:
|
||||
return CrawlResult(0, "timeout")
|
||||
|
||||
@@ -73,6 +73,11 @@ class HttpDirectory(RemoteDirectory):
|
||||
"?C=N&O=A",
|
||||
"?C=N&O=A"
|
||||
)
|
||||
FILE_NAME_BLACKLIST = (
|
||||
"Parent Directory",
|
||||
"../"
|
||||
|
||||
)
|
||||
MAX_RETRIES = 3
|
||||
|
||||
def __init__(self, url):
|
||||
@@ -114,8 +119,8 @@ class HttpDirectory(RemoteDirectory):
|
||||
urls_to_request.append(urljoin(path_url, anchor.href))
|
||||
|
||||
for file in self.request_files(urls_to_request):
|
||||
files.append(file)
|
||||
path_identifier.update(bytes(file))
|
||||
files.append(file)
|
||||
|
||||
return path_identifier.hexdigest(), files
|
||||
|
||||
@@ -197,7 +202,7 @@ class HttpDirectory(RemoteDirectory):
|
||||
|
||||
@staticmethod
|
||||
def _should_ignore(base_url, link: Anchor):
|
||||
if link.text == "../" or link.href == "../" or link.href == "./" or link.href == "" \
|
||||
if link.text in HttpDirectory.FILE_NAME_BLACKLIST or link.href in ("../", "./", "") \
|
||||
or link.href.endswith(HttpDirectory.BLACK_LIST):
|
||||
return True
|
||||
|
||||
|
||||
Reference in New Issue
Block a user