Added filter to check if a website can be scanned from its parent directory

This commit is contained in:
Simon
2018-07-10 10:14:23 -04:00
parent f226b82f5a
commit d138db8f06
4 changed files with 46 additions and 9 deletions

View File

@@ -92,6 +92,7 @@ class RemoteDirectoryCrawler:
if root_listing:
self.crawled_paths.append(path_id)
else:
logger.info("No files in root listing for " + self.url)
return CrawlResult(0, "empty")
directory.close()
except TimeoutError:

View File

@@ -105,6 +105,7 @@ class HttpDirectory(RemoteDirectory):
path_url = urljoin(self.base_url, path, "")
body = self._stream_body(path_url)
if not body:
logger.info("No body returned @ " + path_url)
return None, None
anchors = self._parse_links(body)