Added filter to check if a website can be scanned from its parent directory

This commit is contained in:
Simon 2018-07-10 10:14:23 -04:00
parent f226b82f5a
commit d138db8f06
4 changed files with 46 additions and 9 deletions

9
app.py
View File

@ -322,8 +322,12 @@ def submit():
def try_enqueue(url): def try_enqueue(url):
url = os.path.join(url, "") url = os.path.join(url, "")
website = db.get_website_by_url(url) url = od_util.get_top_directory(url)
if not od_util.is_valid_url(url):
return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme.", "warning"
website = db.get_website_by_url(url)
if website: if website:
return "Website already exists", "danger" return "Website already exists", "danger"
@ -331,9 +335,6 @@ def try_enqueue(url):
if website: if website:
return "A parent directory of this url has already been posted", "danger" return "A parent directory of this url has already been posted", "danger"
if not od_util.is_valid_url(url):
return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme.", "danger"
if db.is_blacklisted(url): if db.is_blacklisted(url):
return "<strong>Error:</strong> " \ return "<strong>Error:</strong> " \
"Sorry, this website has been blacklisted. If you think " \ "Sorry, this website has been blacklisted. If you think " \

View File

@ -92,6 +92,7 @@ class RemoteDirectoryCrawler:
if root_listing: if root_listing:
self.crawled_paths.append(path_id) self.crawled_paths.append(path_id)
else: else:
logger.info("No files in root listing for " + self.url)
return CrawlResult(0, "empty") return CrawlResult(0, "empty")
directory.close() directory.close()
except TimeoutError: except TimeoutError:

View File

@ -105,6 +105,7 @@ class HttpDirectory(RemoteDirectory):
path_url = urljoin(self.base_url, path, "") path_url = urljoin(self.base_url, path, "")
body = self._stream_body(path_url) body = self._stream_body(path_url)
if not body: if not body:
logger.info("No body returned @ " + path_url)
return None, None return None, None
anchors = self._parse_links(body) anchors = self._parse_links(body)

View File

@ -180,7 +180,7 @@ def is_od(url):
elif config.SUBMIT_HTTP: elif config.SUBMIT_HTTP:
r = requests.get(url, timeout=30, allow_redirects=False, verify=False) r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
if r.status_code != 200: if r.status_code != 200:
print("No redirects allowed!") # print("No redirects allowed!")
return False return False
soup = BeautifulSoup(r.text, "lxml") soup = BeautifulSoup(r.text, "lxml")
@ -189,20 +189,54 @@ def is_od(url):
script_tags = len(list(soup.find_all("script"))) script_tags = len(list(soup.find_all("script")))
if external_links > 11: if external_links > 11:
print("Too many external links!") # print("Too many external links!")
return False return False
if link_tags > 5: if link_tags > 5:
print("Too many link tags!") # print("Too many link tags!")
return False return False
if script_tags > 7: if script_tags > 7:
print("Too many script tags!") # print("Too many script tags!")
return False return False
return True return True
except Exception as e: except Exception as e:
print(e) # print(e)
return False return False
def has_parent_dir(url):
parsed_url = urlparse(url)
if parsed_url.path == "/":
return False
parent_url = urljoin(url, "../")
try:
r = requests.get(parent_url, timeout=30, allow_redirects=False, verify=False)
if r.status_code != 200:
return False
soup = BeautifulSoup(r.text, "lxml")
for anchor in soup.find_all("a"):
if anchor.get("href") and anchor.get("href").endswith("/") and urljoin(parent_url, anchor.get("href")) == url:
# The parent page exists, and has a link to the child directory
return is_od(parent_url)
except:
return False
# Parent page exists, but does not have a link to the child directory
return False
def get_top_directory(url):
if url.startswith("ftp://"):
return url
while has_parent_dir(url):
url = urljoin(url, "../")
return url