mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Added filter to check if a website can be scanned from its parent directory
This commit is contained in:
parent
f226b82f5a
commit
d138db8f06
9
app.py
9
app.py
@ -322,8 +322,12 @@ def submit():
|
|||||||
|
|
||||||
def try_enqueue(url):
|
def try_enqueue(url):
|
||||||
url = os.path.join(url, "")
|
url = os.path.join(url, "")
|
||||||
website = db.get_website_by_url(url)
|
url = od_util.get_top_directory(url)
|
||||||
|
|
||||||
|
if not od_util.is_valid_url(url):
|
||||||
|
return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme.", "warning"
|
||||||
|
|
||||||
|
website = db.get_website_by_url(url)
|
||||||
if website:
|
if website:
|
||||||
return "Website already exists", "danger"
|
return "Website already exists", "danger"
|
||||||
|
|
||||||
@ -331,9 +335,6 @@ def try_enqueue(url):
|
|||||||
if website:
|
if website:
|
||||||
return "A parent directory of this url has already been posted", "danger"
|
return "A parent directory of this url has already been posted", "danger"
|
||||||
|
|
||||||
if not od_util.is_valid_url(url):
|
|
||||||
return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme.", "danger"
|
|
||||||
|
|
||||||
if db.is_blacklisted(url):
|
if db.is_blacklisted(url):
|
||||||
return "<strong>Error:</strong> " \
|
return "<strong>Error:</strong> " \
|
||||||
"Sorry, this website has been blacklisted. If you think " \
|
"Sorry, this website has been blacklisted. If you think " \
|
||||||
|
@ -92,6 +92,7 @@ class RemoteDirectoryCrawler:
|
|||||||
if root_listing:
|
if root_listing:
|
||||||
self.crawled_paths.append(path_id)
|
self.crawled_paths.append(path_id)
|
||||||
else:
|
else:
|
||||||
|
logger.info("No files in root listing for " + self.url)
|
||||||
return CrawlResult(0, "empty")
|
return CrawlResult(0, "empty")
|
||||||
directory.close()
|
directory.close()
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
|
@ -105,6 +105,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
path_url = urljoin(self.base_url, path, "")
|
path_url = urljoin(self.base_url, path, "")
|
||||||
body = self._stream_body(path_url)
|
body = self._stream_body(path_url)
|
||||||
if not body:
|
if not body:
|
||||||
|
logger.info("No body returned @ " + path_url)
|
||||||
return None, None
|
return None, None
|
||||||
anchors = self._parse_links(body)
|
anchors = self._parse_links(body)
|
||||||
|
|
||||||
|
44
od_util.py
44
od_util.py
@ -180,7 +180,7 @@ def is_od(url):
|
|||||||
elif config.SUBMIT_HTTP:
|
elif config.SUBMIT_HTTP:
|
||||||
r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
|
r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
print("No redirects allowed!")
|
# print("No redirects allowed!")
|
||||||
return False
|
return False
|
||||||
soup = BeautifulSoup(r.text, "lxml")
|
soup = BeautifulSoup(r.text, "lxml")
|
||||||
|
|
||||||
@ -189,20 +189,54 @@ def is_od(url):
|
|||||||
script_tags = len(list(soup.find_all("script")))
|
script_tags = len(list(soup.find_all("script")))
|
||||||
|
|
||||||
if external_links > 11:
|
if external_links > 11:
|
||||||
print("Too many external links!")
|
# print("Too many external links!")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if link_tags > 5:
|
if link_tags > 5:
|
||||||
print("Too many link tags!")
|
# print("Too many link tags!")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if script_tags > 7:
|
if script_tags > 7:
|
||||||
print("Too many script tags!")
|
# print("Too many script tags!")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
# print(e)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def has_parent_dir(url):
|
||||||
|
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
|
||||||
|
if parsed_url.path == "/":
|
||||||
|
return False
|
||||||
|
|
||||||
|
parent_url = urljoin(url, "../")
|
||||||
|
try:
|
||||||
|
r = requests.get(parent_url, timeout=30, allow_redirects=False, verify=False)
|
||||||
|
if r.status_code != 200:
|
||||||
|
return False
|
||||||
|
soup = BeautifulSoup(r.text, "lxml")
|
||||||
|
|
||||||
|
for anchor in soup.find_all("a"):
|
||||||
|
if anchor.get("href") and anchor.get("href").endswith("/") and urljoin(parent_url, anchor.get("href")) == url:
|
||||||
|
# The parent page exists, and has a link to the child directory
|
||||||
|
return is_od(parent_url)
|
||||||
|
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Parent page exists, but does not have a link to the child directory
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_top_directory(url):
|
||||||
|
if url.startswith("ftp://"):
|
||||||
|
return url
|
||||||
|
|
||||||
|
while has_parent_dir(url):
|
||||||
|
url = urljoin(url, "../")
|
||||||
|
return url
|
||||||
|
Loading…
x
Reference in New Issue
Block a user