From f32f9ee3a6a17328c4a329fb3d742f296408d559 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 6 Feb 2018 16:03:18 -0500 Subject: [PATCH] Fixed problem with a problematic type of OD --- crawler.py | 3 ++- parser.py | 40 ++++++++++++++++++++++------------------ problematic websites | 1 - 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/crawler.py b/crawler.py index 5e7ab7f..19a37d7 100644 --- a/crawler.py +++ b/crawler.py @@ -92,7 +92,8 @@ class Crawler: if __name__ == "__main__": - c = Crawler("http://cnqzu.com/library/Anarchy%20Folder/Computers/", True) + c = Crawler("https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/", True) c.crawl() + print(c.files) c.store_report("000008") diff --git a/parser.py b/parser.py index b6cb2aa..59d0c13 100644 --- a/parser.py +++ b/parser.py @@ -50,8 +50,10 @@ class PageParser: @staticmethod def should_save_link(text): + return text.lower().find("parent directory") == -1 and text != "Name" and text != "Last modified" and \ - text != "Size" and text != "Description " and text != "Description" and text != "../" + text != "Size" and text != "Description " and text != "Description" and text != "../" and text != "" and\ + text is not None @staticmethod def file_type(link): @@ -170,28 +172,30 @@ class ApacheParser(PageParser): if len(row.find_all("th")) > 0: continue - link = row.find("a") + links_in_row = row.find_all("a") - if link is None: - # Exited directory listing - return links - if PageParser.should_save_link(link.text): + for link in links_in_row: + if link is None: + # Exited directory listing + return links - target = link.get("href") - file_type = PageParser.file_type(target) - full_link = urljoin(base_url, target) + if PageParser.should_save_link(link.text): - if file_type == "f": - extension = os.path.splitext(full_link)[1].strip(".") + target = link.get("href") + file_type = PageParser.file_type(target) + full_link = urljoin(base_url, target) - cols = row.find_all("td") - for i in range(len(cols)): - cols[i] = cols[i].string if cols[i].string is not None else "-" - size = self.get_size(cols) + if file_type == "f": + extension = os.path.splitext(full_link)[1].strip(".") - links[target] = dict(link=full_link, size=size, ext=extension, type=file_type) - else: - links[target] = dict(link=full_link, type=file_type) + cols = row.find_all("td") + for i in range(len(cols)): + cols[i] = cols[i].string if cols[i].string is not None else "-" + size = self.get_size(cols) + + links[target] = dict(link=full_link, size=size, ext=extension, type=file_type) + else: + links[target] = dict(link=full_link, type=file_type) else: for link in soup.find_all("a"): diff --git a/problematic websites b/problematic websites index 8ff3245..0e7a9cc 100644 --- a/problematic websites +++ b/problematic websites @@ -3,7 +3,6 @@ Breaks: http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links) https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache) -https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/ (recursion problem) http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem) https://filepursuit.com/ (recursion problem - not an OD) https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)