From f32f9ee3a6a17328c4a329fb3d742f296408d559 Mon Sep 17 00:00:00 2001
From: simon <Anise5-humid5-Allow-5Lulu-Honk>
Date: Tue, 6 Feb 2018 16:03:18 -0500
Subject: [PATCH] Fixed problem with a problematic type of OD

---
 crawler.py           |  3 ++-
 parser.py            | 40 ++++++++++++++++++++++------------------
 problematic websites |  1 -
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/crawler.py b/crawler.py
index 5e7ab7f..19a37d7 100644
--- a/crawler.py
+++ b/crawler.py
@@ -92,7 +92,8 @@ class Crawler:
 
 
 if __name__ == "__main__":
-    c = Crawler("http://cnqzu.com/library/Anarchy%20Folder/Computers/", True)
+    c = Crawler("https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/", True)
     c.crawl()
+    print(c.files)
     c.store_report("000008")
 
diff --git a/parser.py b/parser.py
index b6cb2aa..59d0c13 100644
--- a/parser.py
+++ b/parser.py
@@ -50,8 +50,10 @@ class PageParser:
 
     @staticmethod
     def should_save_link(text):
+
         return text.lower().find("parent directory") == -1 and text != "Name" and text != "Last modified" and \
-               text != "Size" and text != "Description " and text != "Description" and text != "../"
+               text != "Size" and text != "Description " and text != "Description" and text != "../" and text != "" and\
+               text is not None
 
     @staticmethod
     def file_type(link):
@@ -170,28 +172,30 @@ class ApacheParser(PageParser):
                 if len(row.find_all("th")) > 0:
                     continue
 
-                link = row.find("a")
+                links_in_row = row.find_all("a")
 
-                if link is None:
-                    # Exited directory listing
-                    return links
-                if PageParser.should_save_link(link.text):
+                for link in links_in_row:
+                    if link is None:
+                        # Exited directory listing
+                        return links
 
-                    target = link.get("href")
-                    file_type = PageParser.file_type(target)
-                    full_link = urljoin(base_url, target)
+                    if PageParser.should_save_link(link.text):
 
-                    if file_type == "f":
-                        extension = os.path.splitext(full_link)[1].strip(".")
+                        target = link.get("href")
+                        file_type = PageParser.file_type(target)
+                        full_link = urljoin(base_url, target)
 
-                        cols = row.find_all("td")
-                        for i in range(len(cols)):
-                            cols[i] = cols[i].string if cols[i].string is not None else "-"
-                        size = self.get_size(cols)
+                        if file_type == "f":
+                            extension = os.path.splitext(full_link)[1].strip(".")
 
-                        links[target] = dict(link=full_link, size=size, ext=extension, type=file_type)
-                    else:
-                        links[target] = dict(link=full_link, type=file_type)
+                            cols = row.find_all("td")
+                            for i in range(len(cols)):
+                                cols[i] = cols[i].string if cols[i].string is not None else "-"
+                            size = self.get_size(cols)
+
+                            links[target] = dict(link=full_link, size=size, ext=extension, type=file_type)
+                        else:
+                            links[target] = dict(link=full_link, type=file_type)
         else:
 
             for link in soup.find_all("a"):
diff --git a/problematic websites b/problematic websites
index 8ff3245..0e7a9cc 100644
--- a/problematic websites	
+++ b/problematic websites	
@@ -3,7 +3,6 @@ Breaks:
 http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links)
 https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache)
 
-https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/ (recursion problem)
 http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem)
 https://filepursuit.com/ (recursion problem - not an OD)
 https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)