From 88cebae04727f4fe2b3205e64b90974c2585f03a Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 6 Feb 2018 17:10:11 -0500 Subject: [PATCH] Fixed incorrect size for some files --- crawler.py | 13 +++- parser.py | 26 +++++--- problematic websites | 16 ++--- spec/Parser_spec.py | 40 +++++++++++- spec/test.html | 35 +++++++++++ spec/test_apache6.html | 22 +++++++ spec/test_apache7.html | 135 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 267 insertions(+), 20 deletions(-) create mode 100644 spec/test.html create mode 100644 spec/test_apache6.html create mode 100644 spec/test_apache7.html diff --git a/crawler.py b/crawler.py index 19a37d7..0178282 100644 --- a/crawler.py +++ b/crawler.py @@ -92,8 +92,15 @@ class Crawler: if __name__ == "__main__": - c = Crawler("https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/", True) + c = Crawler("http://dl.apkhome.org/", True) c.crawl() - print(c.files) - c.store_report("000008") + + r = ReportBuilder(c.files, "http://dl.apkhome.org/") + print(r.get_total_size_formatted()) + + for f in c.files: + if f["size"] > 1000000: + print(f) + + c.store_report("000009") diff --git a/parser.py b/parser.py index 59d0c13..c81ac2f 100644 --- a/parser.py +++ b/parser.py @@ -1,6 +1,6 @@ import os import re -from urllib.parse import urljoin +from urllib.parse import urljoin, unquote import humanfriendly from bs4 import BeautifulSoup @@ -17,10 +17,16 @@ class PageParser: raise NotImplementedError() @staticmethod - def get_size_columns(cols): + def get_size_columns(cols, file_name): for i in range(len(cols)): + col_file_name = cols[i][0:cols[i].rfind("..>")] # Some file names could be truncated: 'long_file_..>' + file_name = unquote(file_name)[0:len(col_file_name)] + + if len(file_name) > 0 and file_name in col_file_name : + continue # Skip if cols[i] is file name to avoid file names like 100px*.jpg to be parsed as 100 PB + if i == len(cols) - 1: try: humanfriendly.parse_size(cols[i]) @@ -53,7 +59,7 @@ class PageParser: return text.lower().find("parent directory") == -1 and text != "Name" and text != "Last modified" and \ text != "Size" and text != "Description " and text != "Description" and text != "../" and text != "" and\ - text is not None + text is not None and text != ".." @staticmethod def file_type(link): @@ -72,11 +78,12 @@ class PageParser: return text - def get_size(self, cols): + def get_size(self, cols, file_name): # Figure out which column(s) is the size one - size_cols = self.get_size_columns(cols) + size_cols = self.get_size_columns(cols, file_name) if size_cols is not None: + col_start, col_end = size_cols self.size_unknown = False @@ -131,6 +138,7 @@ class NginxParser(PageParser): try: if PageParser.should_save_link(link.text): target = link.get("href") + short_file_name = os.path.split(target)[1] full_link = urljoin(base_url, target) file_type = PageParser.file_type(target) @@ -142,7 +150,7 @@ class NginxParser(PageParser): date_and_size = text[target_index:text.find("', '175289', 'kB', '2008/10/21', '09:00:02', '']) + result = self.parser.get_size_columns(['', '175289', 'kB', '2008/10/21', '09:00:02', ''], "") + result1 = self.parser.get_size_columns(['100pxfilename.jpg', '175289', 'kB', '2008/10/21', '09:00:02', ''], "100pxfilename.jpg") self.assertEqual(result, (1, 2)) + self.assertEqual(result1, (1, 2)) def test_link_count(self): @@ -172,4 +174,38 @@ class ApacheParserTest4(TestCase): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual(result["The.Big.Bang.Theory.S03E06.Football.fuer.Nerds.German.WS.DVDRip.XviD-DELiCiOUS.avi"]["size"], 175000000) - self.assertEqual(result["The.Big.Bang.Theory.S03E03.Sex.oder.Pralinen.German.WS.DVDRip.XviD-DELiCiOUS.avi"]["size"], 0) \ No newline at end of file + self.assertEqual(result["The.Big.Bang.Theory.S03E03.Sex.oder.Pralinen.German.WS.DVDRip.XviD-DELiCiOUS.avi"]["size"], 0) + + +class ApacheParserTest5(TestCase): + + def setUp(self): + self.parser = ApacheParser() + + root_page_file = open("test.html", "r") + self.root_page = root_page_file.read() + self.base_url = "http://archive.scene.org/pub/resources/docs/" + root_page_file.close() + + def test_link_size(self): + result = self.parser.get_links(self.root_page, self.base_url) + + self.assertEqual(result["17toilet.txt"]["size"], 12700) + self.assertEqual(result["288help.diz"]["size"], 9000) + + +class ApacheParserTest7(TestCase): + + def setUp(self): + self.parser = ApacheParser() + + root_page_file = open("test_apache7.html", "r") + self.root_page = root_page_file.read() + self.base_url = "http://www.serenitystreetnews.com/videos/feb 2013/" + root_page_file.close() + + def test_link_size(self): + result = self.parser.get_links(self.root_page, self.base_url) + + self.assertEqual(result["700%20Emerald%20Tablets%20Dark%20Brothers%20-%20YouTube.flv"]["size"], 145000000) + self.assertEqual(result["Economic%20Collapse%20Survival%20Map%20-%20Risk%20Analysis%20of%20best%20area%20in%20United%20States%20-%20YouTube.flv"]["size"], 28000000) \ No newline at end of file diff --git a/spec/test.html b/spec/test.html new file mode 100644 index 0000000..e13d5c9 --- /dev/null +++ b/spec/test.html @@ -0,0 +1,35 @@ + + + + + Index of /pub/resources/docs/ + + +

Index of /pub/resources/docs/

+
+ + + + + + + + + + + + + + + + + + + + + + +
NameLast ModifiedSizeType
../ -  Directory
bbs_denmark/2017-Oct-25 06:57:00-  Directory
bbs_finland/2017-Oct-25 06:56:02-  Directory
bbs_france/2016-Nov-29 13:35:22-  Directory
bbs_netherlands/2017-Oct-29 20:27:03-  Directory
bbs_scene/2017-Oct-01 17:44:08-  Directory
bbs_sweden/2017-Oct-29 20:28:57-  Directory
denthor/2001-Jul-03 08:19:10-  Directory
lsd_team/1999-Jul-06 04:02:05-  Directory
scenery/2013-Dec-25 19:07:35-  Directory
!!!!!!!!.txt2013-Jul-07 03:06:1020.6Ktext/plain
16xpktst.txt2015-Sep-02 06:00:1918.0Ktext/plain
17toilet.txt2015-Sep-02 06:21:3712.7Ktext/plain
1st_sysx.diz2004-Sep-21 02:44:070.3Kapplication/octet-stream
1st_sysx.zip2004-Sep-21 10:59:353.4Kapplication/zip
288help.diz2016-Nov-01 15:22:279.0Kapplication/octet-stream
288help.lha2016-Nov-01 15:22:2425.6Kapplication/x-lha
2d.txt2004-Oct-15 04:25:120.9Ktext/plain
+
+ + diff --git a/spec/test_apache6.html b/spec/test_apache6.html new file mode 100644 index 0000000..8374449 --- /dev/null +++ b/spec/test_apache6.html @@ -0,0 +1,22 @@ + + + +Index of /wp-content/uploads/2017/02 + + +

Index of /wp-content/uploads/2017/02

+ + + + + + + + + + + + + +
 NameLast modifiedSizeDescription

 Parent Directory   -  
 1280px-Vegetable_ris..>2018-01-23 05:44 64K 
 1280px-Vegetable_ris..>2018-01-23 05:44 43K 
 1280px-Vegetable_ris..>2018-01-23 05:44 192K 
 1280px-Vegetable_ris..>2018-01-23 05:44 46K 
 1280px-Vegetable_ris..>2018-01-23 05:44 201K 
 1280px-Vegetable_ris..>2018-01-23 05:44 54K 
 1280px-Vegetable_ris..>2017-08-29 18:04 62K 
 1280px-Vegetable_ris..>2017-08-29 18:04 159K 

+ diff --git a/spec/test_apache7.html b/spec/test_apache7.html new file mode 100644 index 0000000..4fedcfd --- /dev/null +++ b/spec/test_apache7.html @@ -0,0 +1,135 @@ + + + + Index of /videos/feb 2013 + + +

Index of /videos/feb 2013

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 NameLast modifiedSizeDescription

 Parent Directory   -  
 -The Mists of Avalon..>2013-02-07 12:40 13M 
 5 Secrets from a Bre..>2013-02-07 13:55 31M 
 8 day Process of Tra..>2013-02-07 18:30 102M 
 9. The Ring of Power..>2013-02-07 18:37 21M 
 10. The Ring of Powe..>2013-02-07 12:47 21M 
 11. The Ring of Powe..>2013-02-07 12:54 20M 
 689 Emerald 2 Thoth ..>2013-02-07 14:40 137M 
 694 Emerald 7 God of..>2013-02-07 15:19 123M 
 695 2012 Fear The ..>2013-02-07 16:04 142M 
 698 Emerald Tablet 6..>2013-02-07 17:14 227M 
 700 Emerald Tablets ..>2013-02-07 17:59 145M 
 2011 02 23 Hilton Mi..>2013-02-07 13:10 35M 
 2012 Lightworkers ar..>2013-02-07 13:45 107M 
 ABBA - Voulez-Vous (..>2013-02-07 19:59 174M 
 ABBA Dancing Queen ..>2013-02-07 19:01 39M 
 ABBA Greatest HITS-6..>2013-02-07 22:36 75M 
 ABBA Super Trouper -..>2013-02-07 23:19 138M 
 ABBA album - All son..>2013-02-07 20:06 20M 
 ABBA documentary. Th..>2013-02-07 22:12 361M 
 ANONYMOUS GETS OWNED..>2013-02-08 05:21 21M 
 ANONYMOUS REAL FEELI..>2013-02-08 05:24 3.6M 
 Alan Moore Art is M..>2013-02-07 23:23 12M 
 Alan Moore Language..>2013-02-07 23:33 34M 
 Alex Jones Real Wife..>2013-02-07 23:35 4.4M 
 Alexander the Great ..>2013-02-08 03:32 520M 
 Alexander the Great ..>2013-02-08 00:38 204M 
 Alien No truth about..>2013-02-08 03:34 4.8M 
 All I Ask of You - S..>2013-02-08 03:36 5.9M 
 All I Ask of You - Y..>2013-02-08 03:41 14M 
 Aly AJ. - Walking ..>2013-02-08 03:43 5.5M 
 Ameno-Era - YouTube.flv2013-02-08 03:47 12M 
 AmericanFreedomRadio..>2013-02-08 03:52 14M 
 A message from The O..>2013-02-07 18:38 4.8M 
 Ancient Knowledge of..>2013-02-08 04:25 101M 
 Angel of Music - You..>2013-02-08 04:29 11M 
 Angry People who use..>2013-02-08 05:03 104M 
 Anishnahbe Pakwejiga..>2013-02-08 05:04 3.5M 
 Ann Coulter Occupy ..>2013-02-08 05:12 22M 
 Anonymous An Open M..>2013-02-08 05:14 7.4M 
 Anonymous is a Joke...>2013-02-08 05:23 5.1M 
 Another Message with..>2013-02-08 05:48 73M 
 Anthony Robbins inte..>2013-02-08 05:52 12M 
 Anthony Robbins inte..>2013-02-08 05:56 11M 
 Apple Cider Vinegar ..>2013-02-08 06:20 74M 
 Best Exercise and Fo..>2013-02-08 07:02 6.9M 
 Best Of The Global R..>2013-02-08 09:27 436M 
 Be your own Heru Her..>2013-02-08 06:59 116M 
 Bobby Deep Venes -..>2013-02-08 09:34 21M 
 Bonnie Tyler - I Nee..>2013-02-08 09:39 15M 
 Buddha Bar Deepak ..>2013-02-08 09:48 27M 
 CĂ©line Dion - The Po..>2013-02-08 18:15 20M 
 Celtic Woman - A New..>2013-02-08 10:11 13M 
 Celtic Woman - Scarb..>2013-02-08 10:13 7.1M 
 Celtic Woman A New J..>2013-02-08 13:09 502M 
 Change Your Thoughts..>2013-02-08 15:35 417M 
 Charlotte Church - T..>2013-02-08 15:47 33M 
 Clash of the Gods M..>2013-02-08 17:16 241M 
 Clash of the Titans ..>2013-02-08 17:20 13M 
 Coffee Enema, Liver ..>2013-02-08 17:32 13M 
 Coffee Enema , Liver..>2013-02-08 17:27 18M 
 Crystals That Facili..>2013-02-08 17:57 66M 
 Cyndi Lauper - Girls..>2013-02-08 18:08 29M 
 Dale Carnegie How to..>2013-02-09 03:53 1.5G 
 Dark City Draco Be..>2013-02-09 04:14 64M 
 Deepak Chopra's secr..>2013-02-09 06:32 23M 
 Deepak Chopra - God ..>2013-02-09 05:14 1.5M 
 Deepak Chopra - Huma..>2013-02-09 05:23 30M 
 Deepak Chopra - Way ..>2013-02-09 06:25 210M 
 Deepak Chopra Phys..>2013-02-09 05:13 196M 
 Deva Premal ~ Love i..>2013-02-09 06:39 22M 
 ET 101 The Martians..>2013-02-09 07:35 21M 
 EarthQuakes, Tsunami..>2013-02-09 07:14 56M 
 Earth is run by Inse..>2013-02-09 06:57 63M 
 Economic Collapse Su..>2013-02-09 07:23 28M 
 Enoch The Name Ever..>2013-02-09 07:29 19M 
 FIRST NATIONS ( LAKO..>2013-02-09 09:08 27M 
 Fake 2013 RITUALS ..>2013-02-09 09:00 286M 
 Fox News says, OCCUP..>2013-02-09 09:16 29M 
 Freemasons Revealed...>2013-02-09 12:25 635M 
 Goddesses of Nature-..>2013-02-09 12:44 20M 
 Goddesses of Nature-..>2013-02-09 12:50 20M 
 God of War 2 (ALL Ti..>2013-02-09 12:32 22M 
 God of War II - Kill..>2013-02-09 12:38 22M 
 Grant Morrison THE d..>2013-02-09 13:29 131M 
 Healthcare - Deepak ..>2013-02-09 13:34 17M 
 Heart - Alone - YouT..>2013-02-09 13:41 22M 
 Heart - These Dreams..>2013-02-09 13:49 25M 
 Hermeticism 101 - Yo..>2013-02-09 14:12 80M 
 Hillsong- Here I Am ..>2013-02-09 14:24 37M 
 Holy Grail Gratitude..>2013-02-09 14:30 21M 
 Homosexuality - PT. ..>2013-02-09 14:44 47M 
 Homosexuality - PT. ..>2013-02-09 14:58 49M 
 How To Detoxify Your..>2013-02-09 15:20 12M 
 How To Talk To Your ..>2013-02-09 16:23 56M 
 How to Defeat the Il..>2013-02-09 15:06 27M 
 How to Detox Heavy M..>2013-02-09 15:16 33M 
 How to Meditate-Astr..>2013-02-09 16:06 90M 
 How to make Babi's U..>2013-02-09 15:39 66M 
 I Want to know what ..>2013-02-09 16:27 14M 
 I Want to know what ..>2013-02-09 16:32 14M 
 In The Beginning Vam..>2013-02-09 16:44 40M 
 Is Eating Garlic Har..>2013-02-09 17:02 60M 
 Isis The Divine Moth..>2013-02-09 17:13 38M 
 JAI KALI MAA! THE P..>2013-02-09 17:20 22M 
 Jedi Kitten with the..>2013-02-09 17:22 7.8M 
 Jedi Kitten with the..>2013-02-09 17:25 7.8M 
 Josh Reeves - Hidden..>2013-02-09 18:08 127M 
 Kali, The Dangerous ..>2013-02-09 18:18 11M 
 Kali Aarti (Jai Kali..>2013-02-09 18:13 16M 
 Kali Ma - The Black ..>2013-02-09 18:15 8.3M 
 Kathleen Wynne - Gay..>2013-02-09 18:29 37M 
 Katie Melua - Closes..>2013-02-09 18:40 36M 
 Krishna History or M..>2013-02-09 19:12 104M 
 Light-workers workin..>2013-02-09 19:26 45M 
 Little Lotte - YouTu..>2013-02-09 19:29 7.4M 
 Live in Love - YouTu..>2013-02-09 19:34 15M 
 Liver Detox with Cof..>2013-02-09 19:44 34M 
 Long Term Food Stora..>2013-02-09 20:09 81M 
 Love Never Hurt Anyo..>2013-02-09 20:12 8.0M 
 cabbage rolls - YouT..>2013-02-08 10:06 54M 
 josh groban and sara..>2013-02-09 17:30 18M 

+