diff --git a/crawler.py b/crawler.py index 19a37d7..0178282 100644 --- a/crawler.py +++ b/crawler.py @@ -92,8 +92,15 @@ class Crawler: if __name__ == "__main__": - c = Crawler("https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/", True) + c = Crawler("http://dl.apkhome.org/", True) c.crawl() - print(c.files) - c.store_report("000008") + + r = ReportBuilder(c.files, "http://dl.apkhome.org/") + print(r.get_total_size_formatted()) + + for f in c.files: + if f["size"] > 1000000: + print(f) + + c.store_report("000009") diff --git a/parser.py b/parser.py index 59d0c13..c81ac2f 100644 --- a/parser.py +++ b/parser.py @@ -1,6 +1,6 @@ import os import re -from urllib.parse import urljoin +from urllib.parse import urljoin, unquote import humanfriendly from bs4 import BeautifulSoup @@ -17,10 +17,16 @@ class PageParser: raise NotImplementedError() @staticmethod - def get_size_columns(cols): + def get_size_columns(cols, file_name): for i in range(len(cols)): + col_file_name = cols[i][0:cols[i].rfind("..>")] # Some file names could be truncated: 'long_file_..>' + file_name = unquote(file_name)[0:len(col_file_name)] + + if len(file_name) > 0 and file_name in col_file_name : + continue # Skip if cols[i] is file name to avoid file names like 100px*.jpg to be parsed as 100 PB + if i == len(cols) - 1: try: humanfriendly.parse_size(cols[i]) @@ -53,7 +59,7 @@ class PageParser: return text.lower().find("parent directory") == -1 and text != "Name" and text != "Last modified" and \ text != "Size" and text != "Description " and text != "Description" and text != "../" and text != "" and\ - text is not None + text is not None and text != ".." @staticmethod def file_type(link): @@ -72,11 +78,12 @@ class PageParser: return text - def get_size(self, cols): + def get_size(self, cols, file_name): # Figure out which column(s) is the size one - size_cols = self.get_size_columns(cols) + size_cols = self.get_size_columns(cols, file_name) if size_cols is not None: + col_start, col_end = size_cols self.size_unknown = False @@ -131,6 +138,7 @@ class NginxParser(PageParser): try: if PageParser.should_save_link(link.text): target = link.get("href") + short_file_name = os.path.split(target)[1] full_link = urljoin(base_url, target) file_type = PageParser.file_type(target) @@ -142,7 +150,7 @@ class NginxParser(PageParser): date_and_size = text[target_index:text.find("', '175289', 'kB', '2008/10/21', '09:00:02', '']) + result = self.parser.get_size_columns(['', '175289', 'kB', '2008/10/21', '09:00:02', ''], "") + result1 = self.parser.get_size_columns(['100pxfilename.jpg', '175289', 'kB', '2008/10/21', '09:00:02', ''], "100pxfilename.jpg") self.assertEqual(result, (1, 2)) + self.assertEqual(result1, (1, 2)) def test_link_count(self): @@ -172,4 +174,38 @@ class ApacheParserTest4(TestCase): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual(result["The.Big.Bang.Theory.S03E06.Football.fuer.Nerds.German.WS.DVDRip.XviD-DELiCiOUS.avi"]["size"], 175000000) - self.assertEqual(result["The.Big.Bang.Theory.S03E03.Sex.oder.Pralinen.German.WS.DVDRip.XviD-DELiCiOUS.avi"]["size"], 0) \ No newline at end of file + self.assertEqual(result["The.Big.Bang.Theory.S03E03.Sex.oder.Pralinen.German.WS.DVDRip.XviD-DELiCiOUS.avi"]["size"], 0) + + +class ApacheParserTest5(TestCase): + + def setUp(self): + self.parser = ApacheParser() + + root_page_file = open("test.html", "r") + self.root_page = root_page_file.read() + self.base_url = "http://archive.scene.org/pub/resources/docs/" + root_page_file.close() + + def test_link_size(self): + result = self.parser.get_links(self.root_page, self.base_url) + + self.assertEqual(result["17toilet.txt"]["size"], 12700) + self.assertEqual(result["288help.diz"]["size"], 9000) + + +class ApacheParserTest7(TestCase): + + def setUp(self): + self.parser = ApacheParser() + + root_page_file = open("test_apache7.html", "r") + self.root_page = root_page_file.read() + self.base_url = "http://www.serenitystreetnews.com/videos/feb 2013/" + root_page_file.close() + + def test_link_size(self): + result = self.parser.get_links(self.root_page, self.base_url) + + self.assertEqual(result["700%20Emerald%20Tablets%20Dark%20Brothers%20-%20YouTube.flv"]["size"], 145000000) + self.assertEqual(result["Economic%20Collapse%20Survival%20Map%20-%20Risk%20Analysis%20of%20best%20area%20in%20United%20States%20-%20YouTube.flv"]["size"], 28000000) \ No newline at end of file diff --git a/spec/test.html b/spec/test.html new file mode 100644 index 0000000..e13d5c9 --- /dev/null +++ b/spec/test.html @@ -0,0 +1,35 @@ + + +
+ +Name | Last Modified | Size | Type |
---|---|---|---|
../ | - | Directory | |
bbs_denmark/ | 2017-Oct-25 06:57:00 | - | Directory |
bbs_finland/ | 2017-Oct-25 06:56:02 | - | Directory |
bbs_france/ | 2016-Nov-29 13:35:22 | - | Directory |
bbs_netherlands/ | 2017-Oct-29 20:27:03 | - | Directory |
bbs_scene/ | 2017-Oct-01 17:44:08 | - | Directory |
bbs_sweden/ | 2017-Oct-29 20:28:57 | - | Directory |
denthor/ | 2001-Jul-03 08:19:10 | - | Directory |
lsd_team/ | 1999-Jul-06 04:02:05 | - | Directory |
scenery/ | 2013-Dec-25 19:07:35 | - | Directory |
!!!!!!!!.txt | 2013-Jul-07 03:06:10 | 20.6K | text/plain |
16xpktst.txt | 2015-Sep-02 06:00:19 | 18.0K | text/plain |
17toilet.txt | 2015-Sep-02 06:21:37 | 12.7K | text/plain |
1st_sysx.diz | 2004-Sep-21 02:44:07 | 0.3K | application/octet-stream |
1st_sysx.zip | 2004-Sep-21 10:59:35 | 3.4K | application/zip |
288help.diz | 2016-Nov-01 15:22:27 | 9.0K | application/octet-stream |
288help.lha | 2016-Nov-01 15:22:24 | 25.6K | application/x-lha |
2d.txt | 2004-Oct-15 04:25:12 | 0.9K | text/plain |
Name | Last modified | Size | Description | |
---|---|---|---|---|
Parent Directory | - | |||
1280px-Vegetable_ris..> | 2018-01-23 05:44 | 64K | ||
1280px-Vegetable_ris..> | 2018-01-23 05:44 | 43K | ||
1280px-Vegetable_ris..> | 2018-01-23 05:44 | 192K | ||
1280px-Vegetable_ris..> | 2018-01-23 05:44 | 46K | ||
1280px-Vegetable_ris..> | 2018-01-23 05:44 | 201K | ||
1280px-Vegetable_ris..> | 2018-01-23 05:44 | 54K | ||
1280px-Vegetable_ris..> | 2017-08-29 18:04 | 62K | ||
1280px-Vegetable_ris..> | 2017-08-29 18:04 | 159K | ||
Name | Last modified | Size | Description | |
---|---|---|---|---|
Parent Directory | - | |||
-The Mists of Avalon..> | 2013-02-07 12:40 | 13M | ||
5 Secrets from a Bre..> | 2013-02-07 13:55 | 31M | ||
8 day Process of Tra..> | 2013-02-07 18:30 | 102M | ||
9. The Ring of Power..> | 2013-02-07 18:37 | 21M | ||
10. The Ring of Powe..> | 2013-02-07 12:47 | 21M | ||
11. The Ring of Powe..> | 2013-02-07 12:54 | 20M | ||
689 Emerald 2 Thoth ..> | 2013-02-07 14:40 | 137M | ||
694 Emerald 7 God of..> | 2013-02-07 15:19 | 123M | ||
695 2012 Fear The ..> | 2013-02-07 16:04 | 142M | ||
698 Emerald Tablet 6..> | 2013-02-07 17:14 | 227M | ||
700 Emerald Tablets ..> | 2013-02-07 17:59 | 145M | ||
2011 02 23 Hilton Mi..> | 2013-02-07 13:10 | 35M | ||
2012 Lightworkers ar..> | 2013-02-07 13:45 | 107M | ||
ABBA - Voulez-Vous (..> | 2013-02-07 19:59 | 174M | ||
ABBA Dancing Queen ..> | 2013-02-07 19:01 | 39M | ||
ABBA Greatest HITS-6..> | 2013-02-07 22:36 | 75M | ||
ABBA Super Trouper -..> | 2013-02-07 23:19 | 138M | ||
ABBA album - All son..> | 2013-02-07 20:06 | 20M | ||
ABBA documentary. Th..> | 2013-02-07 22:12 | 361M | ||
ANONYMOUS GETS OWNED..> | 2013-02-08 05:21 | 21M | ||
ANONYMOUS REAL FEELI..> | 2013-02-08 05:24 | 3.6M | ||
Alan Moore Art is M..> | 2013-02-07 23:23 | 12M | ||
Alan Moore Language..> | 2013-02-07 23:33 | 34M | ||
Alex Jones Real Wife..> | 2013-02-07 23:35 | 4.4M | ||
Alexander the Great ..> | 2013-02-08 03:32 | 520M | ||
Alexander the Great ..> | 2013-02-08 00:38 | 204M | ||
Alien No truth about..> | 2013-02-08 03:34 | 4.8M | ||
All I Ask of You - S..> | 2013-02-08 03:36 | 5.9M | ||
All I Ask of You - Y..> | 2013-02-08 03:41 | 14M | ||
Aly AJ. - Walking ..> | 2013-02-08 03:43 | 5.5M | ||
Ameno-Era - YouTube.flv | 2013-02-08 03:47 | 12M | ||
AmericanFreedomRadio..> | 2013-02-08 03:52 | 14M | ||
A message from The O..> | 2013-02-07 18:38 | 4.8M | ||
Ancient Knowledge of..> | 2013-02-08 04:25 | 101M | ||
Angel of Music - You..> | 2013-02-08 04:29 | 11M | ||
Angry People who use..> | 2013-02-08 05:03 | 104M | ||
Anishnahbe Pakwejiga..> | 2013-02-08 05:04 | 3.5M | ||
Ann Coulter Occupy ..> | 2013-02-08 05:12 | 22M | ||
Anonymous An Open M..> | 2013-02-08 05:14 | 7.4M | ||
Anonymous is a Joke...> | 2013-02-08 05:23 | 5.1M | ||
Another Message with..> | 2013-02-08 05:48 | 73M | ||
Anthony Robbins inte..> | 2013-02-08 05:52 | 12M | ||
Anthony Robbins inte..> | 2013-02-08 05:56 | 11M | ||
Apple Cider Vinegar ..> | 2013-02-08 06:20 | 74M | ||
Best Exercise and Fo..> | 2013-02-08 07:02 | 6.9M | ||
Best Of The Global R..> | 2013-02-08 09:27 | 436M | ||
Be your own Heru Her..> | 2013-02-08 06:59 | 116M | ||
Bobby Deep Venes -..> | 2013-02-08 09:34 | 21M | ||
Bonnie Tyler - I Nee..> | 2013-02-08 09:39 | 15M | ||
Buddha Bar Deepak ..> | 2013-02-08 09:48 | 27M | ||
Céline Dion - The Po..> | 2013-02-08 18:15 | 20M | ||
Celtic Woman - A New..> | 2013-02-08 10:11 | 13M | ||
Celtic Woman - Scarb..> | 2013-02-08 10:13 | 7.1M | ||
Celtic Woman A New J..> | 2013-02-08 13:09 | 502M | ||
Change Your Thoughts..> | 2013-02-08 15:35 | 417M | ||
Charlotte Church - T..> | 2013-02-08 15:47 | 33M | ||
Clash of the Gods M..> | 2013-02-08 17:16 | 241M | ||
Clash of the Titans ..> | 2013-02-08 17:20 | 13M | ||
Coffee Enema, Liver ..> | 2013-02-08 17:32 | 13M | ||
Coffee Enema , Liver..> | 2013-02-08 17:27 | 18M | ||
Crystals That Facili..> | 2013-02-08 17:57 | 66M | ||
Cyndi Lauper - Girls..> | 2013-02-08 18:08 | 29M | ||
Dale Carnegie How to..> | 2013-02-09 03:53 | 1.5G | ||
Dark City Draco Be..> | 2013-02-09 04:14 | 64M | ||
Deepak Chopra's secr..> | 2013-02-09 06:32 | 23M | ||
Deepak Chopra - God ..> | 2013-02-09 05:14 | 1.5M | ||
Deepak Chopra - Huma..> | 2013-02-09 05:23 | 30M | ||
Deepak Chopra - Way ..> | 2013-02-09 06:25 | 210M | ||
Deepak Chopra Phys..> | 2013-02-09 05:13 | 196M | ||
Deva Premal ~ Love i..> | 2013-02-09 06:39 | 22M | ||
ET 101 The Martians..> | 2013-02-09 07:35 | 21M | ||
EarthQuakes, Tsunami..> | 2013-02-09 07:14 | 56M | ||
Earth is run by Inse..> | 2013-02-09 06:57 | 63M | ||
Economic Collapse Su..> | 2013-02-09 07:23 | 28M | ||
Enoch The Name Ever..> | 2013-02-09 07:29 | 19M | ||
FIRST NATIONS ( LAKO..> | 2013-02-09 09:08 | 27M | ||
Fake 2013 RITUALS ..> | 2013-02-09 09:00 | 286M | ||
Fox News says, OCCUP..> | 2013-02-09 09:16 | 29M | ||
Freemasons Revealed...> | 2013-02-09 12:25 | 635M | ||
Goddesses of Nature-..> | 2013-02-09 12:44 | 20M | ||
Goddesses of Nature-..> | 2013-02-09 12:50 | 20M | ||
God of War 2 (ALL Ti..> | 2013-02-09 12:32 | 22M | ||
God of War II - Kill..> | 2013-02-09 12:38 | 22M | ||
Grant Morrison THE d..> | 2013-02-09 13:29 | 131M | ||
Healthcare - Deepak ..> | 2013-02-09 13:34 | 17M | ||
Heart - Alone - YouT..> | 2013-02-09 13:41 | 22M | ||
Heart - These Dreams..> | 2013-02-09 13:49 | 25M | ||
Hermeticism 101 - Yo..> | 2013-02-09 14:12 | 80M | ||
Hillsong- Here I Am ..> | 2013-02-09 14:24 | 37M | ||
Holy Grail Gratitude..> | 2013-02-09 14:30 | 21M | ||
Homosexuality - PT. ..> | 2013-02-09 14:44 | 47M | ||
Homosexuality - PT. ..> | 2013-02-09 14:58 | 49M | ||
How To Detoxify Your..> | 2013-02-09 15:20 | 12M | ||
How To Talk To Your ..> | 2013-02-09 16:23 | 56M | ||
How to Defeat the Il..> | 2013-02-09 15:06 | 27M | ||
How to Detox Heavy M..> | 2013-02-09 15:16 | 33M | ||
How to Meditate-Astr..> | 2013-02-09 16:06 | 90M | ||
How to make Babi's U..> | 2013-02-09 15:39 | 66M | ||
I Want to know what ..> | 2013-02-09 16:27 | 14M | ||
I Want to know what ..> | 2013-02-09 16:32 | 14M | ||
In The Beginning Vam..> | 2013-02-09 16:44 | 40M | ||
Is Eating Garlic Har..> | 2013-02-09 17:02 | 60M | ||
Isis The Divine Moth..> | 2013-02-09 17:13 | 38M | ||
JAI KALI MAA! THE P..> | 2013-02-09 17:20 | 22M | ||
Jedi Kitten with the..> | 2013-02-09 17:22 | 7.8M | ||
Jedi Kitten with the..> | 2013-02-09 17:25 | 7.8M | ||
Josh Reeves - Hidden..> | 2013-02-09 18:08 | 127M | ||
Kali, The Dangerous ..> | 2013-02-09 18:18 | 11M | ||
Kali Aarti (Jai Kali..> | 2013-02-09 18:13 | 16M | ||
Kali Ma - The Black ..> | 2013-02-09 18:15 | 8.3M | ||
Kathleen Wynne - Gay..> | 2013-02-09 18:29 | 37M | ||
Katie Melua - Closes..> | 2013-02-09 18:40 | 36M | ||
Krishna History or M..> | 2013-02-09 19:12 | 104M | ||
Light-workers workin..> | 2013-02-09 19:26 | 45M | ||
Little Lotte - YouTu..> | 2013-02-09 19:29 | 7.4M | ||
Live in Love - YouTu..> | 2013-02-09 19:34 | 15M | ||
Liver Detox with Cof..> | 2013-02-09 19:44 | 34M | ||
Long Term Food Stora..> | 2013-02-09 20:09 | 81M | ||
Love Never Hurt Anyo..> | 2013-02-09 20:12 | 8.0M | ||
cabbage rolls - YouT..> | 2013-02-08 10:06 | 54M | ||
josh groban and sara..> | 2013-02-09 17:30 | 18M | ||