diff --git a/crawler.py b/crawler.py
index 0c898bc..e577422 100644
--- a/crawler.py
+++ b/crawler.py
@@ -2,19 +2,41 @@ import requests
from parser import NginxParser, ApacheParser
from reports import ReportSaver, ReportBuilder
-headers = {
- 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
-}
-
class Crawler:
- def __init__(self, url):
- self.parser = NginxParser()
+ def __init__(self, url, test_url):
self.files = []
self.base_url = url
+ if test_url:
+ # Test url
+ r = requests.get(self.base_url, timeout=30)
+
+ self.parser = self.guess_parser(r.text, r.headers)()
+
+ print("Using " + self.parser.__class__.__name__ + " as parser")
+
+ else:
+ self.parser = None
+
+ @staticmethod
+ def guess_parser(text, headers):
+
+ server = headers["Server"] if "Server" in headers else ""
+
+ # try nginx
+ parser = NginxParser()
+ if parser.page_is_valid(text):
+ return NginxParser
+
+ # Try apache
+ parser = ApacheParser()
+ if parser.page_is_valid(text):
+ return ApacheParser
+
+ return None
+
def crawl(self, address=None):
if address is None:
@@ -53,6 +75,7 @@ class Crawler:
f.write(report_saver.to_link_list())
-c = Crawler("http://dl.upload8.in/files/Serial/Altered%20Carbon/")
-c.crawl()
-c.store_report("000002")
+if __name__ == "__main__":
+ c = Crawler("https://repo.zenk-security.com/", True)
+ c.crawl()
+ c.store_report("000007")
diff --git a/parser.py b/parser.py
index c30a6b6..e71a30a 100644
--- a/parser.py
+++ b/parser.py
@@ -1,14 +1,43 @@
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin
import os
import re
+from urllib.parse import urljoin
+
import humanfriendly
+from bs4 import BeautifulSoup
class PageParser:
+
+ def __init__(self):
+ self.col_start = None
+ self.col_end = None
+ self.size_unknown = True
+
def get_links(self, text: str, base_url: str):
raise NotImplementedError()
+ @staticmethod
+ def get_size_columns(cols):
+
+ for i in range(len(cols)):
+
+ if i == len(cols) - 1:
+ try:
+ humanfriendly.parse_size(cols[i])
+ return tuple([i, i])
+ except humanfriendly.InvalidSize:
+ return None
+
+ try:
+ humanfriendly.parse_size(cols[i] + cols[i + 1])
+ return tuple([i, i + 1])
+ except humanfriendly.InvalidSize:
+ try:
+ humanfriendly.parse_size(cols[i])
+ return tuple([i, i])
+ except humanfriendly.InvalidSize:
+ continue
+
@staticmethod
def get_parser_type(headers):
"""Get appropriate parser type for a a server based on its header"""
@@ -26,26 +55,82 @@ class PageParser:
@staticmethod
def file_type(link):
- return "d" if link.endswith("/") else "f"
+
+ if link.endswith("/") or link.startswith("?"):
+ return "d"
+ return "f"
+
+
+ @staticmethod
+ def clean_page(text):
+ text = text.replace("", "")
+
+ return text
+
+ def get_size(self, cols):
+
+ # Figure out which column(s) is the size one
+ size_cols = self.get_size_columns(cols)
+ if size_cols is not None:
+ col_start, col_end = size_cols
+ self.size_unknown = False
+
+ size_human = cols[col_start] if col_start == col_end else cols[col_start] + cols[col_end]
+
+ try:
+ size = humanfriendly.parse_size(size_human)
+ except humanfriendly.InvalidSize:
+ size = 0
+ else:
+ size = 0
+
+ return size
class NginxParser(PageParser):
def get_links(self, text, base_url: str):
links = dict()
- soup = BeautifulSoup(text, "html.parser")
- # Handle weird character formats and tag names
- text = text.replace(" 0:
+ t = self.tasks.pop()
+ self.update_file()
+ else:
+ t = None
+
+ return t
+
+ def update_file(self):
+ with open(self.file, "w") as f:
+ json.dump(self.tasks, f, default=dumper)
+
+ def is_queued(self, post_id):
+
+ for task in self.tasks:
+ if task.post_id == post_id:
+ return True
+
+ return False
+
+
+def dumper(obj):
+ return obj.__dict__
class RedditBot:
@@ -11,8 +65,7 @@ class RedditBot:
self.crawled = []
else:
with open(log_file, "r") as f:
- self.crawled = f.read().split("\n")
- self.crawled = list(filter(None, self.crawled))
+ self.crawled = list(filter(None, f.read().split("\n")))
def log_crawl(self, post_id):
diff --git a/reports.py b/reports.py
index 58607a2..b2cfa8e 100644
--- a/reports.py
+++ b/reports.py
@@ -91,6 +91,7 @@ class ReportSaver:
out["ext_sizes"] = self.builder.get_ext_sizes()
out["ext_sizes_formatted"] = self.builder.get_ext_sizes_formatted()
out["report_time"] = str(self.builder.report_time)
+ out["total_count"] = len(self.builder.files)
return json.dumps(out)
@@ -103,6 +104,7 @@ class ReportSaver:
out["ext_count"] = self.builder.get_ext_counts()
out["ext_sizes"] = self.builder.get_ext_sizes()
out["report_time"] = str(self.builder.report_time)
+ out["total_count"] = len(self.builder.files)
return json.dumps(out)
diff --git a/spec/Crawler_spec.py b/spec/Crawler_spec.py
index e69de29..22713f9 100644
--- a/spec/Crawler_spec.py
+++ b/spec/Crawler_spec.py
@@ -0,0 +1,32 @@
+from unittest import TestCase
+
+from parser import ApacheParser, NginxParser
+from crawler import Crawler
+
+
+class CrawlerTest(TestCase):
+
+ def test_guess_parser1(self):
+
+ with open("test_apache1.html", "r") as f:
+ text = f.read()
+
+ c = Crawler("http://some.website/", False)
+
+ self.assertEqual(c.guess_parser(text, {}), ApacheParser)
+
+ def test_guess_parser2(self):
+ with open("test_nginx1.html", "r") as f:
+ text = f.read()
+
+ c = Crawler("http://some.website", False)
+
+ self.assertEqual(c.guess_parser(text, {}), NginxParser)
+
+ def test_guess_parser3(self):
+ with open("test_invalid.html", "r") as f:
+ text = f.read()
+
+ c = Crawler("http://some.website", False)
+
+ self.assertEqual(c.guess_parser(text, {}), None)
\ No newline at end of file
diff --git a/spec/Parser_spec.py b/spec/Parser_spec.py
index 1c9122a..e952759 100644
--- a/spec/Parser_spec.py
+++ b/spec/Parser_spec.py
@@ -18,7 +18,7 @@ class NginxParserTest(TestCase):
def setUp(self):
self.parser = NginxParser()
- root_page_file = open("test_nginx_root.html", "r")
+ root_page_file = open("test_nginx1.html", "r")
self.root_page = root_page_file.read()
root_page_file.close()
@@ -57,7 +57,7 @@ class ApacheParserTest(TestCase):
def setUp(self):
self.parser = ApacheParser()
- root_page_file = open("test_apache_root.html", "r")
+ root_page_file = open("test_apache1.html", "r")
self.root_page = root_page_file.read()
root_page_file.close()
@@ -76,7 +76,7 @@ class ApacheParserTest(TestCase):
result = self.parser.get_links(self.root_page, "https://keisari.net/videos/")
self.assertEqual(result["happyday.mp4"]["size"], 772000)
- self.assertEqual(result["alex_räjähtää.mp4"]["size"], 715000)
+ self.assertEqual(result["alex_r%c3%a4j%c3%a4ht%c3%a4%c3%a4.mp4"]["size"], 715000)
def test_link_type(self):
result = self.parser.get_links(self.root_page, "https://keisari.net/videos/")
@@ -109,16 +109,67 @@ class ApacheParserTest2(TestCase):
def test_link_size(self):
result = self.parser.get_links(self.root_page, self.base_url)
- self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ020˜b.u‚æ‚Ý‚ª‚¦‚éƒTƒCƒ„l“`àIŒå‹ó‚̃‹[ƒcv.wmv"]["size"], 179721000)
- self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ225˜b.u‹‚¢‚ºƒ`ƒrƒbƒRII‚P‚W†‘å‹êíIHv.wmv"]["size"], 347507000)
+ self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"]["size"], 232185000)
+ self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ019˜b.ud—͂ƂÌí‚¢Iƒoƒuƒ‹ƒXŒN‚ð‚‚©‚Ü‚¦‚ëv.wmv"]["size"], 185385000)
def test_link_type(self):
result = self.parser.get_links(self.root_page, self.base_url)
- self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ225˜b.u‹‚¢‚ºƒ`ƒrƒbƒRII‚P‚W†‘å‹êíIHv.wmv"]["type"], "f")
- self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z jpg/"]["type"], "d")
+ self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"]["type"], "f")
+ self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z%20jpg/"]["type"], "d")
def test_link_extension(self):
result = self.parser.get_links(self.root_page, self.base_url)
- self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ225˜b.u‹‚¢‚ºƒ`ƒrƒbƒRII‚P‚W†‘å‹êíIHv.wmv"]["ext"], "wmv")
\ No newline at end of file
+ self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"]["ext"], "wmv")
+
+
+class ApacheParserTest3(TestCase):
+
+ def setUp(self):
+ self.parser = ApacheParser()
+
+ root_page_file = open("test_apache3.html", "r")
+ self.root_page = root_page_file.read()
+ self.base_url = "http://files.duspectacle.com/mp3/Jardinets/"
+ root_page_file.close()
+
+ def test_link_count(self):
+
+ result = self.parser.get_links(self.root_page, self.base_url)
+
+ self.assertEqual(len(result), 21)
+
+ def test_link_size(self):
+ result = self.parser.get_links(self.root_page, self.base_url)
+
+ self.assertEqual(result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"]["size"], 9300000)
+ self.assertEqual(result["16%20Yellow%20Ostrich%20-%20WHALE.mp3"]["size"], 7100000)
+
+ def test_link_type(self):
+ result = self.parser.get_links(self.root_page, self.base_url)
+
+ self.assertEqual(result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"]["type"], "f")
+ self.assertEqual(result["01%20Jean%20Rochefort%20-%20Winnie%20et%20ses%20amis%20(introduction)/"]["type"], "d")
+
+ def test_link_extension(self):
+ result = self.parser.get_links(self.root_page, self.base_url)
+
+ self.assertEqual(result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"]["ext"], "mp3")
+
+
+class ApacheParserTest4(TestCase):
+
+ def setUp(self):
+ self.parser = ApacheParser()
+
+ root_page_file = open("test_apache4.html", "r")
+ self.root_page = root_page_file.read()
+ self.base_url = "http://jenserserver.no-ip.biz/movieserver/serien/bigbangtheorie/S3/"
+ root_page_file.close()
+
+ def test_link_size(self):
+ result = self.parser.get_links(self.root_page, self.base_url)
+
+ self.assertEqual(result["The.Big.Bang.Theory.S03E06.Football.fuer.Nerds.German.WS.DVDRip.XviD-DELiCiOUS.avi"]["size"], 175000000)
+ self.assertEqual(result["The.Big.Bang.Theory.S03E03.Sex.oder.Pralinen.German.WS.DVDRip.XviD-DELiCiOUS.avi"]["size"], 0)
\ No newline at end of file
diff --git a/spec/RedditBot_spec.py b/spec/RedditBot_spec.py
index ff0d667..a534fb0 100644
--- a/spec/RedditBot_spec.py
+++ b/spec/RedditBot_spec.py
@@ -1,5 +1,5 @@
from unittest import TestCase
-from reddit_bot import RedditBot
+from reddit_bot import RedditBot, TaskQueue, CrawTask
import os
@@ -33,3 +33,62 @@ class RedditBotTest(TestCase):
self.assertTrue(bot.has_crawled("000000"))
+class TaskQueueTest(TestCase):
+
+ def tearDown(self):
+ if os.path.isfile("task_queue_test.txt"):
+ os.remove("task_queue_test.txt")
+
+ def test_push_pop_test(self):
+
+ if os.path.isfile("task_queue_test.txt"):
+ os.remove("task_queue_test.txt")
+
+ tq = TaskQueue("task_queue_test.txt")
+ tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
+
+ task1 = tq.pop()
+
+ self.assertEqual(tq.pop(), None)
+ self.assertEqual(task1.url, "http://awebsite.com/")
+ self.assertEqual(task1.post_id, "postid")
+
+ def test_persistence(self):
+
+ if os.path.isfile("task_queue_test.txt"):
+ os.remove("task_queue_test.txt")
+
+ tq = TaskQueue("task_queue_test.txt")
+ tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
+
+ tq2 = TaskQueue("task_queue_test.txt")
+ task = tq2.pop()
+
+ self.assertEqual(task.url, "http://awebsite.com/")
+ self.assertEqual(task.post_id, "postid")
+
+ def test_multiple_tasks(self):
+ if os.path.isfile("task_queue_test.txt"):
+ os.remove("task_queue_test.txt")
+
+ tq = TaskQueue("task_queue_test.txt")
+
+ tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
+ tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
+ tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
+
+ self.assertIsNotNone(tq.pop())
+ self.assertIsNotNone(tq.pop())
+ self.assertIsNotNone(tq.pop())
+ self.assertIsNone(tq.pop())
+
+ def test_is_queued(self):
+ if os.path.isfile("task_queue_test.txt"):
+ os.remove("task_queue_test.txt")
+
+ tq = TaskQueue("task_queue_test.txt")
+
+ tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
+
+ self.assertTrue(tq.is_queued("postid"))
+ self.assertFalse(tq.is_queued("123456"))
\ No newline at end of file
diff --git a/spec/test_apache_root.html b/spec/test_apache1.html
similarity index 100%
rename from spec/test_apache_root.html
rename to spec/test_apache1.html
diff --git a/spec/test_apache3.html b/spec/test_apache3.html
new file mode 100644
index 0000000..eb551e8
--- /dev/null
+++ b/spec/test_apache3.html
@@ -0,0 +1,32 @@
+
+
+
+
+ Index of /mp3/Jardinets
+
+
diff --git a/spec/test_apache4.html b/spec/test_apache4.html
new file mode 100644
index 0000000..07ed408
--- /dev/null
+++ b/spec/test_apache4.html
@@ -0,0 +1,38 @@
+
+
+
+ Name Last modified Size Description
Parent Directory -
+
01 Jean Rochefort - ..> 2017-12-04 16:33 -
+
02 Krisma - Amore.mp3 2017-12-04 16:32 11M
+
03 Bernard Estardy -..> 2017-12-04 16:32 3.5M
+
04 Jamie Woon - Stre..> 2017-12-04 16:32 5.0M
+
05 DyE - Fantasy.mp3 2017-12-04 16:33 6.9M
+
06 Games - Planet Pa..> 2017-12-04 16:33 5.6M
+
07 Yeasayer - Swallo..> 2017-12-04 16:33 11M
+
08 Pacific! - Venus ..> 2017-12-04 16:32 5.7M
+
09 Jacky Chalard - S..> 2017-12-04 16:33 11M
+
10 Piry - Heroi Mode..> 2017-12-04 16:32 4.1M
+
11 Bahamas - Bahamas..> 2017-12-04 16:32 7.9M
+
12 Aeroplane - Fish ..> 2017-12-04 16:32 7.6M
+
13 Discodeine - Sync..> 2017-12-04 16:33 6.8M
+
14 Lykke Li - I Foll..> 2017-12-04 16:33 7.3M
+
15 Woodkid - Iron (R..> 2017-12-04 16:33 9.3M
+
16 Yellow Ostrich - ..> 2017-12-04 16:33 7.1M
+
17 Connan Mockasin -..> 2017-12-04 16:32 6.3M
+
18 Bruce Haack - May..> 2017-12-04 16:33 5.4M
+
cover-small.jpg 2017-12-04 16:32 97K
+
cover.jpg 2017-12-04 16:33 466K
+
playlist.txt 2017-12-04 16:33 955
+
Index of /movieserver/serien/bigbangtheorie/S3
+