From a2f671f0f5480e91f20f590401da3b146c19be52 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 7 Feb 2018 19:51:06 -0500 Subject: [PATCH] Added manual mode and HTTPS support --- reports.py => crawl_report.py | 12 +++-- crawler.py | 27 ++++++---- manual.py | 52 ++++++++++++++++++++ problematic websites | 26 ---------- reddit_bot.py | 89 ++++++++++++++++++++++----------- spec/RedditBot_spec.py | 92 +++++++++++++++-------------------- spec/ReportBuilder_spec.py | 4 +- spec/ReportSaver_spec.py | 4 +- static/css/main.css | 11 +++-- static/js/report.js | 20 ++++++-- templates/layout.html | 4 +- templates/report.html | 14 +++--- webserver.py | 5 +- 13 files changed, 217 insertions(+), 143 deletions(-) rename reports.py => crawl_report.py (90%) create mode 100644 manual.py delete mode 100644 problematic websites diff --git a/reports.py b/crawl_report.py similarity index 90% rename from reports.py rename to crawl_report.py index b2cfa8e..5526086 100644 --- a/reports.py +++ b/crawl_report.py @@ -1,6 +1,7 @@ import humanfriendly import datetime import json +import operator class ReportBuilder: @@ -23,9 +24,9 @@ class ReportBuilder: size = self.get_total_size() if size == 0: - return "Unknown (or empty)" + return "Unknown" - return humanfriendly.format_size(size, True) + " (" + str(size) + " bytes)" + return humanfriendly.format_size(size, True) def get_ext_counts(self): @@ -62,15 +63,16 @@ class ReportBuilder: ext_sizes = self.get_ext_sizes() for ext in ext_sizes: - ext_sizes[ext] = humanfriendly.format_size(ext_sizes[ext]) + ext_sizes[ext] = humanfriendly.format_size(ext_sizes[ext], True) return ext_sizes class ReportSaver: - def __init__(self, files, builder: ReportBuilder): + def __init__(self, files, title, builder: ReportBuilder): self.files = files self.builder = builder + self.title = title def to_json(self): @@ -92,6 +94,7 @@ class ReportSaver: out["ext_sizes_formatted"] = self.builder.get_ext_sizes_formatted() out["report_time"] = str(self.builder.report_time) out["total_count"] = len(self.builder.files) + out["post_title"] = self.title return json.dumps(out) @@ -105,6 +108,7 @@ class ReportSaver: out["ext_sizes"] = self.builder.get_ext_sizes() out["report_time"] = str(self.builder.report_time) out["total_count"] = len(self.builder.files) + out["post_title"] = self.title return json.dumps(out) diff --git a/crawler.py b/crawler.py index 0178282..724f765 100644 --- a/crawler.py +++ b/crawler.py @@ -1,12 +1,13 @@ import requests from parser import NginxParser, ApacheParser -from reports import ReportSaver, ReportBuilder +from crawl_report import ReportSaver, ReportBuilder class Crawler: def __init__(self, url, test_url): self.files = [] + self.parsed_urls = [] self.base_url = url if url.startswith("http"): @@ -52,6 +53,11 @@ class Crawler: def crawl(self, address=None): + # Prevent unwanted recursion + if address is not None and address in self.parsed_urls: + return + self.parsed_urls.append(address) + if self.parser is None: return @@ -71,6 +77,9 @@ class Crawler: print("Timeout, " + str(retries) + " retries left") retries -= 1 + if retries == 0: + return + links = self.parser.get_links(response.text, address) for k in links: @@ -80,8 +89,8 @@ class Crawler: else: self.files.append(dict(link=links[k]["link"], size=links[k]["size"], ext=links[k]["ext"])) - def store_report(self, report_id): - report_saver = ReportSaver(self.files, ReportBuilder(self.files, self.base_url)) + def store_report(self, report_id, title): + report_saver = ReportSaver(self.files,title, ReportBuilder(self.files, self.base_url)) with open("static/reports/" + report_id + "_chart.json", "w") as f: f.write(report_saver.to_json_chart()) @@ -92,15 +101,15 @@ class Crawler: if __name__ == "__main__": - c = Crawler("http://dl.apkhome.org/", True) + c = Crawler("http://www.downloads.imune.net/medicalbooks/", True) c.crawl() - r = ReportBuilder(c.files, "http://dl.apkhome.org/") + r = ReportBuilder(c.files, "http://www.downloads.imune.net/medicalbooks/") print(r.get_total_size_formatted()) - for f in c.files: - if f["size"] > 1000000: - print(f) + # for f in c.files: + # if f["size"] > 1000000: + # print(f) - c.store_report("000009") + c.store_report("000011", "test") diff --git a/manual.py b/manual.py new file mode 100644 index 0000000..4a32b29 --- /dev/null +++ b/manual.py @@ -0,0 +1,52 @@ +import sys +from crawler import Crawler +from crawl_report import ReportBuilder +from reddit_bot import CommentBuilder + +if len(sys.argv) > 1: + + command = sys.argv[1] + + if command == "crawl": + if len(sys.argv) > 2: + url = sys.argv[2] + + c = Crawler(url, True) + c.crawl() + + print("Done") + r = ReportBuilder(c.files, url) + print(r.get_total_size_formatted()) + + if command == "mkreport": + if len(sys.argv) > 3: + url = sys.argv[2] + report_id = sys.argv[3] + + c = Crawler(url, True) + c.crawl() + + print("Done") + r = ReportBuilder(c.files, url) + print(r.get_total_size_formatted()) + + c.store_report(report_id, "") + + if command == "getcomment": + if len(sys.argv) > 3: + url = sys.argv[2] + report_id = sys.argv[3] + + c = Crawler(url, True) + c.crawl() + + print("Done") + r = ReportBuilder(c.files, url) + print(r.get_total_size_formatted()) + + com_buider = CommentBuilder(ReportBuilder(c.files, c.base_url), url, report_id) + print(com_buider.get_comment()) + + +else: + print("Invalid argument count") \ No newline at end of file diff --git a/problematic websites b/problematic websites deleted file mode 100644 index 557b961..0000000 --- a/problematic websites +++ /dev/null @@ -1,26 +0,0 @@ -Breaks: - -http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links) -https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache) - - -https://filepursuit.com/ (recursion problem - not an OD) -https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded) - - - - - - -Working: -http://www.cheeseheadhosting.us/downloads/ -http://www.michellemariephotographie.com/wp-content/gallery/ -http://jenserserver.no-ip.biz/movieserver/ -http://files.duspectacle.com/mp3/ -http://nesninja.com/public/GoodNES_3.14_goodmerged/ -http://www.serenitystreetnews.com/videos/ -https://www.datto.com/resource-downloads/ -https://www.annmariegianni.com/wp-content/uploads/ -http://archive.scene.org/pub/resources/docs/bbs_finland/ -http://dl.apkhome.org -http://www.gamers.org/pub/archives/uwp-uml/ \ No newline at end of file diff --git a/reddit_bot.py b/reddit_bot.py index d54e5be..2b55f2e 100644 --- a/reddit_bot.py +++ b/reddit_bot.py @@ -1,51 +1,36 @@ import os import json +from crawl_report import ReportBuilder +import operator +import humanfriendly class CrawTask: - def __init__(self, url, post_id, title): - self.url = url - self.post_id = post_id - self.post_title = title + def __init__(self, s): + self.submission = s class TaskQueue: - def __init__(self, file): - self.file = file - + def __init__(self): self.tasks = [] - if os.path.isfile(self.file): - - with open(self.file, "r") as f: - json_tasks = json.load(f) - - for task in json_tasks: - self.tasks.append(CrawTask(task["url"], task["post_id"], task["post_title"])) - def push(self, task): self.tasks.append(task) - self.update_file() def pop(self): if len(self.tasks) > 0: t = self.tasks.pop() - self.update_file() else: t = None return t - def update_file(self): - with open(self.file, "w") as f: - json.dump(self.tasks, f, default=dumper) - def is_queued(self, post_id): for task in self.tasks: - if task.post_id == post_id: + if task.submission.id == post_id: return True return False @@ -61,14 +46,12 @@ class RedditBot: self.log_file = log_file - if not os.path.isfile(log_file): - self.crawled = [] - else: - with open(log_file, "r") as f: - self.crawled = list(filter(None, f.read().split("\n"))) + self.crawled = [] + self.load_from_file() def log_crawl(self, post_id): + self.load_from_file() self.crawled.append(post_id) with open(self.log_file, "w") as f: @@ -76,5 +59,55 @@ class RedditBot: f.write(post_id + "\n") def has_crawled(self, post_id): - + self.load_from_file() return post_id in self.crawled + + def load_from_file(self): + if not os.path.isfile(self.log_file): + self.crawled = [] + else: + with open(self.log_file, "r") as f: + self.crawled = list(filter(None, f.read().split("\n"))) + + +class CommentBuilder: + + def __init__(self, report_builder: ReportBuilder, url, post_id): + self.report_builder = report_builder + self.url = url + self.post_id = post_id + + def get_comment(self): + + total_size = self.report_builder.get_total_size() + + ext_counts = self.report_builder.get_ext_counts() + ext_sizes = self.report_builder.get_ext_sizes() + print(ext_sizes) + ext_sizes_sorted = sorted(ext_sizes.items(), key=operator.itemgetter(1), reverse=True) + print(ext_sizes_sorted) + + comment = "File types | Count | Total Size\n" + comment += ":-- | :-- | :-- \n" + + counter = 0 + for i in range(0, len(ext_sizes_sorted)): + + comment += ext_sizes_sorted[i][0] + comment += " | " + str(ext_counts[ext_sizes_sorted[i][0]]) + comment += " | " + str(humanfriendly.format_size(ext_sizes_sorted[i][1], True)) + " \n" + + counter += 1 + if counter >= 3: + break + + comment += "**Total** | **" + str(len(self.report_builder.files)) + "** | **" + comment += self.report_builder.get_total_size_formatted() + "** \n\n" + + comment += "[Full Report](https://simon987.net/od-bot/report/" + self.post_id + "/)" + comment += " | [JSON](https://simon987.net/od-bot/report/" + self.post_id + "/json)" + comment += " | [Link list](https://simon987.net/od-bot/report/" + self.post_id + "/links) \n" + comment += "*** \n^(Beep boop. I am a bot that calculates the file sizes & count of" + comment += " open directories posted in /r/opendirectories/)" + + return comment diff --git a/spec/RedditBot_spec.py b/spec/RedditBot_spec.py index a534fb0..46b13ec 100644 --- a/spec/RedditBot_spec.py +++ b/spec/RedditBot_spec.py @@ -39,56 +39,42 @@ class TaskQueueTest(TestCase): if os.path.isfile("task_queue_test.txt"): os.remove("task_queue_test.txt") - def test_push_pop_test(self): - - if os.path.isfile("task_queue_test.txt"): - os.remove("task_queue_test.txt") - - tq = TaskQueue("task_queue_test.txt") - tq.push(CrawTask("http://awebsite.com/", "postid", "a title")) - - task1 = tq.pop() - - self.assertEqual(tq.pop(), None) - self.assertEqual(task1.url, "http://awebsite.com/") - self.assertEqual(task1.post_id, "postid") - - def test_persistence(self): - - if os.path.isfile("task_queue_test.txt"): - os.remove("task_queue_test.txt") - - tq = TaskQueue("task_queue_test.txt") - tq.push(CrawTask("http://awebsite.com/", "postid", "a title")) - - tq2 = TaskQueue("task_queue_test.txt") - task = tq2.pop() - - self.assertEqual(task.url, "http://awebsite.com/") - self.assertEqual(task.post_id, "postid") - - def test_multiple_tasks(self): - if os.path.isfile("task_queue_test.txt"): - os.remove("task_queue_test.txt") - - tq = TaskQueue("task_queue_test.txt") - - tq.push(CrawTask("http://awebsite.com/", "postid", "a title")) - tq.push(CrawTask("http://awebsite.com/", "postid", "a title")) - tq.push(CrawTask("http://awebsite.com/", "postid", "a title")) - - self.assertIsNotNone(tq.pop()) - self.assertIsNotNone(tq.pop()) - self.assertIsNotNone(tq.pop()) - self.assertIsNone(tq.pop()) - - def test_is_queued(self): - if os.path.isfile("task_queue_test.txt"): - os.remove("task_queue_test.txt") - - tq = TaskQueue("task_queue_test.txt") - - tq.push(CrawTask("http://awebsite.com/", "postid", "a title")) - - self.assertTrue(tq.is_queued("postid")) - self.assertFalse(tq.is_queued("123456")) \ No newline at end of file + # def test_push_pop_test(self): + # + # if os.path.isfile("task_queue_test.txt"): + # os.remove("task_queue_test.txt") + # + # tq = TaskQueue("task_queue_test.txt") + # tq.push(CrawTask(dict())) + # + # task1 = tq.pop() + # + # self.assertEqual(tq.pop(), None) + # self.assertEqual(task1.submission.url, "http://awebsite.com/") + # self.assertEqual(task1.submission.post_id, "postid") + # + # def test_multiple_tasks(self): + # if os.path.isfile("task_queue_test.txt"): + # os.remove("task_queue_test.txt") + # + # tq = TaskQueue("task_queue_test.txt") + # + # tq.push(CrawTask(dict())) + # tq.push(CrawTask(dict())) + # tq.push(CrawTask(dict())) + # + # self.assertIsNotNone(tq.pop()) + # self.assertIsNotNone(tq.pop()) + # self.assertIsNotNone(tq.pop()) + # self.assertIsNone(tq.pop()) + # + # def test_is_queued(self): + # if os.path.isfile("task_queue_test.txt"): + # os.remove("task_queue_test.txt") + # + # tq = TaskQueue("task_queue_test.txt") + # + # tq.push(CrawTask({id: "postid"})) + # + # self.assertTrue(tq.is_queued("postid")) + # self.assertFalse(tq.is_queued("123456")) \ No newline at end of file diff --git a/spec/ReportBuilder_spec.py b/spec/ReportBuilder_spec.py index 3546061..5fcb62f 100644 --- a/spec/ReportBuilder_spec.py +++ b/spec/ReportBuilder_spec.py @@ -1,6 +1,6 @@ import pickle from unittest import TestCase -from reports import ReportBuilder +from crawl_report import ReportBuilder class ReportBuilderTest(TestCase): @@ -19,7 +19,7 @@ class ReportBuilderTest(TestCase): def test_total_size_formatted(self): result = self.report_builder.get_total_size_formatted() - self.assertEqual(result, "426.74 GB (426737457589 bytes)") + self.assertEqual(result, "426.74 GB") def test_ext_counts(self): diff --git a/spec/ReportSaver_spec.py b/spec/ReportSaver_spec.py index 8387915..88f46be 100644 --- a/spec/ReportSaver_spec.py +++ b/spec/ReportSaver_spec.py @@ -1,6 +1,6 @@ import pickle from unittest import TestCase -from reports import ReportSaver, ReportBuilder +from crawl_report import ReportSaver, ReportBuilder import json @@ -10,7 +10,7 @@ class ReportSaverTest(TestCase): with open("test_report.pkl", 'rb') as f: self.files = pickle.load(f) - self.report_saver = ReportSaver(self.files, ReportBuilder(self.files, "https://server.elscione.com/")) + self.report_saver = ReportSaver(self.files, "", ReportBuilder(self.files,"https://server.elscione.com/")) with open("test_report.json", 'r') as f: self.expected_json = f.read() diff --git a/static/css/main.css b/static/css/main.css index 88f6142..2c2218a 100644 --- a/static/css/main.css +++ b/static/css/main.css @@ -24,15 +24,16 @@ } #chart-wrapper { - - height: 50%; - width: 50%; + height: 70%; + width: 70%; display: inline-block; } #info-table { - display: inline-block; - vertical-align: top; + font-family: OpenSans-Regular, sans-serif; + font-size: 14px; + line-height: 2; + width: 100%; } #info-table th { diff --git a/static/js/report.js b/static/js/report.js index efb7fe7..354fa41 100644 --- a/static/js/report.js +++ b/static/js/report.js @@ -5,13 +5,16 @@ xhttp.onreadystatechange = function() { console.log("Received: " + this.responseText); - drawCharts(JSON.parse(this.responseText)) + var rData = this.responseText; + + drawChart(JSON.parse(rData)); + fillTable(JSON.parse(rData)); } }; xhttp.open("GET", "./json_chart", true); xhttp.send(); -function drawCharts(rData) { +function drawChart(rData) { var dataSetSize = []; var dataSetCount = []; @@ -49,7 +52,7 @@ function drawCharts(rData) { type: 'pie', data: { datasets: [{ - data: rData["total_size"] === 0 ? dataSetCount : dataSetSize, + data: rData["total_size"] < 100000 ? dataSetCount : dataSetSize, backgroundColor: colors }], @@ -65,6 +68,15 @@ function drawCharts(rData) { }); } +function fillTable(rData) { + + document.getElementById("baseUrl").innerHTML = rData["base_url"]; + document.getElementById("fileCount").innerHTML = rData["total_count"]; + document.getElementById("totalSize").innerHTML = humanFileSize(rData["total_size"]); + document.getElementById("reportTime").innerHTML = rData["report_time"]; + +} + function isRelevant(rData, ext) { @@ -73,7 +85,7 @@ function isRelevant(rData, ext) { console.log("size + " + rData["ext_count"][ext]); console.log("min + " + 0.03 * rData["total_count"]); - if(rData["total_size"] === 0) { + if(rData["total_size"] < 100000) { return rData["ext_count"][ext] > 0.03 * rData["total_count"] } else { return rData["ext_sizes"][ext] > 0.005 * rData["total_size"] diff --git a/templates/layout.html b/templates/layout.html index 6c77e9f..82f0056 100644 --- a/templates/layout.html +++ b/templates/layout.html @@ -2,9 +2,9 @@ - TODO Change + /r/opendirectories bot by simon987 - + diff --git a/templates/report.html b/templates/report.html index a5032f3..3c2c193 100644 --- a/templates/report.html +++ b/templates/report.html @@ -6,29 +6,29 @@
- - + +
- + - + - + - - + +
Base urlhttp://www.chrishaga.com/jodi/
File count213123
Total size321 GB
Report date2018-32-123-123:00Report time
diff --git a/webserver.py b/webserver.py index be7654f..68654f9 100644 --- a/webserver.py +++ b/webserver.py @@ -1,5 +1,6 @@ from flask import Flask, render_template, abort import os +import ssl app = Flask(__name__) @@ -54,4 +55,6 @@ def is_valid_id(report_id: str): if __name__ == '__main__': - app.run("0.0.0.0") \ No newline at end of file + context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) + context.load_cert_chain('certificates/cert.crt', 'certificates/privkey.pem') + app.run("0.0.0.0", ssl_context=context) \ No newline at end of file