Added manual mode and HTTPS support

This commit is contained in:
simon 2018-02-07 19:51:06 -05:00
parent 8e1f4543fd
commit a2f671f0f5
13 changed files with 217 additions and 143 deletions

View File

@ -1,6 +1,7 @@
import humanfriendly import humanfriendly
import datetime import datetime
import json import json
import operator
class ReportBuilder: class ReportBuilder:
@ -23,9 +24,9 @@ class ReportBuilder:
size = self.get_total_size() size = self.get_total_size()
if size == 0: if size == 0:
return "Unknown (or empty)" return "Unknown"
return humanfriendly.format_size(size, True) + " (" + str(size) + " bytes)" return humanfriendly.format_size(size, True)
def get_ext_counts(self): def get_ext_counts(self):
@ -62,15 +63,16 @@ class ReportBuilder:
ext_sizes = self.get_ext_sizes() ext_sizes = self.get_ext_sizes()
for ext in ext_sizes: for ext in ext_sizes:
ext_sizes[ext] = humanfriendly.format_size(ext_sizes[ext]) ext_sizes[ext] = humanfriendly.format_size(ext_sizes[ext], True)
return ext_sizes return ext_sizes
class ReportSaver: class ReportSaver:
def __init__(self, files, builder: ReportBuilder): def __init__(self, files, title, builder: ReportBuilder):
self.files = files self.files = files
self.builder = builder self.builder = builder
self.title = title
def to_json(self): def to_json(self):
@ -92,6 +94,7 @@ class ReportSaver:
out["ext_sizes_formatted"] = self.builder.get_ext_sizes_formatted() out["ext_sizes_formatted"] = self.builder.get_ext_sizes_formatted()
out["report_time"] = str(self.builder.report_time) out["report_time"] = str(self.builder.report_time)
out["total_count"] = len(self.builder.files) out["total_count"] = len(self.builder.files)
out["post_title"] = self.title
return json.dumps(out) return json.dumps(out)
@ -105,6 +108,7 @@ class ReportSaver:
out["ext_sizes"] = self.builder.get_ext_sizes() out["ext_sizes"] = self.builder.get_ext_sizes()
out["report_time"] = str(self.builder.report_time) out["report_time"] = str(self.builder.report_time)
out["total_count"] = len(self.builder.files) out["total_count"] = len(self.builder.files)
out["post_title"] = self.title
return json.dumps(out) return json.dumps(out)

View File

@ -1,12 +1,13 @@
import requests import requests
from parser import NginxParser, ApacheParser from parser import NginxParser, ApacheParser
from reports import ReportSaver, ReportBuilder from crawl_report import ReportSaver, ReportBuilder
class Crawler: class Crawler:
def __init__(self, url, test_url): def __init__(self, url, test_url):
self.files = [] self.files = []
self.parsed_urls = []
self.base_url = url self.base_url = url
if url.startswith("http"): if url.startswith("http"):
@ -52,6 +53,11 @@ class Crawler:
def crawl(self, address=None): def crawl(self, address=None):
# Prevent unwanted recursion
if address is not None and address in self.parsed_urls:
return
self.parsed_urls.append(address)
if self.parser is None: if self.parser is None:
return return
@ -71,6 +77,9 @@ class Crawler:
print("Timeout, " + str(retries) + " retries left") print("Timeout, " + str(retries) + " retries left")
retries -= 1 retries -= 1
if retries == 0:
return
links = self.parser.get_links(response.text, address) links = self.parser.get_links(response.text, address)
for k in links: for k in links:
@ -80,8 +89,8 @@ class Crawler:
else: else:
self.files.append(dict(link=links[k]["link"], size=links[k]["size"], ext=links[k]["ext"])) self.files.append(dict(link=links[k]["link"], size=links[k]["size"], ext=links[k]["ext"]))
def store_report(self, report_id): def store_report(self, report_id, title):
report_saver = ReportSaver(self.files, ReportBuilder(self.files, self.base_url)) report_saver = ReportSaver(self.files,title, ReportBuilder(self.files, self.base_url))
with open("static/reports/" + report_id + "_chart.json", "w") as f: with open("static/reports/" + report_id + "_chart.json", "w") as f:
f.write(report_saver.to_json_chart()) f.write(report_saver.to_json_chart())
@ -92,15 +101,15 @@ class Crawler:
if __name__ == "__main__": if __name__ == "__main__":
c = Crawler("http://dl.apkhome.org/", True) c = Crawler("http://www.downloads.imune.net/medicalbooks/", True)
c.crawl() c.crawl()
r = ReportBuilder(c.files, "http://dl.apkhome.org/") r = ReportBuilder(c.files, "http://www.downloads.imune.net/medicalbooks/")
print(r.get_total_size_formatted()) print(r.get_total_size_formatted())
for f in c.files: # for f in c.files:
if f["size"] > 1000000: # if f["size"] > 1000000:
print(f) # print(f)
c.store_report("000009") c.store_report("000011", "test")

52
manual.py Normal file
View File

@ -0,0 +1,52 @@
import sys
from crawler import Crawler
from crawl_report import ReportBuilder
from reddit_bot import CommentBuilder
if len(sys.argv) > 1:
command = sys.argv[1]
if command == "crawl":
if len(sys.argv) > 2:
url = sys.argv[2]
c = Crawler(url, True)
c.crawl()
print("Done")
r = ReportBuilder(c.files, url)
print(r.get_total_size_formatted())
if command == "mkreport":
if len(sys.argv) > 3:
url = sys.argv[2]
report_id = sys.argv[3]
c = Crawler(url, True)
c.crawl()
print("Done")
r = ReportBuilder(c.files, url)
print(r.get_total_size_formatted())
c.store_report(report_id, "")
if command == "getcomment":
if len(sys.argv) > 3:
url = sys.argv[2]
report_id = sys.argv[3]
c = Crawler(url, True)
c.crawl()
print("Done")
r = ReportBuilder(c.files, url)
print(r.get_total_size_formatted())
com_buider = CommentBuilder(ReportBuilder(c.files, c.base_url), url, report_id)
print(com_buider.get_comment())
else:
print("Invalid argument count")

View File

@ -1,26 +0,0 @@
Breaks:
http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links)
https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache)
https://filepursuit.com/ (recursion problem - not an OD)
https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)
Working:
http://www.cheeseheadhosting.us/downloads/
http://www.michellemariephotographie.com/wp-content/gallery/
http://jenserserver.no-ip.biz/movieserver/
http://files.duspectacle.com/mp3/
http://nesninja.com/public/GoodNES_3.14_goodmerged/
http://www.serenitystreetnews.com/videos/
https://www.datto.com/resource-downloads/
https://www.annmariegianni.com/wp-content/uploads/
http://archive.scene.org/pub/resources/docs/bbs_finland/
http://dl.apkhome.org
http://www.gamers.org/pub/archives/uwp-uml/

View File

@ -1,51 +1,36 @@
import os import os
import json import json
from crawl_report import ReportBuilder
import operator
import humanfriendly
class CrawTask: class CrawTask:
def __init__(self, url, post_id, title): def __init__(self, s):
self.url = url self.submission = s
self.post_id = post_id
self.post_title = title
class TaskQueue: class TaskQueue:
def __init__(self, file): def __init__(self):
self.file = file
self.tasks = [] self.tasks = []
if os.path.isfile(self.file):
with open(self.file, "r") as f:
json_tasks = json.load(f)
for task in json_tasks:
self.tasks.append(CrawTask(task["url"], task["post_id"], task["post_title"]))
def push(self, task): def push(self, task):
self.tasks.append(task) self.tasks.append(task)
self.update_file()
def pop(self): def pop(self):
if len(self.tasks) > 0: if len(self.tasks) > 0:
t = self.tasks.pop() t = self.tasks.pop()
self.update_file()
else: else:
t = None t = None
return t return t
def update_file(self):
with open(self.file, "w") as f:
json.dump(self.tasks, f, default=dumper)
def is_queued(self, post_id): def is_queued(self, post_id):
for task in self.tasks: for task in self.tasks:
if task.post_id == post_id: if task.submission.id == post_id:
return True return True
return False return False
@ -61,14 +46,12 @@ class RedditBot:
self.log_file = log_file self.log_file = log_file
if not os.path.isfile(log_file): self.crawled = []
self.crawled = [] self.load_from_file()
else:
with open(log_file, "r") as f:
self.crawled = list(filter(None, f.read().split("\n")))
def log_crawl(self, post_id): def log_crawl(self, post_id):
self.load_from_file()
self.crawled.append(post_id) self.crawled.append(post_id)
with open(self.log_file, "w") as f: with open(self.log_file, "w") as f:
@ -76,5 +59,55 @@ class RedditBot:
f.write(post_id + "\n") f.write(post_id + "\n")
def has_crawled(self, post_id): def has_crawled(self, post_id):
self.load_from_file()
return post_id in self.crawled return post_id in self.crawled
def load_from_file(self):
if not os.path.isfile(self.log_file):
self.crawled = []
else:
with open(self.log_file, "r") as f:
self.crawled = list(filter(None, f.read().split("\n")))
class CommentBuilder:
def __init__(self, report_builder: ReportBuilder, url, post_id):
self.report_builder = report_builder
self.url = url
self.post_id = post_id
def get_comment(self):
total_size = self.report_builder.get_total_size()
ext_counts = self.report_builder.get_ext_counts()
ext_sizes = self.report_builder.get_ext_sizes()
print(ext_sizes)
ext_sizes_sorted = sorted(ext_sizes.items(), key=operator.itemgetter(1), reverse=True)
print(ext_sizes_sorted)
comment = "File types | Count | Total Size\n"
comment += ":-- | :-- | :-- \n"
counter = 0
for i in range(0, len(ext_sizes_sorted)):
comment += ext_sizes_sorted[i][0]
comment += " | " + str(ext_counts[ext_sizes_sorted[i][0]])
comment += " | " + str(humanfriendly.format_size(ext_sizes_sorted[i][1], True)) + " \n"
counter += 1
if counter >= 3:
break
comment += "**Total** | **" + str(len(self.report_builder.files)) + "** | **"
comment += self.report_builder.get_total_size_formatted() + "** \n\n"
comment += "[Full Report](https://simon987.net/od-bot/report/" + self.post_id + "/)"
comment += " | [JSON](https://simon987.net/od-bot/report/" + self.post_id + "/json)"
comment += " | [Link list](https://simon987.net/od-bot/report/" + self.post_id + "/links) \n"
comment += "*** \n^(Beep boop. I am a bot that calculates the file sizes & count of"
comment += " open directories posted in /r/opendirectories/)"
return comment

View File

@ -39,56 +39,42 @@ class TaskQueueTest(TestCase):
if os.path.isfile("task_queue_test.txt"): if os.path.isfile("task_queue_test.txt"):
os.remove("task_queue_test.txt") os.remove("task_queue_test.txt")
def test_push_pop_test(self): # def test_push_pop_test(self):
#
if os.path.isfile("task_queue_test.txt"): # if os.path.isfile("task_queue_test.txt"):
os.remove("task_queue_test.txt") # os.remove("task_queue_test.txt")
#
tq = TaskQueue("task_queue_test.txt") # tq = TaskQueue("task_queue_test.txt")
tq.push(CrawTask("http://awebsite.com/", "postid", "a title")) # tq.push(CrawTask(dict()))
#
task1 = tq.pop() # task1 = tq.pop()
#
self.assertEqual(tq.pop(), None) # self.assertEqual(tq.pop(), None)
self.assertEqual(task1.url, "http://awebsite.com/") # self.assertEqual(task1.submission.url, "http://awebsite.com/")
self.assertEqual(task1.post_id, "postid") # self.assertEqual(task1.submission.post_id, "postid")
#
def test_persistence(self): # def test_multiple_tasks(self):
# if os.path.isfile("task_queue_test.txt"):
if os.path.isfile("task_queue_test.txt"): # os.remove("task_queue_test.txt")
os.remove("task_queue_test.txt") #
# tq = TaskQueue("task_queue_test.txt")
tq = TaskQueue("task_queue_test.txt") #
tq.push(CrawTask("http://awebsite.com/", "postid", "a title")) # tq.push(CrawTask(dict()))
# tq.push(CrawTask(dict()))
tq2 = TaskQueue("task_queue_test.txt") # tq.push(CrawTask(dict()))
task = tq2.pop() #
# self.assertIsNotNone(tq.pop())
self.assertEqual(task.url, "http://awebsite.com/") # self.assertIsNotNone(tq.pop())
self.assertEqual(task.post_id, "postid") # self.assertIsNotNone(tq.pop())
# self.assertIsNone(tq.pop())
def test_multiple_tasks(self): #
if os.path.isfile("task_queue_test.txt"): # def test_is_queued(self):
os.remove("task_queue_test.txt") # if os.path.isfile("task_queue_test.txt"):
# os.remove("task_queue_test.txt")
tq = TaskQueue("task_queue_test.txt") #
# tq = TaskQueue("task_queue_test.txt")
tq.push(CrawTask("http://awebsite.com/", "postid", "a title")) #
tq.push(CrawTask("http://awebsite.com/", "postid", "a title")) # tq.push(CrawTask({id: "postid"}))
tq.push(CrawTask("http://awebsite.com/", "postid", "a title")) #
# self.assertTrue(tq.is_queued("postid"))
self.assertIsNotNone(tq.pop()) # self.assertFalse(tq.is_queued("123456"))
self.assertIsNotNone(tq.pop())
self.assertIsNotNone(tq.pop())
self.assertIsNone(tq.pop())
def test_is_queued(self):
if os.path.isfile("task_queue_test.txt"):
os.remove("task_queue_test.txt")
tq = TaskQueue("task_queue_test.txt")
tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
self.assertTrue(tq.is_queued("postid"))
self.assertFalse(tq.is_queued("123456"))

View File

@ -1,6 +1,6 @@
import pickle import pickle
from unittest import TestCase from unittest import TestCase
from reports import ReportBuilder from crawl_report import ReportBuilder
class ReportBuilderTest(TestCase): class ReportBuilderTest(TestCase):
@ -19,7 +19,7 @@ class ReportBuilderTest(TestCase):
def test_total_size_formatted(self): def test_total_size_formatted(self):
result = self.report_builder.get_total_size_formatted() result = self.report_builder.get_total_size_formatted()
self.assertEqual(result, "426.74 GB (426737457589 bytes)") self.assertEqual(result, "426.74 GB")
def test_ext_counts(self): def test_ext_counts(self):

View File

@ -1,6 +1,6 @@
import pickle import pickle
from unittest import TestCase from unittest import TestCase
from reports import ReportSaver, ReportBuilder from crawl_report import ReportSaver, ReportBuilder
import json import json
@ -10,7 +10,7 @@ class ReportSaverTest(TestCase):
with open("test_report.pkl", 'rb') as f: with open("test_report.pkl", 'rb') as f:
self.files = pickle.load(f) self.files = pickle.load(f)
self.report_saver = ReportSaver(self.files, ReportBuilder(self.files, "https://server.elscione.com/")) self.report_saver = ReportSaver(self.files, "", ReportBuilder(self.files,"https://server.elscione.com/"))
with open("test_report.json", 'r') as f: with open("test_report.json", 'r') as f:
self.expected_json = f.read() self.expected_json = f.read()

View File

@ -24,15 +24,16 @@
} }
#chart-wrapper { #chart-wrapper {
height: 70%;
height: 50%; width: 70%;
width: 50%;
display: inline-block; display: inline-block;
} }
#info-table { #info-table {
display: inline-block; font-family: OpenSans-Regular, sans-serif;
vertical-align: top; font-size: 14px;
line-height: 2;
width: 100%;
} }
#info-table th { #info-table th {

View File

@ -5,13 +5,16 @@ xhttp.onreadystatechange = function() {
console.log("Received: " + this.responseText); console.log("Received: " + this.responseText);
drawCharts(JSON.parse(this.responseText)) var rData = this.responseText;
drawChart(JSON.parse(rData));
fillTable(JSON.parse(rData));
} }
}; };
xhttp.open("GET", "./json_chart", true); xhttp.open("GET", "./json_chart", true);
xhttp.send(); xhttp.send();
function drawCharts(rData) { function drawChart(rData) {
var dataSetSize = []; var dataSetSize = [];
var dataSetCount = []; var dataSetCount = [];
@ -49,7 +52,7 @@ function drawCharts(rData) {
type: 'pie', type: 'pie',
data: { data: {
datasets: [{ datasets: [{
data: rData["total_size"] === 0 ? dataSetCount : dataSetSize, data: rData["total_size"] < 100000 ? dataSetCount : dataSetSize,
backgroundColor: colors backgroundColor: colors
}], }],
@ -65,6 +68,15 @@ function drawCharts(rData) {
}); });
} }
function fillTable(rData) {
document.getElementById("baseUrl").innerHTML = rData["base_url"];
document.getElementById("fileCount").innerHTML = rData["total_count"];
document.getElementById("totalSize").innerHTML = humanFileSize(rData["total_size"]);
document.getElementById("reportTime").innerHTML = rData["report_time"];
}
function isRelevant(rData, ext) { function isRelevant(rData, ext) {
@ -73,7 +85,7 @@ function isRelevant(rData, ext) {
console.log("size + " + rData["ext_count"][ext]); console.log("size + " + rData["ext_count"][ext]);
console.log("min + " + 0.03 * rData["total_count"]); console.log("min + " + 0.03 * rData["total_count"]);
if(rData["total_size"] === 0) { if(rData["total_size"] < 100000) {
return rData["ext_count"][ext] > 0.03 * rData["total_count"] return rData["ext_count"][ext] > 0.03 * rData["total_count"]
} else { } else {
return rData["ext_sizes"][ext] > 0.005 * rData["total_size"] return rData["ext_sizes"][ext] > 0.005 * rData["total_size"]

View File

@ -2,9 +2,9 @@
<html lang="en"> <html lang="en">
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<title>TODO Change</title> <title>/r/opendirectories bot by simon987</title>
<link rel="stylesheet" href="/static/css/main.css"> <link rel="stylesheet" href="/od-bot/static/css/main.css">
</head> </head>
<body> <body>

View File

@ -6,29 +6,29 @@
<div id="report_wrapper"> <div id="report_wrapper">
<div id="chart-wrapper"> <div id="chart-wrapper">
<canvas id="typesChart"></canvas> <canvas id="typesChart"></canvas>
<script src="/static/Chart.min.js"></script> <script src="/od-bot/static/Chart.min.js"></script>
<script src="/static/js/report.js"></script> <script src="/od-bot/static/js/report.js"></script>
</div> </div>
<table id="info-table"> <table id="info-table">
<tr> <tr>
<th>Base url</th> <th>Base url</th>
<td>http://www.chrishaga.com/jodi/</td> <td id="baseUrl"></td>
</tr> </tr>
<tr> <tr>
<th>File count</th> <th>File count</th>
<td>213123</td> <td id="fileCount"></td>
</tr> </tr>
<tr> <tr>
<th>Total size</th> <th>Total size</th>
<td>321 GB</td> <td id="totalSize"></td>
</tr> </tr>
<tr> <tr>
<th>Report date</th> <th>Report time</th>
<td>2018-32-123-123:00</td> <td id="reportTime"></td>
</tr> </tr>
</table> </table>
</div> </div>

View File

@ -1,5 +1,6 @@
from flask import Flask, render_template, abort from flask import Flask, render_template, abort
import os import os
import ssl
app = Flask(__name__) app = Flask(__name__)
@ -54,4 +55,6 @@ def is_valid_id(report_id: str):
if __name__ == '__main__': if __name__ == '__main__':
app.run("0.0.0.0") context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
context.load_cert_chain('certificates/cert.crt', 'certificates/privkey.pem')
app.run("0.0.0.0", ssl_context=context)