mirror of
https://github.com/simon987/opendirectories-bot.git
synced 2025-04-10 14:06:44 +00:00
Fixed bugs, enhanced parser
This commit is contained in:
parent
f3dc1445e4
commit
23775ec126
43
crawler.py
43
crawler.py
@ -2,19 +2,41 @@ import requests
|
||||
from parser import NginxParser, ApacheParser
|
||||
from reports import ReportSaver, ReportBuilder
|
||||
|
||||
headers = {
|
||||
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
||||
}
|
||||
|
||||
|
||||
class Crawler:
|
||||
|
||||
def __init__(self, url):
|
||||
self.parser = NginxParser()
|
||||
def __init__(self, url, test_url):
|
||||
self.files = []
|
||||
self.base_url = url
|
||||
|
||||
if test_url:
|
||||
# Test url
|
||||
r = requests.get(self.base_url, timeout=30)
|
||||
|
||||
self.parser = self.guess_parser(r.text, r.headers)()
|
||||
|
||||
print("Using " + self.parser.__class__.__name__ + " as parser")
|
||||
|
||||
else:
|
||||
self.parser = None
|
||||
|
||||
@staticmethod
|
||||
def guess_parser(text, headers):
|
||||
|
||||
server = headers["Server"] if "Server" in headers else ""
|
||||
|
||||
# try nginx
|
||||
parser = NginxParser()
|
||||
if parser.page_is_valid(text):
|
||||
return NginxParser
|
||||
|
||||
# Try apache
|
||||
parser = ApacheParser()
|
||||
if parser.page_is_valid(text):
|
||||
return ApacheParser
|
||||
|
||||
return None
|
||||
|
||||
def crawl(self, address=None):
|
||||
|
||||
if address is None:
|
||||
@ -53,6 +75,7 @@ class Crawler:
|
||||
f.write(report_saver.to_link_list())
|
||||
|
||||
|
||||
c = Crawler("http://dl.upload8.in/files/Serial/Altered%20Carbon/")
|
||||
c.crawl()
|
||||
c.store_report("000002")
|
||||
if __name__ == "__main__":
|
||||
c = Crawler("https://repo.zenk-security.com/", True)
|
||||
c.crawl()
|
||||
c.store_report("000007")
|
||||
|
182
parser.py
182
parser.py
@ -1,14 +1,43 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
import os
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import humanfriendly
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class PageParser:
|
||||
|
||||
def __init__(self):
|
||||
self.col_start = None
|
||||
self.col_end = None
|
||||
self.size_unknown = True
|
||||
|
||||
def get_links(self, text: str, base_url: str):
|
||||
raise NotImplementedError()
|
||||
|
||||
@staticmethod
|
||||
def get_size_columns(cols):
|
||||
|
||||
for i in range(len(cols)):
|
||||
|
||||
if i == len(cols) - 1:
|
||||
try:
|
||||
humanfriendly.parse_size(cols[i])
|
||||
return tuple([i, i])
|
||||
except humanfriendly.InvalidSize:
|
||||
return None
|
||||
|
||||
try:
|
||||
humanfriendly.parse_size(cols[i] + cols[i + 1])
|
||||
return tuple([i, i + 1])
|
||||
except humanfriendly.InvalidSize:
|
||||
try:
|
||||
humanfriendly.parse_size(cols[i])
|
||||
return tuple([i, i])
|
||||
except humanfriendly.InvalidSize:
|
||||
continue
|
||||
|
||||
@staticmethod
|
||||
def get_parser_type(headers):
|
||||
"""Get appropriate parser type for a a server based on its header"""
|
||||
@ -26,26 +55,82 @@ class PageParser:
|
||||
|
||||
@staticmethod
|
||||
def file_type(link):
|
||||
return "d" if link.endswith("/") else "f"
|
||||
|
||||
if link.endswith("/") or link.startswith("?"):
|
||||
return "d"
|
||||
return "f"
|
||||
|
||||
|
||||
@staticmethod
|
||||
def clean_page(text):
|
||||
text = text.replace("<A", "<a")
|
||||
text = text.replace("</A", "</a")
|
||||
# text = text.replace("&", "&")
|
||||
text = text.replace("<hr>", "")
|
||||
|
||||
return text
|
||||
|
||||
def get_size(self, cols):
|
||||
|
||||
# Figure out which column(s) is the size one
|
||||
size_cols = self.get_size_columns(cols)
|
||||
if size_cols is not None:
|
||||
col_start, col_end = size_cols
|
||||
self.size_unknown = False
|
||||
|
||||
size_human = cols[col_start] if col_start == col_end else cols[col_start] + cols[col_end]
|
||||
|
||||
try:
|
||||
size = humanfriendly.parse_size(size_human)
|
||||
except humanfriendly.InvalidSize:
|
||||
size = 0
|
||||
else:
|
||||
size = 0
|
||||
|
||||
return size
|
||||
|
||||
|
||||
class NginxParser(PageParser):
|
||||
def get_links(self, text, base_url: str):
|
||||
|
||||
links = dict()
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
# Handle weird character formats and tag names
|
||||
text = text.replace("<A", "<a")
|
||||
text = text.replace("</A", "</a")
|
||||
text = text.replace("&", "&")
|
||||
text = self.clean_page(text)
|
||||
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
for link in soup.find("pre").find_all("a"):
|
||||
|
||||
if link.text != "../":
|
||||
parsed_link = self.parse_link(link, text, base_url)
|
||||
if parsed_link is not None:
|
||||
links[parsed_link[0]] = parsed_link[1]
|
||||
|
||||
return links
|
||||
|
||||
def page_is_valid(self, text):
|
||||
# Handle weird character formats and tag names
|
||||
text = self.clean_page(text)
|
||||
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
if soup.find("pre") is None:
|
||||
return False
|
||||
|
||||
# try to parse a single link
|
||||
for link in soup.find("pre").find_all("a"):
|
||||
if PageParser.should_save_link(link.text):
|
||||
if self.parse_link(link, text, "") is None:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def parse_link(self, link, text, base_url):
|
||||
|
||||
try:
|
||||
if PageParser.should_save_link(link.text):
|
||||
target = link.get("href")
|
||||
full_link = urljoin(base_url, target)
|
||||
file_type = PageParser.file_type(full_link)
|
||||
file_type = PageParser.file_type(target)
|
||||
|
||||
if file_type == "f":
|
||||
extension = os.path.splitext(full_link)[1].strip(".")
|
||||
@ -53,46 +138,30 @@ class NginxParser(PageParser):
|
||||
# Parse size
|
||||
target_index = text.find("</a", text.find(target))
|
||||
date_and_size = text[target_index:text.find("<a", target_index)]
|
||||
size = humanfriendly.parse_size(re.split("\s+", date_and_size)[3])
|
||||
|
||||
links[link.text] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
cols = re.split("\s+", date_and_size)
|
||||
size = self.get_size(cols)
|
||||
|
||||
return target, dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
else:
|
||||
links[link.text] = dict(link=full_link, type=file_type)
|
||||
return target, dict(link=full_link, type=file_type)
|
||||
except Exception as e:
|
||||
print("Couldn't parse link " + link.get("href") + str(e))
|
||||
raise e
|
||||
|
||||
return links
|
||||
return None
|
||||
|
||||
|
||||
class ApacheParser(PageParser):
|
||||
|
||||
def __init__(self):
|
||||
self.col_start = None
|
||||
self.col_end = None
|
||||
self.size_unknown = True
|
||||
|
||||
def get_size_columns(self, cols):
|
||||
|
||||
for i in range(len(cols) - 1):
|
||||
try:
|
||||
humanfriendly.parse_size(cols[i] + cols[i + 1])
|
||||
return tuple([i, i + 1])
|
||||
except humanfriendly.InvalidSize:
|
||||
try:
|
||||
humanfriendly.parse_size(cols[i])
|
||||
return tuple([i, i])
|
||||
except humanfriendly.InvalidSize:
|
||||
continue
|
||||
|
||||
def get_links(self, text, base_url: str):
|
||||
|
||||
links = dict()
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
# Handle weird character formats and tag names
|
||||
text = text.replace("<A", "<a")
|
||||
text = text.replace("</A", "</a")
|
||||
text = text.replace("&", "&")
|
||||
|
||||
text = self.clean_page(text)
|
||||
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
if soup.find("table"):
|
||||
|
||||
@ -109,20 +178,20 @@ class ApacheParser(PageParser):
|
||||
if PageParser.should_save_link(link.text):
|
||||
|
||||
target = link.get("href")
|
||||
file_type = PageParser.file_type(target)
|
||||
full_link = urljoin(base_url, target)
|
||||
file_type = PageParser.file_type(full_link)
|
||||
|
||||
if file_type == "f":
|
||||
extension = os.path.splitext(full_link)[1].strip(".")
|
||||
|
||||
cols = row.find_all("td")
|
||||
for i in range(len(cols)):
|
||||
cols[i] = cols[i].string if cols[i].string is not None else ""
|
||||
cols[i] = cols[i].string if cols[i].string is not None else "-"
|
||||
size = self.get_size(cols)
|
||||
|
||||
links[link.text] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
links[target] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
else:
|
||||
links[link.text] = dict(link=full_link, type=file_type)
|
||||
links[target] = dict(link=full_link, type=file_type)
|
||||
else:
|
||||
|
||||
for link in soup.find_all("a"):
|
||||
@ -131,36 +200,33 @@ class ApacheParser(PageParser):
|
||||
|
||||
target = link.get("href")
|
||||
full_link = urljoin(base_url, target)
|
||||
file_type = PageParser.file_type(full_link)
|
||||
file_type = PageParser.file_type(target)
|
||||
|
||||
if file_type == "f":
|
||||
extension = os.path.splitext(full_link)[1].strip(".")
|
||||
|
||||
target_index = text.find("</a", text.find(target))
|
||||
date_and_size = text[target_index:text.find("<a", target_index)]
|
||||
date_and_size = text[target_index:text.find("<a", target_index)] # in some cases we,re looking for </pre instead
|
||||
date_and_size = text[target_index:text.find("</pre", target_index)] if text.find("<a", target_index) == -1 else date_and_size
|
||||
|
||||
cols = re.split("\s+", date_and_size)
|
||||
size = self.get_size(cols)
|
||||
|
||||
links[link.text] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
links[target] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
else:
|
||||
links[link.text] = dict(link=full_link, type=file_type)
|
||||
links[target] = dict(link=full_link, type=file_type)
|
||||
|
||||
return links
|
||||
|
||||
def get_size(self, cols):
|
||||
if self.col_start is None:
|
||||
# Figure out which column(s) is the size one
|
||||
size_cols = self.get_size_columns(cols)
|
||||
if size_cols is not None:
|
||||
self.col_start, self.col_end = size_cols
|
||||
self.size_unknown = False
|
||||
def page_is_valid(self, text):
|
||||
|
||||
try:
|
||||
links = self.get_links(text, "")
|
||||
print(links)
|
||||
return True
|
||||
except Exception as e:
|
||||
print("This is not recognised Apache open directory: " + str(e))
|
||||
|
||||
|
||||
if self.size_unknown:
|
||||
size = 0
|
||||
else:
|
||||
size_human = cols[self.col_start] if self.col_start == self.col_end else cols[self.col_start] + cols[self.col_end]
|
||||
size = humanfriendly.parse_size(size_human)
|
||||
return size
|
||||
|
||||
|
||||
|
@ -1,4 +1,58 @@
|
||||
import os
|
||||
import json
|
||||
|
||||
|
||||
class CrawTask:
|
||||
|
||||
def __init__(self, url, post_id, title):
|
||||
self.url = url
|
||||
self.post_id = post_id
|
||||
self.post_title = title
|
||||
|
||||
|
||||
class TaskQueue:
|
||||
|
||||
def __init__(self, file):
|
||||
self.file = file
|
||||
|
||||
self.tasks = []
|
||||
|
||||
if os.path.isfile(self.file):
|
||||
|
||||
with open(self.file, "r") as f:
|
||||
json_tasks = json.load(f)
|
||||
|
||||
for task in json_tasks:
|
||||
self.tasks.append(CrawTask(task["url"], task["post_id"], task["post_title"]))
|
||||
|
||||
def push(self, task):
|
||||
self.tasks.append(task)
|
||||
self.update_file()
|
||||
|
||||
def pop(self):
|
||||
if len(self.tasks) > 0:
|
||||
t = self.tasks.pop()
|
||||
self.update_file()
|
||||
else:
|
||||
t = None
|
||||
|
||||
return t
|
||||
|
||||
def update_file(self):
|
||||
with open(self.file, "w") as f:
|
||||
json.dump(self.tasks, f, default=dumper)
|
||||
|
||||
def is_queued(self, post_id):
|
||||
|
||||
for task in self.tasks:
|
||||
if task.post_id == post_id:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def dumper(obj):
|
||||
return obj.__dict__
|
||||
|
||||
|
||||
class RedditBot:
|
||||
@ -11,8 +65,7 @@ class RedditBot:
|
||||
self.crawled = []
|
||||
else:
|
||||
with open(log_file, "r") as f:
|
||||
self.crawled = f.read().split("\n")
|
||||
self.crawled = list(filter(None, self.crawled))
|
||||
self.crawled = list(filter(None, f.read().split("\n")))
|
||||
|
||||
def log_crawl(self, post_id):
|
||||
|
||||
|
@ -91,6 +91,7 @@ class ReportSaver:
|
||||
out["ext_sizes"] = self.builder.get_ext_sizes()
|
||||
out["ext_sizes_formatted"] = self.builder.get_ext_sizes_formatted()
|
||||
out["report_time"] = str(self.builder.report_time)
|
||||
out["total_count"] = len(self.builder.files)
|
||||
|
||||
return json.dumps(out)
|
||||
|
||||
@ -103,6 +104,7 @@ class ReportSaver:
|
||||
out["ext_count"] = self.builder.get_ext_counts()
|
||||
out["ext_sizes"] = self.builder.get_ext_sizes()
|
||||
out["report_time"] = str(self.builder.report_time)
|
||||
out["total_count"] = len(self.builder.files)
|
||||
|
||||
return json.dumps(out)
|
||||
|
||||
|
@ -0,0 +1,32 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from parser import ApacheParser, NginxParser
|
||||
from crawler import Crawler
|
||||
|
||||
|
||||
class CrawlerTest(TestCase):
|
||||
|
||||
def test_guess_parser1(self):
|
||||
|
||||
with open("test_apache1.html", "r") as f:
|
||||
text = f.read()
|
||||
|
||||
c = Crawler("http://some.website/", False)
|
||||
|
||||
self.assertEqual(c.guess_parser(text, {}), ApacheParser)
|
||||
|
||||
def test_guess_parser2(self):
|
||||
with open("test_nginx1.html", "r") as f:
|
||||
text = f.read()
|
||||
|
||||
c = Crawler("http://some.website", False)
|
||||
|
||||
self.assertEqual(c.guess_parser(text, {}), NginxParser)
|
||||
|
||||
def test_guess_parser3(self):
|
||||
with open("test_invalid.html", "r") as f:
|
||||
text = f.read()
|
||||
|
||||
c = Crawler("http://some.website", False)
|
||||
|
||||
self.assertEqual(c.guess_parser(text, {}), None)
|
@ -18,7 +18,7 @@ class NginxParserTest(TestCase):
|
||||
def setUp(self):
|
||||
self.parser = NginxParser()
|
||||
|
||||
root_page_file = open("test_nginx_root.html", "r")
|
||||
root_page_file = open("test_nginx1.html", "r")
|
||||
self.root_page = root_page_file.read()
|
||||
root_page_file.close()
|
||||
|
||||
@ -57,7 +57,7 @@ class ApacheParserTest(TestCase):
|
||||
def setUp(self):
|
||||
self.parser = ApacheParser()
|
||||
|
||||
root_page_file = open("test_apache_root.html", "r")
|
||||
root_page_file = open("test_apache1.html", "r")
|
||||
self.root_page = root_page_file.read()
|
||||
root_page_file.close()
|
||||
|
||||
@ -76,7 +76,7 @@ class ApacheParserTest(TestCase):
|
||||
result = self.parser.get_links(self.root_page, "https://keisari.net/videos/")
|
||||
|
||||
self.assertEqual(result["happyday.mp4"]["size"], 772000)
|
||||
self.assertEqual(result["alex_räjähtää.mp4"]["size"], 715000)
|
||||
self.assertEqual(result["alex_r%c3%a4j%c3%a4ht%c3%a4%c3%a4.mp4"]["size"], 715000)
|
||||
|
||||
def test_link_type(self):
|
||||
result = self.parser.get_links(self.root_page, "https://keisari.net/videos/")
|
||||
@ -109,16 +109,67 @@ class ApacheParserTest2(TestCase):
|
||||
def test_link_size(self):
|
||||
result = self.parser.get_links(self.root_page, self.base_url)
|
||||
|
||||
self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ020˜b.u‚æ‚Ý‚ª‚¦‚éƒTƒCƒ„l“`àIŒå‹ó‚̃‹[ƒcv.wmv"]["size"], 179721000)
|
||||
self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ225˜b.u‹‚¢‚ºƒ`ƒrƒbƒRII‚P‚W†‘å‹êíIHv.wmv"]["size"], 347507000)
|
||||
self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"]["size"], 232185000)
|
||||
self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ019˜b.ud—Í‚Æ‚Ìí‚¢Iƒoƒuƒ‹ƒXŒN‚ð‚‚©‚Ü‚¦‚ëv.wmv"]["size"], 185385000)
|
||||
|
||||
def test_link_type(self):
|
||||
result = self.parser.get_links(self.root_page, self.base_url)
|
||||
|
||||
self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ225˜b.u‹‚¢‚ºƒ`ƒrƒbƒRII‚P‚W†‘å‹êíIHv.wmv"]["type"], "f")
|
||||
self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z jpg/"]["type"], "d")
|
||||
self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"]["type"], "f")
|
||||
self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z%20jpg/"]["type"], "d")
|
||||
|
||||
def test_link_extension(self):
|
||||
result = self.parser.get_links(self.root_page, self.base_url)
|
||||
|
||||
self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ225˜b.u‹‚¢‚ºƒ`ƒrƒbƒRII‚P‚W†‘å‹êíIHv.wmv"]["ext"], "wmv")
|
||||
self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"]["ext"], "wmv")
|
||||
|
||||
|
||||
class ApacheParserTest3(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.parser = ApacheParser()
|
||||
|
||||
root_page_file = open("test_apache3.html", "r")
|
||||
self.root_page = root_page_file.read()
|
||||
self.base_url = "http://files.duspectacle.com/mp3/Jardinets/"
|
||||
root_page_file.close()
|
||||
|
||||
def test_link_count(self):
|
||||
|
||||
result = self.parser.get_links(self.root_page, self.base_url)
|
||||
|
||||
self.assertEqual(len(result), 21)
|
||||
|
||||
def test_link_size(self):
|
||||
result = self.parser.get_links(self.root_page, self.base_url)
|
||||
|
||||
self.assertEqual(result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"]["size"], 9300000)
|
||||
self.assertEqual(result["16%20Yellow%20Ostrich%20-%20WHALE.mp3"]["size"], 7100000)
|
||||
|
||||
def test_link_type(self):
|
||||
result = self.parser.get_links(self.root_page, self.base_url)
|
||||
|
||||
self.assertEqual(result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"]["type"], "f")
|
||||
self.assertEqual(result["01%20Jean%20Rochefort%20-%20Winnie%20et%20ses%20amis%20(introduction)/"]["type"], "d")
|
||||
|
||||
def test_link_extension(self):
|
||||
result = self.parser.get_links(self.root_page, self.base_url)
|
||||
|
||||
self.assertEqual(result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"]["ext"], "mp3")
|
||||
|
||||
|
||||
class ApacheParserTest4(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.parser = ApacheParser()
|
||||
|
||||
root_page_file = open("test_apache4.html", "r")
|
||||
self.root_page = root_page_file.read()
|
||||
self.base_url = "http://jenserserver.no-ip.biz/movieserver/serien/bigbangtheorie/S3/"
|
||||
root_page_file.close()
|
||||
|
||||
def test_link_size(self):
|
||||
result = self.parser.get_links(self.root_page, self.base_url)
|
||||
|
||||
self.assertEqual(result["The.Big.Bang.Theory.S03E06.Football.fuer.Nerds.German.WS.DVDRip.XviD-DELiCiOUS.avi"]["size"], 175000000)
|
||||
self.assertEqual(result["The.Big.Bang.Theory.S03E03.Sex.oder.Pralinen.German.WS.DVDRip.XviD-DELiCiOUS.avi"]["size"], 0)
|
@ -1,5 +1,5 @@
|
||||
from unittest import TestCase
|
||||
from reddit_bot import RedditBot
|
||||
from reddit_bot import RedditBot, TaskQueue, CrawTask
|
||||
import os
|
||||
|
||||
|
||||
@ -33,3 +33,62 @@ class RedditBotTest(TestCase):
|
||||
self.assertTrue(bot.has_crawled("000000"))
|
||||
|
||||
|
||||
class TaskQueueTest(TestCase):
|
||||
|
||||
def tearDown(self):
|
||||
if os.path.isfile("task_queue_test.txt"):
|
||||
os.remove("task_queue_test.txt")
|
||||
|
||||
def test_push_pop_test(self):
|
||||
|
||||
if os.path.isfile("task_queue_test.txt"):
|
||||
os.remove("task_queue_test.txt")
|
||||
|
||||
tq = TaskQueue("task_queue_test.txt")
|
||||
tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
|
||||
|
||||
task1 = tq.pop()
|
||||
|
||||
self.assertEqual(tq.pop(), None)
|
||||
self.assertEqual(task1.url, "http://awebsite.com/")
|
||||
self.assertEqual(task1.post_id, "postid")
|
||||
|
||||
def test_persistence(self):
|
||||
|
||||
if os.path.isfile("task_queue_test.txt"):
|
||||
os.remove("task_queue_test.txt")
|
||||
|
||||
tq = TaskQueue("task_queue_test.txt")
|
||||
tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
|
||||
|
||||
tq2 = TaskQueue("task_queue_test.txt")
|
||||
task = tq2.pop()
|
||||
|
||||
self.assertEqual(task.url, "http://awebsite.com/")
|
||||
self.assertEqual(task.post_id, "postid")
|
||||
|
||||
def test_multiple_tasks(self):
|
||||
if os.path.isfile("task_queue_test.txt"):
|
||||
os.remove("task_queue_test.txt")
|
||||
|
||||
tq = TaskQueue("task_queue_test.txt")
|
||||
|
||||
tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
|
||||
tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
|
||||
tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
|
||||
|
||||
self.assertIsNotNone(tq.pop())
|
||||
self.assertIsNotNone(tq.pop())
|
||||
self.assertIsNotNone(tq.pop())
|
||||
self.assertIsNone(tq.pop())
|
||||
|
||||
def test_is_queued(self):
|
||||
if os.path.isfile("task_queue_test.txt"):
|
||||
os.remove("task_queue_test.txt")
|
||||
|
||||
tq = TaskQueue("task_queue_test.txt")
|
||||
|
||||
tq.push(CrawTask("http://awebsite.com/", "postid", "a title"))
|
||||
|
||||
self.assertTrue(tq.is_queued("postid"))
|
||||
self.assertFalse(tq.is_queued("123456"))
|
32
spec/test_apache3.html
Normal file
32
spec/test_apache3.html
Normal file
@ -0,0 +1,32 @@
|
||||
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Index of /mp3/Jardinets</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Index of /mp3/Jardinets</h1>
|
||||
<pre><img src="/__ovh_icons/blank.gif" alt="Icon "> <a href="?C=N;O=D">Name</a> <a href="?C=M;O=A">Last modified</a> <a href="?C=S;O=A">Size</a> <a href="?C=D;O=A">Description</a><hr><img src="/__ovh_icons/back.gif" alt="[PARENTDIR]"> <a href="/mp3/">Parent Directory</a> -
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="01%20Jean%20Rochefort%20-%20Winnie%20et%20ses%20amis%20(introduction)/">01 Jean Rochefort - ..></a> 2017-12-04 16:33 -
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="02%20Krisma%20-%20Amore.mp3">02 Krisma - Amore.mp3</a> 2017-12-04 16:32 11M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="03%20Bernard%20Estardy%20-%20Cha%20Tatch%20Ka.mp3">03 Bernard Estardy -..></a> 2017-12-04 16:32 3.5M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="04%20Jamie%20Woon%20-%20Street.mp3">04 Jamie Woon - Stre..></a> 2017-12-04 16:32 5.0M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="05%20DyE%20-%20Fantasy.mp3">05 DyE - Fantasy.mp3</a> 2017-12-04 16:33 6.9M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="06%20Games%20-%20Planet%20Party.mp3">06 Games - Planet Pa..></a> 2017-12-04 16:33 5.6M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="07%20Yeasayer%20-%20Swallowing%20the%20Decibels.mp3">07 Yeasayer - Swallo..></a> 2017-12-04 16:33 11M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="08%20Pacific!%20-%20Venus%20Rising.mp3">08 Pacific! - Venus ..></a> 2017-12-04 16:32 5.7M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="09%20Jacky%20Chalard%20-%20Super%20Man%20-%20Super%20Cool%20(LP%20Version).mp3">09 Jacky Chalard - S..></a> 2017-12-04 16:33 11M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="10%20Piry%20-%20Heroi%20Moderno.mp3">10 Piry - Heroi Mode..></a> 2017-12-04 16:32 4.1M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="11%20Bahamas%20-%20Bahamas.mp3">11 Bahamas - Bahamas..></a> 2017-12-04 16:32 7.9M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="12%20Aeroplane%20-%20Fish%20In%20The%20Sky.mp3">12 Aeroplane - Fish ..></a> 2017-12-04 16:32 7.6M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="13%20Discodeine%20-%20Synchronize%20(feat%20Jarvis%20Cocker%20-%20radio%20edit).mp3">13 Discodeine - Sync..></a> 2017-12-04 16:33 6.8M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="14%20Lykke%20Li%20-%20I%20Follow%20Rivers%20(the%20Magician%20Remix).mp3">14 Lykke Li - I Foll..></a> 2017-12-04 16:33 7.3M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3">15 Woodkid - Iron (R..></a> 2017-12-04 16:33 9.3M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="16%20Yellow%20Ostrich%20-%20WHALE.mp3">16 Yellow Ostrich - ..></a> 2017-12-04 16:33 7.1M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="17%20Connan%20Mockasin%20-%20Unicorn%20in%20Uniform.mp3">17 Connan Mockasin -..></a> 2017-12-04 16:32 6.3M
|
||||
<img src="/__ovh_icons/sound2.gif" alt="[SND]"> <a href="18%20Bruce%20Haack%20-%20Maybe%20This%20Song.mp3">18 Bruce Haack - May..></a> 2017-12-04 16:33 5.4M
|
||||
<img src="/__ovh_icons/image2.gif" alt="[IMG]"> <a href="cover-small.jpg">cover-small.jpg</a> 2017-12-04 16:32 97K
|
||||
<img src="/__ovh_icons/image2.gif" alt="[IMG]"> <a href="cover.jpg">cover.jpg</a> 2017-12-04 16:33 466K
|
||||
<img src="/__ovh_icons/text.gif" alt="[TXT]"> <a href="playlist.txt">playlist.txt</a> 2017-12-04 16:33 955
|
||||
<hr></pre>
|
||||
</body></html>
|
38
spec/test_apache4.html
Normal file
38
spec/test_apache4.html
Normal file
@ -0,0 +1,38 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Index of /movieserver/serien/bigbangtheorie/S3</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Index of /movieserver/serien/bigbangtheorie/S3</h1>
|
||||
<table>
|
||||
<tr><th valign="top"><img src="/icons/blank.gif" alt="[ICO]"></th><th><a href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last modified</a></th><th><a href="?C=S;O=A">Size</a></th><th><a href="?C=D;O=A">Description</a></th></tr>
|
||||
<tr><th colspan="5"><hr></th></tr>
|
||||
<tr><td valign="top"><img src="/icons/back.gif" alt="[PARENTDIR]"></td><td><a href="/movieserver/serien/bigbangtheorie/">Parent Directory</a></td><td> </td><td align="right"> - </td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E01.Der.Nordpol.Plan.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E01.Der.Nordpol.Plan.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2017-01-17 18:52 </td><td align="right">6.8M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E02.Die.Grillenwette.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E02.Die.Grillenwette.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:14 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E03.Sex.oder.Pralinen.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E03.Sex.oder.Pralinen.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2017-01-17 19:38 </td><td align="right"> 0 </td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E04.Fuer.ihn.oder.mit.ihm.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E04.Fuer.ihn.oder.mit.ihm.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:16 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E05.Der.Mann.der.seine.Omi.liebte.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E05.Der.Mann.der.seine.Omi.liebte.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:17 </td><td align="right">174M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E06.Football.fuer.Nerds.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E06.Football.fuer.Nerds.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:17 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E07.Der.Gitarrist.auf.der.Couch.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E07.Der.Gitarrist.auf.der.Couch.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:18 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E08.Das.Suppentattoo.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E08.Das.Suppentattoo.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:18 </td><td align="right">174M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E09.Die.Racheformel.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E09.Die.Racheformel.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:20 </td><td align="right">174M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E10.Das.Gorilla.Projekt.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E10.Das.Gorilla.Projekt.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:20 </td><td align="right">174M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E11.Maedels.an.der.Bar.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E11.Maedels.an.der.Bar.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:21 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E12.Howards.Phasen.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E12.Howards.Phasen.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:21 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E13.Terror.in.der.Oestadt.der.Rosen.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E13.Terror.in.der.Oestadt.der.Rosen.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:22 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E14.Fast.wie.Einstein.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E14.Fast.wie.Einstein.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:23 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E15.Freiflug.nach.Genf.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E15.Freiflug.nach.Genf.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:24 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E16.Sheldon.pro.se.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E16.Sheldon.pro.se.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:24 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E17.Die.Herren.des.Rings.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E17.Die.Herren.des.Rings.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:25 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E18.Die.dunkle.Seite.des.Mondes.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E18.Die.dunkle.Seite.des.Mondes.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:25 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E19.Das.L.Wort.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E19.Das.L.Wort.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:27 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E20.Spaghetti.mit.Wuerstchen.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E20.Spaghetti.mit.Wuerstchen.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:27 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E21.Vierer.ohne.Sheldon.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E21.Vierer.ohne.Sheldon.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:28 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E22.Die.Wahrheit.ueber.den.Fahrstuhl.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E22.Die.Wahrheit.ueber.den.Fahrstuhl.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:28 </td><td align="right">175M</td><td> </td></tr>
|
||||
<tr><td valign="top"><img src="/icons/movie.gif" alt="[VID]"></td><td><a href="The.Big.Bang.Theory.S03E23.Nie.mehr.dumme.Typen.German.WS.DVDRip.XviD-DELiCiOUS.avi">The.Big.Bang.Theory.S03E23.Nie.mehr.dumme.Typen.German.WS.DVDRip.XviD-DELiCiOUS.avi</a></td><td align="right">2014-05-16 17:29 </td><td align="right">174M</td><td> </td></tr>
|
||||
<tr><th colspan="5"><hr></th></tr>
|
||||
</table>
|
||||
<address>Apache/2.4.10 (Debian) Server at jenserserver.no-ip.biz Port 80</address>
|
||||
</body></html>
|
@ -23,7 +23,7 @@ function drawCharts(rData) {
|
||||
|
||||
for(var ext in rData["ext_sizes"]) {
|
||||
//Ignore file sizes below 0.5%
|
||||
if (rData["ext_sizes"][ext] < 0.005 * rData["total_size"]) {
|
||||
if (!isRelevant(rData, ext)) {
|
||||
|
||||
otherSize += rData["ext_sizes"][ext];
|
||||
otherCount += rData["ext_count"][ext];
|
||||
@ -40,6 +40,7 @@ function drawCharts(rData) {
|
||||
colors.push(getRandomColor());
|
||||
labels.push("other x" + otherCount + " (" + humanFileSize(otherSize) + ")");
|
||||
dataSetSize.push(otherSize);
|
||||
dataSetCount.push(otherCount);
|
||||
}
|
||||
|
||||
var ctx = document.getElementById('typesChart').getContext('2d');
|
||||
@ -64,6 +65,23 @@ function drawCharts(rData) {
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
function isRelevant(rData, ext) {
|
||||
|
||||
console.log("Checking + " + ext);
|
||||
console.log("total + " + rData["total_size"]);
|
||||
console.log("size + " + rData["ext_count"][ext]);
|
||||
console.log("min + " + 0.03 * rData["total_count"]);
|
||||
|
||||
if(rData["total_size"] === 0) {
|
||||
return rData["ext_count"][ext] > 0.03 * rData["total_count"]
|
||||
} else {
|
||||
return rData["ext_sizes"][ext] > 0.005 * rData["total_size"]
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* https://stackoverflow.com/questions/1484506
|
||||
*/
|
||||
@ -80,6 +98,11 @@ function getRandomColor() {
|
||||
* https://stackoverflow.com/questions/10420352
|
||||
*/
|
||||
function humanFileSize(bytes) {
|
||||
|
||||
if(bytes === 0) {
|
||||
return "? B"
|
||||
}
|
||||
|
||||
var thresh = 1000;
|
||||
if(Math.abs(bytes) < thresh) {
|
||||
return bytes + ' B';
|
||||
|
Loading…
x
Reference in New Issue
Block a user