mirror of
https://github.com/simon987/opendirectories-bot.git
synced 2025-04-04 07:12:59 +00:00
117 lines
3.7 KiB
Python
117 lines
3.7 KiB
Python
import requests
|
|
from parsing import NginxParser, ApacheParser
|
|
from crawl_report import ReportSaver, ReportBuilder
|
|
import os
|
|
|
|
|
|
class Crawler:
|
|
|
|
def __init__(self, url, test_url):
|
|
self.files = []
|
|
self.parsed_urls = []
|
|
self.base_url = os.path.join(url, '')
|
|
|
|
if url.startswith("http"):
|
|
if test_url:
|
|
# Test url
|
|
try:
|
|
r = requests.get(self.base_url, timeout=10) # todo change to 30
|
|
|
|
if r.status_code == 200:
|
|
self.parser = self.guess_parser(r.text, r.headers)()
|
|
|
|
print("Using " + self.parser.__class__.__name__ + " as parser")
|
|
else:
|
|
print("Couldn't connect (" + str(r.status_code) + ")")
|
|
self.parser = None
|
|
|
|
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError):
|
|
print("Timed out / Connection refused")
|
|
self.parser = None
|
|
else:
|
|
print("Using ApacheParser by default because test_url was set to False")
|
|
self.parser = ApacheParser() # Default parser
|
|
else:
|
|
print("Invalid Schema")
|
|
self.parser = None
|
|
|
|
@staticmethod
|
|
def guess_parser(text, headers):
|
|
|
|
server = headers["Server"] if "Server" in headers else ""
|
|
|
|
# try nginx
|
|
parser = NginxParser()
|
|
if parser.page_is_valid(text):
|
|
return NginxParser
|
|
|
|
# Try apache
|
|
parser = ApacheParser()
|
|
if parser.page_is_valid(text):
|
|
return ApacheParser
|
|
|
|
return None
|
|
|
|
def crawl(self, address=None):
|
|
|
|
# Prevent unwanted recursion
|
|
if address is not None and address in self.parsed_urls:
|
|
return
|
|
self.parsed_urls.append(address)
|
|
|
|
if self.parser is None:
|
|
return
|
|
|
|
if address is None:
|
|
address = self.base_url
|
|
|
|
if not address.startswith(self.base_url):
|
|
print("Skipping " + address + " because it does not match " + self.base_url)
|
|
return
|
|
|
|
retries = 20
|
|
while retries >= 0:
|
|
try:
|
|
response = requests.get(address, timeout=10)
|
|
break
|
|
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError):
|
|
print("Timeout, " + str(retries) + " retries left")
|
|
retries -= 1
|
|
|
|
if retries == 0:
|
|
return
|
|
|
|
links = self.parser.get_links(response.text, address)
|
|
|
|
for k in links:
|
|
if links[k]["type"] == "d":
|
|
print(links[k]["link"])
|
|
self.crawl(links[k]["link"])
|
|
else:
|
|
self.files.append(dict(link=links[k]["link"], size=links[k]["size"], ext=links[k]["ext"]))
|
|
|
|
def store_report(self, report_id, title):
|
|
report_saver = ReportSaver(self.files,title, ReportBuilder(self.files, self.base_url))
|
|
|
|
with open("static/reports/" + report_id + "_chart.json", "w") as f:
|
|
f.write(report_saver.to_json_chart())
|
|
with open("static/reports/" + report_id + ".json", "w") as f:
|
|
f.write(report_saver.to_json())
|
|
with open("static/reports/" + report_id + ".txt", "w") as f:
|
|
f.write(report_saver.to_link_list())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
c = Crawler("http://www.downloads.imune.net/medicalbooks/", True)
|
|
c.crawl()
|
|
|
|
r = ReportBuilder(c.files, "http://www.downloads.imune.net/medicalbooks/")
|
|
print(r.get_total_size_formatted())
|
|
|
|
# for f in c.files:
|
|
# if f["size"] > 1000000:
|
|
# print(f)
|
|
|
|
c.store_report("000011", "test")
|
|
|