mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 10:26:44 +00:00
Merge remote-tracking branch 'origin/master'
This commit is contained in:
commit
85437b1ef9
@ -5,6 +5,7 @@ Suggestions/concerns/PRs are welcome
|
|||||||
## Installation
|
## Installation
|
||||||
Assuming you have Python 3 and git installed:
|
Assuming you have Python 3 and git installed:
|
||||||
```bash
|
```bash
|
||||||
|
sudo apt install libssl-dev libcurl4-openssl-dev
|
||||||
git clone https://github.com/simon987/od-database
|
git clone https://github.com/simon987/od-database
|
||||||
cd od-database
|
cd od-database
|
||||||
sudo pip3 install -r requirements.txt
|
sudo pip3 install -r requirements.txt
|
||||||
@ -14,8 +15,11 @@ Create `/config.py` and fill out the parameters. Sample config:
|
|||||||
# Leave default values for no CAPTCHAs
|
# Leave default values for no CAPTCHAs
|
||||||
CAPTCHA_LOGIN = False
|
CAPTCHA_LOGIN = False
|
||||||
CAPTCHA_SUBMIT = False
|
CAPTCHA_SUBMIT = False
|
||||||
|
CAPTCHA_SEARCH = False
|
||||||
CAPTCHA_SITE_KEY = ""
|
CAPTCHA_SITE_KEY = ""
|
||||||
CAPTCHA_SECRET_KEY = ""
|
CAPTCHA_SECRET_KEY = ""
|
||||||
|
CAPTCHA_S_SITE_KEY = ""
|
||||||
|
CAPTCHA_S_SECRET_KEY = ""
|
||||||
|
|
||||||
# Flask secret key for sessions
|
# Flask secret key for sessions
|
||||||
FLASK_SECRET = ""
|
FLASK_SECRET = ""
|
||||||
|
64
app.py
64
app.py
@ -5,7 +5,6 @@ from urllib.parse import urlparse
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import datetime
|
import datetime
|
||||||
import itertools
|
|
||||||
from database import Database, Website, InvalidQueryException
|
from database import Database, Website, InvalidQueryException
|
||||||
from flask_recaptcha import ReCaptcha
|
from flask_recaptcha import ReCaptcha
|
||||||
import od_util
|
import od_util
|
||||||
@ -13,6 +12,7 @@ import config
|
|||||||
from flask_caching import Cache
|
from flask_caching import Cache
|
||||||
from tasks import TaskManager, Task, TaskResult
|
from tasks import TaskManager, Task, TaskResult
|
||||||
from search.search import ElasticSearchEngine
|
from search.search import ElasticSearchEngine
|
||||||
|
from callbacks import PostCrawlCallbackFactory
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN:
|
if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN:
|
||||||
@ -21,6 +21,12 @@ if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN:
|
|||||||
secret_key=config.CAPTCHA_SECRET_KEY)
|
secret_key=config.CAPTCHA_SECRET_KEY)
|
||||||
else:
|
else:
|
||||||
recaptcha = None
|
recaptcha = None
|
||||||
|
if config.CAPTCHA_SEARCH:
|
||||||
|
recaptcha_search = ReCaptcha(app=app,
|
||||||
|
site_key=config.CAPTCHA_S_SITE_KEY,
|
||||||
|
secret_key=config.CAPTCHA_S_SECRET_KEY)
|
||||||
|
else:
|
||||||
|
recaptcha_search = None
|
||||||
app.secret_key = config.FLASK_SECRET
|
app.secret_key = config.FLASK_SECRET
|
||||||
db = Database("db.sqlite3")
|
db = Database("db.sqlite3")
|
||||||
cache = Cache(app, config={'CACHE_TYPE': 'simple'})
|
cache = Cache(app, config={'CACHE_TYPE': 'simple'})
|
||||||
@ -30,6 +36,7 @@ app.jinja_env.globals.update(get_mime=od_util.get_category)
|
|||||||
|
|
||||||
taskManager = TaskManager()
|
taskManager = TaskManager()
|
||||||
searchEngine = ElasticSearchEngine("od-database")
|
searchEngine = ElasticSearchEngine("od-database")
|
||||||
|
searchEngine.start_stats_scheduler()
|
||||||
|
|
||||||
|
|
||||||
@app.template_filter("date_format")
|
@app.template_filter("date_format")
|
||||||
@ -50,7 +57,7 @@ def from_timestamp(value):
|
|||||||
@app.route("/dl")
|
@app.route("/dl")
|
||||||
def downloads():
|
def downloads():
|
||||||
try:
|
try:
|
||||||
export_file_stats = os.stat("static/out.csv.xz")
|
export_file_stats = os.stat("static/out.csv.lzma")
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print("No export file")
|
print("No export file")
|
||||||
export_file_stats = None
|
export_file_stats = None
|
||||||
@ -236,6 +243,7 @@ def admin_rescan_website(website_id):
|
|||||||
|
|
||||||
@app.route("/search")
|
@app.route("/search")
|
||||||
def search():
|
def search():
|
||||||
|
|
||||||
q = request.args.get("q") if "q" in request.args else ""
|
q = request.args.get("q") if "q" in request.args else ""
|
||||||
sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score"
|
sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score"
|
||||||
|
|
||||||
@ -279,6 +287,8 @@ def search():
|
|||||||
|
|
||||||
if len(q) >= 3:
|
if len(q) >= 3:
|
||||||
|
|
||||||
|
response = request.args.get("g-recaptcha-response", "")
|
||||||
|
if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response):
|
||||||
db.log_search(request.remote_addr,
|
db.log_search(request.remote_addr,
|
||||||
request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
|
request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
|
||||||
q, extensions, page)
|
q, extensions, page)
|
||||||
@ -294,6 +304,9 @@ def search():
|
|||||||
flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
|
flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
|
||||||
"Please try again later", "danger")
|
"Please try again later", "danger")
|
||||||
hits = None
|
hits = None
|
||||||
|
else:
|
||||||
|
flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
|
||||||
|
hits = None
|
||||||
|
|
||||||
else:
|
else:
|
||||||
hits = None
|
hits = None
|
||||||
@ -308,7 +321,8 @@ def search():
|
|||||||
size_min=size_min, size_max=size_max,
|
size_min=size_min, size_max=size_max,
|
||||||
match_all=match_all,
|
match_all=match_all,
|
||||||
field_trigram=field_trigram, field_path=field_path, field_name=field_name,
|
field_trigram=field_trigram, field_path=field_path, field_name=field_name,
|
||||||
date_min=date_min, date_max=date_max)
|
date_min=date_min, date_max=date_max,
|
||||||
|
show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/contribute")
|
@app.route("/contribute")
|
||||||
@ -324,7 +338,8 @@ def home():
|
|||||||
stats["website_count"] = len(db.get_all_websites())
|
stats["website_count"] = len(db.get_all_websites())
|
||||||
except:
|
except:
|
||||||
stats = {}
|
stats = {}
|
||||||
return render_template("home.html", stats=stats)
|
return render_template("home.html", stats=stats,
|
||||||
|
show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/submit")
|
@app.route("/submit")
|
||||||
@ -565,7 +580,11 @@ def api_complete_task():
|
|||||||
if filename and os.path.exists(filename):
|
if filename and os.path.exists(filename):
|
||||||
os.remove(filename)
|
os.remove(filename)
|
||||||
|
|
||||||
# TODO: handle callback here
|
# Handle task callback
|
||||||
|
callback = PostCrawlCallbackFactory.get_callback(task)
|
||||||
|
if callback:
|
||||||
|
callback.run(task_result, searchEngine)
|
||||||
|
|
||||||
return "Successfully logged task result and indexed files"
|
return "Successfully logged task result and indexed files"
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -659,7 +678,7 @@ def api_task_enqueue():
|
|||||||
request.json["url"],
|
request.json["url"],
|
||||||
request.json["priority"],
|
request.json["priority"],
|
||||||
request.json["callback_type"],
|
request.json["callback_type"],
|
||||||
request.json["callback_args"]
|
json.dumps(request.json["callback_args"])
|
||||||
)
|
)
|
||||||
taskManager.queue_task(task)
|
taskManager.queue_task(task)
|
||||||
return ""
|
return ""
|
||||||
@ -705,5 +724,38 @@ def api_random_website():
|
|||||||
return abort(403)
|
return abort(403)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/search", methods=["POST"])
|
||||||
|
def api_search():
|
||||||
|
|
||||||
|
try:
|
||||||
|
token = request.json["token"]
|
||||||
|
except KeyError:
|
||||||
|
return abort(400)
|
||||||
|
|
||||||
|
name = db.check_api_token(token)
|
||||||
|
|
||||||
|
if name:
|
||||||
|
|
||||||
|
try:
|
||||||
|
hits = searchEngine.search(
|
||||||
|
request.json["query"],
|
||||||
|
request.json["page"], request.json["per_page"],
|
||||||
|
request.json["sort_order"],
|
||||||
|
request.json["extensions"],
|
||||||
|
request.json["size_min"], request.json["size_max"],
|
||||||
|
request.json["match_all"],
|
||||||
|
request.json["fields"],
|
||||||
|
request.json["date_min"], request.json["date_max"]
|
||||||
|
)
|
||||||
|
|
||||||
|
hits = db.join_website_on_search_result(hits)
|
||||||
|
return json.dumps(hits)
|
||||||
|
|
||||||
|
except InvalidQueryException as e:
|
||||||
|
return str(e)
|
||||||
|
else:
|
||||||
|
return abort(403)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run("0.0.0.0", port=12345, threaded=True)
|
app.run("0.0.0.0", port=12345, threaded=True)
|
||||||
|
34
callbacks.py
34
callbacks.py
@ -1,6 +1,8 @@
|
|||||||
from tasks import Task
|
from tasks import Task, TaskResult
|
||||||
from crawl_server.reddit_bot import RedditBot
|
from reddit_bot import RedditBot
|
||||||
import praw
|
import praw
|
||||||
|
from search.search import SearchEngine
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
class PostCrawlCallback:
|
class PostCrawlCallback:
|
||||||
@ -8,7 +10,10 @@ class PostCrawlCallback:
|
|||||||
def __init__(self, task: Task):
|
def __init__(self, task: Task):
|
||||||
self.task = task
|
self.task = task
|
||||||
|
|
||||||
def run(self):
|
if self.task.callback_args:
|
||||||
|
self.task.callback_args = json.loads(self.task.callback_args)
|
||||||
|
|
||||||
|
def run(self, task_result: TaskResult, search: SearchEngine):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
@ -36,26 +41,33 @@ class RedditCallback(PostCrawlCallback):
|
|||||||
user_agent='github.com/simon987/od-database (by /u/Hexahedr_n)')
|
user_agent='github.com/simon987/od-database (by /u/Hexahedr_n)')
|
||||||
self.reddit_bot = RedditBot("crawled.txt", reddit)
|
self.reddit_bot = RedditBot("crawled.txt", reddit)
|
||||||
|
|
||||||
def run(self):
|
def run(self, task_result: TaskResult, search: SearchEngine):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class RedditPostCallback(RedditCallback):
|
class RedditPostCallback(RedditCallback):
|
||||||
|
|
||||||
def run(self):
|
def run(self, task_result: TaskResult, search: SearchEngine):
|
||||||
print("Reddit post callback for task " + str(self.task))
|
print("Reddit post callback for task " + str(self.task))
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class RedditCommentCallback(RedditCallback):
|
class RedditCommentCallback(RedditCallback):
|
||||||
|
|
||||||
def run(self):
|
def run(self, task_result: TaskResult, search: SearchEngine):
|
||||||
print("Reddit comment callback for task " + str(self.task))
|
|
||||||
pass
|
comment_id = self.task.callback_args["comment_id"]
|
||||||
|
print("Editing comment comment " + comment_id)
|
||||||
|
|
||||||
|
search.refresh() # Make sure the newly indexed documents are available before commenting
|
||||||
|
stats = search.get_stats(self.task.website_id)
|
||||||
|
message = self.reddit_bot.get_comment(stats, self.task.website_id,
|
||||||
|
message="There you go! This website was crawled in `" +
|
||||||
|
str(int(task_result.end_time - task_result.start_time)) + "s`")
|
||||||
|
print(message)
|
||||||
|
self.reddit_bot.edit(self.reddit_bot.reddit.comment(comment_id), message)
|
||||||
|
|
||||||
|
|
||||||
class DiscordCallback(PostCrawlCallback):
|
class DiscordCallback(PostCrawlCallback):
|
||||||
|
|
||||||
def run(self):
|
def run(self, task_result: TaskResult, search: SearchEngine):
|
||||||
print("Discord callback for task " + str(self.task))
|
print("Discord callback for task " + str(self.task))
|
||||||
pass
|
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
|
import pycurl
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
from crawl_server import logger
|
from crawl_server import logger
|
||||||
from urllib.parse import unquote, urljoin
|
from urllib.parse import unquote, urljoin
|
||||||
import os
|
import os
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
from crawl_server.crawler import RemoteDirectory, File
|
from crawl_server.crawler import RemoteDirectory, File
|
||||||
import requests
|
|
||||||
from requests.exceptions import RequestException
|
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
import config
|
import config
|
||||||
from dateutil.parser import parse as parse_date
|
from dateutil.parser import parse as parse_date
|
||||||
|
from pycurl import Curl
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
import urllib3
|
import urllib3
|
||||||
@ -94,17 +96,36 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
|
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
super().__init__(url)
|
super().__init__(url)
|
||||||
self.session = requests.Session()
|
self.curl = None
|
||||||
self.session.headers = HttpDirectory.HEADERS
|
self.curl_head = None
|
||||||
self.session.verify = False
|
self.init_curl()
|
||||||
self.session.max_redirects = 1
|
|
||||||
|
def init_curl(self):
|
||||||
|
|
||||||
|
self.curl = Curl()
|
||||||
|
self.curl.setopt(self.curl.SSL_VERIFYPEER, 0)
|
||||||
|
self.curl.setopt(self.curl.SSL_VERIFYHOST, 0)
|
||||||
|
self.curl.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT)
|
||||||
|
|
||||||
|
self.curl_head = self._curl_handle()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _curl_handle():
|
||||||
|
|
||||||
|
curl_head = Curl()
|
||||||
|
curl_head.setopt(pycurl.SSL_VERIFYPEER, 0)
|
||||||
|
curl_head.setopt(pycurl.SSL_VERIFYHOST, 0)
|
||||||
|
curl_head.setopt(pycurl.NOBODY, 1)
|
||||||
|
curl_head.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT)
|
||||||
|
|
||||||
|
return curl_head
|
||||||
|
|
||||||
def list_dir(self, path):
|
def list_dir(self, path):
|
||||||
|
|
||||||
current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
|
current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
|
||||||
path_identifier = hashlib.md5(current_dir_name.encode())
|
path_identifier = hashlib.md5(current_dir_name.encode())
|
||||||
path_url = urljoin(self.base_url, path, "")
|
path_url = urljoin(self.base_url, path, "")
|
||||||
body = self._stream_body(path_url)
|
body = self._fetch_body(path_url)
|
||||||
anchors = self._parse_links(body)
|
anchors = self._parse_links(body)
|
||||||
|
|
||||||
urls_to_request = []
|
urls_to_request = []
|
||||||
@ -139,7 +160,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
if len(urls_to_request) > 150:
|
if len(urls_to_request) > 150:
|
||||||
# Many urls, use multi-threaded solution
|
# Many urls, use multi-threaded solution
|
||||||
pool = ThreadPool(processes=10)
|
pool = ThreadPool(processes=10)
|
||||||
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
|
files = pool.starmap(self._request_file, zip(urls_to_request, repeat(self.base_url)))
|
||||||
pool.close()
|
pool.close()
|
||||||
for file in files:
|
for file in files:
|
||||||
if file:
|
if file:
|
||||||
@ -147,67 +168,65 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
else:
|
else:
|
||||||
# Too few urls to create thread pool
|
# Too few urls to create thread pool
|
||||||
for url in urls_to_request:
|
for url in urls_to_request:
|
||||||
file = self._request_file(url)
|
file = self._request_file(url, self.base_url)
|
||||||
if file:
|
if file:
|
||||||
yield file
|
yield file
|
||||||
|
|
||||||
def _request_file(self, url):
|
@staticmethod
|
||||||
|
def _request_file(url, base_url):
|
||||||
|
|
||||||
retries = HttpDirectory.MAX_RETRIES
|
retries = HttpDirectory.MAX_RETRIES
|
||||||
while retries > 0:
|
while retries > 0:
|
||||||
try:
|
try:
|
||||||
r = self.session.head(url, allow_redirects=False, timeout=HttpDirectory.TIMEOUT)
|
curl = HttpDirectory._curl_handle()
|
||||||
|
raw_headers = BytesIO()
|
||||||
|
curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore"))
|
||||||
|
curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write)
|
||||||
|
curl.perform()
|
||||||
|
|
||||||
stripped_url = url[len(self.base_url) - 1:]
|
stripped_url = url[len(base_url) - 1:]
|
||||||
|
headers = HttpDirectory._parse_dict_header(raw_headers.getvalue().decode("utf-8", errors="ignore"))
|
||||||
|
raw_headers.close()
|
||||||
|
|
||||||
path, name = os.path.split(stripped_url)
|
path, name = os.path.split(stripped_url)
|
||||||
date = r.headers.get("Last-Modified", "1970-01-01")
|
date = headers.get("Last-Modified", "1970-01-01")
|
||||||
|
curl.close()
|
||||||
return File(
|
return File(
|
||||||
path=unquote(path).strip("/"),
|
path=unquote(path).strip("/"),
|
||||||
name=unquote(name),
|
name=unquote(name),
|
||||||
size=int(r.headers.get("Content-Length", -1)),
|
size=int(headers.get("Content-Length", -1)),
|
||||||
mtime=int(parse_date(date).timestamp()),
|
mtime=int(parse_date(date).timestamp()),
|
||||||
is_dir=False
|
is_dir=False
|
||||||
)
|
)
|
||||||
except RequestException:
|
except pycurl.error:
|
||||||
self.session.close()
|
|
||||||
retries -= 1
|
retries -= 1
|
||||||
|
|
||||||
logger.debug("TimeoutError - _request_file")
|
logger.debug("TimeoutError - _request_file")
|
||||||
raise TimeoutError
|
raise TimeoutError
|
||||||
|
|
||||||
def _stream_body(self, url: str):
|
def _fetch_body(self, url: str):
|
||||||
retries = HttpDirectory.MAX_RETRIES
|
retries = HttpDirectory.MAX_RETRIES
|
||||||
while retries > 0:
|
while retries > 0:
|
||||||
try:
|
try:
|
||||||
r = self.session.get(url, stream=True, timeout=HttpDirectory.TIMEOUT)
|
content = BytesIO()
|
||||||
for chunk in r.iter_content(chunk_size=8192):
|
self.curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore"))
|
||||||
try:
|
self.curl.setopt(pycurl.WRITEDATA, content)
|
||||||
yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore")
|
self.curl.perform()
|
||||||
except LookupError:
|
|
||||||
# Unsupported encoding
|
return content.getvalue().decode("utf-8", errors="ignore")
|
||||||
yield chunk.decode("utf-8", errors="ignore")
|
except pycurl.error:
|
||||||
r.close()
|
self.close()
|
||||||
return
|
|
||||||
except RequestException:
|
|
||||||
self.session.close()
|
|
||||||
retries -= 1
|
retries -= 1
|
||||||
|
|
||||||
logger.debug("TimeoutError - _stream_body")
|
logger.debug("TimeoutError - _fetch_body")
|
||||||
raise TimeoutError
|
raise TimeoutError
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parse_links(body):
|
def _parse_links(body):
|
||||||
|
|
||||||
parser = HTMLAnchorParser()
|
parser = HTMLAnchorParser()
|
||||||
anchors = []
|
parser.feed(body)
|
||||||
|
return parser.anchors
|
||||||
for chunk in body:
|
|
||||||
parser.feed(chunk)
|
|
||||||
for anchor in parser.anchors:
|
|
||||||
anchors.append(anchor)
|
|
||||||
|
|
||||||
return anchors
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _isdir(link: Anchor):
|
def _isdir(link: Anchor):
|
||||||
@ -216,14 +235,14 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def _should_ignore(base_url, current_path, link: Anchor):
|
def _should_ignore(base_url, current_path, link: Anchor):
|
||||||
|
|
||||||
if urljoin(base_url, link.href) == urljoin(urljoin(base_url, current_path), "../"):
|
full_url = urljoin(base_url, link.href)
|
||||||
|
if full_url == urljoin(urljoin(base_url, current_path), "../") or full_url == base_url:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if link.href.endswith(HttpDirectory.BLACK_LIST):
|
if link.href.endswith(HttpDirectory.BLACK_LIST):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Ignore external links
|
# Ignore external links
|
||||||
full_url = urljoin(base_url, link.href)
|
|
||||||
if not full_url.startswith(base_url):
|
if not full_url.startswith(base_url):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -231,8 +250,18 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
if "?" in link.href:
|
if "?" in link.href:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_dict_header(raw):
|
||||||
|
headers = dict()
|
||||||
|
for line in raw.split("\r\n")[1:]: # Ignore first 'HTTP/1.0 200 OK' line
|
||||||
|
if line:
|
||||||
|
k, v = line.split(":", maxsplit=1)
|
||||||
|
headers[k.strip()] = v.strip()
|
||||||
|
|
||||||
|
return headers
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.session.close()
|
self.curl.close()
|
||||||
logger.debug("Closing HTTPRemoteDirectory for " + self.base_url)
|
self.init_curl()
|
||||||
|
|
||||||
|
|
||||||
|
@ -63,7 +63,7 @@ class TaskManager:
|
|||||||
}
|
}
|
||||||
|
|
||||||
r = requests.post(config.SERVER_URL + "/task/upload", data=payload, files=files)
|
r = requests.post(config.SERVER_URL + "/task/upload", data=payload, files=files)
|
||||||
logger.info("RESPONSE: " + r.text)
|
logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Exception while sending file_list chunk: " + str(e))
|
logger.error("Exception while sending file_list chunk: " + str(e))
|
||||||
pass
|
pass
|
||||||
@ -75,7 +75,7 @@ class TaskManager:
|
|||||||
}
|
}
|
||||||
|
|
||||||
r = requests.post(config.SERVER_URL + "/task/complete", data=payload)
|
r = requests.post(config.SERVER_URL + "/task/complete", data=payload)
|
||||||
logger.info("RESPONSE: " + r.text)
|
logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">")
|
||||||
|
|
||||||
if os.path.exists(filename):
|
if os.path.exists(filename):
|
||||||
os.remove(filename)
|
os.remove(filename)
|
||||||
|
@ -407,12 +407,12 @@ class Database:
|
|||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM "
|
cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM "
|
||||||
"Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name))
|
"Queue WHERE website_id=?", (website_id, ))
|
||||||
|
|
||||||
task = cursor.fetchone()
|
task = cursor.fetchone()
|
||||||
|
|
||||||
if task:
|
if task:
|
||||||
cursor.execute("DELETE FROM Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name))
|
cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, ))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
return Task(task[1], task[2], task[3], task[4], task[5])
|
return Task(task[1], task[2], task[3], task[4], task[5])
|
||||||
else:
|
else:
|
||||||
|
@ -5,7 +5,12 @@ import os
|
|||||||
import validators
|
import validators
|
||||||
import re
|
import re
|
||||||
from ftplib import FTP
|
from ftplib import FTP
|
||||||
import config
|
|
||||||
|
# TODO: find a better way to do this
|
||||||
|
try:
|
||||||
|
from . import config
|
||||||
|
except (ImportError, SystemError):
|
||||||
|
import config
|
||||||
|
|
||||||
import urllib3
|
import urllib3
|
||||||
urllib3.disable_warnings()
|
urllib3.disable_warnings()
|
||||||
|
@ -41,11 +41,23 @@ class RedditBot:
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
# Double check has_crawled
|
|
||||||
if not self.has_crawled(reddit_obj.id):
|
if not self.has_crawled(reddit_obj.id):
|
||||||
reddit_obj.reply(comment)
|
reply = reddit_obj.reply(comment)
|
||||||
self.log_crawl(reddit_obj.id)
|
self.log_crawl(reddit_obj.id)
|
||||||
print("Reply to " + reddit_obj.id)
|
print("Reply to " + reddit_obj.id)
|
||||||
|
return reply
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print("Waiting 5 minutes: " + str(e))
|
||||||
|
time.sleep(300)
|
||||||
|
continue
|
||||||
|
|
||||||
|
def edit(self, reddit_comment, new_message):
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
reddit_comment.edit(new_message)
|
||||||
|
print("Edit comment " + reddit_comment.id)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Waiting 5 minutes: " + str(e))
|
print("Waiting 5 minutes: " + str(e))
|
||||||
@ -54,14 +66,13 @@ class RedditBot:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_comment(stats: dict, website_id, message: str = ""):
|
def get_comment(stats: dict, website_id, message: str = ""):
|
||||||
comment = message + " \n" if len(message) > 0 else ""
|
comment = message + " \n" if message else ""
|
||||||
|
|
||||||
for stat in stats:
|
comment += RedditBot.format_stats(stats)
|
||||||
comment += stat + " \n" if len(stat) > 0 else ""
|
|
||||||
comment += RedditBot.format_stats(stats[stat])
|
|
||||||
|
|
||||||
comment += "[Full Report](https://od-database.simon987.net/website/" + str(website_id) + "/)"
|
comment += "[Full Report](https://od-db.the-eye.eu/website/" + str(website_id) + "/)"
|
||||||
comment += " | [Link list](https://od-database.simon987.net/website/" + str(website_id) + "/links) \n"
|
comment += " | [Link list](https://od-db.the-eye.eu/website/" + str(website_id) + "/links)"
|
||||||
|
comment += " | [Source](https://github.com/simon987) \n"
|
||||||
comment += "*** \n"
|
comment += "*** \n"
|
||||||
comment += RedditBot.bottom_line
|
comment += RedditBot.bottom_line
|
||||||
|
|
||||||
@ -74,7 +85,7 @@ class RedditBot:
|
|||||||
result += "File types | Count | Total Size\n"
|
result += "File types | Count | Total Size\n"
|
||||||
result += ":-- | :-- | :-- \n"
|
result += ":-- | :-- | :-- \n"
|
||||||
counter = 0
|
counter = 0
|
||||||
for mime in stats["mime_stats"]:
|
for mime in stats["ext_stats"]:
|
||||||
result += mime[2]
|
result += mime[2]
|
||||||
result += " | " + str(mime[1])
|
result += " | " + str(mime[1])
|
||||||
result += " | " + humanfriendly.format_size(mime[0]) + " \n"
|
result += " | " + humanfriendly.format_size(mime[0]) + " \n"
|
@ -17,3 +17,5 @@ ujson
|
|||||||
urllib3
|
urllib3
|
||||||
pyOpenSSL
|
pyOpenSSL
|
||||||
pybloom-live
|
pybloom-live
|
||||||
|
pycurl
|
||||||
|
lxml
|
@ -31,6 +31,9 @@ class SearchEngine:
|
|||||||
def get_stats(self, website_id: int, subdir: str = None):
|
def get_stats(self, website_id: int, subdir: str = None):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def refresh(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class ElasticSearchEngine(SearchEngine):
|
class ElasticSearchEngine(SearchEngine):
|
||||||
SORT_ORDERS = {
|
SORT_ORDERS = {
|
||||||
@ -47,13 +50,14 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
self.index_name = index_name
|
self.index_name = index_name
|
||||||
self.es = elasticsearch.Elasticsearch()
|
self.es = elasticsearch.Elasticsearch()
|
||||||
|
|
||||||
scheduler = BackgroundScheduler()
|
|
||||||
scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 15)
|
|
||||||
scheduler.start()
|
|
||||||
|
|
||||||
if not self.es.indices.exists(self.index_name):
|
if not self.es.indices.exists(self.index_name):
|
||||||
self.init()
|
self.init()
|
||||||
|
|
||||||
|
def start_stats_scheduler(self):
|
||||||
|
scheduler = BackgroundScheduler()
|
||||||
|
scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120)
|
||||||
|
scheduler.start()
|
||||||
|
|
||||||
def init(self):
|
def init(self):
|
||||||
print("Elasticsearch first time setup")
|
print("Elasticsearch first time setup")
|
||||||
if self.es.indices.exists(self.index_name):
|
if self.es.indices.exists(self.index_name):
|
||||||
@ -122,8 +126,8 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
|
|
||||||
def import_json(self, in_lines, website_id: int):
|
def import_json(self, in_lines, website_id: int):
|
||||||
|
|
||||||
import_every = 1000
|
import_every = 400
|
||||||
cooldown_time = 1
|
cooldown_time = 0
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
|
|
||||||
@ -211,7 +215,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"size": per_page, "from": min(page * per_page, 10000 - per_page)},
|
"size": per_page, "from": min(page * per_page, 10000 - per_page)},
|
||||||
index=self.index_name, request_timeout=30)
|
index=self.index_name, request_timeout=35)
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
@ -229,7 +233,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
"ext_group": {
|
"ext_group": {
|
||||||
"terms": {
|
"terms": {
|
||||||
"field": "ext",
|
"field": "ext",
|
||||||
"size": 20
|
"size": 12
|
||||||
},
|
},
|
||||||
"aggs": {
|
"aggs": {
|
||||||
"size": {
|
"size": {
|
||||||
@ -246,7 +250,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"size": 0
|
"size": 0
|
||||||
}, index=self.index_name, request_timeout=20)
|
}, index=self.index_name, request_timeout=30)
|
||||||
|
|
||||||
stats = dict()
|
stats = dict()
|
||||||
stats["total_size"] = result["aggregations"]["total_size"]["value"]
|
stats["total_size"] = result["aggregations"]["total_size"]["value"]
|
||||||
@ -311,7 +315,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
},
|
},
|
||||||
"size": 0
|
"size": 0
|
||||||
|
|
||||||
}, index=self.index_name, request_timeout=120)
|
}, index=self.index_name, request_timeout=240)
|
||||||
|
|
||||||
total_stats = self.es.search(body={
|
total_stats = self.es.search(body={
|
||||||
"query": {
|
"query": {
|
||||||
@ -333,7 +337,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
},
|
},
|
||||||
"size": 0
|
"size": 0
|
||||||
|
|
||||||
}, index=self.index_name, request_timeout=120)
|
}, index=self.index_name, request_timeout=241)
|
||||||
|
|
||||||
size_and_date_histogram = self.es.search(body={
|
size_and_date_histogram = self.es.search(body={
|
||||||
"query": {
|
"query": {
|
||||||
@ -354,21 +358,21 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
"sizes": {
|
"sizes": {
|
||||||
"histogram": {
|
"histogram": {
|
||||||
"field": "size",
|
"field": "size",
|
||||||
"interval": 50000000, # 50Mb
|
"interval": 100000000, # 100Mb
|
||||||
"min_doc_count": 100
|
"min_doc_count": 500
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"dates": {
|
"dates": {
|
||||||
"date_histogram": {
|
"date_histogram": {
|
||||||
"field": "mtime",
|
"field": "mtime",
|
||||||
"interval": "1y",
|
"interval": "1y",
|
||||||
"min_doc_count": 100,
|
"min_doc_count": 500,
|
||||||
"format": "yyyy"
|
"format": "yyyy"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"size": 0
|
"size": 0
|
||||||
}, index=self.index_name, request_timeout=120)
|
}, index=self.index_name, request_timeout=242)
|
||||||
|
|
||||||
website_scatter = self.es.search(body={
|
website_scatter = self.es.search(body={
|
||||||
"query": {
|
"query": {
|
||||||
@ -384,7 +388,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
"websites": {
|
"websites": {
|
||||||
"terms": {
|
"terms": {
|
||||||
"field": "website_id",
|
"field": "website_id",
|
||||||
"size": 500 # TODO: Figure out what size is appropriate
|
"size": 400 # TODO: Figure out what size is appropriate
|
||||||
},
|
},
|
||||||
"aggs": {
|
"aggs": {
|
||||||
"size": {
|
"size": {
|
||||||
@ -396,9 +400,9 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"size": 0
|
"size": 0
|
||||||
}, index=self.index_name, request_timeout=120)
|
}, index=self.index_name, request_timeout=243)
|
||||||
|
|
||||||
es_stats = self.es.indices.stats(self.index_name, request_timeout=120)
|
es_stats = self.es.indices.stats(self.index_name, request_timeout=244)
|
||||||
|
|
||||||
stats = dict()
|
stats = dict()
|
||||||
stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
|
stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
|
||||||
@ -460,3 +464,6 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
for website in websites:
|
for website in websites:
|
||||||
if website not in non_empty_websites:
|
if website not in non_empty_websites:
|
||||||
yield website
|
yield website
|
||||||
|
|
||||||
|
def refresh(self):
|
||||||
|
self.es.indices.refresh(self.index_name)
|
||||||
|
@ -23,13 +23,23 @@
|
|||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="card-header">Search</div>
|
<div class="card-header">Search</div>
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
<form action="/search">
|
<form action="/search" id="sfrm">
|
||||||
|
|
||||||
<div class="form-group">
|
<div class="form-row">
|
||||||
|
<div class="col-md-11">
|
||||||
<input class="form-control" name="q" id="q" placeholder="Query">
|
<input class="form-control" name="q" id="q" placeholder="Query">
|
||||||
</div>
|
</div>
|
||||||
|
<div class="col-md-1">
|
||||||
|
{% if show_captcha %}
|
||||||
|
<script>function f(token) {document.getElementById("sfrm").submit();}</script>
|
||||||
|
<script src="https://www.google.com/recaptcha/api.js" async defer></script>
|
||||||
|
<button class="g-recaptcha btn btn-primary btn-shadow" data-sitekey="6LcCXWkUAAAAAJo2NR9_m09Obn5YmDrI97sGrr2f" data-callback="f">Search</button>
|
||||||
|
{% else %}
|
||||||
|
<input class="btn btn-primary btn-shadow" type="submit" value="Search nocap">
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<input class="btn btn-primary btn-shadow" type="submit" value="Search">
|
|
||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -32,7 +32,7 @@
|
|||||||
<div class="collapse navbar-collapse" id="navbarSupportedContent">
|
<div class="collapse navbar-collapse" id="navbarSupportedContent">
|
||||||
<ul class="navbar-nav mr-auto">
|
<ul class="navbar-nav mr-auto">
|
||||||
<li class="nav-item">
|
<li class="nav-item">
|
||||||
<a class="nav-link {{ "active" if current_page == "website" else "" }}" href="/website">Websites</a>
|
<a class="nav-link {{ "active" if current_page == "website" else "" }}" href="/website/">Websites</a>
|
||||||
</li>
|
</li>
|
||||||
<li class="nav-item">
|
<li class="nav-item">
|
||||||
<a class="nav-link {{ "active" if current_page == "submit" else "" }}" href="/submit">Submit website</a>
|
<a class="nav-link {{ "active" if current_page == "submit" else "" }}" href="/submit">Submit website</a>
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="card-header">Search</div>
|
<div class="card-header">Search</div>
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
<form action="/search">
|
<form action="/search" id="sfrm">
|
||||||
|
|
||||||
<div class="form-row">
|
<div class="form-row">
|
||||||
|
|
||||||
@ -22,6 +22,7 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<input class="form-control" name="q" id="q" placeholder="Query" value="{{ q }}">
|
<input class="form-control" name="q" id="q" placeholder="Query" value="{{ q }}">
|
||||||
|
<input type="hidden" name="p" id="page" value="{{ p }}">
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
{# Size #}
|
{# Size #}
|
||||||
@ -90,9 +91,19 @@
|
|||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
{# Search button #}
|
{# Search button #}
|
||||||
<div class="form-group col-md-7">
|
<div class="form-group col-md-7">
|
||||||
<input class="btn btn-primary btn-shadow" type="submit" value="Search" style="float: right">
|
|
||||||
|
{% if show_captcha %}
|
||||||
|
<script>function f(token) {
|
||||||
|
document.getElementById("sfrm").submit();
|
||||||
|
}</script>
|
||||||
|
<script src="https://www.google.com/recaptcha/api.js" async defer></script>
|
||||||
|
<button id="s" class="g-recaptcha btn btn-primary btn-shadow" data-sitekey="6LcCXWkUAAAAAJo2NR9_m09Obn5YmDrI97sGrr2f" data-callback="f" style="float: right" onclick="document.getElementById('page').value = 0">Search</button>
|
||||||
|
{% else %}
|
||||||
|
<input id="s" class="btn btn-primary btn-shadow" type="submit" value="Search" style="float: right">
|
||||||
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@ -156,12 +167,10 @@
|
|||||||
</table>
|
</table>
|
||||||
</div>
|
</div>
|
||||||
{% if results["hits"]["total"] > (p + 1) * per_page %}
|
{% if results["hits"]["total"] > (p + 1) * per_page %}
|
||||||
<a href="/search?q={{ q }}&p={{ p + 1 }}&sort_order={{ sort_order }}&per_page={{ per_page }}&ext={{ extensions }}&size_min={{ size_min }}&size_max={{ size_max }}{{ "&field_path" if field_path else "" }}{{ "&field_name" if field_name else "" }}{{ "&field_trigram" if field_trigram else "" }}&date_min={{ date_min }}&date_max={{ date_max }}"
|
<button class="btn btn-primary" onclick="nextPage()" style="float: right">Next</button>
|
||||||
class="btn btn-primary" style="float: right">Next</a>
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if p > 0 %}
|
{% if p > 0 %}
|
||||||
<a href="/search?q={{ q }}&p={{ p - 1 }}&sort_order={{ sort_order }}&per_page={{ per_page }}&ext={{ extensions }}&size_min={{ size_min }}&size_max={{ size_max }}{{ "&field_path" if field_path else "" }}{{ "&field_name" if field_name else "" }}{{ "&field_trigram" if field_trigram else "" }}&date_min={{ date_min }}&date_max={{ date_max }}"
|
<button class="btn btn-primary" onclick="prevPage()">Previous</button>
|
||||||
class="btn btn-primary">Previous</a>
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
@ -174,6 +183,7 @@
|
|||||||
<ul>
|
<ul>
|
||||||
<li>Try checking the 'Match any word' box for a broader search.</li>
|
<li>Try checking the 'Match any word' box for a broader search.</li>
|
||||||
<li>Make sure you don't include the file extension in your query (Use the appropriate field to filter file types)</li>
|
<li>Make sure you don't include the file extension in your query (Use the appropriate field to filter file types)</li>
|
||||||
|
<li>If you're searching for files in a particular website, use the <a href="/website">website search page</a></li>
|
||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@ -245,6 +255,16 @@
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
//Next button
|
||||||
|
function nextPage() {
|
||||||
|
document.getElementById("page").value = parseInt(document.getElementById("page").value) + 1;
|
||||||
|
grecaptcha.execute();
|
||||||
|
}
|
||||||
|
function prevPage() {
|
||||||
|
document.getElementById("page").value = parseInt(document.getElementById("page").value) - 1;
|
||||||
|
grecaptcha.execute();
|
||||||
|
}
|
||||||
|
|
||||||
</script>
|
</script>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
21
test/files/apache_table.html
Normal file
21
test/files/apache_table.html
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Index of /Public/bootstrap</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Index of /Public/bootstrap</h1>
|
||||||
|
<table>
|
||||||
|
<tr><th valign="top"><img src="/icons/blank.gif" alt="[ICO]"></th><th><a href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last modified</a></th><th><a href="?C=S;O=A">Size</a></th><th><a href="?C=D;O=A">Description</a></th></tr>
|
||||||
|
<tr><th colspan="5"><hr></th></tr>
|
||||||
|
<tr><td valign="top"><img src="/icons/back.gif" alt="[PARENTDIR]"></td><td><a href="/Public/">Parent Directory</a> </td><td> </td><td align="right"> - </td><td> </td></tr>
|
||||||
|
<tr><td valign="top"><img src="/icons/unknown.gif" alt="[ ]"></td><td><a href="bower.json">bower.json</a> </td><td align="right">2017-04-05 01:45 </td><td align="right">1.0K</td><td> </td></tr>
|
||||||
|
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="css/">css/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td> </td></tr>
|
||||||
|
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="image/">image/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td> </td></tr>
|
||||||
|
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="js/">js/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td> </td></tr>
|
||||||
|
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="less/">less/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td> </td></tr>
|
||||||
|
<tr><td valign="top"><img src="/icons/unknown.gif" alt="[ ]"></td><td><a href="package.json">package.json</a> </td><td align="right">2017-04-05 01:45 </td><td align="right">666 </td><td> </td></tr>
|
||||||
|
<tr><th colspan="5"><hr></th></tr>
|
||||||
|
</table>
|
||||||
|
</body></html>
|
||||||
|
|
47
test/files/lighttpd_table.html
Normal file
47
test/files/lighttpd_table.html
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
<?xml version="1.0" encoding="iso-8859-1"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
||||||
|
<head>
|
||||||
|
<title>Index of /gentoo/releases/</title>
|
||||||
|
<style type="text/css">
|
||||||
|
a, a:active {text-decoration: none; color: blue;}
|
||||||
|
a:visited {color: #48468F;}
|
||||||
|
a:hover, a:focus {text-decoration: underline; color: red;}
|
||||||
|
body {background-color: #F5F5F5;}
|
||||||
|
h2 {margin-bottom: 12px;}
|
||||||
|
table {margin-left: 12px;}
|
||||||
|
th, td { font: 90% monospace; text-align: left;}
|
||||||
|
th { font-weight: bold; padding-right: 14px; padding-bottom: 3px;}
|
||||||
|
td {padding-right: 14px;}
|
||||||
|
td.s, th.s {text-align: right;}
|
||||||
|
div.list { background-color: white; border-top: 1px solid #646464; border-bottom: 1px solid #646464; padding-top: 10px; padding-bottom: 14px;}
|
||||||
|
div.foot { font: 90% monospace; color: #787878; padding-top: 4px;}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h2>Index of /gentoo/releases/</h2>
|
||||||
|
<div class="list">
|
||||||
|
<table summary="Directory Listing" cellpadding="0" cellspacing="0">
|
||||||
|
<thead><tr><th class="n">Name</th><th class="m">Last Modified</th><th class="s">Size</th><th class="t">Type</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="n"><a href="../">Parent Directory</a>/</td><td class="m"> </td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="alpha/">alpha</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="amd64/">amd64</a>/</td><td class="m">2017-Feb-09 18:50:44</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="arm/">arm</a>/</td><td class="m">2014-Apr-29 13:42:06</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="hppa/">hppa</a>/</td><td class="m">2014-Apr-29 13:42:12</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="ia64/">ia64</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="mips/">mips</a>/</td><td class="m">2011-Apr-28 23:38:14</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="ppc/">ppc</a>/</td><td class="m">2014-Apr-29 13:41:00</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="s390/">s390</a>/</td><td class="m">2014-Apr-29 13:41:06</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="sh/">sh</a>/</td><td class="m">2014-Apr-29 13:41:16</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="snapshots/">snapshots</a>/</td><td class="m">2009-Apr-16 05:08:17</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="sparc/">sparc</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="x86/">x86</a>/</td><td class="m">2016-Jul-04 21:14:19</td><td class="s">- </td><td class="t">Directory</td></tr>
|
||||||
|
<tr><td class="n"><a href="README">README</a></td><td class="m">2014-Jun-22 05:18:43</td><td class="s">0.1K</td><td class="t">application/octet-stream</td></tr>
|
||||||
|
<tr><td class="n"><a href="verify-digests.sh">verify-digests.sh</a></td><td class="m">2016-Jun-10 02:40:33</td><td class="s">4.5K</td><td class="t">application/octet-stream</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<div class="foot">lighttpd/1.4.29</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
11
test/files/nginx_pre.html
Normal file
11
test/files/nginx_pre.html
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<html>
|
||||||
|
<head><title>Index of /test/To process/Android nak newer/</title></head>
|
||||||
|
<body bgcolor="white">
|
||||||
|
<h1>Index of /test/To process/Android nak newer/</h1><hr><pre><a href="../">../</a>
|
||||||
|
<a href="DCIM/">DCIM/</a> 31-Jul-2018 00:26 -
|
||||||
|
<a href="Pictures/">Pictures/</a> 31-Jul-2018 00:26 -
|
||||||
|
<a href="1529682937580.webm">1529682937580.webm</a> 25-Jun-2018 03:58 3768511
|
||||||
|
<a href="1529716051300.webm">1529716051300.webm</a> 25-Jun-2018 04:01 3181867
|
||||||
|
<a href="1529725898345.webm">1529725898345.webm</a> 25-Jun-2018 04:05 4138908
|
||||||
|
</pre><hr></body>
|
||||||
|
</html>
|
13
test/webserver.py
Normal file
13
test/webserver.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from flask import Flask, send_file
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/test1/")
|
||||||
|
def test1():
|
||||||
|
return send_file("files/apache_table.html")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run("0.0.0.0", port=8888, threaded=True)
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user