Merge remote-tracking branch 'origin/master'

This commit is contained in:
Simon 2018-09-06 19:46:56 -04:00
commit 85437b1ef9
17 changed files with 404 additions and 160 deletions

View File

@ -5,6 +5,7 @@ Suggestions/concerns/PRs are welcome
## Installation ## Installation
Assuming you have Python 3 and git installed: Assuming you have Python 3 and git installed:
```bash ```bash
sudo apt install libssl-dev libcurl4-openssl-dev
git clone https://github.com/simon987/od-database git clone https://github.com/simon987/od-database
cd od-database cd od-database
sudo pip3 install -r requirements.txt sudo pip3 install -r requirements.txt
@ -14,8 +15,11 @@ Create `/config.py` and fill out the parameters. Sample config:
# Leave default values for no CAPTCHAs # Leave default values for no CAPTCHAs
CAPTCHA_LOGIN = False CAPTCHA_LOGIN = False
CAPTCHA_SUBMIT = False CAPTCHA_SUBMIT = False
CAPTCHA_SEARCH = False
CAPTCHA_SITE_KEY = "" CAPTCHA_SITE_KEY = ""
CAPTCHA_SECRET_KEY = "" CAPTCHA_SECRET_KEY = ""
CAPTCHA_S_SITE_KEY = ""
CAPTCHA_S_SECRET_KEY = ""
# Flask secret key for sessions # Flask secret key for sessions
FLASK_SECRET = "" FLASK_SECRET = ""

64
app.py
View File

@ -5,7 +5,6 @@ from urllib.parse import urlparse
import os import os
import time import time
import datetime import datetime
import itertools
from database import Database, Website, InvalidQueryException from database import Database, Website, InvalidQueryException
from flask_recaptcha import ReCaptcha from flask_recaptcha import ReCaptcha
import od_util import od_util
@ -13,6 +12,7 @@ import config
from flask_caching import Cache from flask_caching import Cache
from tasks import TaskManager, Task, TaskResult from tasks import TaskManager, Task, TaskResult
from search.search import ElasticSearchEngine from search.search import ElasticSearchEngine
from callbacks import PostCrawlCallbackFactory
app = Flask(__name__) app = Flask(__name__)
if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN: if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN:
@ -21,6 +21,12 @@ if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN:
secret_key=config.CAPTCHA_SECRET_KEY) secret_key=config.CAPTCHA_SECRET_KEY)
else: else:
recaptcha = None recaptcha = None
if config.CAPTCHA_SEARCH:
recaptcha_search = ReCaptcha(app=app,
site_key=config.CAPTCHA_S_SITE_KEY,
secret_key=config.CAPTCHA_S_SECRET_KEY)
else:
recaptcha_search = None
app.secret_key = config.FLASK_SECRET app.secret_key = config.FLASK_SECRET
db = Database("db.sqlite3") db = Database("db.sqlite3")
cache = Cache(app, config={'CACHE_TYPE': 'simple'}) cache = Cache(app, config={'CACHE_TYPE': 'simple'})
@ -30,6 +36,7 @@ app.jinja_env.globals.update(get_mime=od_util.get_category)
taskManager = TaskManager() taskManager = TaskManager()
searchEngine = ElasticSearchEngine("od-database") searchEngine = ElasticSearchEngine("od-database")
searchEngine.start_stats_scheduler()
@app.template_filter("date_format") @app.template_filter("date_format")
@ -50,7 +57,7 @@ def from_timestamp(value):
@app.route("/dl") @app.route("/dl")
def downloads(): def downloads():
try: try:
export_file_stats = os.stat("static/out.csv.xz") export_file_stats = os.stat("static/out.csv.lzma")
except FileNotFoundError: except FileNotFoundError:
print("No export file") print("No export file")
export_file_stats = None export_file_stats = None
@ -236,6 +243,7 @@ def admin_rescan_website(website_id):
@app.route("/search") @app.route("/search")
def search(): def search():
q = request.args.get("q") if "q" in request.args else "" q = request.args.get("q") if "q" in request.args else ""
sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score" sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score"
@ -279,6 +287,8 @@ def search():
if len(q) >= 3: if len(q) >= 3:
response = request.args.get("g-recaptcha-response", "")
if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response):
db.log_search(request.remote_addr, db.log_search(request.remote_addr,
request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None, request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
q, extensions, page) q, extensions, page)
@ -294,6 +304,9 @@ def search():
flash("Query failed, this could mean that the search server is overloaded or is not reachable. " flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
"Please try again later", "danger") "Please try again later", "danger")
hits = None hits = None
else:
flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
hits = None
else: else:
hits = None hits = None
@ -308,7 +321,8 @@ def search():
size_min=size_min, size_max=size_max, size_min=size_min, size_max=size_max,
match_all=match_all, match_all=match_all,
field_trigram=field_trigram, field_path=field_path, field_name=field_name, field_trigram=field_trigram, field_path=field_path, field_name=field_name,
date_min=date_min, date_max=date_max) date_min=date_min, date_max=date_max,
show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search)
@app.route("/contribute") @app.route("/contribute")
@ -324,7 +338,8 @@ def home():
stats["website_count"] = len(db.get_all_websites()) stats["website_count"] = len(db.get_all_websites())
except: except:
stats = {} stats = {}
return render_template("home.html", stats=stats) return render_template("home.html", stats=stats,
show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search)
@app.route("/submit") @app.route("/submit")
@ -565,7 +580,11 @@ def api_complete_task():
if filename and os.path.exists(filename): if filename and os.path.exists(filename):
os.remove(filename) os.remove(filename)
# TODO: handle callback here # Handle task callback
callback = PostCrawlCallbackFactory.get_callback(task)
if callback:
callback.run(task_result, searchEngine)
return "Successfully logged task result and indexed files" return "Successfully logged task result and indexed files"
else: else:
@ -659,7 +678,7 @@ def api_task_enqueue():
request.json["url"], request.json["url"],
request.json["priority"], request.json["priority"],
request.json["callback_type"], request.json["callback_type"],
request.json["callback_args"] json.dumps(request.json["callback_args"])
) )
taskManager.queue_task(task) taskManager.queue_task(task)
return "" return ""
@ -705,5 +724,38 @@ def api_random_website():
return abort(403) return abort(403)
@app.route("/api/search", methods=["POST"])
def api_search():
try:
token = request.json["token"]
except KeyError:
return abort(400)
name = db.check_api_token(token)
if name:
try:
hits = searchEngine.search(
request.json["query"],
request.json["page"], request.json["per_page"],
request.json["sort_order"],
request.json["extensions"],
request.json["size_min"], request.json["size_max"],
request.json["match_all"],
request.json["fields"],
request.json["date_min"], request.json["date_max"]
)
hits = db.join_website_on_search_result(hits)
return json.dumps(hits)
except InvalidQueryException as e:
return str(e)
else:
return abort(403)
if __name__ == '__main__': if __name__ == '__main__':
app.run("0.0.0.0", port=12345, threaded=True) app.run("0.0.0.0", port=12345, threaded=True)

View File

@ -1,6 +1,8 @@
from tasks import Task from tasks import Task, TaskResult
from crawl_server.reddit_bot import RedditBot from reddit_bot import RedditBot
import praw import praw
from search.search import SearchEngine
import json
class PostCrawlCallback: class PostCrawlCallback:
@ -8,7 +10,10 @@ class PostCrawlCallback:
def __init__(self, task: Task): def __init__(self, task: Task):
self.task = task self.task = task
def run(self): if self.task.callback_args:
self.task.callback_args = json.loads(self.task.callback_args)
def run(self, task_result: TaskResult, search: SearchEngine):
raise NotImplementedError raise NotImplementedError
@ -36,26 +41,33 @@ class RedditCallback(PostCrawlCallback):
user_agent='github.com/simon987/od-database (by /u/Hexahedr_n)') user_agent='github.com/simon987/od-database (by /u/Hexahedr_n)')
self.reddit_bot = RedditBot("crawled.txt", reddit) self.reddit_bot = RedditBot("crawled.txt", reddit)
def run(self): def run(self, task_result: TaskResult, search: SearchEngine):
raise NotImplementedError raise NotImplementedError
class RedditPostCallback(RedditCallback): class RedditPostCallback(RedditCallback):
def run(self): def run(self, task_result: TaskResult, search: SearchEngine):
print("Reddit post callback for task " + str(self.task)) print("Reddit post callback for task " + str(self.task))
pass
class RedditCommentCallback(RedditCallback): class RedditCommentCallback(RedditCallback):
def run(self): def run(self, task_result: TaskResult, search: SearchEngine):
print("Reddit comment callback for task " + str(self.task))
pass comment_id = self.task.callback_args["comment_id"]
print("Editing comment comment " + comment_id)
search.refresh() # Make sure the newly indexed documents are available before commenting
stats = search.get_stats(self.task.website_id)
message = self.reddit_bot.get_comment(stats, self.task.website_id,
message="There you go! This website was crawled in `" +
str(int(task_result.end_time - task_result.start_time)) + "s`")
print(message)
self.reddit_bot.edit(self.reddit_bot.reddit.comment(comment_id), message)
class DiscordCallback(PostCrawlCallback): class DiscordCallback(PostCrawlCallback):
def run(self): def run(self, task_result: TaskResult, search: SearchEngine):
print("Discord callback for task " + str(self.task)) print("Discord callback for task " + str(self.task))
pass

View File

@ -1,14 +1,16 @@
import pycurl
from io import BytesIO
from crawl_server import logger from crawl_server import logger
from urllib.parse import unquote, urljoin from urllib.parse import unquote, urljoin
import os import os
from html.parser import HTMLParser from html.parser import HTMLParser
from itertools import repeat from itertools import repeat
from crawl_server.crawler import RemoteDirectory, File from crawl_server.crawler import RemoteDirectory, File
import requests
from requests.exceptions import RequestException
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
import config import config
from dateutil.parser import parse as parse_date from dateutil.parser import parse as parse_date
from pycurl import Curl
import hashlib import hashlib
import urllib3 import urllib3
@ -94,17 +96,36 @@ class HttpDirectory(RemoteDirectory):
def __init__(self, url): def __init__(self, url):
super().__init__(url) super().__init__(url)
self.session = requests.Session() self.curl = None
self.session.headers = HttpDirectory.HEADERS self.curl_head = None
self.session.verify = False self.init_curl()
self.session.max_redirects = 1
def init_curl(self):
self.curl = Curl()
self.curl.setopt(self.curl.SSL_VERIFYPEER, 0)
self.curl.setopt(self.curl.SSL_VERIFYHOST, 0)
self.curl.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT)
self.curl_head = self._curl_handle()
@staticmethod
def _curl_handle():
curl_head = Curl()
curl_head.setopt(pycurl.SSL_VERIFYPEER, 0)
curl_head.setopt(pycurl.SSL_VERIFYHOST, 0)
curl_head.setopt(pycurl.NOBODY, 1)
curl_head.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT)
return curl_head
def list_dir(self, path): def list_dir(self, path):
current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1] current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
path_identifier = hashlib.md5(current_dir_name.encode()) path_identifier = hashlib.md5(current_dir_name.encode())
path_url = urljoin(self.base_url, path, "") path_url = urljoin(self.base_url, path, "")
body = self._stream_body(path_url) body = self._fetch_body(path_url)
anchors = self._parse_links(body) anchors = self._parse_links(body)
urls_to_request = [] urls_to_request = []
@ -139,7 +160,7 @@ class HttpDirectory(RemoteDirectory):
if len(urls_to_request) > 150: if len(urls_to_request) > 150:
# Many urls, use multi-threaded solution # Many urls, use multi-threaded solution
pool = ThreadPool(processes=10) pool = ThreadPool(processes=10)
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request)) files = pool.starmap(self._request_file, zip(urls_to_request, repeat(self.base_url)))
pool.close() pool.close()
for file in files: for file in files:
if file: if file:
@ -147,67 +168,65 @@ class HttpDirectory(RemoteDirectory):
else: else:
# Too few urls to create thread pool # Too few urls to create thread pool
for url in urls_to_request: for url in urls_to_request:
file = self._request_file(url) file = self._request_file(url, self.base_url)
if file: if file:
yield file yield file
def _request_file(self, url): @staticmethod
def _request_file(url, base_url):
retries = HttpDirectory.MAX_RETRIES retries = HttpDirectory.MAX_RETRIES
while retries > 0: while retries > 0:
try: try:
r = self.session.head(url, allow_redirects=False, timeout=HttpDirectory.TIMEOUT) curl = HttpDirectory._curl_handle()
raw_headers = BytesIO()
curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore"))
curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write)
curl.perform()
stripped_url = url[len(self.base_url) - 1:] stripped_url = url[len(base_url) - 1:]
headers = HttpDirectory._parse_dict_header(raw_headers.getvalue().decode("utf-8", errors="ignore"))
raw_headers.close()
path, name = os.path.split(stripped_url) path, name = os.path.split(stripped_url)
date = r.headers.get("Last-Modified", "1970-01-01") date = headers.get("Last-Modified", "1970-01-01")
curl.close()
return File( return File(
path=unquote(path).strip("/"), path=unquote(path).strip("/"),
name=unquote(name), name=unquote(name),
size=int(r.headers.get("Content-Length", -1)), size=int(headers.get("Content-Length", -1)),
mtime=int(parse_date(date).timestamp()), mtime=int(parse_date(date).timestamp()),
is_dir=False is_dir=False
) )
except RequestException: except pycurl.error:
self.session.close()
retries -= 1 retries -= 1
logger.debug("TimeoutError - _request_file") logger.debug("TimeoutError - _request_file")
raise TimeoutError raise TimeoutError
def _stream_body(self, url: str): def _fetch_body(self, url: str):
retries = HttpDirectory.MAX_RETRIES retries = HttpDirectory.MAX_RETRIES
while retries > 0: while retries > 0:
try: try:
r = self.session.get(url, stream=True, timeout=HttpDirectory.TIMEOUT) content = BytesIO()
for chunk in r.iter_content(chunk_size=8192): self.curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore"))
try: self.curl.setopt(pycurl.WRITEDATA, content)
yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore") self.curl.perform()
except LookupError:
# Unsupported encoding return content.getvalue().decode("utf-8", errors="ignore")
yield chunk.decode("utf-8", errors="ignore") except pycurl.error:
r.close() self.close()
return
except RequestException:
self.session.close()
retries -= 1 retries -= 1
logger.debug("TimeoutError - _stream_body") logger.debug("TimeoutError - _fetch_body")
raise TimeoutError raise TimeoutError
@staticmethod @staticmethod
def _parse_links(body): def _parse_links(body):
parser = HTMLAnchorParser() parser = HTMLAnchorParser()
anchors = [] parser.feed(body)
return parser.anchors
for chunk in body:
parser.feed(chunk)
for anchor in parser.anchors:
anchors.append(anchor)
return anchors
@staticmethod @staticmethod
def _isdir(link: Anchor): def _isdir(link: Anchor):
@ -216,14 +235,14 @@ class HttpDirectory(RemoteDirectory):
@staticmethod @staticmethod
def _should_ignore(base_url, current_path, link: Anchor): def _should_ignore(base_url, current_path, link: Anchor):
if urljoin(base_url, link.href) == urljoin(urljoin(base_url, current_path), "../"): full_url = urljoin(base_url, link.href)
if full_url == urljoin(urljoin(base_url, current_path), "../") or full_url == base_url:
return True return True
if link.href.endswith(HttpDirectory.BLACK_LIST): if link.href.endswith(HttpDirectory.BLACK_LIST):
return True return True
# Ignore external links # Ignore external links
full_url = urljoin(base_url, link.href)
if not full_url.startswith(base_url): if not full_url.startswith(base_url):
return True return True
@ -231,8 +250,18 @@ class HttpDirectory(RemoteDirectory):
if "?" in link.href: if "?" in link.href:
return True return True
@staticmethod
def _parse_dict_header(raw):
headers = dict()
for line in raw.split("\r\n")[1:]: # Ignore first 'HTTP/1.0 200 OK' line
if line:
k, v = line.split(":", maxsplit=1)
headers[k.strip()] = v.strip()
return headers
def close(self): def close(self):
self.session.close() self.curl.close()
logger.debug("Closing HTTPRemoteDirectory for " + self.base_url) self.init_curl()

View File

@ -63,7 +63,7 @@ class TaskManager:
} }
r = requests.post(config.SERVER_URL + "/task/upload", data=payload, files=files) r = requests.post(config.SERVER_URL + "/task/upload", data=payload, files=files)
logger.info("RESPONSE: " + r.text) logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">")
except Exception as e: except Exception as e:
logger.error("Exception while sending file_list chunk: " + str(e)) logger.error("Exception while sending file_list chunk: " + str(e))
pass pass
@ -75,7 +75,7 @@ class TaskManager:
} }
r = requests.post(config.SERVER_URL + "/task/complete", data=payload) r = requests.post(config.SERVER_URL + "/task/complete", data=payload)
logger.info("RESPONSE: " + r.text) logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">")
if os.path.exists(filename): if os.path.exists(filename):
os.remove(filename) os.remove(filename)

View File

@ -407,12 +407,12 @@ class Database:
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM " cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM "
"Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name)) "Queue WHERE website_id=?", (website_id, ))
task = cursor.fetchone() task = cursor.fetchone()
if task: if task:
cursor.execute("DELETE FROM Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name)) cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, ))
conn.commit() conn.commit()
return Task(task[1], task[2], task[3], task[4], task[5]) return Task(task[1], task[2], task[3], task[4], task[5])
else: else:

View File

@ -5,7 +5,12 @@ import os
import validators import validators
import re import re
from ftplib import FTP from ftplib import FTP
import config
# TODO: find a better way to do this
try:
from . import config
except (ImportError, SystemError):
import config
import urllib3 import urllib3
urllib3.disable_warnings() urllib3.disable_warnings()

View File

@ -41,11 +41,23 @@ class RedditBot:
while True: while True:
try: try:
# Double check has_crawled
if not self.has_crawled(reddit_obj.id): if not self.has_crawled(reddit_obj.id):
reddit_obj.reply(comment) reply = reddit_obj.reply(comment)
self.log_crawl(reddit_obj.id) self.log_crawl(reddit_obj.id)
print("Reply to " + reddit_obj.id) print("Reply to " + reddit_obj.id)
return reply
break
except Exception as e:
print("Waiting 5 minutes: " + str(e))
time.sleep(300)
continue
def edit(self, reddit_comment, new_message):
while True:
try:
reddit_comment.edit(new_message)
print("Edit comment " + reddit_comment.id)
break break
except Exception as e: except Exception as e:
print("Waiting 5 minutes: " + str(e)) print("Waiting 5 minutes: " + str(e))
@ -54,14 +66,13 @@ class RedditBot:
@staticmethod @staticmethod
def get_comment(stats: dict, website_id, message: str = ""): def get_comment(stats: dict, website_id, message: str = ""):
comment = message + " \n" if len(message) > 0 else "" comment = message + " \n" if message else ""
for stat in stats: comment += RedditBot.format_stats(stats)
comment += stat + " \n" if len(stat) > 0 else ""
comment += RedditBot.format_stats(stats[stat])
comment += "[Full Report](https://od-database.simon987.net/website/" + str(website_id) + "/)" comment += "[Full Report](https://od-db.the-eye.eu/website/" + str(website_id) + "/)"
comment += " | [Link list](https://od-database.simon987.net/website/" + str(website_id) + "/links) \n" comment += " | [Link list](https://od-db.the-eye.eu/website/" + str(website_id) + "/links)"
comment += " | [Source](https://github.com/simon987) \n"
comment += "*** \n" comment += "*** \n"
comment += RedditBot.bottom_line comment += RedditBot.bottom_line
@ -74,7 +85,7 @@ class RedditBot:
result += "File types | Count | Total Size\n" result += "File types | Count | Total Size\n"
result += ":-- | :-- | :-- \n" result += ":-- | :-- | :-- \n"
counter = 0 counter = 0
for mime in stats["mime_stats"]: for mime in stats["ext_stats"]:
result += mime[2] result += mime[2]
result += " | " + str(mime[1]) result += " | " + str(mime[1])
result += " | " + humanfriendly.format_size(mime[0]) + " \n" result += " | " + humanfriendly.format_size(mime[0]) + " \n"

View File

@ -17,3 +17,5 @@ ujson
urllib3 urllib3
pyOpenSSL pyOpenSSL
pybloom-live pybloom-live
pycurl
lxml

View File

@ -31,6 +31,9 @@ class SearchEngine:
def get_stats(self, website_id: int, subdir: str = None): def get_stats(self, website_id: int, subdir: str = None):
raise NotImplementedError raise NotImplementedError
def refresh(self):
raise NotImplementedError
class ElasticSearchEngine(SearchEngine): class ElasticSearchEngine(SearchEngine):
SORT_ORDERS = { SORT_ORDERS = {
@ -47,13 +50,14 @@ class ElasticSearchEngine(SearchEngine):
self.index_name = index_name self.index_name = index_name
self.es = elasticsearch.Elasticsearch() self.es = elasticsearch.Elasticsearch()
scheduler = BackgroundScheduler()
scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 15)
scheduler.start()
if not self.es.indices.exists(self.index_name): if not self.es.indices.exists(self.index_name):
self.init() self.init()
def start_stats_scheduler(self):
scheduler = BackgroundScheduler()
scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120)
scheduler.start()
def init(self): def init(self):
print("Elasticsearch first time setup") print("Elasticsearch first time setup")
if self.es.indices.exists(self.index_name): if self.es.indices.exists(self.index_name):
@ -122,8 +126,8 @@ class ElasticSearchEngine(SearchEngine):
def import_json(self, in_lines, website_id: int): def import_json(self, in_lines, website_id: int):
import_every = 1000 import_every = 400
cooldown_time = 1 cooldown_time = 0
docs = [] docs = []
@ -211,7 +215,7 @@ class ElasticSearchEngine(SearchEngine):
} }
}, },
"size": per_page, "from": min(page * per_page, 10000 - per_page)}, "size": per_page, "from": min(page * per_page, 10000 - per_page)},
index=self.index_name, request_timeout=30) index=self.index_name, request_timeout=35)
return page return page
@ -229,7 +233,7 @@ class ElasticSearchEngine(SearchEngine):
"ext_group": { "ext_group": {
"terms": { "terms": {
"field": "ext", "field": "ext",
"size": 20 "size": 12
}, },
"aggs": { "aggs": {
"size": { "size": {
@ -246,7 +250,7 @@ class ElasticSearchEngine(SearchEngine):
} }
}, },
"size": 0 "size": 0
}, index=self.index_name, request_timeout=20) }, index=self.index_name, request_timeout=30)
stats = dict() stats = dict()
stats["total_size"] = result["aggregations"]["total_size"]["value"] stats["total_size"] = result["aggregations"]["total_size"]["value"]
@ -311,7 +315,7 @@ class ElasticSearchEngine(SearchEngine):
}, },
"size": 0 "size": 0
}, index=self.index_name, request_timeout=120) }, index=self.index_name, request_timeout=240)
total_stats = self.es.search(body={ total_stats = self.es.search(body={
"query": { "query": {
@ -333,7 +337,7 @@ class ElasticSearchEngine(SearchEngine):
}, },
"size": 0 "size": 0
}, index=self.index_name, request_timeout=120) }, index=self.index_name, request_timeout=241)
size_and_date_histogram = self.es.search(body={ size_and_date_histogram = self.es.search(body={
"query": { "query": {
@ -354,21 +358,21 @@ class ElasticSearchEngine(SearchEngine):
"sizes": { "sizes": {
"histogram": { "histogram": {
"field": "size", "field": "size",
"interval": 50000000, # 50Mb "interval": 100000000, # 100Mb
"min_doc_count": 100 "min_doc_count": 500
} }
}, },
"dates": { "dates": {
"date_histogram": { "date_histogram": {
"field": "mtime", "field": "mtime",
"interval": "1y", "interval": "1y",
"min_doc_count": 100, "min_doc_count": 500,
"format": "yyyy" "format": "yyyy"
} }
} }
}, },
"size": 0 "size": 0
}, index=self.index_name, request_timeout=120) }, index=self.index_name, request_timeout=242)
website_scatter = self.es.search(body={ website_scatter = self.es.search(body={
"query": { "query": {
@ -384,7 +388,7 @@ class ElasticSearchEngine(SearchEngine):
"websites": { "websites": {
"terms": { "terms": {
"field": "website_id", "field": "website_id",
"size": 500 # TODO: Figure out what size is appropriate "size": 400 # TODO: Figure out what size is appropriate
}, },
"aggs": { "aggs": {
"size": { "size": {
@ -396,9 +400,9 @@ class ElasticSearchEngine(SearchEngine):
} }
}, },
"size": 0 "size": 0
}, index=self.index_name, request_timeout=120) }, index=self.index_name, request_timeout=243)
es_stats = self.es.indices.stats(self.index_name, request_timeout=120) es_stats = self.es.indices.stats(self.index_name, request_timeout=244)
stats = dict() stats = dict()
stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"] stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
@ -460,3 +464,6 @@ class ElasticSearchEngine(SearchEngine):
for website in websites: for website in websites:
if website not in non_empty_websites: if website not in non_empty_websites:
yield website yield website
def refresh(self):
self.es.indices.refresh(self.index_name)

View File

@ -23,13 +23,23 @@
<div class="card"> <div class="card">
<div class="card-header">Search</div> <div class="card-header">Search</div>
<div class="card-body"> <div class="card-body">
<form action="/search"> <form action="/search" id="sfrm">
<div class="form-group"> <div class="form-row">
<div class="col-md-11">
<input class="form-control" name="q" id="q" placeholder="Query"> <input class="form-control" name="q" id="q" placeholder="Query">
</div> </div>
<div class="col-md-1">
{% if show_captcha %}
<script>function f(token) {document.getElementById("sfrm").submit();}</script>
<script src="https://www.google.com/recaptcha/api.js" async defer></script>
<button class="g-recaptcha btn btn-primary btn-shadow" data-sitekey="6LcCXWkUAAAAAJo2NR9_m09Obn5YmDrI97sGrr2f" data-callback="f">Search</button>
{% else %}
<input class="btn btn-primary btn-shadow" type="submit" value="Search nocap">
{% endif %}
</div>
</div>
<input class="btn btn-primary btn-shadow" type="submit" value="Search">
</form> </form>
</div> </div>
</div> </div>

View File

@ -32,7 +32,7 @@
<div class="collapse navbar-collapse" id="navbarSupportedContent"> <div class="collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav mr-auto"> <ul class="navbar-nav mr-auto">
<li class="nav-item"> <li class="nav-item">
<a class="nav-link {{ "active" if current_page == "website" else "" }}" href="/website">Websites</a> <a class="nav-link {{ "active" if current_page == "website" else "" }}" href="/website/">Websites</a>
</li> </li>
<li class="nav-item"> <li class="nav-item">
<a class="nav-link {{ "active" if current_page == "submit" else "" }}" href="/submit">Submit website</a> <a class="nav-link {{ "active" if current_page == "submit" else "" }}" href="/submit">Submit website</a>

View File

@ -9,7 +9,7 @@
<div class="card"> <div class="card">
<div class="card-header">Search</div> <div class="card-header">Search</div>
<div class="card-body"> <div class="card-body">
<form action="/search"> <form action="/search" id="sfrm">
<div class="form-row"> <div class="form-row">
@ -22,6 +22,7 @@
</div> </div>
</div> </div>
<input class="form-control" name="q" id="q" placeholder="Query" value="{{ q }}"> <input class="form-control" name="q" id="q" placeholder="Query" value="{{ q }}">
<input type="hidden" name="p" id="page" value="{{ p }}">
</div> </div>
</div> </div>
{# Size #} {# Size #}
@ -90,9 +91,19 @@
</select> </select>
</div> </div>
{# Search button #} {# Search button #}
<div class="form-group col-md-7"> <div class="form-group col-md-7">
<input class="btn btn-primary btn-shadow" type="submit" value="Search" style="float: right">
{% if show_captcha %}
<script>function f(token) {
document.getElementById("sfrm").submit();
}</script>
<script src="https://www.google.com/recaptcha/api.js" async defer></script>
<button id="s" class="g-recaptcha btn btn-primary btn-shadow" data-sitekey="6LcCXWkUAAAAAJo2NR9_m09Obn5YmDrI97sGrr2f" data-callback="f" style="float: right" onclick="document.getElementById('page').value = 0">Search</button>
{% else %}
<input id="s" class="btn btn-primary btn-shadow" type="submit" value="Search" style="float: right">
{% endif %}
</div> </div>
</div> </div>
@ -156,12 +167,10 @@
</table> </table>
</div> </div>
{% if results["hits"]["total"] > (p + 1) * per_page %} {% if results["hits"]["total"] > (p + 1) * per_page %}
<a href="/search?q={{ q }}&p={{ p + 1 }}&sort_order={{ sort_order }}&per_page={{ per_page }}&ext={{ extensions }}&size_min={{ size_min }}&size_max={{ size_max }}{{ "&field_path" if field_path else "" }}{{ "&field_name" if field_name else "" }}{{ "&field_trigram" if field_trigram else "" }}&date_min={{ date_min }}&date_max={{ date_max }}" <button class="btn btn-primary" onclick="nextPage()" style="float: right">Next</button>
class="btn btn-primary" style="float: right">Next</a>
{% endif %} {% endif %}
{% if p > 0 %} {% if p > 0 %}
<a href="/search?q={{ q }}&p={{ p - 1 }}&sort_order={{ sort_order }}&per_page={{ per_page }}&ext={{ extensions }}&size_min={{ size_min }}&size_max={{ size_max }}{{ "&field_path" if field_path else "" }}{{ "&field_name" if field_name else "" }}{{ "&field_trigram" if field_trigram else "" }}&date_min={{ date_min }}&date_max={{ date_max }}" <button class="btn btn-primary" onclick="prevPage()">Previous</button>
class="btn btn-primary">Previous</a>
{% endif %} {% endif %}
</div> </div>
@ -174,6 +183,7 @@
<ul> <ul>
<li>Try checking the 'Match any word' box for a broader search.</li> <li>Try checking the 'Match any word' box for a broader search.</li>
<li>Make sure you don't include the file extension in your query (Use the appropriate field to filter file types)</li> <li>Make sure you don't include the file extension in your query (Use the appropriate field to filter file types)</li>
<li>If you're searching for files in a particular website, use the <a href="/website">website search page</a></li>
</ul> </ul>
</div> </div>
@ -245,6 +255,16 @@
} }
}); });
//Next button
function nextPage() {
document.getElementById("page").value = parseInt(document.getElementById("page").value) + 1;
grecaptcha.execute();
}
function prevPage() {
document.getElementById("page").value = parseInt(document.getElementById("page").value) - 1;
grecaptcha.execute();
}
</script> </script>
</div> </div>

View File

@ -0,0 +1,21 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head>
<title>Index of /Public/bootstrap</title>
</head>
<body>
<h1>Index of /Public/bootstrap</h1>
<table>
<tr><th valign="top"><img src="/icons/blank.gif" alt="[ICO]"></th><th><a href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last modified</a></th><th><a href="?C=S;O=A">Size</a></th><th><a href="?C=D;O=A">Description</a></th></tr>
<tr><th colspan="5"><hr></th></tr>
<tr><td valign="top"><img src="/icons/back.gif" alt="[PARENTDIR]"></td><td><a href="/Public/">Parent Directory</a> </td><td>&nbsp;</td><td align="right"> - </td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/unknown.gif" alt="[ ]"></td><td><a href="bower.json">bower.json</a> </td><td align="right">2017-04-05 01:45 </td><td align="right">1.0K</td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="css/">css/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="image/">image/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="js/">js/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="less/">less/</a> </td><td align="right">2017-09-07 18:03 </td><td align="right"> - </td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/unknown.gif" alt="[ ]"></td><td><a href="package.json">package.json</a> </td><td align="right">2017-04-05 01:45 </td><td align="right">666 </td><td>&nbsp;</td></tr>
<tr><th colspan="5"><hr></th></tr>
</table>
</body></html>

View File

@ -0,0 +1,47 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<title>Index of /gentoo/releases/</title>
<style type="text/css">
a, a:active {text-decoration: none; color: blue;}
a:visited {color: #48468F;}
a:hover, a:focus {text-decoration: underline; color: red;}
body {background-color: #F5F5F5;}
h2 {margin-bottom: 12px;}
table {margin-left: 12px;}
th, td { font: 90% monospace; text-align: left;}
th { font-weight: bold; padding-right: 14px; padding-bottom: 3px;}
td {padding-right: 14px;}
td.s, th.s {text-align: right;}
div.list { background-color: white; border-top: 1px solid #646464; border-bottom: 1px solid #646464; padding-top: 10px; padding-bottom: 14px;}
div.foot { font: 90% monospace; color: #787878; padding-top: 4px;}
</style>
</head>
<body>
<h2>Index of /gentoo/releases/</h2>
<div class="list">
<table summary="Directory Listing" cellpadding="0" cellspacing="0">
<thead><tr><th class="n">Name</th><th class="m">Last Modified</th><th class="s">Size</th><th class="t">Type</th></tr></thead>
<tbody>
<tr><td class="n"><a href="../">Parent Directory</a>/</td><td class="m">&nbsp;</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="alpha/">alpha</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="amd64/">amd64</a>/</td><td class="m">2017-Feb-09 18:50:44</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="arm/">arm</a>/</td><td class="m">2014-Apr-29 13:42:06</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="hppa/">hppa</a>/</td><td class="m">2014-Apr-29 13:42:12</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="ia64/">ia64</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="mips/">mips</a>/</td><td class="m">2011-Apr-28 23:38:14</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="ppc/">ppc</a>/</td><td class="m">2014-Apr-29 13:41:00</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="s390/">s390</a>/</td><td class="m">2014-Apr-29 13:41:06</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="sh/">sh</a>/</td><td class="m">2014-Apr-29 13:41:16</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="snapshots/">snapshots</a>/</td><td class="m">2009-Apr-16 05:08:17</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="sparc/">sparc</a>/</td><td class="m">2009-Aug-09 03:47:09</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="x86/">x86</a>/</td><td class="m">2016-Jul-04 21:14:19</td><td class="s">- &nbsp;</td><td class="t">Directory</td></tr>
<tr><td class="n"><a href="README">README</a></td><td class="m">2014-Jun-22 05:18:43</td><td class="s">0.1K</td><td class="t">application/octet-stream</td></tr>
<tr><td class="n"><a href="verify-digests.sh">verify-digests.sh</a></td><td class="m">2016-Jun-10 02:40:33</td><td class="s">4.5K</td><td class="t">application/octet-stream</td></tr>
</tbody>
</table>
</div>
<div class="foot">lighttpd/1.4.29</div>
</body>
</html>

11
test/files/nginx_pre.html Normal file
View File

@ -0,0 +1,11 @@
<html>
<head><title>Index of /test/To process/Android nak newer/</title></head>
<body bgcolor="white">
<h1>Index of /test/To process/Android nak newer/</h1><hr><pre><a href="../">../</a>
<a href="DCIM/">DCIM/</a> 31-Jul-2018 00:26 -
<a href="Pictures/">Pictures/</a> 31-Jul-2018 00:26 -
<a href="1529682937580.webm">1529682937580.webm</a> 25-Jun-2018 03:58 3768511
<a href="1529716051300.webm">1529716051300.webm</a> 25-Jun-2018 04:01 3181867
<a href="1529725898345.webm">1529725898345.webm</a> 25-Jun-2018 04:05 4138908
</pre><hr></body>
</html>

13
test/webserver.py Normal file
View File

@ -0,0 +1,13 @@
from flask import Flask, send_file
app = Flask(__name__)
@app.route("/test1/")
def test1():
return send_file("files/apache_table.html")
if __name__ == '__main__':
app.run("0.0.0.0", port=8888, threaded=True)