diff --git a/README.md b/README.md index a10a2bd..b53d50a 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ Suggestions/concerns/PRs are welcome ## Installation Assuming you have Python 3 and git installed: ```bash +sudo apt install libssl-dev libcurl4-openssl-dev git clone https://github.com/simon987/od-database cd od-database sudo pip3 install -r requirements.txt @@ -14,8 +15,11 @@ Create `/config.py` and fill out the parameters. Sample config: # Leave default values for no CAPTCHAs CAPTCHA_LOGIN = False CAPTCHA_SUBMIT = False +CAPTCHA_SEARCH = False CAPTCHA_SITE_KEY = "" CAPTCHA_SECRET_KEY = "" +CAPTCHA_S_SITE_KEY = "" +CAPTCHA_S_SECRET_KEY = "" # Flask secret key for sessions FLASK_SECRET = "" diff --git a/app.py b/app.py index 75f8aec..3b2b622 100644 --- a/app.py +++ b/app.py @@ -5,7 +5,6 @@ from urllib.parse import urlparse import os import time import datetime -import itertools from database import Database, Website, InvalidQueryException from flask_recaptcha import ReCaptcha import od_util @@ -13,6 +12,7 @@ import config from flask_caching import Cache from tasks import TaskManager, Task, TaskResult from search.search import ElasticSearchEngine +from callbacks import PostCrawlCallbackFactory app = Flask(__name__) if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN: @@ -21,6 +21,12 @@ if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN: secret_key=config.CAPTCHA_SECRET_KEY) else: recaptcha = None +if config.CAPTCHA_SEARCH: + recaptcha_search = ReCaptcha(app=app, + site_key=config.CAPTCHA_S_SITE_KEY, + secret_key=config.CAPTCHA_S_SECRET_KEY) +else: + recaptcha_search = None app.secret_key = config.FLASK_SECRET db = Database("db.sqlite3") cache = Cache(app, config={'CACHE_TYPE': 'simple'}) @@ -30,6 +36,7 @@ app.jinja_env.globals.update(get_mime=od_util.get_category) taskManager = TaskManager() searchEngine = ElasticSearchEngine("od-database") +searchEngine.start_stats_scheduler() @app.template_filter("date_format") @@ -50,7 +57,7 @@ def from_timestamp(value): @app.route("/dl") def downloads(): try: - export_file_stats = os.stat("static/out.csv.xz") + export_file_stats = os.stat("static/out.csv.lzma") except FileNotFoundError: print("No export file") export_file_stats = None @@ -236,79 +243,86 @@ def admin_rescan_website(website_id): @app.route("/search") def search(): - q = request.args.get("q") if "q" in request.args else "" - sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score" - page = request.args.get("p") if "p" in request.args else "0" - page = int(page) if page.isdigit() else 0 + q = request.args.get("q") if "q" in request.args else "" + sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score" - per_page = request.args.get("per_page") if "per_page" in request.args else "50" - per_page = int(per_page) if per_page.isdigit() else "50" - per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50 + page = request.args.get("p") if "p" in request.args else "0" + page = int(page) if page.isdigit() else 0 - extensions = request.args.get("ext") if "ext" in request.args else None - extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else [] + per_page = request.args.get("per_page") if "per_page" in request.args else "50" + per_page = int(per_page) if per_page.isdigit() else "50" + per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50 - size_min = request.args.get("size_min") if "size_min" in request.args else "size_min" - size_min = int(size_min) if size_min.isdigit() else 0 - size_max = request.args.get("size_max") if "size_max" in request.args else "size_max" - size_max = int(size_max) if size_max.isdigit() else 0 + extensions = request.args.get("ext") if "ext" in request.args else None + extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else [] - date_min = request.args.get("date_min") if "date_min" in request.args else "date_min" - date_min = int(date_min) if date_min.isdigit() else 0 - date_max = request.args.get("date_max") if "date_max" in request.args else "date_max" - date_max = int(date_max) if date_max.isdigit() else 0 + size_min = request.args.get("size_min") if "size_min" in request.args else "size_min" + size_min = int(size_min) if size_min.isdigit() else 0 + size_max = request.args.get("size_max") if "size_max" in request.args else "size_max" + size_max = int(size_max) if size_max.isdigit() else 0 - match_all = "all" in request.args + date_min = request.args.get("date_min") if "date_min" in request.args else "date_min" + date_min = int(date_min) if date_min.isdigit() else 0 + date_max = request.args.get("date_max") if "date_max" in request.args else "date_max" + date_max = int(date_max) if date_max.isdigit() else 0 - field_name = "field_name" in request.args - field_trigram = "field_trigram" in request.args - field_path = "field_path" in request.args + match_all = "all" in request.args - if not field_name and not field_trigram and not field_path: - # If no fields are selected, search in all - field_name = field_path = field_trigram = True + field_name = "field_name" in request.args + field_trigram = "field_trigram" in request.args + field_path = "field_path" in request.args - fields = [] - if field_path: - fields.append("path") - if field_name: - fields.append("name^5") - if field_trigram: - fields.append("name.nGram^2") + if not field_name and not field_trigram and not field_path: + # If no fields are selected, search in all + field_name = field_path = field_trigram = True - if len(q) >= 3: + fields = [] + if field_path: + fields.append("path") + if field_name: + fields.append("name^5") + if field_trigram: + fields.append("name.nGram^2") - db.log_search(request.remote_addr, - request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None, - q, extensions, page) + if len(q) >= 3: - try: - hits = searchEngine.search(q, page, per_page, sort_order, - extensions, size_min, size_max, match_all, fields, date_min, date_max) - hits = db.join_website_on_search_result(hits) - except InvalidQueryException as e: - flash("Invalid query: " + str(e), "warning") - return redirect("/search") - except Exception: - flash("Query failed, this could mean that the search server is overloaded or is not reachable. " - "Please try again later", "danger") + response = request.args.get("g-recaptcha-response", "") + if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response): + db.log_search(request.remote_addr, + request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None, + q, extensions, page) + + try: + hits = searchEngine.search(q, page, per_page, sort_order, + extensions, size_min, size_max, match_all, fields, date_min, date_max) + hits = db.join_website_on_search_result(hits) + except InvalidQueryException as e: + flash("Invalid query: " + str(e), "warning") + return redirect("/search") + except Exception: + flash("Query failed, this could mean that the search server is overloaded or is not reachable. " + "Please try again later", "danger") + hits = None + else: + flash("Error: Invalid captcha please try again", "danger") + hits = None + + else: hits = None - else: - hits = None - - return render_template("search.html", - results=hits, - q=q, - p=page, per_page=per_page, - sort_order=sort_order, - results_set=config.RESULTS_PER_PAGE, - extensions=",".join(extensions), - size_min=size_min, size_max=size_max, - match_all=match_all, - field_trigram=field_trigram, field_path=field_path, field_name=field_name, - date_min=date_min, date_max=date_max) + return render_template("search.html", + results=hits, + q=q, + p=page, per_page=per_page, + sort_order=sort_order, + results_set=config.RESULTS_PER_PAGE, + extensions=",".join(extensions), + size_min=size_min, size_max=size_max, + match_all=match_all, + field_trigram=field_trigram, field_path=field_path, field_name=field_name, + date_min=date_min, date_max=date_max, + show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search) @app.route("/contribute") @@ -324,7 +338,8 @@ def home(): stats["website_count"] = len(db.get_all_websites()) except: stats = {} - return render_template("home.html", stats=stats) + return render_template("home.html", stats=stats, + show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search) @app.route("/submit") @@ -565,7 +580,11 @@ def api_complete_task(): if filename and os.path.exists(filename): os.remove(filename) - # TODO: handle callback here + # Handle task callback + callback = PostCrawlCallbackFactory.get_callback(task) + if callback: + callback.run(task_result, searchEngine) + return "Successfully logged task result and indexed files" else: @@ -659,7 +678,7 @@ def api_task_enqueue(): request.json["url"], request.json["priority"], request.json["callback_type"], - request.json["callback_args"] + json.dumps(request.json["callback_args"]) ) taskManager.queue_task(task) return "" @@ -705,5 +724,38 @@ def api_random_website(): return abort(403) +@app.route("/api/search", methods=["POST"]) +def api_search(): + + try: + token = request.json["token"] + except KeyError: + return abort(400) + + name = db.check_api_token(token) + + if name: + + try: + hits = searchEngine.search( + request.json["query"], + request.json["page"], request.json["per_page"], + request.json["sort_order"], + request.json["extensions"], + request.json["size_min"], request.json["size_max"], + request.json["match_all"], + request.json["fields"], + request.json["date_min"], request.json["date_max"] + ) + + hits = db.join_website_on_search_result(hits) + return json.dumps(hits) + + except InvalidQueryException as e: + return str(e) + else: + return abort(403) + + if __name__ == '__main__': app.run("0.0.0.0", port=12345, threaded=True) diff --git a/callbacks.py b/callbacks.py index 89bda6c..647c963 100644 --- a/callbacks.py +++ b/callbacks.py @@ -1,6 +1,8 @@ -from tasks import Task -from crawl_server.reddit_bot import RedditBot +from tasks import Task, TaskResult +from reddit_bot import RedditBot import praw +from search.search import SearchEngine +import json class PostCrawlCallback: @@ -8,7 +10,10 @@ class PostCrawlCallback: def __init__(self, task: Task): self.task = task - def run(self): + if self.task.callback_args: + self.task.callback_args = json.loads(self.task.callback_args) + + def run(self, task_result: TaskResult, search: SearchEngine): raise NotImplementedError @@ -36,26 +41,33 @@ class RedditCallback(PostCrawlCallback): user_agent='github.com/simon987/od-database (by /u/Hexahedr_n)') self.reddit_bot = RedditBot("crawled.txt", reddit) - def run(self): + def run(self, task_result: TaskResult, search: SearchEngine): raise NotImplementedError class RedditPostCallback(RedditCallback): - def run(self): + def run(self, task_result: TaskResult, search: SearchEngine): print("Reddit post callback for task " + str(self.task)) - pass class RedditCommentCallback(RedditCallback): - def run(self): - print("Reddit comment callback for task " + str(self.task)) - pass + def run(self, task_result: TaskResult, search: SearchEngine): + + comment_id = self.task.callback_args["comment_id"] + print("Editing comment comment " + comment_id) + + search.refresh() # Make sure the newly indexed documents are available before commenting + stats = search.get_stats(self.task.website_id) + message = self.reddit_bot.get_comment(stats, self.task.website_id, + message="There you go! This website was crawled in `" + + str(int(task_result.end_time - task_result.start_time)) + "s`") + print(message) + self.reddit_bot.edit(self.reddit_bot.reddit.comment(comment_id), message) class DiscordCallback(PostCrawlCallback): - def run(self): + def run(self, task_result: TaskResult, search: SearchEngine): print("Discord callback for task " + str(self.task)) - pass diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 368aaac..11452c4 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -1,14 +1,16 @@ +import pycurl +from io import BytesIO + from crawl_server import logger from urllib.parse import unquote, urljoin import os from html.parser import HTMLParser from itertools import repeat from crawl_server.crawler import RemoteDirectory, File -import requests -from requests.exceptions import RequestException from multiprocessing.pool import ThreadPool import config from dateutil.parser import parse as parse_date +from pycurl import Curl import hashlib import urllib3 @@ -94,17 +96,36 @@ class HttpDirectory(RemoteDirectory): def __init__(self, url): super().__init__(url) - self.session = requests.Session() - self.session.headers = HttpDirectory.HEADERS - self.session.verify = False - self.session.max_redirects = 1 + self.curl = None + self.curl_head = None + self.init_curl() + + def init_curl(self): + + self.curl = Curl() + self.curl.setopt(self.curl.SSL_VERIFYPEER, 0) + self.curl.setopt(self.curl.SSL_VERIFYHOST, 0) + self.curl.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT) + + self.curl_head = self._curl_handle() + + @staticmethod + def _curl_handle(): + + curl_head = Curl() + curl_head.setopt(pycurl.SSL_VERIFYPEER, 0) + curl_head.setopt(pycurl.SSL_VERIFYHOST, 0) + curl_head.setopt(pycurl.NOBODY, 1) + curl_head.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT) + + return curl_head def list_dir(self, path): current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1] path_identifier = hashlib.md5(current_dir_name.encode()) path_url = urljoin(self.base_url, path, "") - body = self._stream_body(path_url) + body = self._fetch_body(path_url) anchors = self._parse_links(body) urls_to_request = [] @@ -139,7 +160,7 @@ class HttpDirectory(RemoteDirectory): if len(urls_to_request) > 150: # Many urls, use multi-threaded solution pool = ThreadPool(processes=10) - files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request)) + files = pool.starmap(self._request_file, zip(urls_to_request, repeat(self.base_url))) pool.close() for file in files: if file: @@ -147,67 +168,65 @@ class HttpDirectory(RemoteDirectory): else: # Too few urls to create thread pool for url in urls_to_request: - file = self._request_file(url) + file = self._request_file(url, self.base_url) if file: yield file - def _request_file(self, url): + @staticmethod + def _request_file(url, base_url): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: - r = self.session.head(url, allow_redirects=False, timeout=HttpDirectory.TIMEOUT) + curl = HttpDirectory._curl_handle() + raw_headers = BytesIO() + curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore")) + curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write) + curl.perform() - stripped_url = url[len(self.base_url) - 1:] + stripped_url = url[len(base_url) - 1:] + headers = HttpDirectory._parse_dict_header(raw_headers.getvalue().decode("utf-8", errors="ignore")) + raw_headers.close() path, name = os.path.split(stripped_url) - date = r.headers.get("Last-Modified", "1970-01-01") + date = headers.get("Last-Modified", "1970-01-01") + curl.close() return File( path=unquote(path).strip("/"), name=unquote(name), - size=int(r.headers.get("Content-Length", -1)), + size=int(headers.get("Content-Length", -1)), mtime=int(parse_date(date).timestamp()), is_dir=False ) - except RequestException: - self.session.close() + except pycurl.error: retries -= 1 logger.debug("TimeoutError - _request_file") raise TimeoutError - def _stream_body(self, url: str): + def _fetch_body(self, url: str): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: - r = self.session.get(url, stream=True, timeout=HttpDirectory.TIMEOUT) - for chunk in r.iter_content(chunk_size=8192): - try: - yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore") - except LookupError: - # Unsupported encoding - yield chunk.decode("utf-8", errors="ignore") - r.close() - return - except RequestException: - self.session.close() + content = BytesIO() + self.curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore")) + self.curl.setopt(pycurl.WRITEDATA, content) + self.curl.perform() + + return content.getvalue().decode("utf-8", errors="ignore") + except pycurl.error: + self.close() retries -= 1 - logger.debug("TimeoutError - _stream_body") + logger.debug("TimeoutError - _fetch_body") raise TimeoutError @staticmethod def _parse_links(body): parser = HTMLAnchorParser() - anchors = [] - - for chunk in body: - parser.feed(chunk) - for anchor in parser.anchors: - anchors.append(anchor) - - return anchors + parser.feed(body) + return parser.anchors @staticmethod def _isdir(link: Anchor): @@ -216,14 +235,14 @@ class HttpDirectory(RemoteDirectory): @staticmethod def _should_ignore(base_url, current_path, link: Anchor): - if urljoin(base_url, link.href) == urljoin(urljoin(base_url, current_path), "../"): + full_url = urljoin(base_url, link.href) + if full_url == urljoin(urljoin(base_url, current_path), "../") or full_url == base_url: return True if link.href.endswith(HttpDirectory.BLACK_LIST): return True # Ignore external links - full_url = urljoin(base_url, link.href) if not full_url.startswith(base_url): return True @@ -231,8 +250,18 @@ class HttpDirectory(RemoteDirectory): if "?" in link.href: return True + @staticmethod + def _parse_dict_header(raw): + headers = dict() + for line in raw.split("\r\n")[1:]: # Ignore first 'HTTP/1.0 200 OK' line + if line: + k, v = line.split(":", maxsplit=1) + headers[k.strip()] = v.strip() + + return headers + def close(self): - self.session.close() - logger.debug("Closing HTTPRemoteDirectory for " + self.base_url) + self.curl.close() + self.init_curl() diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index 841e991..cee1e17 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -63,7 +63,7 @@ class TaskManager: } r = requests.post(config.SERVER_URL + "/task/upload", data=payload, files=files) - logger.info("RESPONSE: " + r.text) + logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">") except Exception as e: logger.error("Exception while sending file_list chunk: " + str(e)) pass @@ -75,7 +75,7 @@ class TaskManager: } r = requests.post(config.SERVER_URL + "/task/complete", data=payload) - logger.info("RESPONSE: " + r.text) + logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">") if os.path.exists(filename): os.remove(filename) diff --git a/database.py b/database.py index 51edd67..252a30f 100644 --- a/database.py +++ b/database.py @@ -407,12 +407,12 @@ class Database: cursor = conn.cursor() cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM " - "Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name)) + "Queue WHERE website_id=?", (website_id, )) task = cursor.fetchone() if task: - cursor.execute("DELETE FROM Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name)) + cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, )) conn.commit() return Task(task[1], task[2], task[3], task[4], task[5]) else: diff --git a/od_util.py b/od_util.py index 00057e0..3407fc9 100644 --- a/od_util.py +++ b/od_util.py @@ -5,7 +5,12 @@ import os import validators import re from ftplib import FTP -import config + +# TODO: find a better way to do this +try: + from . import config +except (ImportError, SystemError): + import config import urllib3 urllib3.disable_warnings() diff --git a/crawl_server/reddit_bot.py b/reddit_bot.py similarity index 71% rename from crawl_server/reddit_bot.py rename to reddit_bot.py index bf3c3e4..66f03f6 100644 --- a/crawl_server/reddit_bot.py +++ b/reddit_bot.py @@ -41,11 +41,23 @@ class RedditBot: while True: try: - # Double check has_crawled if not self.has_crawled(reddit_obj.id): - reddit_obj.reply(comment) + reply = reddit_obj.reply(comment) self.log_crawl(reddit_obj.id) print("Reply to " + reddit_obj.id) + return reply + break + except Exception as e: + print("Waiting 5 minutes: " + str(e)) + time.sleep(300) + continue + + def edit(self, reddit_comment, new_message): + + while True: + try: + reddit_comment.edit(new_message) + print("Edit comment " + reddit_comment.id) break except Exception as e: print("Waiting 5 minutes: " + str(e)) @@ -54,14 +66,13 @@ class RedditBot: @staticmethod def get_comment(stats: dict, website_id, message: str = ""): - comment = message + " \n" if len(message) > 0 else "" + comment = message + " \n" if message else "" - for stat in stats: - comment += stat + " \n" if len(stat) > 0 else "" - comment += RedditBot.format_stats(stats[stat]) + comment += RedditBot.format_stats(stats) - comment += "[Full Report](https://od-database.simon987.net/website/" + str(website_id) + "/)" - comment += " | [Link list](https://od-database.simon987.net/website/" + str(website_id) + "/links) \n" + comment += "[Full Report](https://od-db.the-eye.eu/website/" + str(website_id) + "/)" + comment += " | [Link list](https://od-db.the-eye.eu/website/" + str(website_id) + "/links)" + comment += " | [Source](https://github.com/simon987) \n" comment += "*** \n" comment += RedditBot.bottom_line @@ -74,7 +85,7 @@ class RedditBot: result += "File types | Count | Total Size\n" result += ":-- | :-- | :-- \n" counter = 0 - for mime in stats["mime_stats"]: + for mime in stats["ext_stats"]: result += mime[2] result += " | " + str(mime[1]) result += " | " + humanfriendly.format_size(mime[0]) + " \n" diff --git a/requirements.txt b/requirements.txt index 4bb0370..b5a81c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,6 @@ flask_httpauth ujson urllib3 pyOpenSSL -pybloom-live \ No newline at end of file +pybloom-live +pycurl +lxml \ No newline at end of file diff --git a/search/search.py b/search/search.py index fac8dd5..f65f539 100644 --- a/search/search.py +++ b/search/search.py @@ -31,6 +31,9 @@ class SearchEngine: def get_stats(self, website_id: int, subdir: str = None): raise NotImplementedError + def refresh(self): + raise NotImplementedError + class ElasticSearchEngine(SearchEngine): SORT_ORDERS = { @@ -47,13 +50,14 @@ class ElasticSearchEngine(SearchEngine): self.index_name = index_name self.es = elasticsearch.Elasticsearch() - scheduler = BackgroundScheduler() - scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 15) - scheduler.start() - if not self.es.indices.exists(self.index_name): self.init() + def start_stats_scheduler(self): + scheduler = BackgroundScheduler() + scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120) + scheduler.start() + def init(self): print("Elasticsearch first time setup") if self.es.indices.exists(self.index_name): @@ -122,8 +126,8 @@ class ElasticSearchEngine(SearchEngine): def import_json(self, in_lines, website_id: int): - import_every = 1000 - cooldown_time = 1 + import_every = 400 + cooldown_time = 0 docs = [] @@ -211,7 +215,7 @@ class ElasticSearchEngine(SearchEngine): } }, "size": per_page, "from": min(page * per_page, 10000 - per_page)}, - index=self.index_name, request_timeout=30) + index=self.index_name, request_timeout=35) return page @@ -229,7 +233,7 @@ class ElasticSearchEngine(SearchEngine): "ext_group": { "terms": { "field": "ext", - "size": 20 + "size": 12 }, "aggs": { "size": { @@ -246,7 +250,7 @@ class ElasticSearchEngine(SearchEngine): } }, "size": 0 - }, index=self.index_name, request_timeout=20) + }, index=self.index_name, request_timeout=30) stats = dict() stats["total_size"] = result["aggregations"]["total_size"]["value"] @@ -311,7 +315,7 @@ class ElasticSearchEngine(SearchEngine): }, "size": 0 - }, index=self.index_name, request_timeout=120) + }, index=self.index_name, request_timeout=240) total_stats = self.es.search(body={ "query": { @@ -333,7 +337,7 @@ class ElasticSearchEngine(SearchEngine): }, "size": 0 - }, index=self.index_name, request_timeout=120) + }, index=self.index_name, request_timeout=241) size_and_date_histogram = self.es.search(body={ "query": { @@ -354,21 +358,21 @@ class ElasticSearchEngine(SearchEngine): "sizes": { "histogram": { "field": "size", - "interval": 50000000, # 50Mb - "min_doc_count": 100 + "interval": 100000000, # 100Mb + "min_doc_count": 500 } }, "dates": { "date_histogram": { "field": "mtime", "interval": "1y", - "min_doc_count": 100, + "min_doc_count": 500, "format": "yyyy" } } }, "size": 0 - }, index=self.index_name, request_timeout=120) + }, index=self.index_name, request_timeout=242) website_scatter = self.es.search(body={ "query": { @@ -384,7 +388,7 @@ class ElasticSearchEngine(SearchEngine): "websites": { "terms": { "field": "website_id", - "size": 500 # TODO: Figure out what size is appropriate + "size": 400 # TODO: Figure out what size is appropriate }, "aggs": { "size": { @@ -396,9 +400,9 @@ class ElasticSearchEngine(SearchEngine): } }, "size": 0 - }, index=self.index_name, request_timeout=120) + }, index=self.index_name, request_timeout=243) - es_stats = self.es.indices.stats(self.index_name, request_timeout=120) + es_stats = self.es.indices.stats(self.index_name, request_timeout=244) stats = dict() stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"] @@ -460,3 +464,6 @@ class ElasticSearchEngine(SearchEngine): for website in websites: if website not in non_empty_websites: yield website + + def refresh(self): + self.es.indices.refresh(self.index_name) diff --git a/templates/home.html b/templates/home.html index 33881e5..29d30b4 100644 --- a/templates/home.html +++ b/templates/home.html @@ -23,13 +23,23 @@
Search
-
+ -
- +
+
+ +
+
+ {% if show_captcha %} + + + + {% else %} + + {% endif %} +
-
diff --git a/templates/layout.html b/templates/layout.html index b6cefb3..2513392 100644 --- a/templates/layout.html +++ b/templates/layout.html @@ -32,7 +32,7 @@ {# Size #} @@ -90,9 +91,19 @@
+ {# Search button #}
- + + {% if show_captcha %} + + + + {% else %} + + {% endif %}
@@ -156,12 +167,10 @@ {% if results["hits"]["total"] > (p + 1) * per_page %} - Next + {% endif %} {% if p > 0 %} - Previous + {% endif %} @@ -174,6 +183,7 @@ @@ -245,6 +255,16 @@ } }); + //Next button + function nextPage() { + document.getElementById("page").value = parseInt(document.getElementById("page").value) + 1; + grecaptcha.execute(); + } + function prevPage() { + document.getElementById("page").value = parseInt(document.getElementById("page").value) - 1; + grecaptcha.execute(); + } + diff --git a/test/files/apache_table.html b/test/files/apache_table.html new file mode 100644 index 0000000..06e21c9 --- /dev/null +++ b/test/files/apache_table.html @@ -0,0 +1,21 @@ + + + + Index of /Public/bootstrap + + +

Index of /Public/bootstrap

+ + + + + + + + + + + +
[ICO]NameLast modifiedSizeDescription

[PARENTDIR]Parent Directory   -  
[   ]bower.json 2017-04-05 01:45 1.0K 
[DIR]css/ 2017-09-07 18:03 -  
[DIR]image/ 2017-09-07 18:03 -  
[DIR]js/ 2017-09-07 18:03 -  
[DIR]less/ 2017-09-07 18:03 -  
[   ]package.json 2017-04-05 01:45 666  

+ + diff --git a/test/files/lighttpd_table.html b/test/files/lighttpd_table.html new file mode 100644 index 0000000..fe58d9f --- /dev/null +++ b/test/files/lighttpd_table.html @@ -0,0 +1,47 @@ + + + + + Index of /gentoo/releases/ + + + +

Index of /gentoo/releases/

+
+ + + + + + + + + + + + + + + + + + + +
NameLast ModifiedSizeType
Parent Directory/ -  Directory
alpha/2009-Aug-09 03:47:09-  Directory
amd64/2017-Feb-09 18:50:44-  Directory
arm/2014-Apr-29 13:42:06-  Directory
hppa/2014-Apr-29 13:42:12-  Directory
ia64/2009-Aug-09 03:47:09-  Directory
mips/2011-Apr-28 23:38:14-  Directory
ppc/2014-Apr-29 13:41:00-  Directory
s390/2014-Apr-29 13:41:06-  Directory
sh/2014-Apr-29 13:41:16-  Directory
snapshots/2009-Apr-16 05:08:17-  Directory
sparc/2009-Aug-09 03:47:09-  Directory
x86/2016-Jul-04 21:14:19-  Directory
README2014-Jun-22 05:18:430.1Kapplication/octet-stream
verify-digests.sh2016-Jun-10 02:40:334.5Kapplication/octet-stream
+
+
lighttpd/1.4.29
+ + diff --git a/test/files/nginx_pre.html b/test/files/nginx_pre.html new file mode 100644 index 0000000..5bbd35e --- /dev/null +++ b/test/files/nginx_pre.html @@ -0,0 +1,11 @@ + +Index of /test/To process/Android nak newer/ + +

Index of /test/To process/Android nak newer/


../
+DCIM/                                              31-Jul-2018 00:26                   -
+Pictures/                                          31-Jul-2018 00:26                   -
+1529682937580.webm                                 25-Jun-2018 03:58             3768511
+1529716051300.webm                                 25-Jun-2018 04:01             3181867
+1529725898345.webm                                 25-Jun-2018 04:05             4138908
+

+ diff --git a/test/webserver.py b/test/webserver.py new file mode 100644 index 0000000..a3a1c14 --- /dev/null +++ b/test/webserver.py @@ -0,0 +1,13 @@ +from flask import Flask, send_file + +app = Flask(__name__) + + +@app.route("/test1/") +def test1(): + return send_file("files/apache_table.html") + + +if __name__ == '__main__': + app.run("0.0.0.0", port=8888, threaded=True) +