diff --git a/README.md b/README.md
index a10a2bd..b53d50a 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@ Suggestions/concerns/PRs are welcome
## Installation
Assuming you have Python 3 and git installed:
```bash
+sudo apt install libssl-dev libcurl4-openssl-dev
git clone https://github.com/simon987/od-database
cd od-database
sudo pip3 install -r requirements.txt
@@ -14,8 +15,11 @@ Create `/config.py` and fill out the parameters. Sample config:
# Leave default values for no CAPTCHAs
CAPTCHA_LOGIN = False
CAPTCHA_SUBMIT = False
+CAPTCHA_SEARCH = False
CAPTCHA_SITE_KEY = ""
CAPTCHA_SECRET_KEY = ""
+CAPTCHA_S_SITE_KEY = ""
+CAPTCHA_S_SECRET_KEY = ""
# Flask secret key for sessions
FLASK_SECRET = ""
diff --git a/app.py b/app.py
index 75f8aec..3b2b622 100644
--- a/app.py
+++ b/app.py
@@ -5,7 +5,6 @@ from urllib.parse import urlparse
import os
import time
import datetime
-import itertools
from database import Database, Website, InvalidQueryException
from flask_recaptcha import ReCaptcha
import od_util
@@ -13,6 +12,7 @@ import config
from flask_caching import Cache
from tasks import TaskManager, Task, TaskResult
from search.search import ElasticSearchEngine
+from callbacks import PostCrawlCallbackFactory
app = Flask(__name__)
if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN:
@@ -21,6 +21,12 @@ if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN:
secret_key=config.CAPTCHA_SECRET_KEY)
else:
recaptcha = None
+if config.CAPTCHA_SEARCH:
+ recaptcha_search = ReCaptcha(app=app,
+ site_key=config.CAPTCHA_S_SITE_KEY,
+ secret_key=config.CAPTCHA_S_SECRET_KEY)
+else:
+ recaptcha_search = None
app.secret_key = config.FLASK_SECRET
db = Database("db.sqlite3")
cache = Cache(app, config={'CACHE_TYPE': 'simple'})
@@ -30,6 +36,7 @@ app.jinja_env.globals.update(get_mime=od_util.get_category)
taskManager = TaskManager()
searchEngine = ElasticSearchEngine("od-database")
+searchEngine.start_stats_scheduler()
@app.template_filter("date_format")
@@ -50,7 +57,7 @@ def from_timestamp(value):
@app.route("/dl")
def downloads():
try:
- export_file_stats = os.stat("static/out.csv.xz")
+ export_file_stats = os.stat("static/out.csv.lzma")
except FileNotFoundError:
print("No export file")
export_file_stats = None
@@ -236,79 +243,86 @@ def admin_rescan_website(website_id):
@app.route("/search")
def search():
- q = request.args.get("q") if "q" in request.args else ""
- sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score"
- page = request.args.get("p") if "p" in request.args else "0"
- page = int(page) if page.isdigit() else 0
+ q = request.args.get("q") if "q" in request.args else ""
+ sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score"
- per_page = request.args.get("per_page") if "per_page" in request.args else "50"
- per_page = int(per_page) if per_page.isdigit() else "50"
- per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50
+ page = request.args.get("p") if "p" in request.args else "0"
+ page = int(page) if page.isdigit() else 0
- extensions = request.args.get("ext") if "ext" in request.args else None
- extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else []
+ per_page = request.args.get("per_page") if "per_page" in request.args else "50"
+ per_page = int(per_page) if per_page.isdigit() else "50"
+ per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50
- size_min = request.args.get("size_min") if "size_min" in request.args else "size_min"
- size_min = int(size_min) if size_min.isdigit() else 0
- size_max = request.args.get("size_max") if "size_max" in request.args else "size_max"
- size_max = int(size_max) if size_max.isdigit() else 0
+ extensions = request.args.get("ext") if "ext" in request.args else None
+ extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else []
- date_min = request.args.get("date_min") if "date_min" in request.args else "date_min"
- date_min = int(date_min) if date_min.isdigit() else 0
- date_max = request.args.get("date_max") if "date_max" in request.args else "date_max"
- date_max = int(date_max) if date_max.isdigit() else 0
+ size_min = request.args.get("size_min") if "size_min" in request.args else "size_min"
+ size_min = int(size_min) if size_min.isdigit() else 0
+ size_max = request.args.get("size_max") if "size_max" in request.args else "size_max"
+ size_max = int(size_max) if size_max.isdigit() else 0
- match_all = "all" in request.args
+ date_min = request.args.get("date_min") if "date_min" in request.args else "date_min"
+ date_min = int(date_min) if date_min.isdigit() else 0
+ date_max = request.args.get("date_max") if "date_max" in request.args else "date_max"
+ date_max = int(date_max) if date_max.isdigit() else 0
- field_name = "field_name" in request.args
- field_trigram = "field_trigram" in request.args
- field_path = "field_path" in request.args
+ match_all = "all" in request.args
- if not field_name and not field_trigram and not field_path:
- # If no fields are selected, search in all
- field_name = field_path = field_trigram = True
+ field_name = "field_name" in request.args
+ field_trigram = "field_trigram" in request.args
+ field_path = "field_path" in request.args
- fields = []
- if field_path:
- fields.append("path")
- if field_name:
- fields.append("name^5")
- if field_trigram:
- fields.append("name.nGram^2")
+ if not field_name and not field_trigram and not field_path:
+ # If no fields are selected, search in all
+ field_name = field_path = field_trigram = True
- if len(q) >= 3:
+ fields = []
+ if field_path:
+ fields.append("path")
+ if field_name:
+ fields.append("name^5")
+ if field_trigram:
+ fields.append("name.nGram^2")
- db.log_search(request.remote_addr,
- request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
- q, extensions, page)
+ if len(q) >= 3:
- try:
- hits = searchEngine.search(q, page, per_page, sort_order,
- extensions, size_min, size_max, match_all, fields, date_min, date_max)
- hits = db.join_website_on_search_result(hits)
- except InvalidQueryException as e:
- flash("Invalid query: " + str(e), "warning")
- return redirect("/search")
- except Exception:
- flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
- "Please try again later", "danger")
+ response = request.args.get("g-recaptcha-response", "")
+ if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response):
+ db.log_search(request.remote_addr,
+ request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
+ q, extensions, page)
+
+ try:
+ hits = searchEngine.search(q, page, per_page, sort_order,
+ extensions, size_min, size_max, match_all, fields, date_min, date_max)
+ hits = db.join_website_on_search_result(hits)
+ except InvalidQueryException as e:
+ flash("Invalid query: " + str(e), "warning")
+ return redirect("/search")
+ except Exception:
+ flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
+ "Please try again later", "danger")
+ hits = None
+ else:
+ flash("Error: Invalid captcha please try again", "danger")
+ hits = None
+
+ else:
hits = None
- else:
- hits = None
-
- return render_template("search.html",
- results=hits,
- q=q,
- p=page, per_page=per_page,
- sort_order=sort_order,
- results_set=config.RESULTS_PER_PAGE,
- extensions=",".join(extensions),
- size_min=size_min, size_max=size_max,
- match_all=match_all,
- field_trigram=field_trigram, field_path=field_path, field_name=field_name,
- date_min=date_min, date_max=date_max)
+ return render_template("search.html",
+ results=hits,
+ q=q,
+ p=page, per_page=per_page,
+ sort_order=sort_order,
+ results_set=config.RESULTS_PER_PAGE,
+ extensions=",".join(extensions),
+ size_min=size_min, size_max=size_max,
+ match_all=match_all,
+ field_trigram=field_trigram, field_path=field_path, field_name=field_name,
+ date_min=date_min, date_max=date_max,
+ show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search)
@app.route("/contribute")
@@ -324,7 +338,8 @@ def home():
stats["website_count"] = len(db.get_all_websites())
except:
stats = {}
- return render_template("home.html", stats=stats)
+ return render_template("home.html", stats=stats,
+ show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search)
@app.route("/submit")
@@ -565,7 +580,11 @@ def api_complete_task():
if filename and os.path.exists(filename):
os.remove(filename)
- # TODO: handle callback here
+ # Handle task callback
+ callback = PostCrawlCallbackFactory.get_callback(task)
+ if callback:
+ callback.run(task_result, searchEngine)
+
return "Successfully logged task result and indexed files"
else:
@@ -659,7 +678,7 @@ def api_task_enqueue():
request.json["url"],
request.json["priority"],
request.json["callback_type"],
- request.json["callback_args"]
+ json.dumps(request.json["callback_args"])
)
taskManager.queue_task(task)
return ""
@@ -705,5 +724,38 @@ def api_random_website():
return abort(403)
+@app.route("/api/search", methods=["POST"])
+def api_search():
+
+ try:
+ token = request.json["token"]
+ except KeyError:
+ return abort(400)
+
+ name = db.check_api_token(token)
+
+ if name:
+
+ try:
+ hits = searchEngine.search(
+ request.json["query"],
+ request.json["page"], request.json["per_page"],
+ request.json["sort_order"],
+ request.json["extensions"],
+ request.json["size_min"], request.json["size_max"],
+ request.json["match_all"],
+ request.json["fields"],
+ request.json["date_min"], request.json["date_max"]
+ )
+
+ hits = db.join_website_on_search_result(hits)
+ return json.dumps(hits)
+
+ except InvalidQueryException as e:
+ return str(e)
+ else:
+ return abort(403)
+
+
if __name__ == '__main__':
app.run("0.0.0.0", port=12345, threaded=True)
diff --git a/callbacks.py b/callbacks.py
index 89bda6c..647c963 100644
--- a/callbacks.py
+++ b/callbacks.py
@@ -1,6 +1,8 @@
-from tasks import Task
-from crawl_server.reddit_bot import RedditBot
+from tasks import Task, TaskResult
+from reddit_bot import RedditBot
import praw
+from search.search import SearchEngine
+import json
class PostCrawlCallback:
@@ -8,7 +10,10 @@ class PostCrawlCallback:
def __init__(self, task: Task):
self.task = task
- def run(self):
+ if self.task.callback_args:
+ self.task.callback_args = json.loads(self.task.callback_args)
+
+ def run(self, task_result: TaskResult, search: SearchEngine):
raise NotImplementedError
@@ -36,26 +41,33 @@ class RedditCallback(PostCrawlCallback):
user_agent='github.com/simon987/od-database (by /u/Hexahedr_n)')
self.reddit_bot = RedditBot("crawled.txt", reddit)
- def run(self):
+ def run(self, task_result: TaskResult, search: SearchEngine):
raise NotImplementedError
class RedditPostCallback(RedditCallback):
- def run(self):
+ def run(self, task_result: TaskResult, search: SearchEngine):
print("Reddit post callback for task " + str(self.task))
- pass
class RedditCommentCallback(RedditCallback):
- def run(self):
- print("Reddit comment callback for task " + str(self.task))
- pass
+ def run(self, task_result: TaskResult, search: SearchEngine):
+
+ comment_id = self.task.callback_args["comment_id"]
+ print("Editing comment comment " + comment_id)
+
+ search.refresh() # Make sure the newly indexed documents are available before commenting
+ stats = search.get_stats(self.task.website_id)
+ message = self.reddit_bot.get_comment(stats, self.task.website_id,
+ message="There you go! This website was crawled in `" +
+ str(int(task_result.end_time - task_result.start_time)) + "s`")
+ print(message)
+ self.reddit_bot.edit(self.reddit_bot.reddit.comment(comment_id), message)
class DiscordCallback(PostCrawlCallback):
- def run(self):
+ def run(self, task_result: TaskResult, search: SearchEngine):
print("Discord callback for task " + str(self.task))
- pass
diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 368aaac..11452c4 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -1,14 +1,16 @@
+import pycurl
+from io import BytesIO
+
from crawl_server import logger
from urllib.parse import unquote, urljoin
import os
from html.parser import HTMLParser
from itertools import repeat
from crawl_server.crawler import RemoteDirectory, File
-import requests
-from requests.exceptions import RequestException
from multiprocessing.pool import ThreadPool
import config
from dateutil.parser import parse as parse_date
+from pycurl import Curl
import hashlib
import urllib3
@@ -94,17 +96,36 @@ class HttpDirectory(RemoteDirectory):
def __init__(self, url):
super().__init__(url)
- self.session = requests.Session()
- self.session.headers = HttpDirectory.HEADERS
- self.session.verify = False
- self.session.max_redirects = 1
+ self.curl = None
+ self.curl_head = None
+ self.init_curl()
+
+ def init_curl(self):
+
+ self.curl = Curl()
+ self.curl.setopt(self.curl.SSL_VERIFYPEER, 0)
+ self.curl.setopt(self.curl.SSL_VERIFYHOST, 0)
+ self.curl.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT)
+
+ self.curl_head = self._curl_handle()
+
+ @staticmethod
+ def _curl_handle():
+
+ curl_head = Curl()
+ curl_head.setopt(pycurl.SSL_VERIFYPEER, 0)
+ curl_head.setopt(pycurl.SSL_VERIFYHOST, 0)
+ curl_head.setopt(pycurl.NOBODY, 1)
+ curl_head.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT)
+
+ return curl_head
def list_dir(self, path):
current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
path_identifier = hashlib.md5(current_dir_name.encode())
path_url = urljoin(self.base_url, path, "")
- body = self._stream_body(path_url)
+ body = self._fetch_body(path_url)
anchors = self._parse_links(body)
urls_to_request = []
@@ -139,7 +160,7 @@ class HttpDirectory(RemoteDirectory):
if len(urls_to_request) > 150:
# Many urls, use multi-threaded solution
pool = ThreadPool(processes=10)
- files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
+ files = pool.starmap(self._request_file, zip(urls_to_request, repeat(self.base_url)))
pool.close()
for file in files:
if file:
@@ -147,67 +168,65 @@ class HttpDirectory(RemoteDirectory):
else:
# Too few urls to create thread pool
for url in urls_to_request:
- file = self._request_file(url)
+ file = self._request_file(url, self.base_url)
if file:
yield file
- def _request_file(self, url):
+ @staticmethod
+ def _request_file(url, base_url):
retries = HttpDirectory.MAX_RETRIES
while retries > 0:
try:
- r = self.session.head(url, allow_redirects=False, timeout=HttpDirectory.TIMEOUT)
+ curl = HttpDirectory._curl_handle()
+ raw_headers = BytesIO()
+ curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore"))
+ curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write)
+ curl.perform()
- stripped_url = url[len(self.base_url) - 1:]
+ stripped_url = url[len(base_url) - 1:]
+ headers = HttpDirectory._parse_dict_header(raw_headers.getvalue().decode("utf-8", errors="ignore"))
+ raw_headers.close()
path, name = os.path.split(stripped_url)
- date = r.headers.get("Last-Modified", "1970-01-01")
+ date = headers.get("Last-Modified", "1970-01-01")
+ curl.close()
return File(
path=unquote(path).strip("/"),
name=unquote(name),
- size=int(r.headers.get("Content-Length", -1)),
+ size=int(headers.get("Content-Length", -1)),
mtime=int(parse_date(date).timestamp()),
is_dir=False
)
- except RequestException:
- self.session.close()
+ except pycurl.error:
retries -= 1
logger.debug("TimeoutError - _request_file")
raise TimeoutError
- def _stream_body(self, url: str):
+ def _fetch_body(self, url: str):
retries = HttpDirectory.MAX_RETRIES
while retries > 0:
try:
- r = self.session.get(url, stream=True, timeout=HttpDirectory.TIMEOUT)
- for chunk in r.iter_content(chunk_size=8192):
- try:
- yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore")
- except LookupError:
- # Unsupported encoding
- yield chunk.decode("utf-8", errors="ignore")
- r.close()
- return
- except RequestException:
- self.session.close()
+ content = BytesIO()
+ self.curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore"))
+ self.curl.setopt(pycurl.WRITEDATA, content)
+ self.curl.perform()
+
+ return content.getvalue().decode("utf-8", errors="ignore")
+ except pycurl.error:
+ self.close()
retries -= 1
- logger.debug("TimeoutError - _stream_body")
+ logger.debug("TimeoutError - _fetch_body")
raise TimeoutError
@staticmethod
def _parse_links(body):
parser = HTMLAnchorParser()
- anchors = []
-
- for chunk in body:
- parser.feed(chunk)
- for anchor in parser.anchors:
- anchors.append(anchor)
-
- return anchors
+ parser.feed(body)
+ return parser.anchors
@staticmethod
def _isdir(link: Anchor):
@@ -216,14 +235,14 @@ class HttpDirectory(RemoteDirectory):
@staticmethod
def _should_ignore(base_url, current_path, link: Anchor):
- if urljoin(base_url, link.href) == urljoin(urljoin(base_url, current_path), "../"):
+ full_url = urljoin(base_url, link.href)
+ if full_url == urljoin(urljoin(base_url, current_path), "../") or full_url == base_url:
return True
if link.href.endswith(HttpDirectory.BLACK_LIST):
return True
# Ignore external links
- full_url = urljoin(base_url, link.href)
if not full_url.startswith(base_url):
return True
@@ -231,8 +250,18 @@ class HttpDirectory(RemoteDirectory):
if "?" in link.href:
return True
+ @staticmethod
+ def _parse_dict_header(raw):
+ headers = dict()
+ for line in raw.split("\r\n")[1:]: # Ignore first 'HTTP/1.0 200 OK' line
+ if line:
+ k, v = line.split(":", maxsplit=1)
+ headers[k.strip()] = v.strip()
+
+ return headers
+
def close(self):
- self.session.close()
- logger.debug("Closing HTTPRemoteDirectory for " + self.base_url)
+ self.curl.close()
+ self.init_curl()
diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py
index 841e991..cee1e17 100644
--- a/crawl_server/task_manager.py
+++ b/crawl_server/task_manager.py
@@ -63,7 +63,7 @@ class TaskManager:
}
r = requests.post(config.SERVER_URL + "/task/upload", data=payload, files=files)
- logger.info("RESPONSE: " + r.text)
+ logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">")
except Exception as e:
logger.error("Exception while sending file_list chunk: " + str(e))
pass
@@ -75,7 +75,7 @@ class TaskManager:
}
r = requests.post(config.SERVER_URL + "/task/complete", data=payload)
- logger.info("RESPONSE: " + r.text)
+ logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">")
if os.path.exists(filename):
os.remove(filename)
diff --git a/database.py b/database.py
index 51edd67..252a30f 100644
--- a/database.py
+++ b/database.py
@@ -407,12 +407,12 @@ class Database:
cursor = conn.cursor()
cursor.execute("SELECT id, website_id, url, priority, callback_type, callback_args FROM "
- "Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name))
+ "Queue WHERE website_id=?", (website_id, ))
task = cursor.fetchone()
if task:
- cursor.execute("DELETE FROM Queue WHERE website_id=? AND assigned_crawler=?", (website_id, name))
+ cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id, ))
conn.commit()
return Task(task[1], task[2], task[3], task[4], task[5])
else:
diff --git a/od_util.py b/od_util.py
index 00057e0..3407fc9 100644
--- a/od_util.py
+++ b/od_util.py
@@ -5,7 +5,12 @@ import os
import validators
import re
from ftplib import FTP
-import config
+
+# TODO: find a better way to do this
+try:
+ from . import config
+except (ImportError, SystemError):
+ import config
import urllib3
urllib3.disable_warnings()
diff --git a/crawl_server/reddit_bot.py b/reddit_bot.py
similarity index 71%
rename from crawl_server/reddit_bot.py
rename to reddit_bot.py
index bf3c3e4..66f03f6 100644
--- a/crawl_server/reddit_bot.py
+++ b/reddit_bot.py
@@ -41,11 +41,23 @@ class RedditBot:
while True:
try:
- # Double check has_crawled
if not self.has_crawled(reddit_obj.id):
- reddit_obj.reply(comment)
+ reply = reddit_obj.reply(comment)
self.log_crawl(reddit_obj.id)
print("Reply to " + reddit_obj.id)
+ return reply
+ break
+ except Exception as e:
+ print("Waiting 5 minutes: " + str(e))
+ time.sleep(300)
+ continue
+
+ def edit(self, reddit_comment, new_message):
+
+ while True:
+ try:
+ reddit_comment.edit(new_message)
+ print("Edit comment " + reddit_comment.id)
break
except Exception as e:
print("Waiting 5 minutes: " + str(e))
@@ -54,14 +66,13 @@ class RedditBot:
@staticmethod
def get_comment(stats: dict, website_id, message: str = ""):
- comment = message + " \n" if len(message) > 0 else ""
+ comment = message + " \n" if message else ""
- for stat in stats:
- comment += stat + " \n" if len(stat) > 0 else ""
- comment += RedditBot.format_stats(stats[stat])
+ comment += RedditBot.format_stats(stats)
- comment += "[Full Report](https://od-database.simon987.net/website/" + str(website_id) + "/)"
- comment += " | [Link list](https://od-database.simon987.net/website/" + str(website_id) + "/links) \n"
+ comment += "[Full Report](https://od-db.the-eye.eu/website/" + str(website_id) + "/)"
+ comment += " | [Link list](https://od-db.the-eye.eu/website/" + str(website_id) + "/links)"
+ comment += " | [Source](https://github.com/simon987) \n"
comment += "*** \n"
comment += RedditBot.bottom_line
@@ -74,7 +85,7 @@ class RedditBot:
result += "File types | Count | Total Size\n"
result += ":-- | :-- | :-- \n"
counter = 0
- for mime in stats["mime_stats"]:
+ for mime in stats["ext_stats"]:
result += mime[2]
result += " | " + str(mime[1])
result += " | " + humanfriendly.format_size(mime[0]) + " \n"
diff --git a/requirements.txt b/requirements.txt
index 4bb0370..b5a81c6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,4 +16,6 @@ flask_httpauth
ujson
urllib3
pyOpenSSL
-pybloom-live
\ No newline at end of file
+pybloom-live
+pycurl
+lxml
\ No newline at end of file
diff --git a/search/search.py b/search/search.py
index fac8dd5..f65f539 100644
--- a/search/search.py
+++ b/search/search.py
@@ -31,6 +31,9 @@ class SearchEngine:
def get_stats(self, website_id: int, subdir: str = None):
raise NotImplementedError
+ def refresh(self):
+ raise NotImplementedError
+
class ElasticSearchEngine(SearchEngine):
SORT_ORDERS = {
@@ -47,13 +50,14 @@ class ElasticSearchEngine(SearchEngine):
self.index_name = index_name
self.es = elasticsearch.Elasticsearch()
- scheduler = BackgroundScheduler()
- scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 15)
- scheduler.start()
-
if not self.es.indices.exists(self.index_name):
self.init()
+ def start_stats_scheduler(self):
+ scheduler = BackgroundScheduler()
+ scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120)
+ scheduler.start()
+
def init(self):
print("Elasticsearch first time setup")
if self.es.indices.exists(self.index_name):
@@ -122,8 +126,8 @@ class ElasticSearchEngine(SearchEngine):
def import_json(self, in_lines, website_id: int):
- import_every = 1000
- cooldown_time = 1
+ import_every = 400
+ cooldown_time = 0
docs = []
@@ -211,7 +215,7 @@ class ElasticSearchEngine(SearchEngine):
}
},
"size": per_page, "from": min(page * per_page, 10000 - per_page)},
- index=self.index_name, request_timeout=30)
+ index=self.index_name, request_timeout=35)
return page
@@ -229,7 +233,7 @@ class ElasticSearchEngine(SearchEngine):
"ext_group": {
"terms": {
"field": "ext",
- "size": 20
+ "size": 12
},
"aggs": {
"size": {
@@ -246,7 +250,7 @@ class ElasticSearchEngine(SearchEngine):
}
},
"size": 0
- }, index=self.index_name, request_timeout=20)
+ }, index=self.index_name, request_timeout=30)
stats = dict()
stats["total_size"] = result["aggregations"]["total_size"]["value"]
@@ -311,7 +315,7 @@ class ElasticSearchEngine(SearchEngine):
},
"size": 0
- }, index=self.index_name, request_timeout=120)
+ }, index=self.index_name, request_timeout=240)
total_stats = self.es.search(body={
"query": {
@@ -333,7 +337,7 @@ class ElasticSearchEngine(SearchEngine):
},
"size": 0
- }, index=self.index_name, request_timeout=120)
+ }, index=self.index_name, request_timeout=241)
size_and_date_histogram = self.es.search(body={
"query": {
@@ -354,21 +358,21 @@ class ElasticSearchEngine(SearchEngine):
"sizes": {
"histogram": {
"field": "size",
- "interval": 50000000, # 50Mb
- "min_doc_count": 100
+ "interval": 100000000, # 100Mb
+ "min_doc_count": 500
}
},
"dates": {
"date_histogram": {
"field": "mtime",
"interval": "1y",
- "min_doc_count": 100,
+ "min_doc_count": 500,
"format": "yyyy"
}
}
},
"size": 0
- }, index=self.index_name, request_timeout=120)
+ }, index=self.index_name, request_timeout=242)
website_scatter = self.es.search(body={
"query": {
@@ -384,7 +388,7 @@ class ElasticSearchEngine(SearchEngine):
"websites": {
"terms": {
"field": "website_id",
- "size": 500 # TODO: Figure out what size is appropriate
+ "size": 400 # TODO: Figure out what size is appropriate
},
"aggs": {
"size": {
@@ -396,9 +400,9 @@ class ElasticSearchEngine(SearchEngine):
}
},
"size": 0
- }, index=self.index_name, request_timeout=120)
+ }, index=self.index_name, request_timeout=243)
- es_stats = self.es.indices.stats(self.index_name, request_timeout=120)
+ es_stats = self.es.indices.stats(self.index_name, request_timeout=244)
stats = dict()
stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
@@ -460,3 +464,6 @@ class ElasticSearchEngine(SearchEngine):
for website in websites:
if website not in non_empty_websites:
yield website
+
+ def refresh(self):
+ self.es.indices.refresh(self.index_name)
diff --git a/templates/home.html b/templates/home.html
index 33881e5..29d30b4 100644
--- a/templates/home.html
+++ b/templates/home.html
@@ -23,13 +23,23 @@
diff --git a/templates/layout.html b/templates/layout.html
index b6cefb3..2513392 100644
--- a/templates/layout.html
+++ b/templates/layout.html
@@ -32,7 +32,7 @@
-
- Websites
+ Websites
-
Submit website
diff --git a/templates/search.html b/templates/search.html
index 8bb560f..f3437d5 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -9,7 +9,7 @@
{# Size #}
@@ -90,9 +91,19 @@
+
{# Search button #}
-
+
+ {% if show_captcha %}
+
+
+
+ {% else %}
+
+ {% endif %}
@@ -156,12 +167,10 @@
{% if results["hits"]["total"] > (p + 1) * per_page %}
- Next
+
{% endif %}
{% if p > 0 %}
- Previous
+
{% endif %}
@@ -174,6 +183,7 @@
- Try checking the 'Match any word' box for a broader search.
- Make sure you don't include the file extension in your query (Use the appropriate field to filter file types)
+ - If you're searching for files in a particular website, use the website search page
@@ -245,6 +255,16 @@
}
});
+ //Next button
+ function nextPage() {
+ document.getElementById("page").value = parseInt(document.getElementById("page").value) + 1;
+ grecaptcha.execute();
+ }
+ function prevPage() {
+ document.getElementById("page").value = parseInt(document.getElementById("page").value) - 1;
+ grecaptcha.execute();
+ }
+
diff --git a/test/files/apache_table.html b/test/files/apache_table.html
new file mode 100644
index 0000000..06e21c9
--- /dev/null
+++ b/test/files/apache_table.html
@@ -0,0 +1,21 @@
+
+
+
+ Index of /Public/bootstrap
+
+
+Index of /Public/bootstrap
+
+
+
diff --git a/test/files/lighttpd_table.html b/test/files/lighttpd_table.html
new file mode 100644
index 0000000..fe58d9f
--- /dev/null
+++ b/test/files/lighttpd_table.html
@@ -0,0 +1,47 @@
+
+
+
+
+ Index of /gentoo/releases/
+
+
+
+Index of /gentoo/releases/
+
+
+ Name | Last Modified | Size | Type |
+
+ Parent Directory/ | | - | Directory |
+ alpha/ | 2009-Aug-09 03:47:09 | - | Directory |
+ amd64/ | 2017-Feb-09 18:50:44 | - | Directory |
+ arm/ | 2014-Apr-29 13:42:06 | - | Directory |
+ hppa/ | 2014-Apr-29 13:42:12 | - | Directory |
+ ia64/ | 2009-Aug-09 03:47:09 | - | Directory |
+ mips/ | 2011-Apr-28 23:38:14 | - | Directory |
+ ppc/ | 2014-Apr-29 13:41:00 | - | Directory |
+ s390/ | 2014-Apr-29 13:41:06 | - | Directory |
+ sh/ | 2014-Apr-29 13:41:16 | - | Directory |
+ snapshots/ | 2009-Apr-16 05:08:17 | - | Directory |
+ sparc/ | 2009-Aug-09 03:47:09 | - | Directory |
+ x86/ | 2016-Jul-04 21:14:19 | - | Directory |
+ README | 2014-Jun-22 05:18:43 | 0.1K | application/octet-stream |
+ verify-digests.sh | 2016-Jun-10 02:40:33 | 4.5K | application/octet-stream |
+
+
+
+
+
+
diff --git a/test/files/nginx_pre.html b/test/files/nginx_pre.html
new file mode 100644
index 0000000..5bbd35e
--- /dev/null
+++ b/test/files/nginx_pre.html
@@ -0,0 +1,11 @@
+
+Index of /test/To process/Android nak newer/
+
+Index of /test/To process/Android nak newer/
../
+DCIM/ 31-Jul-2018 00:26 -
+Pictures/ 31-Jul-2018 00:26 -
+1529682937580.webm 25-Jun-2018 03:58 3768511
+1529716051300.webm 25-Jun-2018 04:01 3181867
+1529725898345.webm 25-Jun-2018 04:05 4138908
+
+
diff --git a/test/webserver.py b/test/webserver.py
new file mode 100644
index 0000000..a3a1c14
--- /dev/null
+++ b/test/webserver.py
@@ -0,0 +1,13 @@
+from flask import Flask, send_file
+
+app = Flask(__name__)
+
+
+@app.route("/test1/")
+def test1():
+ return send_file("files/apache_table.html")
+
+
+if __name__ == '__main__':
+ app.run("0.0.0.0", port=8888, threaded=True)
+