diff --git a/README.md b/README.md index e9d9aeb..33af441 100644 --- a/README.md +++ b/README.md @@ -27,10 +27,7 @@ Create `/config.py` and fill out the parameters. Sample config: CAPTCHA_LOGIN = False CAPTCHA_SUBMIT = False CAPTCHA_SEARCH = False -CAPTCHA_SITE_KEY = "" -CAPTCHA_SECRET_KEY = "" -CAPTCHA_S_SITE_KEY = "" -CAPTCHA_S_SECRET_KEY = "" +CAPTCHA_EVERY = 10 # Flask secret key for sessions FLASK_SECRET = "" diff --git a/__init__.py b/__init__.py index 308da22..25a3e0b 100644 --- a/__init__.py +++ b/__init__.py @@ -3,12 +3,3 @@ from logging import FileHandler, StreamHandler import sys -logger = logging.getLogger("default") -logger.setLevel(logging.DEBUG) - -formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s') -file_handler = FileHandler("oddb.log") -file_handler.setFormatter(formatter) -logger.addHandler(file_handler) -logger.addHandler(StreamHandler(sys.stdout)) - diff --git a/api.py b/api.py new file mode 100644 index 0000000..b848db5 --- /dev/null +++ b/api.py @@ -0,0 +1,265 @@ +import json +import os +from threading import Lock + +from flask import request, abort, Response, send_file, session + +import common as oddb +import captcha +from callbacks import PostCrawlCallbackFactory +from database import Task, Website +from search.search import InvalidQueryException +from tasks import TaskResult + +uploadLock = Lock() + + +def setup_api(app): + @app.route("/api/task/get", methods=["POST"]) + def api_get_task(): + token = request.form.get("token") + name = oddb.db.check_api_token(token) + accept_ftp = request.form.get("accept") == "ftp" if "accept" in request.form else False + + if name: + task = oddb.db.pop_task(name, accept_ftp) + oddb.logger.debug("API get task from " + name) + + if task: + oddb.logger.info("Assigning task " + str(task.to_json()) + " to " + name) + else: + oddb.logger.info("No queued tasks, creating a new one") + + try: + website_id = oddb.db.get_oldest_website_id() + website = oddb.db.get_website_by_id(website_id) + task = Task(website_id, website.url) + oddb.db.put_task(task) + + task = oddb.db.pop_task(name, accept_ftp) + except: + oddb.logger.error("Couldn't create new task") + abort(404) + + return Response(str(task), mimetype="application/json") + else: + return abort(403) + + @app.route("/api/task/cancel", methods=["POST"]) + def api_cancel_task(): + token = request.form.get("token") + name = oddb.db.check_api_token(token) + + if name: + website_id = request.form.get("website_id") if "website_id" in request.form else None + if website_id: + oddb.logger.debug("API task cancel for " + str(website_id) + " by " + name) + oddb.db.delete_task(website_id) + return Response("cancelled task") + else: + abort(400) + + else: + abort(403) + + @app.route("/api/task/complete", methods=["POST"]) + def api_complete_task(): + token = request.form.get("token") + name = oddb.db.check_api_token(token) + + if name: + tr = json.loads(request.form.get("result")) + oddb.logger.debug("Task result: " + str(tr)) + task_result = TaskResult(tr["status_code"], tr["file_count"], tr["start_time"], tr["end_time"], + tr["website_id"]) + + oddb.logger.info("Task for " + str(task_result.website_id) + " completed by " + name) + task = oddb.db.complete_task(task_result.website_id, name) + + if task: + + filename = "./tmp/" + str(task_result.website_id) + ".json" + if not os.path.exists(filename): + filename = None + oddb.taskManager.complete_task(filename, task, task_result, name) + + if filename and os.path.exists(filename): + os.remove(filename) + + # Handle task callback + callback = PostCrawlCallbackFactory.get_callback(task) + if callback: + callback.run(task_result, oddb.search) + + return "Successfully logged task result and indexed files" + + else: + oddb.logger.error("ERROR: " + name + " indicated that task for " + str(task_result.website_id) + + " was completed but there is no such task in the database.") + return "No such task" + return abort(403) + + @app.route("/api/task/upload", methods=["POST"]) + def api_upload(): + token = request.form.get("token") + name = oddb.db.check_api_token(token) + + if name: + website_id = request.form.get("website_id") + oddb.logger.debug("Result part upload for '" + str(website_id) + "' by " + name) + + if "file_list" in request.files: + file = request.files['file_list'] + + filename = "./tmp/" + str(website_id) + ".json" + + # Read the file into memory cuz if the request fails + # no file is corrupted. + buf = file.stream.read() + + # Write to file (create if not exists) when + # everything read successfully. + with uploadLock: + with open(filename, "a+b") as f: + f.write(buf) + + oddb.logger.debug("Written chunk to file") + return "ok" + else: + return abort(403) + + @app.route("/api/website/by_url", methods=["GET"]) + def api_website_by_url(): + token = request.args.get("token") + name = oddb.db.check_api_token(token) + + if name: + url = request.args.get("url") + website = oddb.db.get_website_by_url(url) + oddb.logger.info("API get website by url '" + url + "' by " + name) + if website: + return str(website.id) + return abort(404) + else: + return abort(403) + + @app.route("/api/website/blacklisted", methods=["GET"]) + def api_website_is_blacklisted(): + token = request.args.get("token") + url = request.args.get("url") + name = oddb.db.check_api_token(token) + + if name: + oddb.logger.info("API get website is blacklisted '" + url + "' by " + name) + return str(oddb.db.is_blacklisted(url)) + else: + return abort(403) + + @app.route("/api/website/add", methods=["GET"]) + def api_add_website(): + token = request.args.get("token") + url = request.args.get("url") + + name = oddb.db.check_api_token(token) + if name: + + website_id = oddb.db.insert_website(Website(url, str(request.remote_addr + "_" + + request.headers.get("X-Forwarded-For", "")), + "API_CLIENT_" + name)) + oddb.logger.info("API add website '" + url + "' by " + name + "(" + str(website_id) + ")") + return str(website_id) + else: + return abort(403) + + @app.route("/api/task/force_enqueue", methods=["POST"]) + def api_task_enqueue(): + try: + token = request.json["token"] + except KeyError: + return abort(400) + + name = oddb.db.check_api_token(token) + + if name: + + task = Task( + request.json["website_id"], + request.json["url"], + request.json["priority"], + request.json["callback_type"], + json.dumps(request.json["callback_args"]) + ) + + oddb.logger.info("API force enqueue by " + name + "\n(" + str(task.to_json()) + ")") + + oddb.taskManager.queue_task(task) + return "" + else: + return abort(403) + + @app.route("/api/task/try_enqueue", methods=["POST"]) + def api_task_try_enqueue(): + token = request.form.get("token") + name = oddb.db.check_api_token(token) + + if name: + + url = request.form.get("url") + message, result = oddb.try_enqueue(url) + + oddb.logger.info("API try enqueue '" + url + "' by " + name + " (" + message + ")") + + return json.dumps({ + "message": message, + "result": result + }) + else: + return abort(403) + + @app.route("/api/website/random") + def api_random_website(): + token = request.json["token"] + name = oddb.db.check_api_token(token) + + if name: + oddb.logger.info("API get random website by " + name) + return str(oddb.db.get_random_website_id()) + else: + return abort(403) + + @app.route("/api/search", methods=["POST"]) + def api_search(): + token = request.json["token"] + name = oddb.db.check_api_token(token) + + if name: + + try: + hits = oddb.searchEngine.search( + request.json["query"], + request.json["page"], request.json["per_page"], + request.json["sort_order"], + request.json["extensions"], + request.json["size_min"], request.json["size_max"], + request.json["match_all"], + request.json["fields"], + request.json["date_min"], request.json["date_max"] + ) + + hits = oddb.db.join_website_on_search_result(hits) + oddb.logger.info("API search '" + request.json["query"] + "' by " + name) + return json.dumps(hits) + + except InvalidQueryException as e: + oddb.logger.info("API search failed: " + str(e)) + return str(e) + else: + return abort(403) + + @app.route("/cap", methods=["GET"]) + def cap(): + word = captcha.make_captcha() + session["cap"] = word + + return send_file(captcha.get_path(word), cache_timeout=0) + diff --git a/app.py b/app.py index 11b981a..9c5e11b 100644 --- a/app.py +++ b/app.py @@ -1,824 +1,16 @@ -from flask import Flask, render_template, redirect, request, flash, abort, Response, session -from multiprocessing import Pool -import json -from urllib.parse import urlparse -import logging -import os -import time -import datetime -from database import Database, Website -from flask_recaptcha import ReCaptcha -import od_util -import config -from flask_caching import Cache -from tasks import TaskManager, Task, TaskResult -from search.search import ElasticSearchEngine, InvalidQueryException -from callbacks import PostCrawlCallbackFactory -from threading import Lock +from flask import Flask -uploadLock = Lock() +import api +import config +import views +import template_filters app = Flask(__name__) app.secret_key = config.FLASK_SECRET +template_filters.setup_template_filters(app) -# Disable flask logging -flaskLogger = logging.getLogger('werkzeug') -flaskLogger.setLevel(logging.ERROR) - -logger = logging.getLogger("default") - -if config.CAPTCHA_SUBMIT or config.CAPTCHA_LOGIN: - recaptcha = ReCaptcha(app=app, - site_key=config.CAPTCHA_SITE_KEY, - secret_key=config.CAPTCHA_SECRET_KEY) -else: - recaptcha = None -if config.CAPTCHA_SEARCH: - recaptcha_search = ReCaptcha(app=app, - site_key=config.CAPTCHA_S_SITE_KEY, - secret_key=config.CAPTCHA_S_SECRET_KEY) -else: - recaptcha_search = None - -db = Database("db.sqlite3") -cache = Cache(app, config={'CACHE_TYPE': 'simple'}) -app.jinja_env.globals.update(truncate_path=od_util.truncate_path) -app.jinja_env.globals.update(get_color=od_util.get_color) -app.jinja_env.globals.update(get_mime=od_util.get_category) - -taskManager = TaskManager() -searchEngine = ElasticSearchEngine("od-database") -searchEngine.start_stats_scheduler() - - -@app.template_filter("date_format") -def date_format(value, format='%Y-%m-%d'): - return time.strftime(format, time.gmtime(value)) - - -@app.template_filter("datetime_format") -def datetime_format(value, format='%Y-%m-%d %H:%M:%S'): - return time.strftime(format, time.gmtime(value)) - - -@app.template_filter("duration_format") -def duration_format(value): - delay = datetime.timedelta(seconds=value) - if (delay.days > 0): - out = str(delay).replace(" days, ", ":") - else: - out = str(delay) - out_ar = out.split(':') - out_ar = ["%02d" % (int(float(x))) for x in out_ar] - out = ":".join(out_ar) - return out - - -@app.template_filter("from_timestamp") -def from_timestamp(value): - return datetime.datetime.fromtimestamp(value) - - -@app.route("/dl") -@cache.cached(120) -def downloads(): - # Get content of downloads directory - dl_dir = "static/downloads/" - dir_content = os.listdir(dl_dir) - - # Make paths relative to working directory - # Only allow csv files - files = [ - (name, os.path.join(dl_dir, name)) - for name in dir_content - if name.find(".csv") != -1 - ] - - # Stat files - # Remove any dirs placed accidentally - files = [ - (f, full, os.stat(full)) - for f, full in files - if os.path.isfile(full) - ] - - if len(files) == 0: - logger.warning("No export file to display in /dl") - - return render_template("downloads.html", export_file_stats=files) - - -@app.route("/stats") -@cache.cached(120) -def stats_page(): - crawl_server_stats = db.get_stats_by_crawler() - return render_template("stats.html", crawl_server_stats=crawl_server_stats) - - -@app.route("/stats/json_chart") -@cache.cached(240) -def stats_json(): - stats = searchEngine.get_global_stats() - if stats: - db.join_website_on_stats(stats) - return Response(json.dumps(stats), mimetype="application/json") - return abort(500) - - -@app.route("/website//") -def website_info(website_id): - website = db.get_website_by_id(website_id) - - if website: - return render_template("website.html", website=website) - else: - abort(404) - - -@app.route("/website//json_chart") -@cache.memoize(60) -def website_json_chart(website_id): - website = db.get_website_by_id(website_id) - - if website: - stats = searchEngine.get_stats(website_id) - stats["base_url"] = website.url - stats["report_time"] = website.last_modified - return Response(json.dumps(stats), mimetype="application/json") - else: - abort(404) - - -@app.route("/website//links") -def website_links(website_id): - website = db.get_website_by_id(website_id) - - if website: - links = searchEngine.get_link_list(website_id, website.url) - return Response("\n".join(links), mimetype="text/plain") - else: - abort(404) - - -@app.route("/website/") -def websites(): - page = int(request.args.get("p")) if "p" in request.args else 0 - url = request.args.get("url") if "url" in request.args else "" - if url: - parsed_url = urlparse(url) - if parsed_url.scheme: - search_term = (parsed_url.scheme + "://" + parsed_url.netloc) - else: - flash("Sorry, I was not able to parse this url format. " - "Make sure you include the appropriate scheme (http/https/ftp)", "warning") - search_term = "" - else: - search_term = url - - return render_template("websites.html", - websites=db.get_websites(50, page, search_term), - p=page, url=search_term, per_page=50) - - -@app.route("/website/random") -def random_website(): - return redirect("/website/" + str(db.get_random_website_id())) - - -## TODO: move to DB -def get_empty_websites(): - current_tasks = taskManager.get_queued_tasks() - - queued_websites = [task.website_id for task in current_tasks] - all_websites = db.get_all_websites() - non_queued_websites = list(set(all_websites).difference(queued_websites)) - - return searchEngine.are_empty(non_queued_websites) - - -@app.route("/website/delete_empty") -def admin_delete_empty_website(): - """Delete websites with no associated files that are not queued""" - - if "username" in session: - - empty_websites = get_empty_websites() - - for website in empty_websites: - # db.delete_website(website) - pass - - flash("Deleted: " + repr(list(empty_websites)), "success") - return redirect("/dashboard") - - else: - abort(403) - - -@app.route("/website//clear") -def admin_clear_website(website_id): - if "username" in session: - - searchEngine.delete_docs(website_id) - flash("Cleared all documents associated with this website", "success") - return redirect("/website/" + str(website_id)) - else: - abort(403) - - -@app.route("/website//delete") -def admin_delete_website(website_id): - if "username" in session: - - searchEngine.delete_docs(website_id) - db.delete_website(website_id) - flash("Deleted website " + str(website_id), "success") - return redirect("/website/") - - else: - abort(403) - - -@app.route("/website//rescan") -def admin_rescan_website(website_id): - if "username" in session: - - website = db.get_website_by_id(website_id) - - if website: - priority = request.args.get("priority") if "priority" in request.args else 1 - task = Task(website_id, website.url, priority) - taskManager.queue_task(task) - - flash("Enqueued rescan task", "success") - else: - flash("Website does not exist", "danger") - return redirect("/website/" + str(website_id)) - - else: - abort(403) - - -@app.route("/search") -def search(): - q = request.args.get("q") if "q" in request.args else "" - sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score" - - page = request.args.get("p") if "p" in request.args else "0" - page = int(page) if page.isdigit() else 0 - - per_page = request.args.get("per_page") if "per_page" in request.args else "50" - per_page = int(per_page) if per_page.isdigit() else "50" - per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50 - - extensions = request.args.get("ext") if "ext" in request.args else None - extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else [] - - size_min = request.args.get("size_min") if "size_min" in request.args else "size_min" - size_min = int(size_min) if size_min.isdigit() else 0 - size_max = request.args.get("size_max") if "size_max" in request.args else "size_max" - size_max = int(size_max) if size_max.isdigit() else 0 - - date_min = request.args.get("date_min") if "date_min" in request.args else "date_min" - date_min = int(date_min) if date_min.isdigit() else 0 - date_max = request.args.get("date_max") if "date_max" in request.args else "date_max" - date_max = int(date_max) if date_max.isdigit() else 0 - - match_all = "all" in request.args - - field_name = "field_name" in request.args - field_trigram = "field_trigram" in request.args - field_path = "field_path" in request.args - - if not field_name and not field_trigram and not field_path: - # If no fields are selected, search in all - field_name = field_path = field_trigram = True - - fields = [] - if field_path: - fields.append("path") - if field_name: - fields.append("name^5") - if field_trigram: - fields.append("name.nGram^2") - - if len(q) >= 3: - - blocked = False - hits = None - response = request.args.get("g-recaptcha-response", "") - if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response): - - try: - hits = searchEngine.search(q, page, per_page, sort_order, - extensions, size_min, size_max, match_all, fields, date_min, date_max) - hits = db.join_website_on_search_result(hits) - except InvalidQueryException as e: - flash("Invalid query: " + str(e), "warning") - blocked = True - except: - flash("Query failed, this could mean that the search server is overloaded or is not reachable. " - "Please try again later", "danger") - - results = hits["hits"]["total"] if hits else -1 - took = hits["took"] if hits else -1 - forwarded_for = request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None - - logger.info("SEARCH '{}' [res={}, t={}, p={}x{}, ext={}] by {}{}" - .format(q, results, took, page, per_page, str(extensions), - request.remote_addr, "_" + forwarded_for if forwarded_for else "")) - - db.log_search(request.remote_addr, forwarded_for, q, extensions, page, blocked, results, took) - if blocked: - return redirect("/search") - else: - flash("Error: Invalid captcha please try again", "danger") - - else: - hits = None - - return render_template("search.html", - results=hits, - q=q, - p=page, per_page=per_page, - sort_order=sort_order, - results_set=config.RESULTS_PER_PAGE, - extensions=",".join(extensions), - size_min=size_min, size_max=size_max, - match_all=match_all, - field_trigram=field_trigram, field_path=field_path, field_name=field_name, - date_min=date_min, date_max=date_max, - show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search) - - -@app.route("/contribute") -@cache.cached(600) -def contribute(): - return render_template("contribute.html") - - -@app.route("/") -@cache.cached(240) -def home(): - try: - stats = searchEngine.get_global_stats() - stats["website_count"] = len(db.get_all_websites()) - except: - stats = {} - return render_template("home.html", stats=stats, - show_captcha=config.CAPTCHA_SEARCH, recaptcha=recaptcha_search) - - -@app.route("/submit") -def submit(): - queued_websites = taskManager.get_queued_tasks()[:30] - return render_template("submit.html", queue=queued_websites, recaptcha=recaptcha, - show_captcha=config.CAPTCHA_SUBMIT) - - -def try_enqueue(url): - url = os.path.join(url, "") - url = od_util.get_top_directory(url) - - if not od_util.is_valid_url(url): - return "Error: Invalid url. Make sure to include the appropriate scheme.", "warning" - - website = db.get_website_by_url(url) - if website: - return "Website already exists", "danger" - - website = db.website_exists(url) - if website: - return "A parent directory of this url has already been posted", "danger" - - if db.is_blacklisted(url): - return "Error: " \ - "Sorry, this website has been blacklisted. If you think " \ - "this is an error, please contact me.", "danger" - - if not od_util.is_od(url): - return "Error:" \ - "The anti-spam algorithm determined that the submitted url is not " \ - "an open directory or the server is not responding. If you think " \ - "this is an error, please contact me.", "danger" - - website_id = db.insert_website(Website(url, str(request.remote_addr + "_" + - request.headers.get("X-Forwarded-For", "")), - request.user_agent)) - - task = Task(website_id, url, priority=1) - taskManager.queue_task(task) - - return "The website has been added to the queue", "success" - - -@app.route("/enqueue", methods=["POST"]) -def enqueue(): - if not config.CAPTCHA_SUBMIT or recaptcha.verify(): - - url = os.path.join(request.form.get("url"), "") - message, msg_type = try_enqueue(url) - flash(message, msg_type) - - return redirect("/submit") - - else: - flash("Error: Invalid captcha please try again", "danger") - return redirect("/submit") - - -def check_url(url): - url = os.path.join(url, "") - try_enqueue(url) - return None - - -@app.route("/enqueue_bulk", methods=["POST"]) -def enqueue_bulk(): - if not config.CAPTCHA_SUBMIT or recaptcha.verify(): - - urls = request.form.get("urls") - if urls: - urls = urls.split() - - if 0 < len(urls) <= 1000: # TODO: Load from config & adjust placeholder/messages? - - pool = Pool(processes=6) - pool.map(func=check_url, iterable=urls) - pool.close() - - flash("Submitted websites to the queue", "success") - - return redirect("/submit") - - else: - flash("Too few or too many urls, please submit 1-10 urls", "danger") - return redirect("/submit") - else: - flash("Too few or too many urls, please submit 1-10 urls", "danger") - return redirect("/submit") - else: - flash("Error: Invalid captcha please try again", "danger") - return redirect("/submit") - - -@app.route("/admin") -def admin_login_form(): - if "username" in session: - return redirect("/dashboard") - return render_template("admin.html", recaptcha=recaptcha, show_captcha=config.CAPTCHA_LOGIN) - - -@app.route("/login", methods=["POST"]) -def admin_login(): - if not config.CAPTCHA_LOGIN or recaptcha.verify(): - - username = request.form.get("username") - password = request.form.get("password") - - if db.check_login(username, password): - session["username"] = username - flash("Logged in", "success") - return redirect("/dashboard") - - flash("Invalid username/password combo", "danger") - return redirect("/admin") - - else: - flash("Invalid captcha", "danger") - return redirect("/admin") - - -@app.route("/logout") -def admin_logout(): - session.clear() - flash("Logged out", "info") - return redirect("/") - - -@app.route("/dashboard") -def admin_dashboard(): - if "username" in session: - - tokens = db.get_tokens() - blacklist = db.get_blacklist() - - return render_template("dashboard.html", api_tokens=tokens, blacklist=blacklist) - else: - return abort(403) - - -@app.route("/blacklist/add", methods=["POST"]) -def admin_blacklist_add(): - if "username" in session: - - url = request.form.get("url") - db.add_blacklist_website(url) - flash("Added item to blacklist", "success") - return redirect("/dashboard") - - else: - return abort(403) - - -@app.route("/blacklist//delete") -def admin_blacklist_remove(blacklist_id): - if "username" in session: - db.remove_blacklist_website(blacklist_id) - flash("Removed blacklist item", "success") - return redirect("/dashboard") - - -@app.route("/generate_token", methods=["POST"]) -def admin_generate_token(): - if "username" in session: - - description = request.form.get("description") - - db.generate_api_token(description) - flash("Generated API token", "success") - - return redirect("/dashboard") - else: - return abort(403) - - -@app.route("/del_token", methods=["POST"]) -def admin_del_token(): - if "username" in session: - - token = request.form.get("token") - - db.delete_token(token) - flash("Deleted API token", "success") - return redirect("/dashboard") - else: - return abort(403) - - -# TODO: pages scrolling -@app.route("/logs", methods=["GET"]) -def admin_crawl_logs(): - if "username" in session: - - results = db.get_crawl_logs() - - return render_template("crawl_logs.html", logs=results) - else: - return abort(403) - - -@app.route("/api/task/get", methods=["POST"]) -def api_get_task(): - token = request.form.get("token") - name = db.check_api_token(token) - accept_ftp = request.form.get("accept") == "ftp" if "accept" in request.form else False - - if name: - task = db.pop_task(name, accept_ftp) - logger.debug("API get task from " + name) - - if task: - logger.info("Assigning task " + str(task.to_json()) + " to " + name) - else: - logger.info("No queued tasks, creating a new one") - - try: - website_id = db.get_oldest_website_id() - website = db.get_website_by_id(website_id) - task = Task(website_id, website.url) - db.put_task(task) - - task = db.pop_task(name, accept_ftp) - except: - logger.error("Couldn't create new task") - abort(404) - - return Response(str(task), mimetype="application/json") - else: - return abort(403) - - -@app.route("/api/task/cancel", methods=["POST"]) -def api_cancel_task(): - token = request.form.get("token") - name = db.check_api_token(token) - - if name: - website_id = request.form.get("website_id") if "website_id" in request.form else None - if website_id: - logger.debug("API task cancel for " + str(website_id) + " by " + name) - db.delete_task(website_id) - return Response("cancelled task") - else: - abort(400) - - else: - abort(403) - - -@app.route("/api/task/complete", methods=["POST"]) -def api_complete_task(): - token = request.form.get("token") - name = db.check_api_token(token) - - if name: - tr = json.loads(request.form.get("result")) - logger.debug("Task result: " + str(tr)) - task_result = TaskResult(tr["status_code"], tr["file_count"], tr["start_time"], tr["end_time"], - tr["website_id"]) - - logger.info("Task for " + str(task_result.website_id) + " completed by " + name) - task = db.complete_task(task_result.website_id, name) - - if task: - - filename = "./tmp/" + str(task_result.website_id) + ".json" - if not os.path.exists(filename): - filename = None - taskManager.complete_task(filename, task, task_result, name) - - if filename and os.path.exists(filename): - os.remove(filename) - - # Handle task callback - callback = PostCrawlCallbackFactory.get_callback(task) - if callback: - callback.run(task_result, search) - - return "Successfully logged task result and indexed files" - - else: - logger.error("ERROR: " + name + " indicated that task for " + str(task_result.website_id) + - " was completed but there is no such task in the database.") - return "No such task" - return abort(403) - - -@app.route("/api/task/upload", methods=["POST"]) -def api_upload(): - token = request.form.get("token") - name = db.check_api_token(token) - - if name: - website_id = request.form.get("website_id") - logger.debug("Result part upload for '" + str(website_id) + "' by " + name) - - if "file_list" in request.files: - file = request.files['file_list'] - - filename = "./tmp/" + str(website_id) + ".json" - - # Read the file into memory cuz if the request fails - # no file is corrupted. - buf = file.stream.read() - - # Write to file (create if not exists) when - # everything read successfully. - with uploadLock: - with open(filename, "a+b") as f: - f.write(buf) - - logger.debug("Written chunk to file") - return "ok" - else: - return abort(403) - - -@app.route("/api/website/by_url", methods=["GET"]) -def api_website_by_url(): - token = request.args.get("token") - name = db.check_api_token(token) - - if name: - url = request.args.get("url") - website = db.get_website_by_url(url) - logger.info("API get website by url '" + url + "' by " + name) - if website: - return str(website.id) - return abort(404) - else: - return abort(403) - - -@app.route("/api/website/blacklisted", methods=["GET"]) -def api_website_is_blacklisted(): - token = request.args.get("token") - url = request.args.get("url") - name = db.check_api_token(token) - - if name: - logger.info("API get website is blacklisted '" + url + "' by " + name) - return str(db.is_blacklisted(url)) - else: - return abort(403) - - -@app.route("/api/website/add", methods=["GET"]) -def api_add_website(): - token = request.args.get("token") - url = request.args.get("url") - - name = db.check_api_token(token) - if name: - - website_id = db.insert_website(Website(url, str(request.remote_addr + "_" + - request.headers.get("X-Forwarded-For", "")), - "API_CLIENT_" + name)) - logger.info("API add website '" + url + "' by " + name + "(" + str(website_id) + ")") - return str(website_id) - else: - return abort(403) - - -@app.route("/api/task/force_enqueue", methods=["POST"]) -def api_task_enqueue(): - try: - token = request.json["token"] - except KeyError: - return abort(400) - - name = db.check_api_token(token) - - if name: - - task = Task( - request.json["website_id"], - request.json["url"], - request.json["priority"], - request.json["callback_type"], - json.dumps(request.json["callback_args"]) - ) - - logger.info("API force enqueue by " + name + "\n(" + str(task.to_json()) + ")") - - taskManager.queue_task(task) - return "" - else: - return abort(403) - - -@app.route("/api/task/try_enqueue", methods=["POST"]) -def api_task_try_enqueue(): - token = request.form.get("token") - name = db.check_api_token(token) - - if name: - - url = request.form.get("url") - message, result = try_enqueue(url) - - logger.info("API try enqueue '" + url + "' by " + name + " (" + message + ")") - - return json.dumps({ - "message": message, - "result": result - }) - else: - return abort(403) - - -@app.route("/api/website/random") -def api_random_website(): - token = request.json["token"] - name = db.check_api_token(token) - - if name: - logger.info("API get random website by " + name) - return str(db.get_random_website_id()) - else: - return abort(403) - - -@app.route("/api/search", methods=["POST"]) -def api_search(): - token = request.json["token"] - name = db.check_api_token(token) - - if name: - - try: - hits = searchEngine.search( - request.json["query"], - request.json["page"], request.json["per_page"], - request.json["sort_order"], - request.json["extensions"], - request.json["size_min"], request.json["size_max"], - request.json["match_all"], - request.json["fields"], - request.json["date_min"], request.json["date_max"] - ) - - hits = db.join_website_on_search_result(hits) - logger.info("API search '" + request.json["query"] + "' by " + name) - return json.dumps(hits) - - except InvalidQueryException as e: - logger.info("API search failed: " + str(e)) - return str(e) - else: - return abort(403) - +views.setup_views(app) +api.setup_api(app) if __name__ == '__main__': app.run("0.0.0.0", port=12345, threaded=True) diff --git a/captcha.py b/captcha.py new file mode 100644 index 0000000..9f1c8ae --- /dev/null +++ b/captcha.py @@ -0,0 +1,92 @@ +import os +import random + +import numpy +import pylab +from PIL import Image, ImageDraw, ImageFont +import mpl_toolkits.mplot3d.axes3d as axes3d +import io +from wand.image import Image as WImage +from flask import request, session + +import config +from common import logger + +SIZE = (60, 20) +with open("words.txt") as f: + WORDS = f.read().splitlines(keepends=False) + + +def get_code(): + + if "cap_remaining" in session and session["cap_remaining"] > 0: + return """ + You will not be asked to complete a captcha for the next {} pages + """.format(session["cap_remaining"]) + + return """ +
+ cap + +
+ """ + + +def get_path(word): + return "captchas/{}.png".format(word) + + +def verify(): + if "cap_remaining" in session and session["cap_remaining"] > 0: + session["cap_remaining"] -= 1 + return True + + attempt = request.form.get("cap") if "cap" in request.form else ( + request.args.get("cap") if "cap" in request.args else "" + ) + + if "cap" in session and session["cap"] == attempt: + session["cap_remaining"] = config.CAPTCHA_EVERY + return True + return False + + +def make_captcha(): + word = random.choice(WORDS) + path = get_path(word) + + logger.info("generating CAPTCHA: " + word) + + if os.path.exists(path): + os.remove(path) + + image = Image.new('L', SIZE, 255) + image_draw = ImageDraw.Draw(image) + font = ImageFont.truetype("static/Hack-Regular.ttf", 12) + + image_draw.text((5, 3), word, font=font) + + x, y = numpy.meshgrid(range(SIZE[0]), range(SIZE[1])) + z = 1 - numpy.asarray(image) / 255 + + fig = pylab.figure() + ax = axes3d.Axes3D(fig) + ax.plot_wireframe(x, -y, z, rstride=1, cstride=1) + ax.set_zlim((0, 20)) + ax.set_axis_off() + pylab.close(fig) + + buf = io.BytesIO() + fig.savefig(buf, dpi=150) + buf.seek(0) + image.close() + + with WImage(blob=buf.read()) as img: + img.trim() + img.save(filename=path) + + return word + + +if __name__ == "__main__": + make_captcha() diff --git a/common.py b/common.py new file mode 100644 index 0000000..92dd0fa --- /dev/null +++ b/common.py @@ -0,0 +1,33 @@ +from logging import FileHandler, StreamHandler + +import sys + +from database import Database +from search.search import ElasticSearchEngine +from tasks import TaskManager +import logging +from flask import session, abort + +# Disable flask logging +flaskLogger = logging.getLogger('werkzeug') +flaskLogger.setLevel(logging.ERROR) + +logger = logging.getLogger("default") +logger.setLevel(logging.DEBUG) + +formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s') +file_handler = FileHandler("oddb.log") +file_handler.setFormatter(formatter) +logger.addHandler(file_handler) +logger.addHandler(StreamHandler(sys.stdout)) + +taskManager = TaskManager() +searchEngine = ElasticSearchEngine("od-database") +searchEngine.start_stats_scheduler() +db = Database("db.sqlite3") + + +def require_role(role: str): + + if db.get_user_role(session.get("username", None)) != role: + abort(403) diff --git a/database.py b/database.py index aaa1e65..54431cb 100644 --- a/database.py +++ b/database.py @@ -8,7 +8,6 @@ import uuid import tasks - class BlacklistedWebsite: def __init__(self, blacklist_id, url): self.id = blacklist_id @@ -182,6 +181,18 @@ class Database: return bcrypt.checkpw(password.encode(), db_user[0]) return False + def get_user_role(self, username: str): + with sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + + cursor.execute("SELECT role FROM Admin WHERE username=?", (username, )) + + db_user = cursor.fetchone() + + if db_user: + return db_user[0] + return False + def generate_login(self, username, password) -> None: with sqlite3.connect(self.db_path) as conn: @@ -189,7 +200,7 @@ class Database: hashed_pw = bcrypt.hashpw(password.encode(), bcrypt.gensalt(12)) - cursor.execute("INSERT INTO Admin (username, password) VALUES (?,?)", (username, hashed_pw)) + cursor.execute("INSERT INTO Admin (username, password, role) VALUES (?,?, 'admin')", (username, hashed_pw)) conn.commit() def check_api_token(self, token) -> str: diff --git a/init_script.sql b/init_script.sql index cfebb42..2774e08 100644 --- a/init_script.sql +++ b/init_script.sql @@ -11,7 +11,8 @@ CREATE TABLE Website ( CREATE TABLE Admin ( username TEXT PRIMARY KEY NOT NULL, - password TEXT + password TEXT, + role TEXT ); CREATE TABLE BlacklistedWebsite ( diff --git a/requirements.txt b/requirements.txt index b5a81c6..def8a5f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ flask_testing requests bs4 validators -flask_recaptcha Flask-Caching praw humanfriendly @@ -18,4 +17,7 @@ urllib3 pyOpenSSL pybloom-live pycurl -lxml \ No newline at end of file +lxml +pillow +Wand +numpy \ No newline at end of file diff --git a/restore.py b/restore.py new file mode 100644 index 0000000..e7b50fd --- /dev/null +++ b/restore.py @@ -0,0 +1,25 @@ +from search.search import ElasticSearchEngine +import ujson + +es = ElasticSearchEngine("od-database") +es.reset() + +with open("dump.json", "r") as f: + + buffer = list() + index_every = 10000 + + for line in f: + try: + doc = ujson.loads(line)["_source"] + buffer.append(doc) + + if len(buffer) >= index_every: + es._index(buffer) + buffer.clear() + + except Exception as e: + print("ERROR: " + str(e)) + + es._index(buffer) + diff --git a/search/search.py b/search/search.py index 4c207d4..3060e13 100644 --- a/search/search.py +++ b/search/search.py @@ -75,12 +75,13 @@ class ElasticSearchEngine(SearchEngine): self.es.indices.create(index=self.index_name) self.es.indices.close(index=self.index_name) - # File names and paths + # Index settings self.es.indices.put_settings(body={ "analysis": { "tokenizer": { "my_nGram_tokenizer": { - "type": "nGram", "min_gram": 3, "max_gram": 3} + "type": "nGram", "min_gram": 3, "max_gram": 3 + } } }}, index=self.index_name) self.es.indices.put_settings(body={ @@ -93,16 +94,16 @@ class ElasticSearchEngine(SearchEngine): } }}, index=self.index_name) - # Mappings - self.es.indices.put_mapping(body={"properties": { - "path": {"analyzer": "standard", "type": "text"}, - "name": {"analyzer": "standard", "type": "text", - "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, - "mtime": {"type": "date", "format": "epoch_second"}, - "size": {"type": "long"}, - "website_id": {"type": "integer"}, - "ext": {"type": "keyword"} - }}, doc_type="file", index=self.index_name) + self.es.indices.put_mapping(body={ + "properties": { + "path": {"analyzer": "standard", "type": "text"}, + "name": {"analyzer": "standard", "type": "text", + "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, + "mtime": {"type": "date", "format": "epoch_second"}, + "size": {"type": "long"}, + "website_id": {"type": "integer"}, + "ext": {"type": "keyword"}, + }}, doc_type="file", index=self.index_name) self.es.indices.open(index=self.index_name) @@ -120,9 +121,9 @@ class ElasticSearchEngine(SearchEngine): to_delete = helpers.scan(query={ "query": { - "term": {"website_id": website_id} + "match_all": {} } - }, scroll="1m", client=self.es, index=self.index_name, request_timeout=120) + }, scroll="1m", client=self.es, index=self.index_name, request_timeout=120, routing=website_id) buf = [] counter = 0 @@ -130,12 +131,12 @@ class ElasticSearchEngine(SearchEngine): buf.append(doc) counter += 1 - if counter >= 400: - self._delete(buf) + if counter >= 10000: + self._delete(buf, website_id) buf.clear() counter = 0 if counter > 0: - self._delete(buf) + self._delete(buf, website_id) break except Exception as e: @@ -144,9 +145,10 @@ class ElasticSearchEngine(SearchEngine): logger.debug("Done deleting for " + str(website_id)) - def _delete(self, docs): + def _delete(self, docs, website_id): bulk_string = self.create_bulk_delete_string(docs) - result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30) + result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30, + routing=website_id) if result["errors"]: logger.error("Error in ES bulk delete: \n" + result["errors"]) @@ -154,7 +156,7 @@ class ElasticSearchEngine(SearchEngine): def import_json(self, in_lines, website_id: int): - import_every = 400 + import_every = 10000 cooldown_time = 0 docs = [] @@ -183,7 +185,8 @@ class ElasticSearchEngine(SearchEngine): try: logger.debug("Indexing " + str(len(docs)) + " docs") bulk_string = ElasticSearchEngine.create_bulk_index_string(docs) - self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30) + self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30, + routing=docs[0]["website_id"]) break except Exception as e: logger.error("Error in _index: " + str(e) + ", retrying") @@ -293,7 +296,7 @@ class ElasticSearchEngine(SearchEngine): } }, "size": 0 - }, index=self.index_name, request_timeout=30) + }, index=self.index_name, request_timeout=30, routing=website_id) stats = dict() stats["total_size"] = result["aggregations"]["total_size"]["value"] @@ -311,11 +314,10 @@ class ElasticSearchEngine(SearchEngine): "includes": ["path", "name", "ext"] }, "query": { - "term": { - "website_id": website_id} + "match_all": {} } }, - index=self.index_name, request_timeout=20) + index=self.index_name, request_timeout=20, routing=website_id) for hit in hits: src = hit["_source"] yield base_url + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \ @@ -431,7 +433,7 @@ class ElasticSearchEngine(SearchEngine): "websites": { "terms": { "field": "website_id", - "size": 400 # TODO: Figure out what size is appropriate + "size": 600 # TODO: Figure out what size is appropriate }, "aggs": { "size": { @@ -451,7 +453,8 @@ class ElasticSearchEngine(SearchEngine): stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"] stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"] stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"] - stats["es_search_time_avg"] = stats["es_search_time"] / (stats["es_search_count"] if stats["es_search_count"] != 0 else 1) + stats["es_search_time_avg"] = stats["es_search_time"] / ( + stats["es_search_count"] if stats["es_search_count"] != 0 else 1) stats["total_count"] = total_stats["hits"]["total"] stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"] @@ -479,34 +482,5 @@ class ElasticSearchEngine(SearchEngine): } }, scroll="1m", client=self.es, index=self.index_name, request_timeout=60) - def are_empty(self, websites): - result = self.es.search(body={ - "query": { - "bool": { - "filter": { - "terms": { - "website_id": websites - }, - } - } - }, - "aggs": { - "websites": { - "terms": { - "field": "website_id", - "size": 100000, - "min_doc_count": 1 - } - } - }, - "size": 0 - }, index=self.index_name, request_timeout=30) - - non_empty_websites = [bucket["key"] for bucket in result["aggregations"]["websites"]["buckets"]] - - for website in websites: - if website not in non_empty_websites: - yield website - def refresh(self): self.es.indices.refresh(self.index_name) diff --git a/static/Hack-Regular.ttf b/static/Hack-Regular.ttf new file mode 100644 index 0000000..92a90cb Binary files /dev/null and b/static/Hack-Regular.ttf differ diff --git a/template_filters.py b/template_filters.py new file mode 100644 index 0000000..788e95d --- /dev/null +++ b/template_filters.py @@ -0,0 +1,34 @@ +import datetime +import time +import od_util + + +def setup_template_filters(app): + + app.jinja_env.globals.update(truncate_path=od_util.truncate_path) + app.jinja_env.globals.update(get_color=od_util.get_color) + app.jinja_env.globals.update(get_mime=od_util.get_category) + + @app.template_filter("date_format") + def date_format(value, format='%Y-%m-%d'): + return time.strftime(format, time.gmtime(value)) + + @app.template_filter("datetime_format") + def datetime_format(value, format='%Y-%m-%d %H:%M:%S'): + return time.strftime(format, time.gmtime(value)) + + @app.template_filter("duration_format") + def duration_format(value): + delay = datetime.timedelta(seconds=value) + if delay.days > 0: + out = str(delay).replace(" days, ", ":") + else: + out = str(delay) + out_ar = out.split(':') + out_ar = ["%02d" % (int(float(x))) for x in out_ar] + out = ":".join(out_ar) + return out + + @app.template_filter("from_timestamp") + def from_timestamp(value): + return datetime.datetime.fromtimestamp(value) diff --git a/templates/admin.html b/templates/admin.html index 4681097..a6be27f 100644 --- a/templates/admin.html +++ b/templates/admin.html @@ -16,7 +16,7 @@ {% if show_captcha %} - {{ recaptcha.get_code()|safe }} + {{ captcha.get_code()|safe }} {% endif %} diff --git a/templates/home.html b/templates/home.html index 8d4b27c..2638212 100644 --- a/templates/home.html +++ b/templates/home.html @@ -30,20 +30,12 @@
- {% if show_captcha %} - - - - {% else %} - - {% endif %} +
- + {% if show_captcha %} + {{ captcha.get_code()|safe }} + {% endif %} diff --git a/templates/search.html b/templates/search.html index c70acf3..bd434dc 100755 --- a/templates/search.html +++ b/templates/search.html @@ -110,21 +110,13 @@ {# Search button #}
- {% if show_captcha %} - - - - {% else %} - - {% endif %} +
+ {% if show_captcha %} + {{ captcha.get_code()|safe }} + {% endif %} @@ -280,20 +272,12 @@ //Next button function nextPage() { document.getElementById("page").value = parseInt(document.getElementById("page").value) + 1; - {% if show_captcha %} - grecaptcha.execute(); - {% else %} - document.getElementById("sfrm").submit() - {% endif %} + document.getElementById("sfrm").submit(); } function prevPage() { document.getElementById("page").value = parseInt(document.getElementById("page").value) - 1; - {% if show_captcha %} - grecaptcha.execute(); - {% else %} - document.getElementById("sfrm").submit() - {% endif %} + document.getElementById("sfrm").submit(); } diff --git a/templates/submit.html b/templates/submit.html index a09170d..55a97da 100644 --- a/templates/submit.html +++ b/templates/submit.html @@ -3,15 +3,17 @@ {% set current_page = "submit" %} {% block body %} -
+
@@ -24,16 +26,12 @@
-
- {% if show_captcha %} -
- {{ recaptcha.get_code()|safe }} -
- {% endif %} -
- + {% if show_captcha %} +
+ {{ captcha.get_code()|safe }}
-
+ {% endif %} +
@@ -41,18 +39,17 @@ {# Bulk #}
- +
-
- {% if show_captcha %} -
- {{ recaptcha.get_code()|safe }} -
- {% endif %} -
- + {% if show_captcha %} +
+ {{ captcha.get_code()|safe }}
-
+ {% endif %} + +
@@ -71,26 +68,26 @@
Queued websites
- - - - - - - - +
UrlPriorityTask type
+ + + + + + + - - {% for task in queue %} - - - - - - {% endfor %} - + + {% for task in queue %} + + + + + + {% endfor %} + -
UrlPriorityTask type
{{ task.url | truncate(70)}}{{ task.priority }}{{ task.callback_type if task.callback_type else "NORMAL" }}
{{ task.url | truncate(70) }}{{ task.priority }}{{ task.callback_type if task.callback_type else "NORMAL" }}
+
diff --git a/views.py b/views.py new file mode 100644 index 0000000..58be38d --- /dev/null +++ b/views.py @@ -0,0 +1,422 @@ +import json +import os +from multiprocessing.pool import Pool +from urllib.parse import urlparse + +from flask import render_template, redirect, request, flash, abort, Response, session +from flask_caching import Cache + +import config +import od_util +from common import db, taskManager, searchEngine, logger, require_role +from database import Task, Website +from search.search import InvalidQueryException +import captcha + + +def setup_views(app): + cache = Cache(app, config={'CACHE_TYPE': 'simple'}) + + @app.route("/dl") + @cache.cached(120) + def downloads(): + # Get content of downloads directory + dl_dir = "static/downloads/" + dir_content = os.listdir(dl_dir) + + # Make paths relative to working directory + # Only allow csv files + files = [ + (name, os.path.join(dl_dir, name)) + for name in dir_content + if name.find(".csv") != -1 + ] + + # Stat files + # Remove any dirs placed accidentally + files = [ + (f, full, os.stat(full)) + for f, full in files + if os.path.isfile(full) + ] + + if len(files) == 0: + logger.warning("No export file to display in /dl") + + return render_template("downloads.html", export_file_stats=files) + + @app.route("/stats") + @cache.cached(120) + def stats_page(): + crawl_server_stats = db.get_stats_by_crawler() + return render_template("stats.html", crawl_server_stats=crawl_server_stats) + + @app.route("/stats/json_chart") + @cache.cached(240) + def stats_json(): + stats = searchEngine.get_global_stats() + if stats: + db.join_website_on_stats(stats) + return Response(json.dumps(stats), mimetype="application/json") + return abort(500) + + @app.route("/website//") + def website_info(website_id): + website = db.get_website_by_id(website_id) + + if website: + return render_template("website.html", website=website) + else: + abort(404) + + @app.route("/website//json_chart") + @cache.memoize(60) + def website_json_chart(website_id): + website = db.get_website_by_id(website_id) + + if website: + stats = searchEngine.get_stats(website_id) + stats["base_url"] = website.url + stats["report_time"] = website.last_modified + return Response(json.dumps(stats), mimetype="application/json") + else: + abort(404) + + @app.route("/website//links") + def website_links(website_id): + website = db.get_website_by_id(website_id) + + if website: + links = searchEngine.get_link_list(website_id, website.url) + return Response("\n".join(links), mimetype="text/plain") + else: + abort(404) + + @app.route("/website/") + def websites(): + page = int(request.args.get("p")) if "p" in request.args else 0 + url = request.args.get("url") if "url" in request.args else "" + if url: + parsed_url = urlparse(url) + if parsed_url.scheme: + search_term = (parsed_url.scheme + "://" + parsed_url.netloc) + else: + flash("Sorry, I was not able to parse this url format. " + "Make sure you include the appropriate scheme (http/https/ftp)", "warning") + search_term = "" + else: + search_term = url + + return render_template("websites.html", + websites=db.get_websites(50, page, search_term), + p=page, url=search_term, per_page=50) + + @app.route("/website/random") + def random_website(): + return redirect("/website/" + str(db.get_random_website_id())) + + @app.route("/website//clear") + def admin_clear_website(website_id): + require_role("admin") + + searchEngine.delete_docs(website_id) + flash("Cleared all documents associated with this website", "success") + return redirect("/website/" + str(website_id)) + + @app.route("/website//delete") + def admin_delete_website(website_id): + require_role("admin") + + searchEngine.delete_docs(website_id) + db.delete_website(website_id) + flash("Deleted website " + str(website_id), "success") + return redirect("/website/") + + @app.route("/website//rescan") + def admin_rescan_website(website_id): + require_role("admin") + website = db.get_website_by_id(website_id) + + if website: + priority = request.args.get("priority") if "priority" in request.args else 1 + task = Task(website_id, website.url, priority) + taskManager.queue_task(task) + + flash("Enqueued rescan task", "success") + else: + flash("Website does not exist", "danger") + return redirect("/website/" + str(website_id)) + + @app.route("/search") + def search(): + q = request.args.get("q") if "q" in request.args else "" + sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score" + + page = request.args.get("p") if "p" in request.args else "0" + page = int(page) if page.isdigit() else 0 + + per_page = request.args.get("per_page") if "per_page" in request.args else "50" + per_page = int(per_page) if per_page.isdigit() else "50" + per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50 + + extensions = request.args.get("ext") if "ext" in request.args else None + extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else [] + + size_min = request.args.get("size_min") if "size_min" in request.args else "size_min" + size_min = int(size_min) if size_min.isdigit() else 0 + size_max = request.args.get("size_max") if "size_max" in request.args else "size_max" + size_max = int(size_max) if size_max.isdigit() else 0 + + date_min = request.args.get("date_min") if "date_min" in request.args else "date_min" + date_min = int(date_min) if date_min.isdigit() else 0 + date_max = request.args.get("date_max") if "date_max" in request.args else "date_max" + date_max = int(date_max) if date_max.isdigit() else 0 + + match_all = "all" in request.args + + field_name = "field_name" in request.args + field_trigram = "field_trigram" in request.args + field_path = "field_path" in request.args + + if not field_name and not field_trigram and not field_path: + # If no fields are selected, search in all + field_name = field_path = field_trigram = True + + fields = [] + if field_path: + fields.append("path") + if field_name: + fields.append("name^5") + if field_trigram: + fields.append("name.nGram^2") + + if len(q) >= 3: + + blocked = False + hits = None + if not config.CAPTCHA_SEARCH or captcha.verify(): + + try: + hits = searchEngine.search(q, page, per_page, sort_order, + extensions, size_min, size_max, match_all, fields, date_min, date_max) + hits = db.join_website_on_search_result(hits) + except InvalidQueryException as e: + flash("Invalid query: " + str(e), "warning") + blocked = True + except: + flash("Query failed, this could mean that the search server is overloaded or is not reachable. " + "Please try again later", "danger") + + results = hits["hits"]["total"] if hits else -1 + took = hits["took"] if hits else -1 + forwarded_for = request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None + + logger.info("SEARCH '{}' [res={}, t={}, p={}x{}, ext={}] by {}{}" + .format(q, results, took, page, per_page, str(extensions), + request.remote_addr, "_" + forwarded_for if forwarded_for else "")) + + db.log_search(request.remote_addr, forwarded_for, q, extensions, page, blocked, results, took) + if blocked: + return redirect("/search") + else: + flash("Error: Invalid captcha please try again", "danger") + + else: + hits = None + + return render_template("search.html", + results=hits, + q=q, + p=page, per_page=per_page, + sort_order=sort_order, + results_set=config.RESULTS_PER_PAGE, + extensions=",".join(extensions), + size_min=size_min, size_max=size_max, + match_all=match_all, + field_trigram=field_trigram, field_path=field_path, field_name=field_name, + date_min=date_min, date_max=date_max, + show_captcha=config.CAPTCHA_SEARCH, captcha=captcha) + + @app.route("/contribute") + @cache.cached(600) + def contribute(): + return render_template("contribute.html") + + @app.route("/") + def home(): + try: + stats = searchEngine.get_global_stats() + stats["website_count"] = len(db.get_all_websites()) + except: + stats = {} + return render_template("home.html", stats=stats, + show_captcha=config.CAPTCHA_SEARCH, captcha=captcha) + + @app.route("/submit") + def submit(): + queued_websites = taskManager.get_queued_tasks()[:30] + return render_template("submit.html", queue=queued_websites, captcha=captcha, + show_captcha=config.CAPTCHA_SUBMIT) + + def try_enqueue(url): + url = os.path.join(url, "") + url = od_util.get_top_directory(url) + + if not od_util.is_valid_url(url): + return "Error: Invalid url. Make sure to include the appropriate scheme.", "warning" + + website = db.get_website_by_url(url) + if website: + return "Website already exists", "danger" + + website = db.website_exists(url) + if website: + return "A parent directory of this url has already been posted", "danger" + + if db.is_blacklisted(url): + return "Error: " \ + "Sorry, this website has been blacklisted. If you think " \ + "this is an error, please contact me.", "danger" + + if not od_util.is_od(url): + return "Error:" \ + "The anti-spam algorithm determined that the submitted url is not " \ + "an open directory or the server is not responding. If you think " \ + "this is an error, please contact me.", "danger" + + website_id = db.insert_website(Website(url, str(request.remote_addr + "_" + + request.headers.get("X-Forwarded-For", "")), + request.user_agent)) + + task = Task(website_id, url, priority=1) + taskManager.queue_task(task) + + return "The website has been added to the queue", "success" + + @app.route("/enqueue", methods=["POST"]) + def enqueue(): + if not config.CAPTCHA_SUBMIT or captcha.verify(): + + url = os.path.join(request.form.get("url"), "") + message, msg_type = try_enqueue(url) + flash(message, msg_type) + + return redirect("/submit") + + else: + flash("Error: Invalid captcha please try again", "danger") + return redirect("/submit") + + def check_url(url): + url = os.path.join(url, "") + try_enqueue(url) + return None + + @app.route("/enqueue_bulk", methods=["POST"]) + def enqueue_bulk(): + if not config.CAPTCHA_SUBMIT or captcha.verify(): + + urls = request.form.get("urls") + if urls: + urls = urls.split() + + if 0 < len(urls) <= 1000: # TODO: Load from config & adjust placeholder/messages? + + pool = Pool(processes=6) + pool.map(func=check_url, iterable=urls) + pool.close() + + flash("Submitted websites to the queue", "success") + + return redirect("/submit") + + else: + flash("Too few or too many urls, please submit 1-10 urls", "danger") + return redirect("/submit") + else: + flash("Too few or too many urls, please submit 1-10 urls", "danger") + return redirect("/submit") + else: + flash("Error: Invalid captcha please try again", "danger") + return redirect("/submit") + + @app.route("/admin") + def admin_login_form(): + if "username" in session: + return redirect("/dashboard") + return render_template("admin.html", captcha=captcha, show_captcha=config.CAPTCHA_LOGIN) + + @app.route("/login", methods=["POST"]) + def admin_login(): + if not config.CAPTCHA_LOGIN or captcha.verify(): + + username = request.form.get("username") + password = request.form.get("password") + + if db.check_login(username, password): + session["username"] = username + flash("Logged in", "success") + return redirect("/dashboard") + + flash("Invalid username/password combo", "danger") + return redirect("/admin") + + else: + flash("Invalid captcha", "danger") + return redirect("/admin") + + @app.route("/logout") + def admin_logout(): + session.clear() + flash("Logged out", "info") + return redirect("/") + + @app.route("/dashboard") + def admin_dashboard(): + require_role("admin") + tokens = db.get_tokens() + blacklist = db.get_blacklist() + + return render_template("dashboard.html", api_tokens=tokens, blacklist=blacklist) + + @app.route("/blacklist/add", methods=["POST"]) + def admin_blacklist_add(): + require_role("admin") + url = request.form.get("url") + db.add_blacklist_website(url) + flash("Added item to blacklist", "success") + return redirect("/dashboard") + + @app.route("/blacklist//delete") + def admin_blacklist_remove(blacklist_id): + require_role("admin") + db.remove_blacklist_website(blacklist_id) + flash("Removed blacklist item", "success") + return redirect("/dashboard") + + @app.route("/generate_token", methods=["POST"]) + def admin_generate_token(): + require_role("admin") + description = request.form.get("description") + + db.generate_api_token(description) + flash("Generated API token", "success") + + return redirect("/dashboard") + + @app.route("/del_token", methods=["POST"]) + def admin_del_token(): + require_role("admin") + token = request.form.get("token") + + db.delete_token(token) + flash("Deleted API token", "success") + return redirect("/dashboard") + + # TODO: pages scrolling + @app.route("/logs", methods=["GET"]) + def admin_crawl_logs(): + require_role("admin") + results = db.get_crawl_logs() + + return render_template("crawl_logs.html", logs=results)