From 0227684a535ead707ec8ff044a99c7baebf44297 Mon Sep 17 00:00:00 2001 From: Simon Date: Sun, 15 Jul 2018 21:21:57 -0400 Subject: [PATCH] Added API commands --- app.py | 78 +++++++++++++++++++-- debug_put.py | 11 ++- queue_reddit_links.py | 157 ------------------------------------------ stress_test.py | 6 +- templates/home.html | 2 +- templates/stats.html | 4 +- 6 files changed, 85 insertions(+), 173 deletions(-) delete mode 100644 queue_reddit_links.py diff --git a/app.py b/app.py index ed7c34a..c4021db 100644 --- a/app.py +++ b/app.py @@ -334,8 +334,9 @@ def home(): @app.route("/submit") def submit(): - queued_websites = taskManager.get_queued_tasks() - return render_template("submit.html", queue=queued_websites, recaptcha=recaptcha, show_captcha=config.CAPTCHA_SUBMIT) + queued_websites = taskManager.get_queued_tasks()[30:] + return render_template("submit.html", queue=queued_websites, recaptcha=recaptcha, + show_captcha=config.CAPTCHA_SUBMIT) def try_enqueue(url): @@ -364,9 +365,11 @@ def try_enqueue(url): "an open directory or the server is not responding. If you think " \ "this is an error, please contact me.", "danger" - web_id = db.insert_website(Website(url, str(request.remote_addr), str(request.user_agent))) + website_id = db.insert_website(Website(url, str(request.remote_addr + "|" + + request.headers.get("X-Forwarded-For", "")), + request.user_agent)) - task = Task(web_id, url, priority=1) + task = Task(website_id, url, priority=1) taskManager.queue_task(task) return "The website has been added to the queue", "success" @@ -582,5 +585,72 @@ def api_complete_task(): return "No such task" +@app.route("/api/website/by_url", methods=["GET"]) +def api_website_by_url(): + token = request.args.get("token") + url = request.args.get("url") + name = db.check_api_token(token) + + if name: + website = db.get_website_by_url(url) + if website: + return str(website.id) + return abort(404) + else: + return abort(403) + + +@app.route("/api/website/blacklisted", methods=["GET"]) +def api_website_is_blacklisted(): + token = request.args.get("token") + url = request.args.get("url") + name = db.check_api_token(token) + + if name: + return str(db.is_blacklisted(url)) + else: + return abort(403) + + +@app.route("/api/website/add", methods=["GET"]) +def api_add_website(): + token = request.args.get("token") + url = request.args.get("url") + + name = db.check_api_token(token) + if name: + + website_id = db.insert_website(Website(url, str(request.remote_addr + "|" + + request.headers.get("X-Forwarded-For", "")), + "API_CLIENT_" + name)) + return str(website_id) + else: + return abort(403) + + +@app.route("/api/task/enqueue", methods=["POST"]) +def api_task_enqueue(): + try: + token = request.json["token"] + except KeyError: + return abort(400) + + name = db.check_api_token(token) + + if name: + + task = Task( + request.json["website_id"], + request.json["url"], + request.json["priority"], + request.json["callback_type"], + request.json["callback_args"] + ) + taskManager.queue_task(task) + return "" + else: + return abort(403) + + if __name__ == '__main__': app.run("0.0.0.0", port=12345, threaded=True) diff --git a/debug_put.py b/debug_put.py index f137acb..8a2f7eb 100644 --- a/debug_put.py +++ b/debug_put.py @@ -3,17 +3,16 @@ import json payload = json.dumps({ + "token": "4eafc6ed-74b7-4f04-9d34-7f3e01201003", "website_id": 3, - # "url": "ftp://132.249.213.137", "url": "http://localhost:8000/", - # "url": "http://dlst18.xyz/ - # dl/vip/film/", "priority": 2, "callback_type": "", "callback_args": "{}" }) -r = requests.post("http://localhost:5001/task/put", - headers={"Content-Type": "application/json", - "Authorization": "Token abc"}, +r = requests.post("http://localhost/api/task/enqueue", + headers={"Content-Type": "application/json"}, data=payload) +print(r) +print(r.text) diff --git a/queue_reddit_links.py b/queue_reddit_links.py deleted file mode 100644 index c5cdb95..0000000 --- a/queue_reddit_links.py +++ /dev/null @@ -1,157 +0,0 @@ -import praw -from crawl_server.reddit_bot import RedditBot -from search.search import ElasticSearchEngine -from database import Database, Website -import od_util -import os -import re - -chars_to_remove_from_comment = re.compile("[\[\]\\\()]+") -reddit = praw.Reddit('opendirectories-bot', - user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)') -db = Database("db.sqlite3") -search = ElasticSearchEngine("od-database") -subreddit = reddit.subreddit("opendirectories") -# subreddit = reddit.subreddit("test") -bot = RedditBot("crawled.txt", reddit) - -submissions = [] - - -def handle_exact_repost(website_id, reddit_obj): - stats = search.get_stats(website_id) - comment = bot.get_comment({"": stats}, website_id, - "I already scanned this website on " + website.last_modified + " UTC") - print(comment) - print("Exact repost!") - bot.reply(reddit_obj, comment) - - -def handle_subdir_repost(website_id, reddit_obj): - - website = db.get_website_by_id(website_id) - message = "I already scanned a parent directory of this website on " + website.last_modified + " UTC" - stats = db.get_website_stats(website_id) - tables = {"Parent directory:": stats} - - subdir = url[len(website.url):] - subdir_stats = db.get_subdir_stats(website_id, subdir) - if subdir_stats["total_size"] <= 0: - message += " but I couldn't calculate the size of this subdirectory." - else: - tables["Subdirectory `/" + subdir + "`:"] = subdir_stats - comment = bot.get_comment(tables, website_id, message) - print(comment) - print("Subdir repost!") - bot.reply(reddit_obj, comment) - - -# Check comments -for comment in subreddit.comments(limit=50): - - if not bot.has_crawled(comment): - text = chars_to_remove_from_comment.sub(" ", comment.body).strip() - if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"): - lines = text.split() - if len(lines) > 1: - url = os.path.join(lines[1], "") # Add trailing slash - scanned = db.website_has_been_scanned(url) - - website = db.get_website_by_url(url) - - if website: - if not scanned: - # in progress - print(url) - print("In progress") - continue - handle_exact_repost(website.id, comment) - continue - - website_id = db.website_exists(url) - if website_id: - if not scanned: - print("Parent in progress") - continue - handle_subdir_repost(website_id, comment) - continue - - if not od_util.is_valid_url(url): - print("Skipping reddit comment: Invalid url") - bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you " - "provided: `" + url + "` is not valid. Make sure that you include the" - "'`http(s)://` prefix. \n") - continue - - if od_util.is_blacklisted(url): - print("Skipping reddit comment: blacklisted") - bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately my programmer has " - "blacklisted this website. If you think that this is an error, please " - "[contact him](https://old.reddit.com/message/compose?to=Hexahedr_n)") - continue - - if not od_util.is_od(url): - print("Skipping reddit comment: Not an OD") - print(url) - bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you " - "provided: `" + url + "` does not point to an open directory. This could also" - " mean that the website is not responding (in which case, feel free to retry in " - "a few minutes). If you think that this is an error, please " - "[contact my programmer](https://old.reddit.com/message/compose?to=Hexahedr_n)") - continue - - web_id = db.insert_website(Website(url, "localhost", "reddit_bot")) - db.enqueue(web_id, reddit_comment_id=comment.id, priority=2) # Medium priority for reddit comments - print("Queued comment post: " + str(web_id)) - - -# Check posts -for submission in subreddit.new(limit=3): - submissions.append(submission) - - -for s in submissions: - - if not s.is_self: - if not bot.has_crawled(s.id): - - url = os.path.join(s.url, "") # add trailing slash - scanned = db.website_has_been_scanned(url) - - website = db.get_website_by_url(url) - - if website: - if not scanned: - print(url) - print("In progress") - continue - handle_exact_repost(website.id, s) - continue - - website_id = db.website_exists(url) - if website_id: - if not scanned: - print("Parent in progress") - continue - handle_subdir_repost(website_id, s) - continue - - if not od_util.is_valid_url(url): - print("Skipping reddit post: Invalid url") - bot.log_crawl(s.id) - continue - - if od_util.is_blacklisted(url): - print("Skipping reddit post: blacklisted") - bot.log_crawl(s.id) - continue - - if not od_util.is_od(url): - print("Skipping reddit post: Not an OD") - print(url) - bot.log_crawl(s.id) - continue - - web_id = db.insert_website(Website(url, "localhost", "reddit_bot")) - db.enqueue(web_id, reddit_post_id=s.id, priority=3) # Higher priority for reddit posts - print("Queued reddit post: " + str(web_id)) diff --git a/stress_test.py b/stress_test.py index aff5d18..dec7f59 100644 --- a/stress_test.py +++ b/stress_test.py @@ -1,7 +1,6 @@ import os import json import shutil -import sys from search.search import ElasticSearchEngine from concurrent.futures import ThreadPoolExecutor import requests @@ -16,6 +15,7 @@ exts = [ "so", "dll", "tar", "gz", "bin", "cad", "cmd", "bat", "sh", "md" ] + def dump_local_filesystem(root_dir: str): docs = [] @@ -71,7 +71,7 @@ def index_file_list(path: str, website_id): def search(term=""): - requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False) + requests.get("http://localhost/search?q=" + term, verify=False) print(term) @@ -91,7 +91,7 @@ def make_wide_filesystem(count=100000): os.mkdir(new_path) -dump_local_filesystem("/mnt/") +# dump_local_filesystem("/mnt/") # index_file_list("local_filesystem.json", 4) # random_searches(100000) # dump_random_files(20000 * 100000) diff --git a/templates/home.html b/templates/home.html index df2a5fa..33881e5 100644 --- a/templates/home.html +++ b/templates/home.html @@ -12,7 +12,7 @@

{{ stats["total_count"] }} files totalling ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites

{% else %} -

We're currently experiencing a high volume of traffic. The search function +

We are currently experiencing a high volume of traffic. The search function may be unresponsive.

{% endif %}

diff --git a/templates/stats.html b/templates/stats.html index b90f0c1..9f9e04f 100644 --- a/templates/stats.html +++ b/templates/stats.html @@ -96,7 +96,7 @@ Crawl time average {% for server in crawl_server_stats %} - {{ crawl_server_stats[server].time_avg|round(2) }}s per task + {{ crawl_server_stats[server].time_avg|round(2) }}s {% endfor %} @@ -108,7 +108,7 @@ Files crawled average {% for server in crawl_server_stats %} - {{ crawl_server_stats[server].file_count_avg | round(2) }} per task + {{ crawl_server_stats[server].file_count_avg | round(2) }} {% endfor %}