From 0227684a535ead707ec8ff044a99c7baebf44297 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Sun, 15 Jul 2018 21:21:57 -0400
Subject: [PATCH] Added API commands

---
 app.py                |  78 +++++++++++++++++++--
 debug_put.py          |  11 ++-
 queue_reddit_links.py | 157 ------------------------------------------
 stress_test.py        |   6 +-
 templates/home.html   |   2 +-
 templates/stats.html  |   4 +-
 6 files changed, 85 insertions(+), 173 deletions(-)
 delete mode 100644 queue_reddit_links.py
diff --git a/app.py b/app.py
index ed7c34a..c4021db 100644
--- a/app.py
+++ b/app.py
@@ -334,8 +334,9 @@ def home():
 
 @app.route("/submit")
 def submit():
-    queued_websites = taskManager.get_queued_tasks()
-    return render_template("submit.html", queue=queued_websites, recaptcha=recaptcha, show_captcha=config.CAPTCHA_SUBMIT)
+    queued_websites = taskManager.get_queued_tasks()[30:]
+    return render_template("submit.html", queue=queued_websites, recaptcha=recaptcha,
+                           show_captcha=config.CAPTCHA_SUBMIT)
 
 
 def try_enqueue(url):
@@ -364,9 +365,11 @@ def try_enqueue(url):
                "an open directory or the server is not responding. If you think " \
                "this is an error, please <a href='/contribute'>contact me</a>.", "danger"
 
-    web_id = db.insert_website(Website(url, str(request.remote_addr), str(request.user_agent)))
+    website_id = db.insert_website(Website(url, str(request.remote_addr + "|" +
+                                                    request.headers.get("X-Forwarded-For", "")),
+                                           request.user_agent))
 
-    task = Task(web_id, url, priority=1)
+    task = Task(website_id, url, priority=1)
     taskManager.queue_task(task)
 
     return "The website has been added to the queue", "success"
@@ -582,5 +585,72 @@ def api_complete_task():
             return "No such task"
 
 
+@app.route("/api/website/by_url", methods=["GET"])
+def api_website_by_url():
+    token = request.args.get("token")
+    url = request.args.get("url")
+    name = db.check_api_token(token)
+
+    if name:
+        website = db.get_website_by_url(url)
+        if website:
+            return str(website.id)
+        return abort(404)
+    else:
+        return abort(403)
+
+
+@app.route("/api/website/blacklisted", methods=["GET"])
+def api_website_is_blacklisted():
+    token = request.args.get("token")
+    url = request.args.get("url")
+    name = db.check_api_token(token)
+
+    if name:
+        return str(db.is_blacklisted(url))
+    else:
+        return abort(403)
+
+
+@app.route("/api/website/add", methods=["GET"])
+def api_add_website():
+    token = request.args.get("token")
+    url = request.args.get("url")
+
+    name = db.check_api_token(token)
+    if name:
+
+        website_id = db.insert_website(Website(url, str(request.remote_addr + "|" +
+                                                        request.headers.get("X-Forwarded-For", "")),
+                                               "API_CLIENT_" + name))
+        return str(website_id)
+    else:
+        return abort(403)
+
+
+@app.route("/api/task/enqueue", methods=["POST"])
+def api_task_enqueue():
+    try:
+        token = request.json["token"]
+    except KeyError:
+        return abort(400)
+
+    name = db.check_api_token(token)
+
+    if name:
+
+        task = Task(
+            request.json["website_id"],
+            request.json["url"],
+            request.json["priority"],
+            request.json["callback_type"],
+            request.json["callback_args"]
+        )
+        taskManager.queue_task(task)
+        return ""
+    else:
+        return abort(403)
+
+
 if __name__ == '__main__':
     app.run("0.0.0.0", port=12345, threaded=True)
diff --git a/debug_put.py b/debug_put.py
index f137acb..8a2f7eb 100644
--- a/debug_put.py
+++ b/debug_put.py
@@ -3,17 +3,16 @@ import json
 
 
 payload = json.dumps({
+    "token": "4eafc6ed-74b7-4f04-9d34-7f3e01201003",
     "website_id": 3,
-    # "url": "ftp://132.249.213.137",
     "url": "http://localhost:8000/",
-    # "url": "http://dlst18.xyz/
-    # dl/vip/film/",
     "priority": 2,
     "callback_type": "",
     "callback_args": "{}"
 })
 
-r = requests.post("http://localhost:5001/task/put",
-                  headers={"Content-Type": "application/json",
-                           "Authorization": "Token abc"},
+r = requests.post("http://localhost/api/task/enqueue",
+                  headers={"Content-Type": "application/json"},
                   data=payload)
+print(r)
+print(r.text)
diff --git a/queue_reddit_links.py b/queue_reddit_links.py
deleted file mode 100644
index c5cdb95..0000000
--- a/queue_reddit_links.py
+++ /dev/null
@@ -1,157 +0,0 @@
-import praw
-from crawl_server.reddit_bot import RedditBot
-from search.search import ElasticSearchEngine
-from database import Database, Website
-import od_util
-import os
-import re
-
-chars_to_remove_from_comment = re.compile("[\[\]\\\()]+")
-reddit = praw.Reddit('opendirectories-bot',
-                     user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
-db = Database("db.sqlite3")
-search = ElasticSearchEngine("od-database")
-subreddit = reddit.subreddit("opendirectories")
-# subreddit = reddit.subreddit("test")
-bot = RedditBot("crawled.txt", reddit)
-
-submissions = []
-
-
-def handle_exact_repost(website_id, reddit_obj):
-    stats = search.get_stats(website_id)
-    comment = bot.get_comment({"": stats}, website_id,
-                              "I already scanned this website on " + website.last_modified + " UTC")
-    print(comment)
-    print("Exact repost!")
-    bot.reply(reddit_obj, comment)
-
-
-def handle_subdir_repost(website_id, reddit_obj):
-
-    website = db.get_website_by_id(website_id)
-    message = "I already scanned a parent directory of this website on " + website.last_modified + " UTC"
-    stats = db.get_website_stats(website_id)
-    tables = {"Parent directory:": stats}
-
-    subdir = url[len(website.url):]
-    subdir_stats = db.get_subdir_stats(website_id, subdir)
-    if subdir_stats["total_size"] <= 0:
-        message += " but I couldn't calculate the size of this subdirectory."
-    else:
-        tables["Subdirectory `/" + subdir + "`:"] = subdir_stats
-    comment = bot.get_comment(tables, website_id, message)
-    print(comment)
-    print("Subdir repost!")
-    bot.reply(reddit_obj, comment)
-
-
-# Check comments
-for comment in subreddit.comments(limit=50):
-
-    if not bot.has_crawled(comment):
-        text = chars_to_remove_from_comment.sub(" ", comment.body).strip()
-        if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"):
-            lines = text.split()
-            if len(lines) > 1:
-                url = os.path.join(lines[1], "")  # Add trailing slash
-                scanned = db.website_has_been_scanned(url)
-
-                website = db.get_website_by_url(url)
-
-                if website:
-                    if not scanned:
-                        # in progress
-                        print(url)
-                        print("In progress")
-                        continue
-                    handle_exact_repost(website.id, comment)
-                    continue
-
-                website_id = db.website_exists(url)
-                if website_id:
-                    if not scanned:
-                        print("Parent in progress")
-                        continue
-                    handle_subdir_repost(website_id, comment)
-                    continue
-
-                if not od_util.is_valid_url(url):
-                    print("Skipping reddit comment: Invalid url")
-                    bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you "
-                                       "provided: `" + url + "` is not valid. Make sure that you include the"
-                                       "'`http(s)://` prefix.    \n")
-                    continue
-
-                if od_util.is_blacklisted(url):
-                    print("Skipping reddit comment: blacklisted")
-                    bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately my programmer has "
-                                       "blacklisted this website. If you think that this is an error, please "
-                                       "[contact him](https://old.reddit.com/message/compose?to=Hexahedr_n)")
-                    continue
-
-                if not od_util.is_od(url):
-                    print("Skipping reddit comment: Not an OD")
-                    print(url)
-                    bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you "
-                                       "provided: `" + url + "` does not point to an open directory. This could also"
-                                       " mean that the website is not responding (in which case, feel free to retry in "
-                                       "a few minutes). If you think that this is an error, please "
-                                       "[contact my programmer](https://old.reddit.com/message/compose?to=Hexahedr_n)")
-                    continue
-
-                web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
-                db.enqueue(web_id, reddit_comment_id=comment.id, priority=2)  # Medium priority for reddit comments
-                print("Queued comment post: " + str(web_id))
-
-
-# Check posts
-for submission in subreddit.new(limit=3):
-    submissions.append(submission)
-
-
-for s in submissions:
-
-    if not s.is_self:
-        if not bot.has_crawled(s.id):
-
-            url = os.path.join(s.url, "")  # add trailing slash
-            scanned = db.website_has_been_scanned(url)
-
-            website = db.get_website_by_url(url)
-
-            if website:
-                if not scanned:
-                    print(url)
-                    print("In progress")
-                    continue
-                handle_exact_repost(website.id, s)
-                continue
-
-            website_id = db.website_exists(url)
-            if website_id:
-                if not scanned:
-                    print("Parent in progress")
-                    continue
-                handle_subdir_repost(website_id, s)
-                continue
-
-            if not od_util.is_valid_url(url):
-                print("Skipping reddit post: Invalid url")
-                bot.log_crawl(s.id)
-                continue
-
-            if od_util.is_blacklisted(url):
-                print("Skipping reddit post: blacklisted")
-                bot.log_crawl(s.id)
-                continue
-
-            if not od_util.is_od(url):
-                print("Skipping reddit post: Not an OD")
-                print(url)
-                bot.log_crawl(s.id)
-                continue
-
-            web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
-            db.enqueue(web_id, reddit_post_id=s.id, priority=3)  # Higher priority for reddit posts
-            print("Queued reddit post: " + str(web_id))
diff --git a/stress_test.py b/stress_test.py
index aff5d18..dec7f59 100644
--- a/stress_test.py
+++ b/stress_test.py
@@ -1,7 +1,6 @@
 import os
 import json
 import shutil
-import sys
 from search.search import ElasticSearchEngine
 from concurrent.futures import ThreadPoolExecutor
 import requests
@@ -16,6 +15,7 @@ exts = [
     "so", "dll", "tar", "gz", "bin", "cad", "cmd", "bat", "sh", "md"
 ]
 
+
 def dump_local_filesystem(root_dir: str):
 
     docs = []
@@ -71,7 +71,7 @@ def index_file_list(path: str, website_id):
 
 
 def search(term=""):
-    requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False)
+    requests.get("http://localhost/search?q=" + term, verify=False)
     print(term)
 
 
@@ -91,7 +91,7 @@ def make_wide_filesystem(count=100000):
             os.mkdir(new_path)
 
 
-dump_local_filesystem("/mnt/")
+# dump_local_filesystem("/mnt/")
 # index_file_list("local_filesystem.json", 4)
 # random_searches(100000)
 # dump_random_files(20000 * 100000)
diff --git a/templates/home.html b/templates/home.html
index df2a5fa..33881e5 100644
--- a/templates/home.html
+++ b/templates/home.html
@@ -12,7 +12,7 @@
                 <p class="lead">{{ stats["total_count"] }} files totalling
                     ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites</p>
             {% else %}
-                <p class="lead">We're currently experiencing a high volume of traffic. The search function
+                <p class="lead">We are currently experiencing a high volume of traffic. The search function
                 may be unresponsive.</p>
             {% endif %}
             <p></p>
diff --git a/templates/stats.html b/templates/stats.html
index b90f0c1..9f9e04f 100644
--- a/templates/stats.html
+++ b/templates/stats.html
@@ -96,7 +96,7 @@
                     <tr>
                         <th>Crawl time average</th>
                         {% for server in crawl_server_stats %}
-                            <td>{{ crawl_server_stats[server].time_avg|round(2) }}s per task</td>
+                            <td>{{ crawl_server_stats[server].time_avg|round(2) }}s</td>
                         {% endfor %}
                     </tr>
                     <tr>
@@ -108,7 +108,7 @@
                     <tr>
                         <th>Files crawled average</th>
                         {% for server in crawl_server_stats %}
-                            <td>{{ crawl_server_stats[server].file_count_avg | round(2) }} per task</td>
+                            <td>{{ crawl_server_stats[server].file_count_avg | round(2) }}</td>
                         {% endfor %}
                     </tr>
                     </tbody>