Added API commands

This commit is contained in:
Simon 2018-07-15 21:21:57 -04:00
parent 8a19fa0ce7
commit 0227684a53
6 changed files with 85 additions and 173 deletions

78
app.py
View File

@ -334,8 +334,9 @@ def home():
@app.route("/submit") @app.route("/submit")
def submit(): def submit():
queued_websites = taskManager.get_queued_tasks() queued_websites = taskManager.get_queued_tasks()[30:]
return render_template("submit.html", queue=queued_websites, recaptcha=recaptcha, show_captcha=config.CAPTCHA_SUBMIT) return render_template("submit.html", queue=queued_websites, recaptcha=recaptcha,
show_captcha=config.CAPTCHA_SUBMIT)
def try_enqueue(url): def try_enqueue(url):
@ -364,9 +365,11 @@ def try_enqueue(url):
"an open directory or the server is not responding. If you think " \ "an open directory or the server is not responding. If you think " \
"this is an error, please <a href='/contribute'>contact me</a>.", "danger" "this is an error, please <a href='/contribute'>contact me</a>.", "danger"
web_id = db.insert_website(Website(url, str(request.remote_addr), str(request.user_agent))) website_id = db.insert_website(Website(url, str(request.remote_addr + "|" +
request.headers.get("X-Forwarded-For", "")),
request.user_agent))
task = Task(web_id, url, priority=1) task = Task(website_id, url, priority=1)
taskManager.queue_task(task) taskManager.queue_task(task)
return "The website has been added to the queue", "success" return "The website has been added to the queue", "success"
@ -582,5 +585,72 @@ def api_complete_task():
return "No such task" return "No such task"
@app.route("/api/website/by_url", methods=["GET"])
def api_website_by_url():
token = request.args.get("token")
url = request.args.get("url")
name = db.check_api_token(token)
if name:
website = db.get_website_by_url(url)
if website:
return str(website.id)
return abort(404)
else:
return abort(403)
@app.route("/api/website/blacklisted", methods=["GET"])
def api_website_is_blacklisted():
token = request.args.get("token")
url = request.args.get("url")
name = db.check_api_token(token)
if name:
return str(db.is_blacklisted(url))
else:
return abort(403)
@app.route("/api/website/add", methods=["GET"])
def api_add_website():
token = request.args.get("token")
url = request.args.get("url")
name = db.check_api_token(token)
if name:
website_id = db.insert_website(Website(url, str(request.remote_addr + "|" +
request.headers.get("X-Forwarded-For", "")),
"API_CLIENT_" + name))
return str(website_id)
else:
return abort(403)
@app.route("/api/task/enqueue", methods=["POST"])
def api_task_enqueue():
try:
token = request.json["token"]
except KeyError:
return abort(400)
name = db.check_api_token(token)
if name:
task = Task(
request.json["website_id"],
request.json["url"],
request.json["priority"],
request.json["callback_type"],
request.json["callback_args"]
)
taskManager.queue_task(task)
return ""
else:
return abort(403)
if __name__ == '__main__': if __name__ == '__main__':
app.run("0.0.0.0", port=12345, threaded=True) app.run("0.0.0.0", port=12345, threaded=True)

View File

@ -3,17 +3,16 @@ import json
payload = json.dumps({ payload = json.dumps({
"token": "4eafc6ed-74b7-4f04-9d34-7f3e01201003",
"website_id": 3, "website_id": 3,
# "url": "ftp://132.249.213.137",
"url": "http://localhost:8000/", "url": "http://localhost:8000/",
# "url": "http://dlst18.xyz/
# dl/vip/film/",
"priority": 2, "priority": 2,
"callback_type": "", "callback_type": "",
"callback_args": "{}" "callback_args": "{}"
}) })
r = requests.post("http://localhost:5001/task/put", r = requests.post("http://localhost/api/task/enqueue",
headers={"Content-Type": "application/json", headers={"Content-Type": "application/json"},
"Authorization": "Token abc"},
data=payload) data=payload)
print(r)
print(r.text)

View File

@ -1,157 +0,0 @@
import praw
from crawl_server.reddit_bot import RedditBot
from search.search import ElasticSearchEngine
from database import Database, Website
import od_util
import os
import re
chars_to_remove_from_comment = re.compile("[\[\]\\\()]+")
reddit = praw.Reddit('opendirectories-bot',
user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)')
db = Database("db.sqlite3")
search = ElasticSearchEngine("od-database")
subreddit = reddit.subreddit("opendirectories")
# subreddit = reddit.subreddit("test")
bot = RedditBot("crawled.txt", reddit)
submissions = []
def handle_exact_repost(website_id, reddit_obj):
stats = search.get_stats(website_id)
comment = bot.get_comment({"": stats}, website_id,
"I already scanned this website on " + website.last_modified + " UTC")
print(comment)
print("Exact repost!")
bot.reply(reddit_obj, comment)
def handle_subdir_repost(website_id, reddit_obj):
website = db.get_website_by_id(website_id)
message = "I already scanned a parent directory of this website on " + website.last_modified + " UTC"
stats = db.get_website_stats(website_id)
tables = {"Parent directory:": stats}
subdir = url[len(website.url):]
subdir_stats = db.get_subdir_stats(website_id, subdir)
if subdir_stats["total_size"] <= 0:
message += " but I couldn't calculate the size of this subdirectory."
else:
tables["Subdirectory `/" + subdir + "`:"] = subdir_stats
comment = bot.get_comment(tables, website_id, message)
print(comment)
print("Subdir repost!")
bot.reply(reddit_obj, comment)
# Check comments
for comment in subreddit.comments(limit=50):
if not bot.has_crawled(comment):
text = chars_to_remove_from_comment.sub(" ", comment.body).strip()
if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"):
lines = text.split()
if len(lines) > 1:
url = os.path.join(lines[1], "") # Add trailing slash
scanned = db.website_has_been_scanned(url)
website = db.get_website_by_url(url)
if website:
if not scanned:
# in progress
print(url)
print("In progress")
continue
handle_exact_repost(website.id, comment)
continue
website_id = db.website_exists(url)
if website_id:
if not scanned:
print("Parent in progress")
continue
handle_subdir_repost(website_id, comment)
continue
if not od_util.is_valid_url(url):
print("Skipping reddit comment: Invalid url")
bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you "
"provided: `" + url + "` is not valid. Make sure that you include the"
"'`http(s)://` prefix. \n")
continue
if od_util.is_blacklisted(url):
print("Skipping reddit comment: blacklisted")
bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately my programmer has "
"blacklisted this website. If you think that this is an error, please "
"[contact him](https://old.reddit.com/message/compose?to=Hexahedr_n)")
continue
if not od_util.is_od(url):
print("Skipping reddit comment: Not an OD")
print(url)
bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you "
"provided: `" + url + "` does not point to an open directory. This could also"
" mean that the website is not responding (in which case, feel free to retry in "
"a few minutes). If you think that this is an error, please "
"[contact my programmer](https://old.reddit.com/message/compose?to=Hexahedr_n)")
continue
web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
db.enqueue(web_id, reddit_comment_id=comment.id, priority=2) # Medium priority for reddit comments
print("Queued comment post: " + str(web_id))
# Check posts
for submission in subreddit.new(limit=3):
submissions.append(submission)
for s in submissions:
if not s.is_self:
if not bot.has_crawled(s.id):
url = os.path.join(s.url, "") # add trailing slash
scanned = db.website_has_been_scanned(url)
website = db.get_website_by_url(url)
if website:
if not scanned:
print(url)
print("In progress")
continue
handle_exact_repost(website.id, s)
continue
website_id = db.website_exists(url)
if website_id:
if not scanned:
print("Parent in progress")
continue
handle_subdir_repost(website_id, s)
continue
if not od_util.is_valid_url(url):
print("Skipping reddit post: Invalid url")
bot.log_crawl(s.id)
continue
if od_util.is_blacklisted(url):
print("Skipping reddit post: blacklisted")
bot.log_crawl(s.id)
continue
if not od_util.is_od(url):
print("Skipping reddit post: Not an OD")
print(url)
bot.log_crawl(s.id)
continue
web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
db.enqueue(web_id, reddit_post_id=s.id, priority=3) # Higher priority for reddit posts
print("Queued reddit post: " + str(web_id))

View File

@ -1,7 +1,6 @@
import os import os
import json import json
import shutil import shutil
import sys
from search.search import ElasticSearchEngine from search.search import ElasticSearchEngine
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
import requests import requests
@ -16,6 +15,7 @@ exts = [
"so", "dll", "tar", "gz", "bin", "cad", "cmd", "bat", "sh", "md" "so", "dll", "tar", "gz", "bin", "cad", "cmd", "bat", "sh", "md"
] ]
def dump_local_filesystem(root_dir: str): def dump_local_filesystem(root_dir: str):
docs = [] docs = []
@ -71,7 +71,7 @@ def index_file_list(path: str, website_id):
def search(term=""): def search(term=""):
requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False) requests.get("http://localhost/search?q=" + term, verify=False)
print(term) print(term)
@ -91,7 +91,7 @@ def make_wide_filesystem(count=100000):
os.mkdir(new_path) os.mkdir(new_path)
dump_local_filesystem("/mnt/") # dump_local_filesystem("/mnt/")
# index_file_list("local_filesystem.json", 4) # index_file_list("local_filesystem.json", 4)
# random_searches(100000) # random_searches(100000)
# dump_random_files(20000 * 100000) # dump_random_files(20000 * 100000)

View File

@ -12,7 +12,7 @@
<p class="lead">{{ stats["total_count"] }} files totalling <p class="lead">{{ stats["total_count"] }} files totalling
~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites</p> ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites</p>
{% else %} {% else %}
<p class="lead">We're currently experiencing a high volume of traffic. The search function <p class="lead">We are currently experiencing a high volume of traffic. The search function
may be unresponsive.</p> may be unresponsive.</p>
{% endif %} {% endif %}
<p></p> <p></p>

View File

@ -96,7 +96,7 @@
<tr> <tr>
<th>Crawl time average</th> <th>Crawl time average</th>
{% for server in crawl_server_stats %} {% for server in crawl_server_stats %}
<td>{{ crawl_server_stats[server].time_avg|round(2) }}s per task</td> <td>{{ crawl_server_stats[server].time_avg|round(2) }}s</td>
{% endfor %} {% endfor %}
</tr> </tr>
<tr> <tr>
@ -108,7 +108,7 @@
<tr> <tr>
<th>Files crawled average</th> <th>Files crawled average</th>
{% for server in crawl_server_stats %} {% for server in crawl_server_stats %}
<td>{{ crawl_server_stats[server].file_count_avg | round(2) }} per task</td> <td>{{ crawl_server_stats[server].file_count_avg | round(2) }}</td>
{% endfor %} {% endfor %}
</tr> </tr>
</tbody> </tbody>