mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Added API commands
This commit is contained in:
parent
8a19fa0ce7
commit
0227684a53
78
app.py
78
app.py
@ -334,8 +334,9 @@ def home():
|
|||||||
|
|
||||||
@app.route("/submit")
|
@app.route("/submit")
|
||||||
def submit():
|
def submit():
|
||||||
queued_websites = taskManager.get_queued_tasks()
|
queued_websites = taskManager.get_queued_tasks()[30:]
|
||||||
return render_template("submit.html", queue=queued_websites, recaptcha=recaptcha, show_captcha=config.CAPTCHA_SUBMIT)
|
return render_template("submit.html", queue=queued_websites, recaptcha=recaptcha,
|
||||||
|
show_captcha=config.CAPTCHA_SUBMIT)
|
||||||
|
|
||||||
|
|
||||||
def try_enqueue(url):
|
def try_enqueue(url):
|
||||||
@ -364,9 +365,11 @@ def try_enqueue(url):
|
|||||||
"an open directory or the server is not responding. If you think " \
|
"an open directory or the server is not responding. If you think " \
|
||||||
"this is an error, please <a href='/contribute'>contact me</a>.", "danger"
|
"this is an error, please <a href='/contribute'>contact me</a>.", "danger"
|
||||||
|
|
||||||
web_id = db.insert_website(Website(url, str(request.remote_addr), str(request.user_agent)))
|
website_id = db.insert_website(Website(url, str(request.remote_addr + "|" +
|
||||||
|
request.headers.get("X-Forwarded-For", "")),
|
||||||
|
request.user_agent))
|
||||||
|
|
||||||
task = Task(web_id, url, priority=1)
|
task = Task(website_id, url, priority=1)
|
||||||
taskManager.queue_task(task)
|
taskManager.queue_task(task)
|
||||||
|
|
||||||
return "The website has been added to the queue", "success"
|
return "The website has been added to the queue", "success"
|
||||||
@ -582,5 +585,72 @@ def api_complete_task():
|
|||||||
return "No such task"
|
return "No such task"
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/website/by_url", methods=["GET"])
|
||||||
|
def api_website_by_url():
|
||||||
|
token = request.args.get("token")
|
||||||
|
url = request.args.get("url")
|
||||||
|
name = db.check_api_token(token)
|
||||||
|
|
||||||
|
if name:
|
||||||
|
website = db.get_website_by_url(url)
|
||||||
|
if website:
|
||||||
|
return str(website.id)
|
||||||
|
return abort(404)
|
||||||
|
else:
|
||||||
|
return abort(403)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/website/blacklisted", methods=["GET"])
|
||||||
|
def api_website_is_blacklisted():
|
||||||
|
token = request.args.get("token")
|
||||||
|
url = request.args.get("url")
|
||||||
|
name = db.check_api_token(token)
|
||||||
|
|
||||||
|
if name:
|
||||||
|
return str(db.is_blacklisted(url))
|
||||||
|
else:
|
||||||
|
return abort(403)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/website/add", methods=["GET"])
|
||||||
|
def api_add_website():
|
||||||
|
token = request.args.get("token")
|
||||||
|
url = request.args.get("url")
|
||||||
|
|
||||||
|
name = db.check_api_token(token)
|
||||||
|
if name:
|
||||||
|
|
||||||
|
website_id = db.insert_website(Website(url, str(request.remote_addr + "|" +
|
||||||
|
request.headers.get("X-Forwarded-For", "")),
|
||||||
|
"API_CLIENT_" + name))
|
||||||
|
return str(website_id)
|
||||||
|
else:
|
||||||
|
return abort(403)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/task/enqueue", methods=["POST"])
|
||||||
|
def api_task_enqueue():
|
||||||
|
try:
|
||||||
|
token = request.json["token"]
|
||||||
|
except KeyError:
|
||||||
|
return abort(400)
|
||||||
|
|
||||||
|
name = db.check_api_token(token)
|
||||||
|
|
||||||
|
if name:
|
||||||
|
|
||||||
|
task = Task(
|
||||||
|
request.json["website_id"],
|
||||||
|
request.json["url"],
|
||||||
|
request.json["priority"],
|
||||||
|
request.json["callback_type"],
|
||||||
|
request.json["callback_args"]
|
||||||
|
)
|
||||||
|
taskManager.queue_task(task)
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
return abort(403)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run("0.0.0.0", port=12345, threaded=True)
|
app.run("0.0.0.0", port=12345, threaded=True)
|
||||||
|
11
debug_put.py
11
debug_put.py
@ -3,17 +3,16 @@ import json
|
|||||||
|
|
||||||
|
|
||||||
payload = json.dumps({
|
payload = json.dumps({
|
||||||
|
"token": "4eafc6ed-74b7-4f04-9d34-7f3e01201003",
|
||||||
"website_id": 3,
|
"website_id": 3,
|
||||||
# "url": "ftp://132.249.213.137",
|
|
||||||
"url": "http://localhost:8000/",
|
"url": "http://localhost:8000/",
|
||||||
# "url": "http://dlst18.xyz/
|
|
||||||
# dl/vip/film/",
|
|
||||||
"priority": 2,
|
"priority": 2,
|
||||||
"callback_type": "",
|
"callback_type": "",
|
||||||
"callback_args": "{}"
|
"callback_args": "{}"
|
||||||
})
|
})
|
||||||
|
|
||||||
r = requests.post("http://localhost:5001/task/put",
|
r = requests.post("http://localhost/api/task/enqueue",
|
||||||
headers={"Content-Type": "application/json",
|
headers={"Content-Type": "application/json"},
|
||||||
"Authorization": "Token abc"},
|
|
||||||
data=payload)
|
data=payload)
|
||||||
|
print(r)
|
||||||
|
print(r.text)
|
||||||
|
@ -1,157 +0,0 @@
|
|||||||
import praw
|
|
||||||
from crawl_server.reddit_bot import RedditBot
|
|
||||||
from search.search import ElasticSearchEngine
|
|
||||||
from database import Database, Website
|
|
||||||
import od_util
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
|
|
||||||
chars_to_remove_from_comment = re.compile("[\[\]\\\()]+")
|
|
||||||
reddit = praw.Reddit('opendirectories-bot',
|
|
||||||
user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)')
|
|
||||||
db = Database("db.sqlite3")
|
|
||||||
search = ElasticSearchEngine("od-database")
|
|
||||||
subreddit = reddit.subreddit("opendirectories")
|
|
||||||
# subreddit = reddit.subreddit("test")
|
|
||||||
bot = RedditBot("crawled.txt", reddit)
|
|
||||||
|
|
||||||
submissions = []
|
|
||||||
|
|
||||||
|
|
||||||
def handle_exact_repost(website_id, reddit_obj):
|
|
||||||
stats = search.get_stats(website_id)
|
|
||||||
comment = bot.get_comment({"": stats}, website_id,
|
|
||||||
"I already scanned this website on " + website.last_modified + " UTC")
|
|
||||||
print(comment)
|
|
||||||
print("Exact repost!")
|
|
||||||
bot.reply(reddit_obj, comment)
|
|
||||||
|
|
||||||
|
|
||||||
def handle_subdir_repost(website_id, reddit_obj):
|
|
||||||
|
|
||||||
website = db.get_website_by_id(website_id)
|
|
||||||
message = "I already scanned a parent directory of this website on " + website.last_modified + " UTC"
|
|
||||||
stats = db.get_website_stats(website_id)
|
|
||||||
tables = {"Parent directory:": stats}
|
|
||||||
|
|
||||||
subdir = url[len(website.url):]
|
|
||||||
subdir_stats = db.get_subdir_stats(website_id, subdir)
|
|
||||||
if subdir_stats["total_size"] <= 0:
|
|
||||||
message += " but I couldn't calculate the size of this subdirectory."
|
|
||||||
else:
|
|
||||||
tables["Subdirectory `/" + subdir + "`:"] = subdir_stats
|
|
||||||
comment = bot.get_comment(tables, website_id, message)
|
|
||||||
print(comment)
|
|
||||||
print("Subdir repost!")
|
|
||||||
bot.reply(reddit_obj, comment)
|
|
||||||
|
|
||||||
|
|
||||||
# Check comments
|
|
||||||
for comment in subreddit.comments(limit=50):
|
|
||||||
|
|
||||||
if not bot.has_crawled(comment):
|
|
||||||
text = chars_to_remove_from_comment.sub(" ", comment.body).strip()
|
|
||||||
if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"):
|
|
||||||
lines = text.split()
|
|
||||||
if len(lines) > 1:
|
|
||||||
url = os.path.join(lines[1], "") # Add trailing slash
|
|
||||||
scanned = db.website_has_been_scanned(url)
|
|
||||||
|
|
||||||
website = db.get_website_by_url(url)
|
|
||||||
|
|
||||||
if website:
|
|
||||||
if not scanned:
|
|
||||||
# in progress
|
|
||||||
print(url)
|
|
||||||
print("In progress")
|
|
||||||
continue
|
|
||||||
handle_exact_repost(website.id, comment)
|
|
||||||
continue
|
|
||||||
|
|
||||||
website_id = db.website_exists(url)
|
|
||||||
if website_id:
|
|
||||||
if not scanned:
|
|
||||||
print("Parent in progress")
|
|
||||||
continue
|
|
||||||
handle_subdir_repost(website_id, comment)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not od_util.is_valid_url(url):
|
|
||||||
print("Skipping reddit comment: Invalid url")
|
|
||||||
bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you "
|
|
||||||
"provided: `" + url + "` is not valid. Make sure that you include the"
|
|
||||||
"'`http(s)://` prefix. \n")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if od_util.is_blacklisted(url):
|
|
||||||
print("Skipping reddit comment: blacklisted")
|
|
||||||
bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately my programmer has "
|
|
||||||
"blacklisted this website. If you think that this is an error, please "
|
|
||||||
"[contact him](https://old.reddit.com/message/compose?to=Hexahedr_n)")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not od_util.is_od(url):
|
|
||||||
print("Skipping reddit comment: Not an OD")
|
|
||||||
print(url)
|
|
||||||
bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you "
|
|
||||||
"provided: `" + url + "` does not point to an open directory. This could also"
|
|
||||||
" mean that the website is not responding (in which case, feel free to retry in "
|
|
||||||
"a few minutes). If you think that this is an error, please "
|
|
||||||
"[contact my programmer](https://old.reddit.com/message/compose?to=Hexahedr_n)")
|
|
||||||
continue
|
|
||||||
|
|
||||||
web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
|
|
||||||
db.enqueue(web_id, reddit_comment_id=comment.id, priority=2) # Medium priority for reddit comments
|
|
||||||
print("Queued comment post: " + str(web_id))
|
|
||||||
|
|
||||||
|
|
||||||
# Check posts
|
|
||||||
for submission in subreddit.new(limit=3):
|
|
||||||
submissions.append(submission)
|
|
||||||
|
|
||||||
|
|
||||||
for s in submissions:
|
|
||||||
|
|
||||||
if not s.is_self:
|
|
||||||
if not bot.has_crawled(s.id):
|
|
||||||
|
|
||||||
url = os.path.join(s.url, "") # add trailing slash
|
|
||||||
scanned = db.website_has_been_scanned(url)
|
|
||||||
|
|
||||||
website = db.get_website_by_url(url)
|
|
||||||
|
|
||||||
if website:
|
|
||||||
if not scanned:
|
|
||||||
print(url)
|
|
||||||
print("In progress")
|
|
||||||
continue
|
|
||||||
handle_exact_repost(website.id, s)
|
|
||||||
continue
|
|
||||||
|
|
||||||
website_id = db.website_exists(url)
|
|
||||||
if website_id:
|
|
||||||
if not scanned:
|
|
||||||
print("Parent in progress")
|
|
||||||
continue
|
|
||||||
handle_subdir_repost(website_id, s)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not od_util.is_valid_url(url):
|
|
||||||
print("Skipping reddit post: Invalid url")
|
|
||||||
bot.log_crawl(s.id)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if od_util.is_blacklisted(url):
|
|
||||||
print("Skipping reddit post: blacklisted")
|
|
||||||
bot.log_crawl(s.id)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not od_util.is_od(url):
|
|
||||||
print("Skipping reddit post: Not an OD")
|
|
||||||
print(url)
|
|
||||||
bot.log_crawl(s.id)
|
|
||||||
continue
|
|
||||||
|
|
||||||
web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
|
|
||||||
db.enqueue(web_id, reddit_post_id=s.id, priority=3) # Higher priority for reddit posts
|
|
||||||
print("Queued reddit post: " + str(web_id))
|
|
@ -1,7 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
|
||||||
from search.search import ElasticSearchEngine
|
from search.search import ElasticSearchEngine
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
import requests
|
import requests
|
||||||
@ -16,6 +15,7 @@ exts = [
|
|||||||
"so", "dll", "tar", "gz", "bin", "cad", "cmd", "bat", "sh", "md"
|
"so", "dll", "tar", "gz", "bin", "cad", "cmd", "bat", "sh", "md"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def dump_local_filesystem(root_dir: str):
|
def dump_local_filesystem(root_dir: str):
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
@ -71,7 +71,7 @@ def index_file_list(path: str, website_id):
|
|||||||
|
|
||||||
|
|
||||||
def search(term=""):
|
def search(term=""):
|
||||||
requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False)
|
requests.get("http://localhost/search?q=" + term, verify=False)
|
||||||
print(term)
|
print(term)
|
||||||
|
|
||||||
|
|
||||||
@ -91,7 +91,7 @@ def make_wide_filesystem(count=100000):
|
|||||||
os.mkdir(new_path)
|
os.mkdir(new_path)
|
||||||
|
|
||||||
|
|
||||||
dump_local_filesystem("/mnt/")
|
# dump_local_filesystem("/mnt/")
|
||||||
# index_file_list("local_filesystem.json", 4)
|
# index_file_list("local_filesystem.json", 4)
|
||||||
# random_searches(100000)
|
# random_searches(100000)
|
||||||
# dump_random_files(20000 * 100000)
|
# dump_random_files(20000 * 100000)
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
<p class="lead">{{ stats["total_count"] }} files totalling
|
<p class="lead">{{ stats["total_count"] }} files totalling
|
||||||
~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites</p>
|
~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites</p>
|
||||||
{% else %}
|
{% else %}
|
||||||
<p class="lead">We're currently experiencing a high volume of traffic. The search function
|
<p class="lead">We are currently experiencing a high volume of traffic. The search function
|
||||||
may be unresponsive.</p>
|
may be unresponsive.</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<p></p>
|
<p></p>
|
||||||
|
@ -96,7 +96,7 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<th>Crawl time average</th>
|
<th>Crawl time average</th>
|
||||||
{% for server in crawl_server_stats %}
|
{% for server in crawl_server_stats %}
|
||||||
<td>{{ crawl_server_stats[server].time_avg|round(2) }}s per task</td>
|
<td>{{ crawl_server_stats[server].time_avg|round(2) }}s</td>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
@ -108,7 +108,7 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<th>Files crawled average</th>
|
<th>Files crawled average</th>
|
||||||
{% for server in crawl_server_stats %}
|
{% for server in crawl_server_stats %}
|
||||||
<td>{{ crawl_server_stats[server].file_count_avg | round(2) }} per task</td>
|
<td>{{ crawl_server_stats[server].file_count_avg | round(2) }}</td>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user