mirror of
				https://github.com/simon987/od-database.git
				synced 2025-11-03 22:46:52 +00:00 
			
		
		
		
	Added API commands
This commit is contained in:
		
							parent
							
								
									8a19fa0ce7
								
							
						
					
					
						commit
						0227684a53
					
				
							
								
								
									
										78
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										78
									
								
								app.py
									
									
									
									
									
								
							@ -334,8 +334,9 @@ def home():
 | 
			
		||||
 | 
			
		||||
@app.route("/submit")
 | 
			
		||||
def submit():
 | 
			
		||||
    queued_websites = taskManager.get_queued_tasks()
 | 
			
		||||
    return render_template("submit.html", queue=queued_websites, recaptcha=recaptcha, show_captcha=config.CAPTCHA_SUBMIT)
 | 
			
		||||
    queued_websites = taskManager.get_queued_tasks()[30:]
 | 
			
		||||
    return render_template("submit.html", queue=queued_websites, recaptcha=recaptcha,
 | 
			
		||||
                           show_captcha=config.CAPTCHA_SUBMIT)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def try_enqueue(url):
 | 
			
		||||
@ -364,9 +365,11 @@ def try_enqueue(url):
 | 
			
		||||
               "an open directory or the server is not responding. If you think " \
 | 
			
		||||
               "this is an error, please <a href='/contribute'>contact me</a>.", "danger"
 | 
			
		||||
 | 
			
		||||
    web_id = db.insert_website(Website(url, str(request.remote_addr), str(request.user_agent)))
 | 
			
		||||
    website_id = db.insert_website(Website(url, str(request.remote_addr + "|" +
 | 
			
		||||
                                                    request.headers.get("X-Forwarded-For", "")),
 | 
			
		||||
                                           request.user_agent))
 | 
			
		||||
 | 
			
		||||
    task = Task(web_id, url, priority=1)
 | 
			
		||||
    task = Task(website_id, url, priority=1)
 | 
			
		||||
    taskManager.queue_task(task)
 | 
			
		||||
 | 
			
		||||
    return "The website has been added to the queue", "success"
 | 
			
		||||
@ -582,5 +585,72 @@ def api_complete_task():
 | 
			
		||||
            return "No such task"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.route("/api/website/by_url", methods=["GET"])
 | 
			
		||||
def api_website_by_url():
 | 
			
		||||
    token = request.args.get("token")
 | 
			
		||||
    url = request.args.get("url")
 | 
			
		||||
    name = db.check_api_token(token)
 | 
			
		||||
 | 
			
		||||
    if name:
 | 
			
		||||
        website = db.get_website_by_url(url)
 | 
			
		||||
        if website:
 | 
			
		||||
            return str(website.id)
 | 
			
		||||
        return abort(404)
 | 
			
		||||
    else:
 | 
			
		||||
        return abort(403)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.route("/api/website/blacklisted", methods=["GET"])
 | 
			
		||||
def api_website_is_blacklisted():
 | 
			
		||||
    token = request.args.get("token")
 | 
			
		||||
    url = request.args.get("url")
 | 
			
		||||
    name = db.check_api_token(token)
 | 
			
		||||
 | 
			
		||||
    if name:
 | 
			
		||||
        return str(db.is_blacklisted(url))
 | 
			
		||||
    else:
 | 
			
		||||
        return abort(403)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.route("/api/website/add", methods=["GET"])
 | 
			
		||||
def api_add_website():
 | 
			
		||||
    token = request.args.get("token")
 | 
			
		||||
    url = request.args.get("url")
 | 
			
		||||
 | 
			
		||||
    name = db.check_api_token(token)
 | 
			
		||||
    if name:
 | 
			
		||||
 | 
			
		||||
        website_id = db.insert_website(Website(url, str(request.remote_addr + "|" +
 | 
			
		||||
                                                        request.headers.get("X-Forwarded-For", "")),
 | 
			
		||||
                                               "API_CLIENT_" + name))
 | 
			
		||||
        return str(website_id)
 | 
			
		||||
    else:
 | 
			
		||||
        return abort(403)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.route("/api/task/enqueue", methods=["POST"])
 | 
			
		||||
def api_task_enqueue():
 | 
			
		||||
    try:
 | 
			
		||||
        token = request.json["token"]
 | 
			
		||||
    except KeyError:
 | 
			
		||||
        return abort(400)
 | 
			
		||||
 | 
			
		||||
    name = db.check_api_token(token)
 | 
			
		||||
 | 
			
		||||
    if name:
 | 
			
		||||
 | 
			
		||||
        task = Task(
 | 
			
		||||
            request.json["website_id"],
 | 
			
		||||
            request.json["url"],
 | 
			
		||||
            request.json["priority"],
 | 
			
		||||
            request.json["callback_type"],
 | 
			
		||||
            request.json["callback_args"]
 | 
			
		||||
        )
 | 
			
		||||
        taskManager.queue_task(task)
 | 
			
		||||
        return ""
 | 
			
		||||
    else:
 | 
			
		||||
        return abort(403)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    app.run("0.0.0.0", port=12345, threaded=True)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										11
									
								
								debug_put.py
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								debug_put.py
									
									
									
									
									
								
							@ -3,17 +3,16 @@ import json
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
payload = json.dumps({
 | 
			
		||||
    "token": "4eafc6ed-74b7-4f04-9d34-7f3e01201003",
 | 
			
		||||
    "website_id": 3,
 | 
			
		||||
    # "url": "ftp://132.249.213.137",
 | 
			
		||||
    "url": "http://localhost:8000/",
 | 
			
		||||
    # "url": "http://dlst18.xyz/
 | 
			
		||||
    # dl/vip/film/",
 | 
			
		||||
    "priority": 2,
 | 
			
		||||
    "callback_type": "",
 | 
			
		||||
    "callback_args": "{}"
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
r = requests.post("http://localhost:5001/task/put",
 | 
			
		||||
                  headers={"Content-Type": "application/json",
 | 
			
		||||
                           "Authorization": "Token abc"},
 | 
			
		||||
r = requests.post("http://localhost/api/task/enqueue",
 | 
			
		||||
                  headers={"Content-Type": "application/json"},
 | 
			
		||||
                  data=payload)
 | 
			
		||||
print(r)
 | 
			
		||||
print(r.text)
 | 
			
		||||
 | 
			
		||||
@ -1,157 +0,0 @@
 | 
			
		||||
import praw
 | 
			
		||||
from crawl_server.reddit_bot import RedditBot
 | 
			
		||||
from search.search import ElasticSearchEngine
 | 
			
		||||
from database import Database, Website
 | 
			
		||||
import od_util
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
chars_to_remove_from_comment = re.compile("[\[\]\\\()]+")
 | 
			
		||||
reddit = praw.Reddit('opendirectories-bot',
 | 
			
		||||
                     user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
 | 
			
		||||
db = Database("db.sqlite3")
 | 
			
		||||
search = ElasticSearchEngine("od-database")
 | 
			
		||||
subreddit = reddit.subreddit("opendirectories")
 | 
			
		||||
# subreddit = reddit.subreddit("test")
 | 
			
		||||
bot = RedditBot("crawled.txt", reddit)
 | 
			
		||||
 | 
			
		||||
submissions = []
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def handle_exact_repost(website_id, reddit_obj):
 | 
			
		||||
    stats = search.get_stats(website_id)
 | 
			
		||||
    comment = bot.get_comment({"": stats}, website_id,
 | 
			
		||||
                              "I already scanned this website on " + website.last_modified + " UTC")
 | 
			
		||||
    print(comment)
 | 
			
		||||
    print("Exact repost!")
 | 
			
		||||
    bot.reply(reddit_obj, comment)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def handle_subdir_repost(website_id, reddit_obj):
 | 
			
		||||
 | 
			
		||||
    website = db.get_website_by_id(website_id)
 | 
			
		||||
    message = "I already scanned a parent directory of this website on " + website.last_modified + " UTC"
 | 
			
		||||
    stats = db.get_website_stats(website_id)
 | 
			
		||||
    tables = {"Parent directory:": stats}
 | 
			
		||||
 | 
			
		||||
    subdir = url[len(website.url):]
 | 
			
		||||
    subdir_stats = db.get_subdir_stats(website_id, subdir)
 | 
			
		||||
    if subdir_stats["total_size"] <= 0:
 | 
			
		||||
        message += " but I couldn't calculate the size of this subdirectory."
 | 
			
		||||
    else:
 | 
			
		||||
        tables["Subdirectory `/" + subdir + "`:"] = subdir_stats
 | 
			
		||||
    comment = bot.get_comment(tables, website_id, message)
 | 
			
		||||
    print(comment)
 | 
			
		||||
    print("Subdir repost!")
 | 
			
		||||
    bot.reply(reddit_obj, comment)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Check comments
 | 
			
		||||
for comment in subreddit.comments(limit=50):
 | 
			
		||||
 | 
			
		||||
    if not bot.has_crawled(comment):
 | 
			
		||||
        text = chars_to_remove_from_comment.sub(" ", comment.body).strip()
 | 
			
		||||
        if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"):
 | 
			
		||||
            lines = text.split()
 | 
			
		||||
            if len(lines) > 1:
 | 
			
		||||
                url = os.path.join(lines[1], "")  # Add trailing slash
 | 
			
		||||
                scanned = db.website_has_been_scanned(url)
 | 
			
		||||
 | 
			
		||||
                website = db.get_website_by_url(url)
 | 
			
		||||
 | 
			
		||||
                if website:
 | 
			
		||||
                    if not scanned:
 | 
			
		||||
                        # in progress
 | 
			
		||||
                        print(url)
 | 
			
		||||
                        print("In progress")
 | 
			
		||||
                        continue
 | 
			
		||||
                    handle_exact_repost(website.id, comment)
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                website_id = db.website_exists(url)
 | 
			
		||||
                if website_id:
 | 
			
		||||
                    if not scanned:
 | 
			
		||||
                        print("Parent in progress")
 | 
			
		||||
                        continue
 | 
			
		||||
                    handle_subdir_repost(website_id, comment)
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                if not od_util.is_valid_url(url):
 | 
			
		||||
                    print("Skipping reddit comment: Invalid url")
 | 
			
		||||
                    bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you "
 | 
			
		||||
                                       "provided: `" + url + "` is not valid. Make sure that you include the"
 | 
			
		||||
                                       "'`http(s)://` prefix.    \n")
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                if od_util.is_blacklisted(url):
 | 
			
		||||
                    print("Skipping reddit comment: blacklisted")
 | 
			
		||||
                    bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately my programmer has "
 | 
			
		||||
                                       "blacklisted this website. If you think that this is an error, please "
 | 
			
		||||
                                       "[contact him](https://old.reddit.com/message/compose?to=Hexahedr_n)")
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                if not od_util.is_od(url):
 | 
			
		||||
                    print("Skipping reddit comment: Not an OD")
 | 
			
		||||
                    print(url)
 | 
			
		||||
                    bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you "
 | 
			
		||||
                                       "provided: `" + url + "` does not point to an open directory. This could also"
 | 
			
		||||
                                       " mean that the website is not responding (in which case, feel free to retry in "
 | 
			
		||||
                                       "a few minutes). If you think that this is an error, please "
 | 
			
		||||
                                       "[contact my programmer](https://old.reddit.com/message/compose?to=Hexahedr_n)")
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
 | 
			
		||||
                db.enqueue(web_id, reddit_comment_id=comment.id, priority=2)  # Medium priority for reddit comments
 | 
			
		||||
                print("Queued comment post: " + str(web_id))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Check posts
 | 
			
		||||
for submission in subreddit.new(limit=3):
 | 
			
		||||
    submissions.append(submission)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
for s in submissions:
 | 
			
		||||
 | 
			
		||||
    if not s.is_self:
 | 
			
		||||
        if not bot.has_crawled(s.id):
 | 
			
		||||
 | 
			
		||||
            url = os.path.join(s.url, "")  # add trailing slash
 | 
			
		||||
            scanned = db.website_has_been_scanned(url)
 | 
			
		||||
 | 
			
		||||
            website = db.get_website_by_url(url)
 | 
			
		||||
 | 
			
		||||
            if website:
 | 
			
		||||
                if not scanned:
 | 
			
		||||
                    print(url)
 | 
			
		||||
                    print("In progress")
 | 
			
		||||
                    continue
 | 
			
		||||
                handle_exact_repost(website.id, s)
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            website_id = db.website_exists(url)
 | 
			
		||||
            if website_id:
 | 
			
		||||
                if not scanned:
 | 
			
		||||
                    print("Parent in progress")
 | 
			
		||||
                    continue
 | 
			
		||||
                handle_subdir_repost(website_id, s)
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if not od_util.is_valid_url(url):
 | 
			
		||||
                print("Skipping reddit post: Invalid url")
 | 
			
		||||
                bot.log_crawl(s.id)
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if od_util.is_blacklisted(url):
 | 
			
		||||
                print("Skipping reddit post: blacklisted")
 | 
			
		||||
                bot.log_crawl(s.id)
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if not od_util.is_od(url):
 | 
			
		||||
                print("Skipping reddit post: Not an OD")
 | 
			
		||||
                print(url)
 | 
			
		||||
                bot.log_crawl(s.id)
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
 | 
			
		||||
            db.enqueue(web_id, reddit_post_id=s.id, priority=3)  # Higher priority for reddit posts
 | 
			
		||||
            print("Queued reddit post: " + str(web_id))
 | 
			
		||||
@ -1,7 +1,6 @@
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
import shutil
 | 
			
		||||
import sys
 | 
			
		||||
from search.search import ElasticSearchEngine
 | 
			
		||||
from concurrent.futures import ThreadPoolExecutor
 | 
			
		||||
import requests
 | 
			
		||||
@ -16,6 +15,7 @@ exts = [
 | 
			
		||||
    "so", "dll", "tar", "gz", "bin", "cad", "cmd", "bat", "sh", "md"
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def dump_local_filesystem(root_dir: str):
 | 
			
		||||
 | 
			
		||||
    docs = []
 | 
			
		||||
@ -71,7 +71,7 @@ def index_file_list(path: str, website_id):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def search(term=""):
 | 
			
		||||
    requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False)
 | 
			
		||||
    requests.get("http://localhost/search?q=" + term, verify=False)
 | 
			
		||||
    print(term)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -91,7 +91,7 @@ def make_wide_filesystem(count=100000):
 | 
			
		||||
            os.mkdir(new_path)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
dump_local_filesystem("/mnt/")
 | 
			
		||||
# dump_local_filesystem("/mnt/")
 | 
			
		||||
# index_file_list("local_filesystem.json", 4)
 | 
			
		||||
# random_searches(100000)
 | 
			
		||||
# dump_random_files(20000 * 100000)
 | 
			
		||||
 | 
			
		||||
@ -12,7 +12,7 @@
 | 
			
		||||
                <p class="lead">{{ stats["total_count"] }} files totalling
 | 
			
		||||
                    ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites</p>
 | 
			
		||||
            {% else %}
 | 
			
		||||
                <p class="lead">We're currently experiencing a high volume of traffic. The search function
 | 
			
		||||
                <p class="lead">We are currently experiencing a high volume of traffic. The search function
 | 
			
		||||
                may be unresponsive.</p>
 | 
			
		||||
            {% endif %}
 | 
			
		||||
            <p></p>
 | 
			
		||||
 | 
			
		||||
@ -96,7 +96,7 @@
 | 
			
		||||
                    <tr>
 | 
			
		||||
                        <th>Crawl time average</th>
 | 
			
		||||
                        {% for server in crawl_server_stats %}
 | 
			
		||||
                            <td>{{ crawl_server_stats[server].time_avg|round(2) }}s per task</td>
 | 
			
		||||
                            <td>{{ crawl_server_stats[server].time_avg|round(2) }}s</td>
 | 
			
		||||
                        {% endfor %}
 | 
			
		||||
                    </tr>
 | 
			
		||||
                    <tr>
 | 
			
		||||
@ -108,7 +108,7 @@
 | 
			
		||||
                    <tr>
 | 
			
		||||
                        <th>Files crawled average</th>
 | 
			
		||||
                        {% for server in crawl_server_stats %}
 | 
			
		||||
                            <td>{{ crawl_server_stats[server].file_count_avg | round(2) }} per task</td>
 | 
			
		||||
                            <td>{{ crawl_server_stats[server].file_count_avg | round(2) }}</td>
 | 
			
		||||
                        {% endfor %}
 | 
			
		||||
                    </tr>
 | 
			
		||||
                    </tbody>
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user