commit b3e5d05262b166a3a300214a8f57a80f404d9135 Author: Simon Date: Thu Aug 9 15:02:14 2018 -0400 Initial commit diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..e3bd571 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "od_database"] + path = od_database + url = https://github.com/simon987/od-database diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/config.py b/config.py new file mode 100644 index 0000000..d6d90c4 --- /dev/null +++ b/config.py @@ -0,0 +1,2 @@ +API_KEY = "" +API_URL = "https://od-db.the-eye.eu/api/" diff --git a/od_db_client.py b/od_db_client.py new file mode 100644 index 0000000..f3d5e00 --- /dev/null +++ b/od_db_client.py @@ -0,0 +1,143 @@ +import requests +import json +import re +import humanfriendly +import time + +from od_database.od_util import truncate_path + + +class OdDatabase: + + def __init__(self, url, token): + + self.url = url + self.token = token + + @staticmethod + def perform_request(url, method="GET", data=None, json_data=None): + + try: + if json_data: + return requests.request("POST", url, data=json_data, headers={"Content-Type": "application/json"}) + else: + return requests.request(method, url, data=data) + except Exception as e: + print(e) + return None + + def website_by_url(self, url): + + r = self.perform_request(self.url + "website/by_url?token=" + self.token + "&url=" + url) + if not r or r.status_code != 200: + return None + + return int(r.text) + + def website_is_blacklisted(self, url): + + r = self.perform_request(self.url + "website/blacklisted?token=" + self.token + "&url=" + url) + if not r or r.status_code != 200: + return False + return r.text == "True" + + def add_website(self, url): + + r = self.perform_request(self.url + "website/add?token=" + self.token + "&url=" + url) + if not r or r.status_code != 200: + return None + return int(r.text) + + def enqueue(self, website_id=None, url=None, priority=1, callback_type="", callback_args=""): + + data = json.dumps({ + "token": self.token, + "website_id": website_id, + "url": url, + "priority": priority, + "callback_type": callback_type, + "callback_args": callback_args + }) + r = self.perform_request(self.url + "task/force_enqueue", json_data=data) + + if not r or r.status_code != 200: + return False + return True + + def search(self, q, p, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, date_max): + + data = json.dumps({ + "token": self.token, + "query": q, + "page": p, + "per_page": per_page, + "sort_order": sort_order, + "extensions": extensions, + "size_min": size_min, + "size_max": size_max, + "match_all": match_all, + "fields": fields, + "date_min": date_min, + "date_max": date_max + }) + + r = self.perform_request(self.url + "search", json_data=data) + + if not r or r.status_code != 200: + return None + return json.loads(r.text) + + @staticmethod + def format_search_hits(hits, query): + + message = str(hits["hits"]["total"]) + " matches found in " + str(hits["took"]) + "ms for query `" + query + "`: \n\n" + message += "File | Size | Date \n" + message += ":-- | :-- | --: \n" + + for hit in hits["hits"]["hits"]: + src = hit["_source"] + + # File name highlight + if "name" in hit["highlight"]: + hl_name = format_highlight(hit["highlight"]["name"][0]) + elif "name.nGram" in hit["highlight"]: + hl_name = format_highlight(hit["highlight"]["name.nGram"][0]) + else: + hl_name = src["name"] + + # Path highlight + if "path" in hit["highlight"]: + hl_path = format_highlight(hit["highlight"]["path"][0]) + else: + hl_path = src["path"] + hl_path = truncate_path(hl_path, 65) + hl_path += "/" if hl_path else "" + + message += "[" + src["website_url"] + "](https://od-db.the-eye.eu/website/" + str(src["website_id"]) + "/)" + hl_path + message += hl_name + ("." if src["ext"] else "") + src["ext"] + "| " + message += humanfriendly.format_size(src["size"]) + " | " + message += time.strftime("%Y-%m-%d", time.gmtime(src["mtime"])) + " \n" + + message += "\n[More results for this query](https://the-eye.eu/search?q=" + query + ") |" \ + " [OD-Database](https://od-db.the-eye.eu/)" + + return message + + def get_stats(self, website_id): + r = self.perform_request(self.url + "../website/" + str(website_id) + "/json_chart") + return json.loads(r.text) + + +def format_highlight(text): + + text = re.sub("()", "**", text) + text = re.sub("()\s+", " **", text) + text = re.sub("()", "**", text) + text = re.sub("\s+()", "** ", text) + + return text + + + + + diff --git a/run.py b/run.py new file mode 100644 index 0000000..6627ed4 --- /dev/null +++ b/run.py @@ -0,0 +1,133 @@ +import praw +from od_database.reddit_bot import RedditBot +import re +import os +import json +from od_db_client import OdDatabase +from od_database.od_util import get_top_directory, is_od, is_valid_url +import shlex +import config + + +PATTERN = re.compile("[\[\]\\\()]+") +od_db_client = OdDatabase(config.API_URL, config.API_KEY) + + +def process_comment(comment, bot): + + text = PATTERN.sub(" ", comment.body).strip() + + if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"): + lines = shlex.split(text,) + if len(lines) > 1: + text = lines[1] + if text.startswith("?"): + process_query(comment, bot, text[1:]) + else: + process_url(comment, bot, text) + + +def process_query(comment, bot, query): + + print("Search query '" + query + "'") + + hits = od_db_client.search( + query, 0, 10, + "score", [], + 0, 0, + False, ["path", "name^5", "name.nGram^2"], + 0, 0 + ) + message = od_db_client.format_search_hits(hits, query) + "\n*** \n" + bot.bottom_line + print(message) + bot.reply(comment, message) + + +def process_url(comment, bot, url): + + url = os.path.join(url, "") # Add trailing slash + + if not is_valid_url(url): + print("Url is invalid") + handle_invalid_url(comment, bot, url) + + if od_db_client.website_is_blacklisted(url): + print("Website is blacklisted") + handle_blacklisted(comment, bot) + return + + url = get_top_directory(url) + website_id = od_db_client.website_by_url(url) + + if not website_id: + print("Website does not exist") + + if not is_od(url): + print("Website is not an od") + handle_non_od_website(comment, bot, url) + return + + handle_new_website(comment, bot, url) + else: + print("Website already exists") + handle_existing_website(comment, bot, website_id) + + +def handle_invalid_url(comment, bot, url): + bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you provided: `" + + url + "` is not valid. Make sure that you include the `http(s)://` prefix. \n*** \n" + bot.bottom_line) + + +def handle_blacklisted(comment, bot): + bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately my programmer has blacklisted this website." + " If you think that this is an error, please " + "[contact him](https://old.reddit.com/message/compose?to=Hexahedr_n) \n*** \n" + bot.bottom_line) + + +def handle_non_od_website(comment, bot, url): + bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you provided: `" + + url + "` does not point to an open directory. This could also mean that the website is not responding " + "(in which case, feel free to retry in a few minutes). If you think that this is an error, please " + "[contact my programmer](https://old.reddit.com/message/compose?to=Hexahedr_n) \n*** \n" + + bot.bottom_line) + + +def handle_new_website(comment, bot, url): + + website_id = od_db_client.add_website(url) + if website_id: + reply = bot.reply(comment, "Hello, " + str(comment.author) + ". This website was added to od-database and will " + "be processed as soon as a crawl server is available. Thank you for your " + "contribution to the database! \nI will edit this comment when the website has" + " been crawled and indexed. Website id is `" + str(website_id) + "`. \n*** \n" + + bot.bottom_line) + + od_db_client.enqueue(website_id=website_id, url=url, priority=2, callback_type="reddit_comment", + callback_args=json.dumps({ + "comment_id": reply.id + })) + else: + print("Could not create new website") + + +def handle_existing_website(comment, bot, website_id): + + stats = od_db_client.get_stats(website_id) + message_header = "Hello, " + str(comment.author) + ". This website was crawled and indexed by od-database at `" + \ + stats["report_time"] + "`. " + + message = bot.get_comment(stats, website_id, message_header) + print(message) + bot.reply(comment, message) + + +if __name__ == "__main__": + reddit = praw.Reddit('opendirectories-bot', + user_agent='github.com/simon987/opendirectories-bot-new (by /u/Hexahedr_n)') + bot = RedditBot("processed.txt", reddit) + subreddit = reddit.subreddit("test") + + for comment in subreddit.comments(limit=50): + if not bot.has_crawled(comment): + process_comment(comment, bot) +