mirror of
https://github.com/simon987/opendirectories-bot-2.git
synced 2025-04-10 14:06:41 +00:00
Initial commit
This commit is contained in:
commit
b3e5d05262
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
[submodule "od_database"]
|
||||||
|
path = od_database
|
||||||
|
url = https://github.com/simon987/od-database
|
0
__init__.py
Normal file
0
__init__.py
Normal file
2
config.py
Normal file
2
config.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
API_KEY = ""
|
||||||
|
API_URL = "https://od-db.the-eye.eu/api/"
|
143
od_db_client.py
Normal file
143
od_db_client.py
Normal file
@ -0,0 +1,143 @@
|
|||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import humanfriendly
|
||||||
|
import time
|
||||||
|
|
||||||
|
from od_database.od_util import truncate_path
|
||||||
|
|
||||||
|
|
||||||
|
class OdDatabase:
|
||||||
|
|
||||||
|
def __init__(self, url, token):
|
||||||
|
|
||||||
|
self.url = url
|
||||||
|
self.token = token
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def perform_request(url, method="GET", data=None, json_data=None):
|
||||||
|
|
||||||
|
try:
|
||||||
|
if json_data:
|
||||||
|
return requests.request("POST", url, data=json_data, headers={"Content-Type": "application/json"})
|
||||||
|
else:
|
||||||
|
return requests.request(method, url, data=data)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def website_by_url(self, url):
|
||||||
|
|
||||||
|
r = self.perform_request(self.url + "website/by_url?token=" + self.token + "&url=" + url)
|
||||||
|
if not r or r.status_code != 200:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return int(r.text)
|
||||||
|
|
||||||
|
def website_is_blacklisted(self, url):
|
||||||
|
|
||||||
|
r = self.perform_request(self.url + "website/blacklisted?token=" + self.token + "&url=" + url)
|
||||||
|
if not r or r.status_code != 200:
|
||||||
|
return False
|
||||||
|
return r.text == "True"
|
||||||
|
|
||||||
|
def add_website(self, url):
|
||||||
|
|
||||||
|
r = self.perform_request(self.url + "website/add?token=" + self.token + "&url=" + url)
|
||||||
|
if not r or r.status_code != 200:
|
||||||
|
return None
|
||||||
|
return int(r.text)
|
||||||
|
|
||||||
|
def enqueue(self, website_id=None, url=None, priority=1, callback_type="", callback_args=""):
|
||||||
|
|
||||||
|
data = json.dumps({
|
||||||
|
"token": self.token,
|
||||||
|
"website_id": website_id,
|
||||||
|
"url": url,
|
||||||
|
"priority": priority,
|
||||||
|
"callback_type": callback_type,
|
||||||
|
"callback_args": callback_args
|
||||||
|
})
|
||||||
|
r = self.perform_request(self.url + "task/force_enqueue", json_data=data)
|
||||||
|
|
||||||
|
if not r or r.status_code != 200:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def search(self, q, p, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, date_max):
|
||||||
|
|
||||||
|
data = json.dumps({
|
||||||
|
"token": self.token,
|
||||||
|
"query": q,
|
||||||
|
"page": p,
|
||||||
|
"per_page": per_page,
|
||||||
|
"sort_order": sort_order,
|
||||||
|
"extensions": extensions,
|
||||||
|
"size_min": size_min,
|
||||||
|
"size_max": size_max,
|
||||||
|
"match_all": match_all,
|
||||||
|
"fields": fields,
|
||||||
|
"date_min": date_min,
|
||||||
|
"date_max": date_max
|
||||||
|
})
|
||||||
|
|
||||||
|
r = self.perform_request(self.url + "search", json_data=data)
|
||||||
|
|
||||||
|
if not r or r.status_code != 200:
|
||||||
|
return None
|
||||||
|
return json.loads(r.text)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def format_search_hits(hits, query):
|
||||||
|
|
||||||
|
message = str(hits["hits"]["total"]) + " matches found in " + str(hits["took"]) + "ms for query `" + query + "`: \n\n"
|
||||||
|
message += "File | Size | Date \n"
|
||||||
|
message += ":-- | :-- | --: \n"
|
||||||
|
|
||||||
|
for hit in hits["hits"]["hits"]:
|
||||||
|
src = hit["_source"]
|
||||||
|
|
||||||
|
# File name highlight
|
||||||
|
if "name" in hit["highlight"]:
|
||||||
|
hl_name = format_highlight(hit["highlight"]["name"][0])
|
||||||
|
elif "name.nGram" in hit["highlight"]:
|
||||||
|
hl_name = format_highlight(hit["highlight"]["name.nGram"][0])
|
||||||
|
else:
|
||||||
|
hl_name = src["name"]
|
||||||
|
|
||||||
|
# Path highlight
|
||||||
|
if "path" in hit["highlight"]:
|
||||||
|
hl_path = format_highlight(hit["highlight"]["path"][0])
|
||||||
|
else:
|
||||||
|
hl_path = src["path"]
|
||||||
|
hl_path = truncate_path(hl_path, 65)
|
||||||
|
hl_path += "/" if hl_path else ""
|
||||||
|
|
||||||
|
message += "[" + src["website_url"] + "](https://od-db.the-eye.eu/website/" + str(src["website_id"]) + "/)" + hl_path
|
||||||
|
message += hl_name + ("." if src["ext"] else "") + src["ext"] + "| "
|
||||||
|
message += humanfriendly.format_size(src["size"]) + " | "
|
||||||
|
message += time.strftime("%Y-%m-%d", time.gmtime(src["mtime"])) + " \n"
|
||||||
|
|
||||||
|
message += "\n[More results for this query](https://the-eye.eu/search?q=" + query + ") |" \
|
||||||
|
" [OD-Database](https://od-db.the-eye.eu/)"
|
||||||
|
|
||||||
|
return message
|
||||||
|
|
||||||
|
def get_stats(self, website_id):
|
||||||
|
r = self.perform_request(self.url + "../website/" + str(website_id) + "/json_chart")
|
||||||
|
return json.loads(r.text)
|
||||||
|
|
||||||
|
|
||||||
|
def format_highlight(text):
|
||||||
|
|
||||||
|
text = re.sub("(<mark>)", "**", text)
|
||||||
|
text = re.sub("(<mark>)\s+", " **", text)
|
||||||
|
text = re.sub("(</mark>)", "**", text)
|
||||||
|
text = re.sub("\s+(</mark>)", "** ", text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
133
run.py
Normal file
133
run.py
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
import praw
|
||||||
|
from od_database.reddit_bot import RedditBot
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from od_db_client import OdDatabase
|
||||||
|
from od_database.od_util import get_top_directory, is_od, is_valid_url
|
||||||
|
import shlex
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
|
PATTERN = re.compile("[\[\]\\\()]+")
|
||||||
|
od_db_client = OdDatabase(config.API_URL, config.API_KEY)
|
||||||
|
|
||||||
|
|
||||||
|
def process_comment(comment, bot):
|
||||||
|
|
||||||
|
text = PATTERN.sub(" ", comment.body).strip()
|
||||||
|
|
||||||
|
if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"):
|
||||||
|
lines = shlex.split(text,)
|
||||||
|
if len(lines) > 1:
|
||||||
|
text = lines[1]
|
||||||
|
if text.startswith("?"):
|
||||||
|
process_query(comment, bot, text[1:])
|
||||||
|
else:
|
||||||
|
process_url(comment, bot, text)
|
||||||
|
|
||||||
|
|
||||||
|
def process_query(comment, bot, query):
|
||||||
|
|
||||||
|
print("Search query '" + query + "'")
|
||||||
|
|
||||||
|
hits = od_db_client.search(
|
||||||
|
query, 0, 10,
|
||||||
|
"score", [],
|
||||||
|
0, 0,
|
||||||
|
False, ["path", "name^5", "name.nGram^2"],
|
||||||
|
0, 0
|
||||||
|
)
|
||||||
|
message = od_db_client.format_search_hits(hits, query) + "\n*** \n" + bot.bottom_line
|
||||||
|
print(message)
|
||||||
|
bot.reply(comment, message)
|
||||||
|
|
||||||
|
|
||||||
|
def process_url(comment, bot, url):
|
||||||
|
|
||||||
|
url = os.path.join(url, "") # Add trailing slash
|
||||||
|
|
||||||
|
if not is_valid_url(url):
|
||||||
|
print("Url is invalid")
|
||||||
|
handle_invalid_url(comment, bot, url)
|
||||||
|
|
||||||
|
if od_db_client.website_is_blacklisted(url):
|
||||||
|
print("Website is blacklisted")
|
||||||
|
handle_blacklisted(comment, bot)
|
||||||
|
return
|
||||||
|
|
||||||
|
url = get_top_directory(url)
|
||||||
|
website_id = od_db_client.website_by_url(url)
|
||||||
|
|
||||||
|
if not website_id:
|
||||||
|
print("Website does not exist")
|
||||||
|
|
||||||
|
if not is_od(url):
|
||||||
|
print("Website is not an od")
|
||||||
|
handle_non_od_website(comment, bot, url)
|
||||||
|
return
|
||||||
|
|
||||||
|
handle_new_website(comment, bot, url)
|
||||||
|
else:
|
||||||
|
print("Website already exists")
|
||||||
|
handle_existing_website(comment, bot, website_id)
|
||||||
|
|
||||||
|
|
||||||
|
def handle_invalid_url(comment, bot, url):
|
||||||
|
bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you provided: `" +
|
||||||
|
url + "` is not valid. Make sure that you include the `http(s)://` prefix. \n*** \n" + bot.bottom_line)
|
||||||
|
|
||||||
|
|
||||||
|
def handle_blacklisted(comment, bot):
|
||||||
|
bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately my programmer has blacklisted this website."
|
||||||
|
" If you think that this is an error, please "
|
||||||
|
"[contact him](https://old.reddit.com/message/compose?to=Hexahedr_n) \n*** \n" + bot.bottom_line)
|
||||||
|
|
||||||
|
|
||||||
|
def handle_non_od_website(comment, bot, url):
|
||||||
|
bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you provided: `" +
|
||||||
|
url + "` does not point to an open directory. This could also mean that the website is not responding "
|
||||||
|
"(in which case, feel free to retry in a few minutes). If you think that this is an error, please "
|
||||||
|
"[contact my programmer](https://old.reddit.com/message/compose?to=Hexahedr_n) \n*** \n" +
|
||||||
|
bot.bottom_line)
|
||||||
|
|
||||||
|
|
||||||
|
def handle_new_website(comment, bot, url):
|
||||||
|
|
||||||
|
website_id = od_db_client.add_website(url)
|
||||||
|
if website_id:
|
||||||
|
reply = bot.reply(comment, "Hello, " + str(comment.author) + ". This website was added to od-database and will "
|
||||||
|
"be processed as soon as a crawl server is available. Thank you for your "
|
||||||
|
"contribution to the database! \nI will edit this comment when the website has"
|
||||||
|
" been crawled and indexed. Website id is `" + str(website_id) + "`. \n*** \n"
|
||||||
|
+ bot.bottom_line)
|
||||||
|
|
||||||
|
od_db_client.enqueue(website_id=website_id, url=url, priority=2, callback_type="reddit_comment",
|
||||||
|
callback_args=json.dumps({
|
||||||
|
"comment_id": reply.id
|
||||||
|
}))
|
||||||
|
else:
|
||||||
|
print("Could not create new website")
|
||||||
|
|
||||||
|
|
||||||
|
def handle_existing_website(comment, bot, website_id):
|
||||||
|
|
||||||
|
stats = od_db_client.get_stats(website_id)
|
||||||
|
message_header = "Hello, " + str(comment.author) + ". This website was crawled and indexed by od-database at `" + \
|
||||||
|
stats["report_time"] + "`. "
|
||||||
|
|
||||||
|
message = bot.get_comment(stats, website_id, message_header)
|
||||||
|
print(message)
|
||||||
|
bot.reply(comment, message)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
reddit = praw.Reddit('opendirectories-bot',
|
||||||
|
user_agent='github.com/simon987/opendirectories-bot-new (by /u/Hexahedr_n)')
|
||||||
|
bot = RedditBot("processed.txt", reddit)
|
||||||
|
subreddit = reddit.subreddit("test")
|
||||||
|
|
||||||
|
for comment in subreddit.comments(limit=50):
|
||||||
|
if not bot.has_crawled(comment):
|
||||||
|
process_comment(comment, bot)
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user