opendirectories-bot-2/run.py

import praw
from od_database.reddit_bot import RedditBot
import re
import os
import json
from od_db_client import OdDatabase
from od_database.od_util import get_top_directory, is_od, is_valid_url
import shlex
import config


PATTERN = re.compile("[\[\]\\\()]+")
od_db_client = OdDatabase(config.API_URL, config.API_KEY)


def process_comment(comment, bot):

    text = PATTERN.sub(" ", comment.body).strip()

    if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"):
        try:
            lines = shlex.split(text)
        except ValueError:
            return
        if len(lines) > 1:
            if lines[1] == "https://www.reddit.com/u/opendirectories-bot" and len(lines) > 2:
                text = lines[2]
            else:
                text = lines[1]

            if text.startswith("?"):
                process_query(comment, bot, text[1:])
            else:
                process_url(comment, bot, text)


def process_query(comment, bot, query):

    print("Search query '" + query + "'")

    hits = od_db_client.search(
        query, 0, 6,
        "score", [],
        0, 0,
        False, ["path", "name^5", "name.nGram^2"],
        0, 0
    )
    message = od_db_client.format_search_hits(hits, query) + "\n***    \n" + bot.bottom_line
    print(message)
    bot.reply(comment, message)


def process_url(comment, bot, url):

    url = os.path.join(url, "")  # Add trailing slash

    if not is_valid_url(url):
        print("Url is invalid")
        handle_invalid_url(comment, bot, url)
        return

    if od_db_client.website_is_blacklisted(url):
        print("Website is blacklisted")
        handle_blacklisted(comment, bot)
        return

    url = get_top_directory(url)
    website_id = od_db_client.website_by_url(url)

    if not website_id:
        print("Website does not exist")

        if not is_od(url):
            print("Website is not an od")
            handle_non_od_website(comment, bot, url)
            return

        handle_new_website(comment, bot, url)
    else:
        print("Website already exists")
        handle_existing_website(comment, bot, website_id)


def handle_invalid_url(comment, bot, url):
    bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you provided: `" +
              url + "` is not valid. Make sure that you include the `http(s)://` prefix.    \n***    \n" + bot.bottom_line)


def handle_blacklisted(comment, bot):
    bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately my programmer has blacklisted this website."
                       " If you think that this is an error, please "
                       "[contact him](https://old.reddit.com/message/compose?to=Hexahedr_n)    \n***    \n" + bot.bottom_line)


def handle_non_od_website(comment, bot, url):
    bot.reply(comment, "Hello, " + str(comment.author) + ". Unfortunately it seems that the link you provided: `" +
              url + "` does not point to an open directory. This could also mean that the website is not responding "
                    "(in which case, feel free to retry in a few minutes). If you think that this is an error, please "
                    "[contact my programmer](https://old.reddit.com/message/compose?to=Hexahedr_n)    \n***    \n" +
              bot.bottom_line)


def handle_new_website(comment, bot, url):

    website_id = od_db_client.add_website(url)
    if website_id:
        reply = bot.reply(comment, "Hello, " + str(comment.author) + ". This website was added to od-database and will "
                                   "be processed as soon as a crawl server is available. Thank you for your "
                                   "contribution to the database!    \nI will edit this comment when the website has"
                                   " been crawled and indexed. Website id is `" + str(website_id) + "`.    \n***    \n"
                                   + bot.bottom_line)

        od_db_client.enqueue(website_id=website_id, url=url, priority=2, callback_type="reddit_comment",
                             callback_args=json.dumps({
                                 "comment_id": reply.id
                             }))
    else:
        print("Could not create new website")


def handle_existing_website(comment, bot, website_id):

    stats = od_db_client.get_stats(website_id)
    message_header = "Hello, " + str(comment.author) + ". This website was crawled and indexed by od-database at `" + \
                     stats["report_time"] + "`. "

    message = bot.get_comment(stats, website_id, message_header)
    print(message)
    bot.reply(comment, message)


def process_post(submission):
    print("Checking new post with url " + submission.url)
    url = os.path.join(submission.url, "")

    if not is_valid_url(url):
        print("Url is invalid")
        return

    if od_db_client.website_is_blacklisted(url):
        print("Website is blacklisted")
        return

    url = get_top_directory(url)
    website_id = od_db_client.website_by_url(url)

    if not website_id:
        print("Website does not exist")

        if not is_od(url):
            print("Website is not an od")
            return

        handle_new_website(submission, bot, url)
    else:
        print("Website already exists")
        handle_existing_website(submission, bot, website_id)


if __name__ == "__main__":
    reddit = praw.Reddit('opendirectories-bot',
                         user_agent='github.com/simon987/opendirectories-bot-2 (by /u/Hexahedr_n)')
    bot = RedditBot("processed.txt", reddit)
    subreddit = reddit.subreddit("test")

    # Check comments
    for comment in subreddit.comments(limit=50):
        if not bot.has_crawled(comment):
            process_comment(comment, bot)

    # Check submissions
    for submission in subreddit.new(limit=3):
        if not submission.is_self and not bot.has_crawled(submission):
            process_post(submission)