From 41ba6a35a420acea12b43ea4f9db0296787f009a Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 6 Apr 2019 19:25:49 -0400 Subject: [PATCH] Add mass import utility --- api.py | 2 ++ common.py | 1 - jenkins/Jenkinsfile | 1 + jenkins/deploy.sh | 12 ++++++++-- mass_import.py | 58 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 3 deletions(-) create mode 100644 mass_import.py diff --git a/api.py b/api.py index 9d10c84..862a979 100644 --- a/api.py +++ b/api.py @@ -5,11 +5,13 @@ from flask import request, abort, send_file, session import captcha import common as oddb +from common import taskManager from database import Website from search.search import InvalidQueryException def setup_api(app): + taskManager.start_indexer_threads() @app.route("/api/website/by_url", methods=["GET"]) def api_website_by_url(): diff --git a/common.py b/common.py index 1b31d7f..089f8d7 100644 --- a/common.py +++ b/common.py @@ -26,7 +26,6 @@ logger.addHandler(file_handler) logger.addHandler(StreamHandler(sys.stdout)) taskManager = TaskManager() -taskManager.start_indexer_threads() searchEngine = ElasticSearchEngine("od-database") searchEngine.start_stats_scheduler() db = Database(config.DB_CONN_STR) diff --git a/jenkins/Jenkinsfile b/jenkins/Jenkinsfile index 6447197..5631447 100644 --- a/jenkins/Jenkinsfile +++ b/jenkins/Jenkinsfile @@ -39,6 +39,7 @@ pipeline { sshPut remote: remote, from: 'uwsgi.py', into: 'od-database' sshPut remote: remote, from: 'views.py', into: 'od-database' sshPut remote: remote, from: 'config.py', into: 'od-database' + sshPut remote: remote, from: 'mass_import.py', into: 'od-database' sshPut remote: remote, from: 'do_recrawl.py', into: 'od-database' sshPut remote: remote, from: 'od-database.ini', into: 'od-database' sshPut remote: remote, from: 'jenkins/deploy.sh', into: 'od-database' diff --git a/jenkins/deploy.sh b/jenkins/deploy.sh index 06cdc77..b02801e 100755 --- a/jenkins/deploy.sh +++ b/jenkins/deploy.sh @@ -16,9 +16,17 @@ screen -S oddb_web -d -m bash -c "cd ${ODDBROOT} && source env/bin/activate && u sleep 1 screen -list -echo "Installing crontab" +echo "Installing crontabs" absolute_dir=$(cd ${ODDBROOT} && pwd) + +# Re-crawl dirs command="bash -c \"cd '${absolute_dir}' && source env/bin/activate && python do_recrawl.py >> recrawl_logs.txt\"" -job="*/10 * * * * \"$command\"" +job="*/10 * * * * $command" +echo "$job" +cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab - + +# Cleanup captchas +command="bash -c \"cd '${absolute_dir}' && rm captchas/*.png\"" +job="*/60 * * * * $command" echo "$job" cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab - diff --git a/mass_import.py b/mass_import.py new file mode 100644 index 0000000..d6b6838 --- /dev/null +++ b/mass_import.py @@ -0,0 +1,58 @@ +import fileinput +import os +from multiprocessing.pool import Pool + +import od_util +from common import db, taskManager +from database import Website +from tasks import Task + +urls = (line for line in fileinput.input()) + + +def try_enqueue(url): + url = os.path.join(url, "") + url = od_util.get_top_directory(url) + + if not od_util.is_valid_url(url): + return "Error: Invalid url. Make sure to include the appropriate scheme." + + website = db.get_website_by_url(url) + if website: + return "Website already exists" + + website = db.website_exists(url) + if website: + return "A parent directory of this url has already been posted" + + if db.is_blacklisted(url): + return "Error: " \ + "Sorry, this website has been blacklisted. If you think " \ + "this is an error, please contact me." + + if not od_util.is_od(url): + return "Error:" \ + "The anti-spam algorithm determined that the submitted url is not " \ + "an open directory or the server is not responding. If you think " \ + "this is an error, please contact me." + + website_id = db.insert_website(Website(url, "localhost", "mass_import.py")) + + task = Task(website_id, url, priority=2) + taskManager.queue_task(task) + + return "The website has been added to the queue" + + +def check_url(url): + url = os.path.join(url.strip(), "") + try: + print(try_enqueue(url)) + except: + pass + return None + + +pool = Pool(processes=50) +pool.map(func=check_url, iterable=urls) +pool.close()