diff --git a/api.py b/api.py
index 9d10c84..862a979 100644
--- a/api.py
+++ b/api.py
@@ -5,11 +5,13 @@ from flask import request, abort, send_file, session
import captcha
import common as oddb
+from common import taskManager
from database import Website
from search.search import InvalidQueryException
def setup_api(app):
+ taskManager.start_indexer_threads()
@app.route("/api/website/by_url", methods=["GET"])
def api_website_by_url():
diff --git a/common.py b/common.py
index 1b31d7f..089f8d7 100644
--- a/common.py
+++ b/common.py
@@ -26,7 +26,6 @@ logger.addHandler(file_handler)
logger.addHandler(StreamHandler(sys.stdout))
taskManager = TaskManager()
-taskManager.start_indexer_threads()
searchEngine = ElasticSearchEngine("od-database")
searchEngine.start_stats_scheduler()
db = Database(config.DB_CONN_STR)
diff --git a/jenkins/Jenkinsfile b/jenkins/Jenkinsfile
index 6447197..5631447 100644
--- a/jenkins/Jenkinsfile
+++ b/jenkins/Jenkinsfile
@@ -39,6 +39,7 @@ pipeline {
sshPut remote: remote, from: 'uwsgi.py', into: 'od-database'
sshPut remote: remote, from: 'views.py', into: 'od-database'
sshPut remote: remote, from: 'config.py', into: 'od-database'
+ sshPut remote: remote, from: 'mass_import.py', into: 'od-database'
sshPut remote: remote, from: 'do_recrawl.py', into: 'od-database'
sshPut remote: remote, from: 'od-database.ini', into: 'od-database'
sshPut remote: remote, from: 'jenkins/deploy.sh', into: 'od-database'
diff --git a/jenkins/deploy.sh b/jenkins/deploy.sh
index 06cdc77..b02801e 100755
--- a/jenkins/deploy.sh
+++ b/jenkins/deploy.sh
@@ -16,9 +16,17 @@ screen -S oddb_web -d -m bash -c "cd ${ODDBROOT} && source env/bin/activate && u
sleep 1
screen -list
-echo "Installing crontab"
+echo "Installing crontabs"
absolute_dir=$(cd ${ODDBROOT} && pwd)
+
+# Re-crawl dirs
command="bash -c \"cd '${absolute_dir}' && source env/bin/activate && python do_recrawl.py >> recrawl_logs.txt\""
-job="*/10 * * * * \"$command\""
+job="*/10 * * * * $command"
+echo "$job"
+cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -
+
+# Cleanup captchas
+command="bash -c \"cd '${absolute_dir}' && rm captchas/*.png\""
+job="*/60 * * * * $command"
echo "$job"
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -
diff --git a/mass_import.py b/mass_import.py
new file mode 100644
index 0000000..d6b6838
--- /dev/null
+++ b/mass_import.py
@@ -0,0 +1,58 @@
+import fileinput
+import os
+from multiprocessing.pool import Pool
+
+import od_util
+from common import db, taskManager
+from database import Website
+from tasks import Task
+
+urls = (line for line in fileinput.input())
+
+
+def try_enqueue(url):
+ url = os.path.join(url, "")
+ url = od_util.get_top_directory(url)
+
+ if not od_util.is_valid_url(url):
+ return "Error: Invalid url. Make sure to include the appropriate scheme."
+
+ website = db.get_website_by_url(url)
+ if website:
+ return "Website already exists"
+
+ website = db.website_exists(url)
+ if website:
+ return "A parent directory of this url has already been posted"
+
+ if db.is_blacklisted(url):
+ return "Error: " \
+ "Sorry, this website has been blacklisted. If you think " \
+ "this is an error, please contact me."
+
+ if not od_util.is_od(url):
+ return "Error:" \
+ "The anti-spam algorithm determined that the submitted url is not " \
+ "an open directory or the server is not responding. If you think " \
+ "this is an error, please contact me."
+
+ website_id = db.insert_website(Website(url, "localhost", "mass_import.py"))
+
+ task = Task(website_id, url, priority=2)
+ taskManager.queue_task(task)
+
+ return "The website has been added to the queue"
+
+
+def check_url(url):
+ url = os.path.join(url.strip(), "")
+ try:
+ print(try_enqueue(url))
+ except:
+ pass
+ return None
+
+
+pool = Pool(processes=50)
+pool.map(func=check_url, iterable=urls)
+pool.close()