Add mass import utility

This commit is contained in:
simon987 2019-04-06 19:25:49 -04:00
parent 2c7e71cde1
commit 41ba6a35a4
5 changed files with 71 additions and 3 deletions

2
api.py
View File

@ -5,11 +5,13 @@ from flask import request, abort, send_file, session
import captcha import captcha
import common as oddb import common as oddb
from common import taskManager
from database import Website from database import Website
from search.search import InvalidQueryException from search.search import InvalidQueryException
def setup_api(app): def setup_api(app):
taskManager.start_indexer_threads()
@app.route("/api/website/by_url", methods=["GET"]) @app.route("/api/website/by_url", methods=["GET"])
def api_website_by_url(): def api_website_by_url():

View File

@ -26,7 +26,6 @@ logger.addHandler(file_handler)
logger.addHandler(StreamHandler(sys.stdout)) logger.addHandler(StreamHandler(sys.stdout))
taskManager = TaskManager() taskManager = TaskManager()
taskManager.start_indexer_threads()
searchEngine = ElasticSearchEngine("od-database") searchEngine = ElasticSearchEngine("od-database")
searchEngine.start_stats_scheduler() searchEngine.start_stats_scheduler()
db = Database(config.DB_CONN_STR) db = Database(config.DB_CONN_STR)

1
jenkins/Jenkinsfile vendored
View File

@ -39,6 +39,7 @@ pipeline {
sshPut remote: remote, from: 'uwsgi.py', into: 'od-database' sshPut remote: remote, from: 'uwsgi.py', into: 'od-database'
sshPut remote: remote, from: 'views.py', into: 'od-database' sshPut remote: remote, from: 'views.py', into: 'od-database'
sshPut remote: remote, from: 'config.py', into: 'od-database' sshPut remote: remote, from: 'config.py', into: 'od-database'
sshPut remote: remote, from: 'mass_import.py', into: 'od-database'
sshPut remote: remote, from: 'do_recrawl.py', into: 'od-database' sshPut remote: remote, from: 'do_recrawl.py', into: 'od-database'
sshPut remote: remote, from: 'od-database.ini', into: 'od-database' sshPut remote: remote, from: 'od-database.ini', into: 'od-database'
sshPut remote: remote, from: 'jenkins/deploy.sh', into: 'od-database' sshPut remote: remote, from: 'jenkins/deploy.sh', into: 'od-database'

View File

@ -16,9 +16,17 @@ screen -S oddb_web -d -m bash -c "cd ${ODDBROOT} && source env/bin/activate && u
sleep 1 sleep 1
screen -list screen -list
echo "Installing crontab" echo "Installing crontabs"
absolute_dir=$(cd ${ODDBROOT} && pwd) absolute_dir=$(cd ${ODDBROOT} && pwd)
# Re-crawl dirs
command="bash -c \"cd '${absolute_dir}' && source env/bin/activate && python do_recrawl.py >> recrawl_logs.txt\"" command="bash -c \"cd '${absolute_dir}' && source env/bin/activate && python do_recrawl.py >> recrawl_logs.txt\""
job="*/10 * * * * \"$command\"" job="*/10 * * * * $command"
echo "$job"
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -
# Cleanup captchas
command="bash -c \"cd '${absolute_dir}' && rm captchas/*.png\""
job="*/60 * * * * $command"
echo "$job" echo "$job"
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab - cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -

58
mass_import.py Normal file
View File

@ -0,0 +1,58 @@
import fileinput
import os
from multiprocessing.pool import Pool
import od_util
from common import db, taskManager
from database import Website
from tasks import Task
urls = (line for line in fileinput.input())
def try_enqueue(url):
url = os.path.join(url, "")
url = od_util.get_top_directory(url)
if not od_util.is_valid_url(url):
return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme."
website = db.get_website_by_url(url)
if website:
return "Website already exists"
website = db.website_exists(url)
if website:
return "A parent directory of this url has already been posted"
if db.is_blacklisted(url):
return "<strong>Error:</strong> " \
"Sorry, this website has been blacklisted. If you think " \
"this is an error, please <a href='/contribute'>contact me</a>."
if not od_util.is_od(url):
return "<strong>Error:</strong>" \
"The anti-spam algorithm determined that the submitted url is not " \
"an open directory or the server is not responding. If you think " \
"this is an error, please <a href='/contribute'>contact me</a>."
website_id = db.insert_website(Website(url, "localhost", "mass_import.py"))
task = Task(website_id, url, priority=2)
taskManager.queue_task(task)
return "The website has been added to the queue"
def check_url(url):
url = os.path.join(url.strip(), "")
try:
print(try_enqueue(url))
except:
pass
return None
pool = Pool(processes=50)
pool.map(func=check_url, iterable=urls)
pool.close()