Add mass import utility

This commit is contained in:
simon987 2019-04-06 19:25:49 -04:00
parent 2c7e71cde1
commit 41ba6a35a4
5 changed files with 71 additions and 3 deletions

2
api.py
View File

@ -5,11 +5,13 @@ from flask import request, abort, send_file, session
import captcha
import common as oddb
from common import taskManager
from database import Website
from search.search import InvalidQueryException
def setup_api(app):
taskManager.start_indexer_threads()
@app.route("/api/website/by_url", methods=["GET"])
def api_website_by_url():

View File

@ -26,7 +26,6 @@ logger.addHandler(file_handler)
logger.addHandler(StreamHandler(sys.stdout))
taskManager = TaskManager()
taskManager.start_indexer_threads()
searchEngine = ElasticSearchEngine("od-database")
searchEngine.start_stats_scheduler()
db = Database(config.DB_CONN_STR)

1
jenkins/Jenkinsfile vendored
View File

@ -39,6 +39,7 @@ pipeline {
sshPut remote: remote, from: 'uwsgi.py', into: 'od-database'
sshPut remote: remote, from: 'views.py', into: 'od-database'
sshPut remote: remote, from: 'config.py', into: 'od-database'
sshPut remote: remote, from: 'mass_import.py', into: 'od-database'
sshPut remote: remote, from: 'do_recrawl.py', into: 'od-database'
sshPut remote: remote, from: 'od-database.ini', into: 'od-database'
sshPut remote: remote, from: 'jenkins/deploy.sh', into: 'od-database'

View File

@ -16,9 +16,17 @@ screen -S oddb_web -d -m bash -c "cd ${ODDBROOT} && source env/bin/activate && u
sleep 1
screen -list
echo "Installing crontab"
echo "Installing crontabs"
absolute_dir=$(cd ${ODDBROOT} && pwd)
# Re-crawl dirs
command="bash -c \"cd '${absolute_dir}' && source env/bin/activate && python do_recrawl.py >> recrawl_logs.txt\""
job="*/10 * * * * \"$command\""
job="*/10 * * * * $command"
echo "$job"
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -
# Cleanup captchas
command="bash -c \"cd '${absolute_dir}' && rm captchas/*.png\""
job="*/60 * * * * $command"
echo "$job"
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -

58
mass_import.py Normal file
View File

@ -0,0 +1,58 @@
import fileinput
import os
from multiprocessing.pool import Pool
import od_util
from common import db, taskManager
from database import Website
from tasks import Task
urls = (line for line in fileinput.input())
def try_enqueue(url):
url = os.path.join(url, "")
url = od_util.get_top_directory(url)
if not od_util.is_valid_url(url):
return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme."
website = db.get_website_by_url(url)
if website:
return "Website already exists"
website = db.website_exists(url)
if website:
return "A parent directory of this url has already been posted"
if db.is_blacklisted(url):
return "<strong>Error:</strong> " \
"Sorry, this website has been blacklisted. If you think " \
"this is an error, please <a href='/contribute'>contact me</a>."
if not od_util.is_od(url):
return "<strong>Error:</strong>" \
"The anti-spam algorithm determined that the submitted url is not " \
"an open directory or the server is not responding. If you think " \
"this is an error, please <a href='/contribute'>contact me</a>."
website_id = db.insert_website(Website(url, "localhost", "mass_import.py"))
task = Task(website_id, url, priority=2)
taskManager.queue_task(task)
return "The website has been added to the queue"
def check_url(url):
url = os.path.join(url.strip(), "")
try:
print(try_enqueue(url))
except:
pass
return None
pool = Pool(processes=50)
pool.map(func=check_url, iterable=urls)
pool.close()