mirror of
https://github.com/simon987/od-database.git
synced 2025-04-04 06:52:59 +00:00
Add mass import utility
This commit is contained in:
parent
2c7e71cde1
commit
41ba6a35a4
2
api.py
2
api.py
@ -5,11 +5,13 @@ from flask import request, abort, send_file, session
|
||||
|
||||
import captcha
|
||||
import common as oddb
|
||||
from common import taskManager
|
||||
from database import Website
|
||||
from search.search import InvalidQueryException
|
||||
|
||||
|
||||
def setup_api(app):
|
||||
taskManager.start_indexer_threads()
|
||||
|
||||
@app.route("/api/website/by_url", methods=["GET"])
|
||||
def api_website_by_url():
|
||||
|
@ -26,7 +26,6 @@ logger.addHandler(file_handler)
|
||||
logger.addHandler(StreamHandler(sys.stdout))
|
||||
|
||||
taskManager = TaskManager()
|
||||
taskManager.start_indexer_threads()
|
||||
searchEngine = ElasticSearchEngine("od-database")
|
||||
searchEngine.start_stats_scheduler()
|
||||
db = Database(config.DB_CONN_STR)
|
||||
|
1
jenkins/Jenkinsfile
vendored
1
jenkins/Jenkinsfile
vendored
@ -39,6 +39,7 @@ pipeline {
|
||||
sshPut remote: remote, from: 'uwsgi.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'views.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'config.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'mass_import.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'do_recrawl.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'od-database.ini', into: 'od-database'
|
||||
sshPut remote: remote, from: 'jenkins/deploy.sh', into: 'od-database'
|
||||
|
@ -16,9 +16,17 @@ screen -S oddb_web -d -m bash -c "cd ${ODDBROOT} && source env/bin/activate && u
|
||||
sleep 1
|
||||
screen -list
|
||||
|
||||
echo "Installing crontab"
|
||||
echo "Installing crontabs"
|
||||
absolute_dir=$(cd ${ODDBROOT} && pwd)
|
||||
|
||||
# Re-crawl dirs
|
||||
command="bash -c \"cd '${absolute_dir}' && source env/bin/activate && python do_recrawl.py >> recrawl_logs.txt\""
|
||||
job="*/10 * * * * \"$command\""
|
||||
job="*/10 * * * * $command"
|
||||
echo "$job"
|
||||
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -
|
||||
|
||||
# Cleanup captchas
|
||||
command="bash -c \"cd '${absolute_dir}' && rm captchas/*.png\""
|
||||
job="*/60 * * * * $command"
|
||||
echo "$job"
|
||||
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -
|
||||
|
58
mass_import.py
Normal file
58
mass_import.py
Normal file
@ -0,0 +1,58 @@
|
||||
import fileinput
|
||||
import os
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import od_util
|
||||
from common import db, taskManager
|
||||
from database import Website
|
||||
from tasks import Task
|
||||
|
||||
urls = (line for line in fileinput.input())
|
||||
|
||||
|
||||
def try_enqueue(url):
|
||||
url = os.path.join(url, "")
|
||||
url = od_util.get_top_directory(url)
|
||||
|
||||
if not od_util.is_valid_url(url):
|
||||
return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme."
|
||||
|
||||
website = db.get_website_by_url(url)
|
||||
if website:
|
||||
return "Website already exists"
|
||||
|
||||
website = db.website_exists(url)
|
||||
if website:
|
||||
return "A parent directory of this url has already been posted"
|
||||
|
||||
if db.is_blacklisted(url):
|
||||
return "<strong>Error:</strong> " \
|
||||
"Sorry, this website has been blacklisted. If you think " \
|
||||
"this is an error, please <a href='/contribute'>contact me</a>."
|
||||
|
||||
if not od_util.is_od(url):
|
||||
return "<strong>Error:</strong>" \
|
||||
"The anti-spam algorithm determined that the submitted url is not " \
|
||||
"an open directory or the server is not responding. If you think " \
|
||||
"this is an error, please <a href='/contribute'>contact me</a>."
|
||||
|
||||
website_id = db.insert_website(Website(url, "localhost", "mass_import.py"))
|
||||
|
||||
task = Task(website_id, url, priority=2)
|
||||
taskManager.queue_task(task)
|
||||
|
||||
return "The website has been added to the queue"
|
||||
|
||||
|
||||
def check_url(url):
|
||||
url = os.path.join(url.strip(), "")
|
||||
try:
|
||||
print(try_enqueue(url))
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
pool = Pool(processes=50)
|
||||
pool.map(func=check_url, iterable=urls)
|
||||
pool.close()
|
Loading…
x
Reference in New Issue
Block a user