docker-compose setup (wip)

This commit is contained in:
simon 2019-11-13 13:03:43 -05:00
parent 31877283b3
commit df8ab7727b
18 changed files with 212 additions and 151 deletions

2
.gitignore vendored
View File

@ -4,9 +4,9 @@
__pycache__/
captchas/
_stats.json
config.py
oddb.log
praw.ini
env/
worker.json
search_blacklist.txt
*.iml

11
Dockerfile Normal file
View File

@ -0,0 +1,11 @@
FROM python:3.7
WORKDIR /app
ADD requirements.txt /app/requirements.txt
RUN pip install -r requirements.txt
ENTRYPOINT ["python", "app.py"]
COPY . /app

5
app.py
View File

@ -1,3 +1,6 @@
import time
time.sleep(60)
from flask import Flask
import api
@ -14,4 +17,4 @@ views.setup_views(app)
api.setup_api(app)
if __name__ == '__main__':
app.run("0.0.0.0", port=12345, threaded=True)
app.run("0.0.0.0", port=80, threaded=True)

View File

@ -26,14 +26,13 @@ logger.addHandler(file_handler)
logger.addHandler(StreamHandler(sys.stdout))
taskManager = TaskManager()
searchEngine = ElasticSearchEngine("od-database")
searchEngine = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
searchEngine.start_stats_scheduler()
db = Database(config.DB_CONN_STR)
redis = r.Redis()
redis = r.Redis(host=config.REDIS_HOST, port=config.REDIS_PORT)
def require_role(role: str):
if db.get_user_role(session.get("username", None)) != role:
abort(403)

29
config.py Normal file
View File

@ -0,0 +1,29 @@
from os import environ
CAPTCHA_LOGIN = bool(environ.get("CAPTCHA_LOGIN", False))
CAPTCHA_SUBMIT = bool(environ.get("CAPTCHA_SUBMIT", False))
CAPTCHA_SEARCH = bool(environ.get("CAPTCHA_SEARCH", False))
CAPTCHA_EVERY = int(environ.get("CAPTCHA_EVERY", 10))
FLASK_SECRET = environ.get("FLASK_SECRET", "A very secret secret")
RESULTS_PER_PAGE = (12, 25, 50, 100, 250, 500, 1000)
SUBMIT_FTP = bool(environ.get("SUBMIT_FTP", False))
SUBMIT_HTTP = bool(environ.get("SUBMIT_HTTP", True))
TT_API = environ.get("TT_API", "http://localhost:3010")
TT_CRAWL_PROJECT = int(environ.get("TT_CRAWL_PROJECT", 3))
TT_INDEX_PROJECT = int(environ.get("TT_INDEX_PROJECT", 9))
WSB_API = environ.get("WSB_API", "http://localhost:3020")
WSB_SECRET = environ.get("WSB_API", "default_secret")
ES_URL = environ.get("ES_URL", "http://localhost:9200")
ES_INDEX = environ.get("ES_INDEX", "od-database")
REDIS_HOST = environ.get("REDIS_HOST", "localhost")
REDIS_PORT = environ.get("REDIS_PORT", 6379)
DB_CONN_STR = environ.get("DB_CONN_STR", "dbname=od_database user=od_database password=od_database")
RECRAWL_POOL_SIZE = environ.get("RECRAWL_POOL_SIZE", 10000)
INDEXER_THREADS = int(environ.get("INDEXER_THREAD", 3))

92
docker-compose.yml Normal file
View File

@ -0,0 +1,92 @@
version: "3"
services:
oddb:
image: simon987/od-database
ports:
- 5020:80
environment:
- "CAPTCHA_LOGIN=True"
- "CAPTCHA_SUBMIT=True"
- "CAPTCHA_SEARCH=True"
- "CAPTCHA_EVERY=10"
- "FLASK_SECRET=changeme"
- "SUBMIT_FTP=False"
- "SUBMIT_HTTP=True"
- "TT_API=http://tt:3010"
- "TT_CRAWL_PROJECT=1"
- "TT_INDEX_PROJECT=2"
- "WSB_API=http://wsb:3020"
- "WSB_SECRET=changeme"
- "REDIS_HOST=oddb_redis"
- "ES_URL=es:9200"
- "DB_CONN_STR=postgres://od_database:changeme@oddb_db/od_database?sslmode=disable"
- "RECRAWL_POOL_SIZE=10000"
- "INDEXER_THREADS=2"
depends_on:
- wsb
- tt
- es
restart: always
oddb_db:
image: postgres
volumes:
- ./oddb_pg_data:/var/lib/postgresql/data
environment:
- "POSTGRES_USER=od_database"
- "POSTGRES_PASSWORD=changeme"
ports:
- 5021:5432
healthcheck:
test: ["CMD-SHELL", "pg_isready -U od_database"]
interval: 5s
timeout: 5s
retries: 5
oddb_redis:
image: redis
wsb:
image: simon987/wsb_bucket
volumes:
- ./wsb_data:/data
environment:
- "WS_BUCKET_SECRET=changeme"
ports:
- 3020:3020
tt_db:
image: postgres
volumes:
- ./tt_pg_data:/var/lib/postgresql/data
environment:
POSTGRES_USER: task_tracker
POSTGRES_PASSWORD: changeme
healthcheck:
test: ["CMD-SHELL", "pg_isready -U task_tracker"]
interval: 5s
timeout: 5s
retries: 5
tt:
image: simon987/task_tracker
volumes:
- ./tt_pg_data:/var/lib/postgresql/data
- ./tt_config.yml:/root/config.yml
ports:
- 3010:80
depends_on:
- tt_db
es:
image: docker.elastic.co/elasticsearch/elasticsearch:7.4.2
environment:
# - bootstrap.memory_lock=true
- discovery.type=single-node
# - index.number_of_shards=50
# - index.number_of_replicas=0
# - "ES_JAVA_OPTS=-Xms1G -Xmx10G"
volumes:
- /usr/share/elasticsearch/data
healthcheck:
test: ["CMD-SHELL", "curl --silent --fail localhost:9200/_cluster/health || exit 1"]
interval: 30s
timeout: 30s
retries: 3

View File

@ -28,7 +28,7 @@ for file in os.listdir(dldir):
print("Export started, connecting to databases...")
db = Database(config.DB_CONN_STR)
es = ElasticSearchEngine("od-database")
es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
docs_with_url = db.join_website_url(es.stream_all_docs())

50
jenkins/Jenkinsfile vendored
View File

@ -1,50 +0,0 @@
def remote = [:]
remote.name = 'remote'
remote.host = env.DEPLOY_HOST
remote.user = env.DEPLOY_USER
remote.identityFile = '/var/lib/jenkins/.ssh/id_rsa'
remote.knownHosts = '/var/lib/jenkins/.ssh/known_hosts'
pipeline {
agent any
stages {
stage('Build') {
steps {
sh './jenkins/build.sh'
}
}
stage('Deploy') {
steps {
sh 'echo $ODDB_CONFIG > config.py'
sshCommand remote: remote, command: "cd od-database && rm -rf env fold_to_ascii search static task_tracker_drone templates ws_bucket_client *.py deploy.sh"
sshPut remote: remote, from: 'requirements.txt', into: 'od-database'
sshPut remote: remote, from: 'fold_to_ascii', into: 'od-database'
sshPut remote: remote, from: 'search', into: 'od-database'
sshPut remote: remote, from: 'static', into: 'od-database'
sshPut remote: remote, from: 'task_tracker_drone', into: 'od-database'
sshPut remote: remote, from: 'templates', into: 'od-database'
sshPut remote: remote, from: 'ws_bucket_client', into: 'od-database'
sshPut remote: remote, from: '__init__.py', into: 'od-database'
sshPut remote: remote, from: 'api.py', into: 'od-database'
sshPut remote: remote, from: 'app.py', into: 'od-database'
sshPut remote: remote, from: 'captcha.py', into: 'od-database'
sshPut remote: remote, from: 'common.py', into: 'od-database'
sshPut remote: remote, from: 'database.py', into: 'od-database'
sshPut remote: remote, from: 'export.py', into: 'od-database'
sshPut remote: remote, from: 'init_script.sql', into: 'od-database'
sshPut remote: remote, from: 'od_util.py', into: 'od-database'
sshPut remote: remote, from: 'reddit_bot.py', into: 'od-database'
sshPut remote: remote, from: 'tasks.py', into: 'od-database'
sshPut remote: remote, from: 'template_filters.py', into: 'od-database'
sshPut remote: remote, from: 'uwsgi.py', into: 'od-database'
sshPut remote: remote, from: 'views.py', into: 'od-database'
sshPut remote: remote, from: 'config.py', into: 'od-database'
sshPut remote: remote, from: 'mass_import.py', into: 'od-database'
sshPut remote: remote, from: 'do_recrawl.py', into: 'od-database'
sshPut remote: remote, from: 'od-database.ini', into: 'od-database'
sshPut remote: remote, from: 'jenkins/deploy.sh', into: 'od-database'
sshCommand remote: remote, command: 'chmod +x od-database/deploy.sh && ./od-database/deploy.sh'
}
}
}
}

View File

@ -1,4 +0,0 @@
#!/bin/bash
git submodule init
git submodule update --remote --recursive

View File

@ -1,32 +0,0 @@
#!/bin/bash
export ODDBROOT="od-database"
virtualenv ${ODDBROOT}/env -p python3.7
source ${ODDBROOT}/env/bin/activate
pip install -r ${ODDBROOT}/requirements.txt
screen -S oddb_web -X quit
killall -9 uwsgi
sleep 5
echo "starting oddb_web"
screen -S oddb_web -d -m bash -c "cd ${ODDBROOT} && source env/bin/activate && uwsgi od-database.ini 2> stderr.txt"
sleep 1
screen -list
echo "Installing crontabs"
absolute_dir=$(cd ${ODDBROOT} && pwd)
# Re-crawl dirs
command="bash -c \"cd '${absolute_dir}' && source env/bin/activate && python do_recrawl.py >> recrawl_logs.txt\""
job="*/10 * * * * $command"
echo "$job"
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -
# Cleanup captchas
command="bash -c \"cd '${absolute_dir}' && rm captchas/*.png\""
job="*/60 * * * * $command"
echo "$job"
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -

View File

View File

@ -1,10 +0,0 @@
[uwsgi]
uwsgi-socket = 127.0.0.1:3031
wsgi-file = uwsgi.py
processes = 2
threads = 16
stats = 127.0.0.1:9191
callable=app
virtualenv=./env
disable-logging=True

View File

@ -18,7 +18,6 @@ lxml
pillow
Wand
numpy
matplotlib
uwsgi
redis
psycopg2-binary

View File

@ -20,32 +20,7 @@ class IndexingError(Exception):
pass
class SearchEngine:
def __init__(self):
pass
def import_json(self, in_str: str, website_id: int):
raise NotImplementedError
def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min,
date_max) -> {}:
raise NotImplementedError
def reset(self):
raise NotImplementedError
def ping(self):
raise NotImplementedError
def get_stats(self, website_id: int, subdir: str = None):
raise NotImplementedError
def refresh(self):
raise NotImplementedError
class ElasticSearchEngine(SearchEngine):
class ElasticSearchEngine:
SORT_ORDERS = {
"score": ["_score"],
"size_asc": [{"size": {"order": "asc"}}],
@ -55,10 +30,11 @@ class ElasticSearchEngine(SearchEngine):
"none": []
}
def __init__(self, index_name):
def __init__(self, url, index_name):
super().__init__()
self.index_name = index_name
self.es = elasticsearch.Elasticsearch()
logger.info("Connecting to ES @ %s" % url)
self.es = elasticsearch.Elasticsearch(hosts=[url])
self.filter = SearchFilter()
if not self.es.indices.exists(self.index_name):
@ -78,23 +54,25 @@ class ElasticSearchEngine(SearchEngine):
# Index settings
self.es.indices.put_settings(body={
"analysis": {
"tokenizer": {
"my_nGram_tokenizer": {
"type": "nGram", "min_gram": 3, "max_gram": 3
}
}
}}, index=self.index_name)
self.es.indices.put_settings(body={
"index": {
"refresh_interval": "30s",
"codec": "best_compression"
},
"analysis": {
"analyzer": {
"my_nGram": {
"tokenizer": "my_nGram_tokenizer",
"filter": ["lowercase", "asciifolding"]
}
},
"tokenizer": {
"my_nGram_tokenizer": {
"type": "nGram", "min_gram": 3, "max_gram": 3
}
}
}}, index=self.index_name)
# Index Mappings
self.es.indices.put_mapping(body={
"properties": {
"path": {"analyzer": "standard", "type": "text"},
@ -110,12 +88,6 @@ class ElasticSearchEngine(SearchEngine):
self.es.indices.open(index=self.index_name)
def reset(self):
self.init()
def ping(self):
return self.es.ping()
def delete_docs(self, website_id):
while True:
@ -332,7 +304,8 @@ class ElasticSearchEngine(SearchEngine):
yield urljoin(base_url, "/") + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \
("." if src["ext"] != "" else "") + src["ext"]
def get_global_stats(self):
@staticmethod
def get_global_stats():
if os.path.exists("_stats.json"):
with open("_stats.json", "r") as f:
@ -489,7 +462,7 @@ class ElasticSearchEngine(SearchEngine):
"query": {
"match_all": {}
}
}, scroll="1m", client=self.es, index=self.index_name, request_timeout=60)
}, scroll="30s", client=self.es, index=self.index_name, request_timeout=30)
def refresh(self):
self.es.indices.refresh(self.index_name)

View File

@ -3,9 +3,11 @@ import logging
import os
import time
from multiprocessing.pool import ThreadPool
from tempfile import NamedTemporaryFile
from threading import Thread
from uuid import uuid4
import requests
import urllib3
import config
@ -60,13 +62,13 @@ class IndexingTask:
class TaskManager:
def __init__(self):
self.search = ElasticSearchEngine("od-database")
self.search = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
self.db = database.Database(config.DB_CONN_STR)
self.tracker = TaskTrackerApi(config.TT_API)
self.worker = Worker.from_file(self.tracker)
if not self.worker:
self.worker = self.tracker.make_worker("oddb_master")
self.worker = self.tracker.make_worker("$oddb_master")
self.worker.dump_to_file()
self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
self.worker.request_access(config.TT_INDEX_PROJECT, True, False)
@ -91,8 +93,9 @@ class TaskManager:
try:
recipe = task.json_recipe()
logger.debug("Got indexing task: " + str(recipe))
filename = os.path.join(config.WSB_PATH,
format_file_name(recipe["website_id"], recipe["upload_token"]))
filename = download_file(config.WSB_API + "/slot?token=" + recipe["upload_token"])
self._complete_task(filename, Task(recipe["website_id"], recipe["url"]))
except Exception as e:
self.worker.release_task(task_id=task.id, result=1, verification=0)
@ -167,3 +170,18 @@ class TaskManager:
def format_file_name(website_id, token):
return "%d_%s.NDJSON" % (website_id, token,)
def download_file(url):
r = requests.get(url, stream=True,)
if r.status_code != 200:
raise ValueError("HTTP error %d: %s" % (r.status_code, url))
tmp = NamedTemporaryFile(delete=False)
for chunk in r.iter_content(chunk_size=4096):
if chunk:
tmp.write(chunk)
tmp.close()
return tmp.name

0
templates/search.html Executable file → Normal file
View File

24
tt_config.yml Normal file
View File

@ -0,0 +1,24 @@
server:
address: "0.0.0.0:3010"
database:
conn_str: "postgres://task_tracker:changeme@tt_db/task_tracker?sslmode=disable"
log_levels: ["error", "info", "warn"]
git:
webhook_hash: "sha256"
webhook_sig_header: "X-Gogs-Signature"
log:
level: "trace"
session:
cookie_name: "tt"
expiration: "48h"
monitoring:
snapshot_interval: "120s"
history_length: "1800h"
maintenance:
reset_timed_out_tasks_interval: "10m"

9
uwsgi.ini Normal file
View File

@ -0,0 +1,9 @@
[uwsgi]
module = main
callable = app
enable-threads = true
processes = 4
threads = 16
disable-logging = True