docker-compose setup (wip)

This commit is contained in:
simon 2019-11-13 13:03:43 -05:00
parent 31877283b3
commit df8ab7727b
18 changed files with 212 additions and 151 deletions

2
.gitignore vendored
View File

@ -4,9 +4,9 @@
__pycache__/ __pycache__/
captchas/ captchas/
_stats.json _stats.json
config.py
oddb.log oddb.log
praw.ini praw.ini
env/ env/
worker.json worker.json
search_blacklist.txt search_blacklist.txt
*.iml

11
Dockerfile Normal file
View File

@ -0,0 +1,11 @@
FROM python:3.7
WORKDIR /app
ADD requirements.txt /app/requirements.txt
RUN pip install -r requirements.txt
ENTRYPOINT ["python", "app.py"]
COPY . /app

5
app.py
View File

@ -1,3 +1,6 @@
import time
time.sleep(60)
from flask import Flask from flask import Flask
import api import api
@ -14,4 +17,4 @@ views.setup_views(app)
api.setup_api(app) api.setup_api(app)
if __name__ == '__main__': if __name__ == '__main__':
app.run("0.0.0.0", port=12345, threaded=True) app.run("0.0.0.0", port=80, threaded=True)

View File

@ -26,14 +26,13 @@ logger.addHandler(file_handler)
logger.addHandler(StreamHandler(sys.stdout)) logger.addHandler(StreamHandler(sys.stdout))
taskManager = TaskManager() taskManager = TaskManager()
searchEngine = ElasticSearchEngine("od-database") searchEngine = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
searchEngine.start_stats_scheduler() searchEngine.start_stats_scheduler()
db = Database(config.DB_CONN_STR) db = Database(config.DB_CONN_STR)
redis = r.Redis() redis = r.Redis(host=config.REDIS_HOST, port=config.REDIS_PORT)
def require_role(role: str): def require_role(role: str):
if db.get_user_role(session.get("username", None)) != role: if db.get_user_role(session.get("username", None)) != role:
abort(403) abort(403)

29
config.py Normal file
View File

@ -0,0 +1,29 @@
from os import environ
CAPTCHA_LOGIN = bool(environ.get("CAPTCHA_LOGIN", False))
CAPTCHA_SUBMIT = bool(environ.get("CAPTCHA_SUBMIT", False))
CAPTCHA_SEARCH = bool(environ.get("CAPTCHA_SEARCH", False))
CAPTCHA_EVERY = int(environ.get("CAPTCHA_EVERY", 10))
FLASK_SECRET = environ.get("FLASK_SECRET", "A very secret secret")
RESULTS_PER_PAGE = (12, 25, 50, 100, 250, 500, 1000)
SUBMIT_FTP = bool(environ.get("SUBMIT_FTP", False))
SUBMIT_HTTP = bool(environ.get("SUBMIT_HTTP", True))
TT_API = environ.get("TT_API", "http://localhost:3010")
TT_CRAWL_PROJECT = int(environ.get("TT_CRAWL_PROJECT", 3))
TT_INDEX_PROJECT = int(environ.get("TT_INDEX_PROJECT", 9))
WSB_API = environ.get("WSB_API", "http://localhost:3020")
WSB_SECRET = environ.get("WSB_API", "default_secret")
ES_URL = environ.get("ES_URL", "http://localhost:9200")
ES_INDEX = environ.get("ES_INDEX", "od-database")
REDIS_HOST = environ.get("REDIS_HOST", "localhost")
REDIS_PORT = environ.get("REDIS_PORT", 6379)
DB_CONN_STR = environ.get("DB_CONN_STR", "dbname=od_database user=od_database password=od_database")
RECRAWL_POOL_SIZE = environ.get("RECRAWL_POOL_SIZE", 10000)
INDEXER_THREADS = int(environ.get("INDEXER_THREAD", 3))

92
docker-compose.yml Normal file
View File

@ -0,0 +1,92 @@
version: "3"
services:
oddb:
image: simon987/od-database
ports:
- 5020:80
environment:
- "CAPTCHA_LOGIN=True"
- "CAPTCHA_SUBMIT=True"
- "CAPTCHA_SEARCH=True"
- "CAPTCHA_EVERY=10"
- "FLASK_SECRET=changeme"
- "SUBMIT_FTP=False"
- "SUBMIT_HTTP=True"
- "TT_API=http://tt:3010"
- "TT_CRAWL_PROJECT=1"
- "TT_INDEX_PROJECT=2"
- "WSB_API=http://wsb:3020"
- "WSB_SECRET=changeme"
- "REDIS_HOST=oddb_redis"
- "ES_URL=es:9200"
- "DB_CONN_STR=postgres://od_database:changeme@oddb_db/od_database?sslmode=disable"
- "RECRAWL_POOL_SIZE=10000"
- "INDEXER_THREADS=2"
depends_on:
- wsb
- tt
- es
restart: always
oddb_db:
image: postgres
volumes:
- ./oddb_pg_data:/var/lib/postgresql/data
environment:
- "POSTGRES_USER=od_database"
- "POSTGRES_PASSWORD=changeme"
ports:
- 5021:5432
healthcheck:
test: ["CMD-SHELL", "pg_isready -U od_database"]
interval: 5s
timeout: 5s
retries: 5
oddb_redis:
image: redis
wsb:
image: simon987/wsb_bucket
volumes:
- ./wsb_data:/data
environment:
- "WS_BUCKET_SECRET=changeme"
ports:
- 3020:3020
tt_db:
image: postgres
volumes:
- ./tt_pg_data:/var/lib/postgresql/data
environment:
POSTGRES_USER: task_tracker
POSTGRES_PASSWORD: changeme
healthcheck:
test: ["CMD-SHELL", "pg_isready -U task_tracker"]
interval: 5s
timeout: 5s
retries: 5
tt:
image: simon987/task_tracker
volumes:
- ./tt_pg_data:/var/lib/postgresql/data
- ./tt_config.yml:/root/config.yml
ports:
- 3010:80
depends_on:
- tt_db
es:
image: docker.elastic.co/elasticsearch/elasticsearch:7.4.2
environment:
# - bootstrap.memory_lock=true
- discovery.type=single-node
# - index.number_of_shards=50
# - index.number_of_replicas=0
# - "ES_JAVA_OPTS=-Xms1G -Xmx10G"
volumes:
- /usr/share/elasticsearch/data
healthcheck:
test: ["CMD-SHELL", "curl --silent --fail localhost:9200/_cluster/health || exit 1"]
interval: 30s
timeout: 30s
retries: 3

View File

@ -28,7 +28,7 @@ for file in os.listdir(dldir):
print("Export started, connecting to databases...") print("Export started, connecting to databases...")
db = Database(config.DB_CONN_STR) db = Database(config.DB_CONN_STR)
es = ElasticSearchEngine("od-database") es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
docs_with_url = db.join_website_url(es.stream_all_docs()) docs_with_url = db.join_website_url(es.stream_all_docs())

50
jenkins/Jenkinsfile vendored
View File

@ -1,50 +0,0 @@
def remote = [:]
remote.name = 'remote'
remote.host = env.DEPLOY_HOST
remote.user = env.DEPLOY_USER
remote.identityFile = '/var/lib/jenkins/.ssh/id_rsa'
remote.knownHosts = '/var/lib/jenkins/.ssh/known_hosts'
pipeline {
agent any
stages {
stage('Build') {
steps {
sh './jenkins/build.sh'
}
}
stage('Deploy') {
steps {
sh 'echo $ODDB_CONFIG > config.py'
sshCommand remote: remote, command: "cd od-database && rm -rf env fold_to_ascii search static task_tracker_drone templates ws_bucket_client *.py deploy.sh"
sshPut remote: remote, from: 'requirements.txt', into: 'od-database'
sshPut remote: remote, from: 'fold_to_ascii', into: 'od-database'
sshPut remote: remote, from: 'search', into: 'od-database'
sshPut remote: remote, from: 'static', into: 'od-database'
sshPut remote: remote, from: 'task_tracker_drone', into: 'od-database'
sshPut remote: remote, from: 'templates', into: 'od-database'
sshPut remote: remote, from: 'ws_bucket_client', into: 'od-database'
sshPut remote: remote, from: '__init__.py', into: 'od-database'
sshPut remote: remote, from: 'api.py', into: 'od-database'
sshPut remote: remote, from: 'app.py', into: 'od-database'
sshPut remote: remote, from: 'captcha.py', into: 'od-database'
sshPut remote: remote, from: 'common.py', into: 'od-database'
sshPut remote: remote, from: 'database.py', into: 'od-database'
sshPut remote: remote, from: 'export.py', into: 'od-database'
sshPut remote: remote, from: 'init_script.sql', into: 'od-database'
sshPut remote: remote, from: 'od_util.py', into: 'od-database'
sshPut remote: remote, from: 'reddit_bot.py', into: 'od-database'
sshPut remote: remote, from: 'tasks.py', into: 'od-database'
sshPut remote: remote, from: 'template_filters.py', into: 'od-database'
sshPut remote: remote, from: 'uwsgi.py', into: 'od-database'
sshPut remote: remote, from: 'views.py', into: 'od-database'
sshPut remote: remote, from: 'config.py', into: 'od-database'
sshPut remote: remote, from: 'mass_import.py', into: 'od-database'
sshPut remote: remote, from: 'do_recrawl.py', into: 'od-database'
sshPut remote: remote, from: 'od-database.ini', into: 'od-database'
sshPut remote: remote, from: 'jenkins/deploy.sh', into: 'od-database'
sshCommand remote: remote, command: 'chmod +x od-database/deploy.sh && ./od-database/deploy.sh'
}
}
}
}

View File

@ -1,4 +0,0 @@
#!/bin/bash
git submodule init
git submodule update --remote --recursive

View File

@ -1,32 +0,0 @@
#!/bin/bash
export ODDBROOT="od-database"
virtualenv ${ODDBROOT}/env -p python3.7
source ${ODDBROOT}/env/bin/activate
pip install -r ${ODDBROOT}/requirements.txt
screen -S oddb_web -X quit
killall -9 uwsgi
sleep 5
echo "starting oddb_web"
screen -S oddb_web -d -m bash -c "cd ${ODDBROOT} && source env/bin/activate && uwsgi od-database.ini 2> stderr.txt"
sleep 1
screen -list
echo "Installing crontabs"
absolute_dir=$(cd ${ODDBROOT} && pwd)
# Re-crawl dirs
command="bash -c \"cd '${absolute_dir}' && source env/bin/activate && python do_recrawl.py >> recrawl_logs.txt\""
job="*/10 * * * * $command"
echo "$job"
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -
# Cleanup captchas
command="bash -c \"cd '${absolute_dir}' && rm captchas/*.png\""
job="*/60 * * * * $command"
echo "$job"
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -

View File

View File

@ -1,10 +0,0 @@
[uwsgi]
uwsgi-socket = 127.0.0.1:3031
wsgi-file = uwsgi.py
processes = 2
threads = 16
stats = 127.0.0.1:9191
callable=app
virtualenv=./env
disable-logging=True

View File

@ -18,7 +18,6 @@ lxml
pillow pillow
Wand Wand
numpy numpy
matplotlib
uwsgi uwsgi
redis redis
psycopg2-binary psycopg2-binary

View File

@ -20,32 +20,7 @@ class IndexingError(Exception):
pass pass
class SearchEngine: class ElasticSearchEngine:
def __init__(self):
pass
def import_json(self, in_str: str, website_id: int):
raise NotImplementedError
def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min,
date_max) -> {}:
raise NotImplementedError
def reset(self):
raise NotImplementedError
def ping(self):
raise NotImplementedError
def get_stats(self, website_id: int, subdir: str = None):
raise NotImplementedError
def refresh(self):
raise NotImplementedError
class ElasticSearchEngine(SearchEngine):
SORT_ORDERS = { SORT_ORDERS = {
"score": ["_score"], "score": ["_score"],
"size_asc": [{"size": {"order": "asc"}}], "size_asc": [{"size": {"order": "asc"}}],
@ -55,10 +30,11 @@ class ElasticSearchEngine(SearchEngine):
"none": [] "none": []
} }
def __init__(self, index_name): def __init__(self, url, index_name):
super().__init__() super().__init__()
self.index_name = index_name self.index_name = index_name
self.es = elasticsearch.Elasticsearch() logger.info("Connecting to ES @ %s" % url)
self.es = elasticsearch.Elasticsearch(hosts=[url])
self.filter = SearchFilter() self.filter = SearchFilter()
if not self.es.indices.exists(self.index_name): if not self.es.indices.exists(self.index_name):
@ -78,23 +54,25 @@ class ElasticSearchEngine(SearchEngine):
# Index settings # Index settings
self.es.indices.put_settings(body={ self.es.indices.put_settings(body={
"analysis": { "index": {
"tokenizer": { "refresh_interval": "30s",
"my_nGram_tokenizer": { "codec": "best_compression"
"type": "nGram", "min_gram": 3, "max_gram": 3 },
}
}
}}, index=self.index_name)
self.es.indices.put_settings(body={
"analysis": { "analysis": {
"analyzer": { "analyzer": {
"my_nGram": { "my_nGram": {
"tokenizer": "my_nGram_tokenizer", "tokenizer": "my_nGram_tokenizer",
"filter": ["lowercase", "asciifolding"] "filter": ["lowercase", "asciifolding"]
} }
},
"tokenizer": {
"my_nGram_tokenizer": {
"type": "nGram", "min_gram": 3, "max_gram": 3
}
} }
}}, index=self.index_name) }}, index=self.index_name)
# Index Mappings
self.es.indices.put_mapping(body={ self.es.indices.put_mapping(body={
"properties": { "properties": {
"path": {"analyzer": "standard", "type": "text"}, "path": {"analyzer": "standard", "type": "text"},
@ -110,12 +88,6 @@ class ElasticSearchEngine(SearchEngine):
self.es.indices.open(index=self.index_name) self.es.indices.open(index=self.index_name)
def reset(self):
self.init()
def ping(self):
return self.es.ping()
def delete_docs(self, website_id): def delete_docs(self, website_id):
while True: while True:
@ -332,7 +304,8 @@ class ElasticSearchEngine(SearchEngine):
yield urljoin(base_url, "/") + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \ yield urljoin(base_url, "/") + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \
("." if src["ext"] != "" else "") + src["ext"] ("." if src["ext"] != "" else "") + src["ext"]
def get_global_stats(self): @staticmethod
def get_global_stats():
if os.path.exists("_stats.json"): if os.path.exists("_stats.json"):
with open("_stats.json", "r") as f: with open("_stats.json", "r") as f:
@ -489,7 +462,7 @@ class ElasticSearchEngine(SearchEngine):
"query": { "query": {
"match_all": {} "match_all": {}
} }
}, scroll="1m", client=self.es, index=self.index_name, request_timeout=60) }, scroll="30s", client=self.es, index=self.index_name, request_timeout=30)
def refresh(self): def refresh(self):
self.es.indices.refresh(self.index_name) self.es.indices.refresh(self.index_name)

View File

@ -3,9 +3,11 @@ import logging
import os import os
import time import time
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
from tempfile import NamedTemporaryFile
from threading import Thread from threading import Thread
from uuid import uuid4 from uuid import uuid4
import requests
import urllib3 import urllib3
import config import config
@ -60,13 +62,13 @@ class IndexingTask:
class TaskManager: class TaskManager:
def __init__(self): def __init__(self):
self.search = ElasticSearchEngine("od-database") self.search = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
self.db = database.Database(config.DB_CONN_STR) self.db = database.Database(config.DB_CONN_STR)
self.tracker = TaskTrackerApi(config.TT_API) self.tracker = TaskTrackerApi(config.TT_API)
self.worker = Worker.from_file(self.tracker) self.worker = Worker.from_file(self.tracker)
if not self.worker: if not self.worker:
self.worker = self.tracker.make_worker("oddb_master") self.worker = self.tracker.make_worker("$oddb_master")
self.worker.dump_to_file() self.worker.dump_to_file()
self.worker.request_access(config.TT_CRAWL_PROJECT, False, True) self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
self.worker.request_access(config.TT_INDEX_PROJECT, True, False) self.worker.request_access(config.TT_INDEX_PROJECT, True, False)
@ -91,8 +93,9 @@ class TaskManager:
try: try:
recipe = task.json_recipe() recipe = task.json_recipe()
logger.debug("Got indexing task: " + str(recipe)) logger.debug("Got indexing task: " + str(recipe))
filename = os.path.join(config.WSB_PATH,
format_file_name(recipe["website_id"], recipe["upload_token"])) filename = download_file(config.WSB_API + "/slot?token=" + recipe["upload_token"])
self._complete_task(filename, Task(recipe["website_id"], recipe["url"])) self._complete_task(filename, Task(recipe["website_id"], recipe["url"]))
except Exception as e: except Exception as e:
self.worker.release_task(task_id=task.id, result=1, verification=0) self.worker.release_task(task_id=task.id, result=1, verification=0)
@ -167,3 +170,18 @@ class TaskManager:
def format_file_name(website_id, token): def format_file_name(website_id, token):
return "%d_%s.NDJSON" % (website_id, token,) return "%d_%s.NDJSON" % (website_id, token,)
def download_file(url):
r = requests.get(url, stream=True,)
if r.status_code != 200:
raise ValueError("HTTP error %d: %s" % (r.status_code, url))
tmp = NamedTemporaryFile(delete=False)
for chunk in r.iter_content(chunk_size=4096):
if chunk:
tmp.write(chunk)
tmp.close()
return tmp.name

0
templates/search.html Executable file → Normal file
View File

24
tt_config.yml Normal file
View File

@ -0,0 +1,24 @@
server:
address: "0.0.0.0:3010"
database:
conn_str: "postgres://task_tracker:changeme@tt_db/task_tracker?sslmode=disable"
log_levels: ["error", "info", "warn"]
git:
webhook_hash: "sha256"
webhook_sig_header: "X-Gogs-Signature"
log:
level: "trace"
session:
cookie_name: "tt"
expiration: "48h"
monitoring:
snapshot_interval: "120s"
history_length: "1800h"
maintenance:
reset_timed_out_tasks_interval: "10m"

9
uwsgi.ini Normal file
View File

@ -0,0 +1,9 @@
[uwsgi]
module = main
callable = app
enable-threads = true
processes = 4
threads = 16
disable-logging = True