From df8ab7727bd5452c08e1a7d238fc5466e8460362 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 13 Nov 2019 13:03:43 -0500 Subject: [PATCH] docker-compose setup (wip) --- .gitignore | 2 +- Dockerfile | 11 ++++++ app.py | 5 ++- common.py | 5 +-- config.py | 29 ++++++++++++++ docker-compose.yml | 92 +++++++++++++++++++++++++++++++++++++++++++ export.py | 2 +- jenkins/Jenkinsfile | 50 ----------------------- jenkins/build.sh | 4 -- jenkins/deploy.sh | 32 --------------- uwsgi.py => main.py | 0 od-database.ini | 10 ----- requirements.txt | 1 - search/search.py | 61 ++++++++-------------------- tasks.py | 26 ++++++++++-- templates/search.html | 0 tt_config.yml | 24 +++++++++++ uwsgi.ini | 9 +++++ 18 files changed, 212 insertions(+), 151 deletions(-) create mode 100644 Dockerfile create mode 100644 config.py create mode 100644 docker-compose.yml delete mode 100644 jenkins/Jenkinsfile delete mode 100755 jenkins/build.sh delete mode 100755 jenkins/deploy.sh rename uwsgi.py => main.py (100%) delete mode 100644 od-database.ini mode change 100755 => 100644 templates/search.html create mode 100644 tt_config.yml create mode 100644 uwsgi.ini diff --git a/.gitignore b/.gitignore index 7db1aee..9bc380e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,9 +4,9 @@ __pycache__/ captchas/ _stats.json -config.py oddb.log praw.ini env/ worker.json search_blacklist.txt +*.iml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..39bd781 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.7 + +WORKDIR /app + +ADD requirements.txt /app/requirements.txt +RUN pip install -r requirements.txt + +ENTRYPOINT ["python", "app.py"] + +COPY . /app + diff --git a/app.py b/app.py index 1e3112f..1f00e69 100644 --- a/app.py +++ b/app.py @@ -1,3 +1,6 @@ +import time +time.sleep(60) + from flask import Flask import api @@ -14,4 +17,4 @@ views.setup_views(app) api.setup_api(app) if __name__ == '__main__': - app.run("0.0.0.0", port=12345, threaded=True) + app.run("0.0.0.0", port=80, threaded=True) diff --git a/common.py b/common.py index 089f8d7..94f0f57 100644 --- a/common.py +++ b/common.py @@ -26,14 +26,13 @@ logger.addHandler(file_handler) logger.addHandler(StreamHandler(sys.stdout)) taskManager = TaskManager() -searchEngine = ElasticSearchEngine("od-database") +searchEngine = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) searchEngine.start_stats_scheduler() db = Database(config.DB_CONN_STR) -redis = r.Redis() +redis = r.Redis(host=config.REDIS_HOST, port=config.REDIS_PORT) def require_role(role: str): - if db.get_user_role(session.get("username", None)) != role: abort(403) diff --git a/config.py b/config.py new file mode 100644 index 0000000..9993f3e --- /dev/null +++ b/config.py @@ -0,0 +1,29 @@ +from os import environ + +CAPTCHA_LOGIN = bool(environ.get("CAPTCHA_LOGIN", False)) +CAPTCHA_SUBMIT = bool(environ.get("CAPTCHA_SUBMIT", False)) +CAPTCHA_SEARCH = bool(environ.get("CAPTCHA_SEARCH", False)) +CAPTCHA_EVERY = int(environ.get("CAPTCHA_EVERY", 10)) + +FLASK_SECRET = environ.get("FLASK_SECRET", "A very secret secret") +RESULTS_PER_PAGE = (12, 25, 50, 100, 250, 500, 1000) + +SUBMIT_FTP = bool(environ.get("SUBMIT_FTP", False)) +SUBMIT_HTTP = bool(environ.get("SUBMIT_HTTP", True)) + +TT_API = environ.get("TT_API", "http://localhost:3010") +TT_CRAWL_PROJECT = int(environ.get("TT_CRAWL_PROJECT", 3)) +TT_INDEX_PROJECT = int(environ.get("TT_INDEX_PROJECT", 9)) + +WSB_API = environ.get("WSB_API", "http://localhost:3020") +WSB_SECRET = environ.get("WSB_API", "default_secret") + +ES_URL = environ.get("ES_URL", "http://localhost:9200") +ES_INDEX = environ.get("ES_INDEX", "od-database") + +REDIS_HOST = environ.get("REDIS_HOST", "localhost") +REDIS_PORT = environ.get("REDIS_PORT", 6379) + +DB_CONN_STR = environ.get("DB_CONN_STR", "dbname=od_database user=od_database password=od_database") +RECRAWL_POOL_SIZE = environ.get("RECRAWL_POOL_SIZE", 10000) +INDEXER_THREADS = int(environ.get("INDEXER_THREAD", 3)) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..c47f598 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,92 @@ +version: "3" +services: + oddb: + image: simon987/od-database + ports: + - 5020:80 + environment: + - "CAPTCHA_LOGIN=True" + - "CAPTCHA_SUBMIT=True" + - "CAPTCHA_SEARCH=True" + - "CAPTCHA_EVERY=10" + - "FLASK_SECRET=changeme" + - "SUBMIT_FTP=False" + - "SUBMIT_HTTP=True" + - "TT_API=http://tt:3010" + - "TT_CRAWL_PROJECT=1" + - "TT_INDEX_PROJECT=2" + - "WSB_API=http://wsb:3020" + - "WSB_SECRET=changeme" + - "REDIS_HOST=oddb_redis" + - "ES_URL=es:9200" + - "DB_CONN_STR=postgres://od_database:changeme@oddb_db/od_database?sslmode=disable" + - "RECRAWL_POOL_SIZE=10000" + - "INDEXER_THREADS=2" + depends_on: + - wsb + - tt + - es + restart: always + oddb_db: + image: postgres + volumes: + - ./oddb_pg_data:/var/lib/postgresql/data + environment: + - "POSTGRES_USER=od_database" + - "POSTGRES_PASSWORD=changeme" + ports: + - 5021:5432 + healthcheck: + test: ["CMD-SHELL", "pg_isready -U od_database"] + interval: 5s + timeout: 5s + retries: 5 + oddb_redis: + image: redis + wsb: + image: simon987/wsb_bucket + volumes: + - ./wsb_data:/data + environment: + - "WS_BUCKET_SECRET=changeme" + ports: + - 3020:3020 + tt_db: + image: postgres + volumes: + - ./tt_pg_data:/var/lib/postgresql/data + environment: + POSTGRES_USER: task_tracker + POSTGRES_PASSWORD: changeme + healthcheck: + test: ["CMD-SHELL", "pg_isready -U task_tracker"] + interval: 5s + timeout: 5s + retries: 5 + tt: + image: simon987/task_tracker + volumes: + - ./tt_pg_data:/var/lib/postgresql/data + - ./tt_config.yml:/root/config.yml + ports: + - 3010:80 + depends_on: + - tt_db + es: + image: docker.elastic.co/elasticsearch/elasticsearch:7.4.2 + environment: +# - bootstrap.memory_lock=true + - discovery.type=single-node +# - index.number_of_shards=50 +# - index.number_of_replicas=0 +# - "ES_JAVA_OPTS=-Xms1G -Xmx10G" + volumes: + - /usr/share/elasticsearch/data + healthcheck: + test: ["CMD-SHELL", "curl --silent --fail localhost:9200/_cluster/health || exit 1"] + interval: 30s + timeout: 30s + retries: 3 + + + diff --git a/export.py b/export.py index d2558ca..4973afa 100644 --- a/export.py +++ b/export.py @@ -28,7 +28,7 @@ for file in os.listdir(dldir): print("Export started, connecting to databases...") db = Database(config.DB_CONN_STR) -es = ElasticSearchEngine("od-database") +es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) docs_with_url = db.join_website_url(es.stream_all_docs()) diff --git a/jenkins/Jenkinsfile b/jenkins/Jenkinsfile deleted file mode 100644 index 5631447..0000000 --- a/jenkins/Jenkinsfile +++ /dev/null @@ -1,50 +0,0 @@ -def remote = [:] -remote.name = 'remote' -remote.host = env.DEPLOY_HOST -remote.user = env.DEPLOY_USER -remote.identityFile = '/var/lib/jenkins/.ssh/id_rsa' -remote.knownHosts = '/var/lib/jenkins/.ssh/known_hosts' - -pipeline { - agent any - stages { - stage('Build') { - steps { - sh './jenkins/build.sh' - } - } - stage('Deploy') { - steps { - sh 'echo $ODDB_CONFIG > config.py' - sshCommand remote: remote, command: "cd od-database && rm -rf env fold_to_ascii search static task_tracker_drone templates ws_bucket_client *.py deploy.sh" - sshPut remote: remote, from: 'requirements.txt', into: 'od-database' - sshPut remote: remote, from: 'fold_to_ascii', into: 'od-database' - sshPut remote: remote, from: 'search', into: 'od-database' - sshPut remote: remote, from: 'static', into: 'od-database' - sshPut remote: remote, from: 'task_tracker_drone', into: 'od-database' - sshPut remote: remote, from: 'templates', into: 'od-database' - sshPut remote: remote, from: 'ws_bucket_client', into: 'od-database' - sshPut remote: remote, from: '__init__.py', into: 'od-database' - sshPut remote: remote, from: 'api.py', into: 'od-database' - sshPut remote: remote, from: 'app.py', into: 'od-database' - sshPut remote: remote, from: 'captcha.py', into: 'od-database' - sshPut remote: remote, from: 'common.py', into: 'od-database' - sshPut remote: remote, from: 'database.py', into: 'od-database' - sshPut remote: remote, from: 'export.py', into: 'od-database' - sshPut remote: remote, from: 'init_script.sql', into: 'od-database' - sshPut remote: remote, from: 'od_util.py', into: 'od-database' - sshPut remote: remote, from: 'reddit_bot.py', into: 'od-database' - sshPut remote: remote, from: 'tasks.py', into: 'od-database' - sshPut remote: remote, from: 'template_filters.py', into: 'od-database' - sshPut remote: remote, from: 'uwsgi.py', into: 'od-database' - sshPut remote: remote, from: 'views.py', into: 'od-database' - sshPut remote: remote, from: 'config.py', into: 'od-database' - sshPut remote: remote, from: 'mass_import.py', into: 'od-database' - sshPut remote: remote, from: 'do_recrawl.py', into: 'od-database' - sshPut remote: remote, from: 'od-database.ini', into: 'od-database' - sshPut remote: remote, from: 'jenkins/deploy.sh', into: 'od-database' - sshCommand remote: remote, command: 'chmod +x od-database/deploy.sh && ./od-database/deploy.sh' - } - } - } -} \ No newline at end of file diff --git a/jenkins/build.sh b/jenkins/build.sh deleted file mode 100755 index af57e90..0000000 --- a/jenkins/build.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -git submodule init -git submodule update --remote --recursive diff --git a/jenkins/deploy.sh b/jenkins/deploy.sh deleted file mode 100755 index b02801e..0000000 --- a/jenkins/deploy.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -export ODDBROOT="od-database" - -virtualenv ${ODDBROOT}/env -p python3.7 -source ${ODDBROOT}/env/bin/activate -pip install -r ${ODDBROOT}/requirements.txt - -screen -S oddb_web -X quit -killall -9 uwsgi - -sleep 5 - -echo "starting oddb_web" -screen -S oddb_web -d -m bash -c "cd ${ODDBROOT} && source env/bin/activate && uwsgi od-database.ini 2> stderr.txt" -sleep 1 -screen -list - -echo "Installing crontabs" -absolute_dir=$(cd ${ODDBROOT} && pwd) - -# Re-crawl dirs -command="bash -c \"cd '${absolute_dir}' && source env/bin/activate && python do_recrawl.py >> recrawl_logs.txt\"" -job="*/10 * * * * $command" -echo "$job" -cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab - - -# Cleanup captchas -command="bash -c \"cd '${absolute_dir}' && rm captchas/*.png\"" -job="*/60 * * * * $command" -echo "$job" -cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab - diff --git a/uwsgi.py b/main.py similarity index 100% rename from uwsgi.py rename to main.py diff --git a/od-database.ini b/od-database.ini deleted file mode 100644 index 088a3a7..0000000 --- a/od-database.ini +++ /dev/null @@ -1,10 +0,0 @@ -[uwsgi] -uwsgi-socket = 127.0.0.1:3031 -wsgi-file = uwsgi.py -processes = 2 -threads = 16 -stats = 127.0.0.1:9191 -callable=app -virtualenv=./env - -disable-logging=True \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 96cde60..c7b8d3f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,6 @@ lxml pillow Wand numpy -matplotlib uwsgi redis psycopg2-binary diff --git a/search/search.py b/search/search.py index e7f7b32..340134f 100644 --- a/search/search.py +++ b/search/search.py @@ -20,32 +20,7 @@ class IndexingError(Exception): pass -class SearchEngine: - - def __init__(self): - pass - - def import_json(self, in_str: str, website_id: int): - raise NotImplementedError - - def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min, - date_max) -> {}: - raise NotImplementedError - - def reset(self): - raise NotImplementedError - - def ping(self): - raise NotImplementedError - - def get_stats(self, website_id: int, subdir: str = None): - raise NotImplementedError - - def refresh(self): - raise NotImplementedError - - -class ElasticSearchEngine(SearchEngine): +class ElasticSearchEngine: SORT_ORDERS = { "score": ["_score"], "size_asc": [{"size": {"order": "asc"}}], @@ -55,10 +30,11 @@ class ElasticSearchEngine(SearchEngine): "none": [] } - def __init__(self, index_name): + def __init__(self, url, index_name): super().__init__() self.index_name = index_name - self.es = elasticsearch.Elasticsearch() + logger.info("Connecting to ES @ %s" % url) + self.es = elasticsearch.Elasticsearch(hosts=[url]) self.filter = SearchFilter() if not self.es.indices.exists(self.index_name): @@ -78,23 +54,25 @@ class ElasticSearchEngine(SearchEngine): # Index settings self.es.indices.put_settings(body={ - "analysis": { - "tokenizer": { - "my_nGram_tokenizer": { - "type": "nGram", "min_gram": 3, "max_gram": 3 - } - } - }}, index=self.index_name) - self.es.indices.put_settings(body={ + "index": { + "refresh_interval": "30s", + "codec": "best_compression" + }, "analysis": { "analyzer": { "my_nGram": { "tokenizer": "my_nGram_tokenizer", "filter": ["lowercase", "asciifolding"] } + }, + "tokenizer": { + "my_nGram_tokenizer": { + "type": "nGram", "min_gram": 3, "max_gram": 3 + } } }}, index=self.index_name) + # Index Mappings self.es.indices.put_mapping(body={ "properties": { "path": {"analyzer": "standard", "type": "text"}, @@ -110,12 +88,6 @@ class ElasticSearchEngine(SearchEngine): self.es.indices.open(index=self.index_name) - def reset(self): - self.init() - - def ping(self): - return self.es.ping() - def delete_docs(self, website_id): while True: @@ -332,7 +304,8 @@ class ElasticSearchEngine(SearchEngine): yield urljoin(base_url, "/") + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \ ("." if src["ext"] != "" else "") + src["ext"] - def get_global_stats(self): + @staticmethod + def get_global_stats(): if os.path.exists("_stats.json"): with open("_stats.json", "r") as f: @@ -489,7 +462,7 @@ class ElasticSearchEngine(SearchEngine): "query": { "match_all": {} } - }, scroll="1m", client=self.es, index=self.index_name, request_timeout=60) + }, scroll="30s", client=self.es, index=self.index_name, request_timeout=30) def refresh(self): self.es.indices.refresh(self.index_name) diff --git a/tasks.py b/tasks.py index 79d026d..db693fa 100644 --- a/tasks.py +++ b/tasks.py @@ -3,9 +3,11 @@ import logging import os import time from multiprocessing.pool import ThreadPool +from tempfile import NamedTemporaryFile from threading import Thread from uuid import uuid4 +import requests import urllib3 import config @@ -60,13 +62,13 @@ class IndexingTask: class TaskManager: def __init__(self): - self.search = ElasticSearchEngine("od-database") + self.search = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) self.db = database.Database(config.DB_CONN_STR) self.tracker = TaskTrackerApi(config.TT_API) self.worker = Worker.from_file(self.tracker) if not self.worker: - self.worker = self.tracker.make_worker("oddb_master") + self.worker = self.tracker.make_worker("$oddb_master") self.worker.dump_to_file() self.worker.request_access(config.TT_CRAWL_PROJECT, False, True) self.worker.request_access(config.TT_INDEX_PROJECT, True, False) @@ -91,8 +93,9 @@ class TaskManager: try: recipe = task.json_recipe() logger.debug("Got indexing task: " + str(recipe)) - filename = os.path.join(config.WSB_PATH, - format_file_name(recipe["website_id"], recipe["upload_token"])) + + filename = download_file(config.WSB_API + "/slot?token=" + recipe["upload_token"]) + self._complete_task(filename, Task(recipe["website_id"], recipe["url"])) except Exception as e: self.worker.release_task(task_id=task.id, result=1, verification=0) @@ -167,3 +170,18 @@ class TaskManager: def format_file_name(website_id, token): return "%d_%s.NDJSON" % (website_id, token,) + + +def download_file(url): + r = requests.get(url, stream=True,) + + if r.status_code != 200: + raise ValueError("HTTP error %d: %s" % (r.status_code, url)) + + tmp = NamedTemporaryFile(delete=False) + for chunk in r.iter_content(chunk_size=4096): + if chunk: + tmp.write(chunk) + tmp.close() + + return tmp.name diff --git a/templates/search.html b/templates/search.html old mode 100755 new mode 100644 diff --git a/tt_config.yml b/tt_config.yml new file mode 100644 index 0000000..21bdfab --- /dev/null +++ b/tt_config.yml @@ -0,0 +1,24 @@ +server: + address: "0.0.0.0:3010" + +database: + conn_str: "postgres://task_tracker:changeme@tt_db/task_tracker?sslmode=disable" + log_levels: ["error", "info", "warn"] + +git: + webhook_hash: "sha256" + webhook_sig_header: "X-Gogs-Signature" + +log: + level: "trace" + +session: + cookie_name: "tt" + expiration: "48h" + +monitoring: + snapshot_interval: "120s" + history_length: "1800h" + +maintenance: + reset_timed_out_tasks_interval: "10m" diff --git a/uwsgi.ini b/uwsgi.ini new file mode 100644 index 0000000..a678528 --- /dev/null +++ b/uwsgi.ini @@ -0,0 +1,9 @@ +[uwsgi] +module = main +callable = app + +enable-threads = true +processes = 4 +threads = 16 + +disable-logging = True \ No newline at end of file