mirror of
https://github.com/simon987/od-database.git
synced 2025-04-04 06:52:59 +00:00
docker-compose setup (wip)
This commit is contained in:
parent
31877283b3
commit
df8ab7727b
2
.gitignore
vendored
2
.gitignore
vendored
@ -4,9 +4,9 @@
|
||||
__pycache__/
|
||||
captchas/
|
||||
_stats.json
|
||||
config.py
|
||||
oddb.log
|
||||
praw.ini
|
||||
env/
|
||||
worker.json
|
||||
search_blacklist.txt
|
||||
*.iml
|
||||
|
11
Dockerfile
Normal file
11
Dockerfile
Normal file
@ -0,0 +1,11 @@
|
||||
FROM python:3.7
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ADD requirements.txt /app/requirements.txt
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
ENTRYPOINT ["python", "app.py"]
|
||||
|
||||
COPY . /app
|
||||
|
5
app.py
5
app.py
@ -1,3 +1,6 @@
|
||||
import time
|
||||
time.sleep(60)
|
||||
|
||||
from flask import Flask
|
||||
|
||||
import api
|
||||
@ -14,4 +17,4 @@ views.setup_views(app)
|
||||
api.setup_api(app)
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run("0.0.0.0", port=12345, threaded=True)
|
||||
app.run("0.0.0.0", port=80, threaded=True)
|
||||
|
@ -26,14 +26,13 @@ logger.addHandler(file_handler)
|
||||
logger.addHandler(StreamHandler(sys.stdout))
|
||||
|
||||
taskManager = TaskManager()
|
||||
searchEngine = ElasticSearchEngine("od-database")
|
||||
searchEngine = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
|
||||
searchEngine.start_stats_scheduler()
|
||||
db = Database(config.DB_CONN_STR)
|
||||
|
||||
redis = r.Redis()
|
||||
redis = r.Redis(host=config.REDIS_HOST, port=config.REDIS_PORT)
|
||||
|
||||
|
||||
def require_role(role: str):
|
||||
|
||||
if db.get_user_role(session.get("username", None)) != role:
|
||||
abort(403)
|
||||
|
29
config.py
Normal file
29
config.py
Normal file
@ -0,0 +1,29 @@
|
||||
from os import environ
|
||||
|
||||
CAPTCHA_LOGIN = bool(environ.get("CAPTCHA_LOGIN", False))
|
||||
CAPTCHA_SUBMIT = bool(environ.get("CAPTCHA_SUBMIT", False))
|
||||
CAPTCHA_SEARCH = bool(environ.get("CAPTCHA_SEARCH", False))
|
||||
CAPTCHA_EVERY = int(environ.get("CAPTCHA_EVERY", 10))
|
||||
|
||||
FLASK_SECRET = environ.get("FLASK_SECRET", "A very secret secret")
|
||||
RESULTS_PER_PAGE = (12, 25, 50, 100, 250, 500, 1000)
|
||||
|
||||
SUBMIT_FTP = bool(environ.get("SUBMIT_FTP", False))
|
||||
SUBMIT_HTTP = bool(environ.get("SUBMIT_HTTP", True))
|
||||
|
||||
TT_API = environ.get("TT_API", "http://localhost:3010")
|
||||
TT_CRAWL_PROJECT = int(environ.get("TT_CRAWL_PROJECT", 3))
|
||||
TT_INDEX_PROJECT = int(environ.get("TT_INDEX_PROJECT", 9))
|
||||
|
||||
WSB_API = environ.get("WSB_API", "http://localhost:3020")
|
||||
WSB_SECRET = environ.get("WSB_API", "default_secret")
|
||||
|
||||
ES_URL = environ.get("ES_URL", "http://localhost:9200")
|
||||
ES_INDEX = environ.get("ES_INDEX", "od-database")
|
||||
|
||||
REDIS_HOST = environ.get("REDIS_HOST", "localhost")
|
||||
REDIS_PORT = environ.get("REDIS_PORT", 6379)
|
||||
|
||||
DB_CONN_STR = environ.get("DB_CONN_STR", "dbname=od_database user=od_database password=od_database")
|
||||
RECRAWL_POOL_SIZE = environ.get("RECRAWL_POOL_SIZE", 10000)
|
||||
INDEXER_THREADS = int(environ.get("INDEXER_THREAD", 3))
|
92
docker-compose.yml
Normal file
92
docker-compose.yml
Normal file
@ -0,0 +1,92 @@
|
||||
version: "3"
|
||||
services:
|
||||
oddb:
|
||||
image: simon987/od-database
|
||||
ports:
|
||||
- 5020:80
|
||||
environment:
|
||||
- "CAPTCHA_LOGIN=True"
|
||||
- "CAPTCHA_SUBMIT=True"
|
||||
- "CAPTCHA_SEARCH=True"
|
||||
- "CAPTCHA_EVERY=10"
|
||||
- "FLASK_SECRET=changeme"
|
||||
- "SUBMIT_FTP=False"
|
||||
- "SUBMIT_HTTP=True"
|
||||
- "TT_API=http://tt:3010"
|
||||
- "TT_CRAWL_PROJECT=1"
|
||||
- "TT_INDEX_PROJECT=2"
|
||||
- "WSB_API=http://wsb:3020"
|
||||
- "WSB_SECRET=changeme"
|
||||
- "REDIS_HOST=oddb_redis"
|
||||
- "ES_URL=es:9200"
|
||||
- "DB_CONN_STR=postgres://od_database:changeme@oddb_db/od_database?sslmode=disable"
|
||||
- "RECRAWL_POOL_SIZE=10000"
|
||||
- "INDEXER_THREADS=2"
|
||||
depends_on:
|
||||
- wsb
|
||||
- tt
|
||||
- es
|
||||
restart: always
|
||||
oddb_db:
|
||||
image: postgres
|
||||
volumes:
|
||||
- ./oddb_pg_data:/var/lib/postgresql/data
|
||||
environment:
|
||||
- "POSTGRES_USER=od_database"
|
||||
- "POSTGRES_PASSWORD=changeme"
|
||||
ports:
|
||||
- 5021:5432
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U od_database"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
oddb_redis:
|
||||
image: redis
|
||||
wsb:
|
||||
image: simon987/wsb_bucket
|
||||
volumes:
|
||||
- ./wsb_data:/data
|
||||
environment:
|
||||
- "WS_BUCKET_SECRET=changeme"
|
||||
ports:
|
||||
- 3020:3020
|
||||
tt_db:
|
||||
image: postgres
|
||||
volumes:
|
||||
- ./tt_pg_data:/var/lib/postgresql/data
|
||||
environment:
|
||||
POSTGRES_USER: task_tracker
|
||||
POSTGRES_PASSWORD: changeme
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U task_tracker"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
tt:
|
||||
image: simon987/task_tracker
|
||||
volumes:
|
||||
- ./tt_pg_data:/var/lib/postgresql/data
|
||||
- ./tt_config.yml:/root/config.yml
|
||||
ports:
|
||||
- 3010:80
|
||||
depends_on:
|
||||
- tt_db
|
||||
es:
|
||||
image: docker.elastic.co/elasticsearch/elasticsearch:7.4.2
|
||||
environment:
|
||||
# - bootstrap.memory_lock=true
|
||||
- discovery.type=single-node
|
||||
# - index.number_of_shards=50
|
||||
# - index.number_of_replicas=0
|
||||
# - "ES_JAVA_OPTS=-Xms1G -Xmx10G"
|
||||
volumes:
|
||||
- /usr/share/elasticsearch/data
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl --silent --fail localhost:9200/_cluster/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 30s
|
||||
retries: 3
|
||||
|
||||
|
||||
|
@ -28,7 +28,7 @@ for file in os.listdir(dldir):
|
||||
print("Export started, connecting to databases...")
|
||||
|
||||
db = Database(config.DB_CONN_STR)
|
||||
es = ElasticSearchEngine("od-database")
|
||||
es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
|
||||
|
||||
docs_with_url = db.join_website_url(es.stream_all_docs())
|
||||
|
||||
|
50
jenkins/Jenkinsfile
vendored
50
jenkins/Jenkinsfile
vendored
@ -1,50 +0,0 @@
|
||||
def remote = [:]
|
||||
remote.name = 'remote'
|
||||
remote.host = env.DEPLOY_HOST
|
||||
remote.user = env.DEPLOY_USER
|
||||
remote.identityFile = '/var/lib/jenkins/.ssh/id_rsa'
|
||||
remote.knownHosts = '/var/lib/jenkins/.ssh/known_hosts'
|
||||
|
||||
pipeline {
|
||||
agent any
|
||||
stages {
|
||||
stage('Build') {
|
||||
steps {
|
||||
sh './jenkins/build.sh'
|
||||
}
|
||||
}
|
||||
stage('Deploy') {
|
||||
steps {
|
||||
sh 'echo $ODDB_CONFIG > config.py'
|
||||
sshCommand remote: remote, command: "cd od-database && rm -rf env fold_to_ascii search static task_tracker_drone templates ws_bucket_client *.py deploy.sh"
|
||||
sshPut remote: remote, from: 'requirements.txt', into: 'od-database'
|
||||
sshPut remote: remote, from: 'fold_to_ascii', into: 'od-database'
|
||||
sshPut remote: remote, from: 'search', into: 'od-database'
|
||||
sshPut remote: remote, from: 'static', into: 'od-database'
|
||||
sshPut remote: remote, from: 'task_tracker_drone', into: 'od-database'
|
||||
sshPut remote: remote, from: 'templates', into: 'od-database'
|
||||
sshPut remote: remote, from: 'ws_bucket_client', into: 'od-database'
|
||||
sshPut remote: remote, from: '__init__.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'api.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'app.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'captcha.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'common.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'database.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'export.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'init_script.sql', into: 'od-database'
|
||||
sshPut remote: remote, from: 'od_util.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'reddit_bot.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'tasks.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'template_filters.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'uwsgi.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'views.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'config.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'mass_import.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'do_recrawl.py', into: 'od-database'
|
||||
sshPut remote: remote, from: 'od-database.ini', into: 'od-database'
|
||||
sshPut remote: remote, from: 'jenkins/deploy.sh', into: 'od-database'
|
||||
sshCommand remote: remote, command: 'chmod +x od-database/deploy.sh && ./od-database/deploy.sh'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
git submodule init
|
||||
git submodule update --remote --recursive
|
@ -1,32 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
export ODDBROOT="od-database"
|
||||
|
||||
virtualenv ${ODDBROOT}/env -p python3.7
|
||||
source ${ODDBROOT}/env/bin/activate
|
||||
pip install -r ${ODDBROOT}/requirements.txt
|
||||
|
||||
screen -S oddb_web -X quit
|
||||
killall -9 uwsgi
|
||||
|
||||
sleep 5
|
||||
|
||||
echo "starting oddb_web"
|
||||
screen -S oddb_web -d -m bash -c "cd ${ODDBROOT} && source env/bin/activate && uwsgi od-database.ini 2> stderr.txt"
|
||||
sleep 1
|
||||
screen -list
|
||||
|
||||
echo "Installing crontabs"
|
||||
absolute_dir=$(cd ${ODDBROOT} && pwd)
|
||||
|
||||
# Re-crawl dirs
|
||||
command="bash -c \"cd '${absolute_dir}' && source env/bin/activate && python do_recrawl.py >> recrawl_logs.txt\""
|
||||
job="*/10 * * * * $command"
|
||||
echo "$job"
|
||||
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -
|
||||
|
||||
# Cleanup captchas
|
||||
command="bash -c \"cd '${absolute_dir}' && rm captchas/*.png\""
|
||||
job="*/60 * * * * $command"
|
||||
echo "$job"
|
||||
cat <(fgrep -i -v "$command" <(crontab -l)) <(echo "$job") | crontab -
|
@ -1,10 +0,0 @@
|
||||
[uwsgi]
|
||||
uwsgi-socket = 127.0.0.1:3031
|
||||
wsgi-file = uwsgi.py
|
||||
processes = 2
|
||||
threads = 16
|
||||
stats = 127.0.0.1:9191
|
||||
callable=app
|
||||
virtualenv=./env
|
||||
|
||||
disable-logging=True
|
@ -18,7 +18,6 @@ lxml
|
||||
pillow
|
||||
Wand
|
||||
numpy
|
||||
matplotlib
|
||||
uwsgi
|
||||
redis
|
||||
psycopg2-binary
|
||||
|
@ -20,32 +20,7 @@ class IndexingError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class SearchEngine:
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def import_json(self, in_str: str, website_id: int):
|
||||
raise NotImplementedError
|
||||
|
||||
def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min,
|
||||
date_max) -> {}:
|
||||
raise NotImplementedError
|
||||
|
||||
def reset(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def ping(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_stats(self, website_id: int, subdir: str = None):
|
||||
raise NotImplementedError
|
||||
|
||||
def refresh(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class ElasticSearchEngine(SearchEngine):
|
||||
class ElasticSearchEngine:
|
||||
SORT_ORDERS = {
|
||||
"score": ["_score"],
|
||||
"size_asc": [{"size": {"order": "asc"}}],
|
||||
@ -55,10 +30,11 @@ class ElasticSearchEngine(SearchEngine):
|
||||
"none": []
|
||||
}
|
||||
|
||||
def __init__(self, index_name):
|
||||
def __init__(self, url, index_name):
|
||||
super().__init__()
|
||||
self.index_name = index_name
|
||||
self.es = elasticsearch.Elasticsearch()
|
||||
logger.info("Connecting to ES @ %s" % url)
|
||||
self.es = elasticsearch.Elasticsearch(hosts=[url])
|
||||
self.filter = SearchFilter()
|
||||
|
||||
if not self.es.indices.exists(self.index_name):
|
||||
@ -78,23 +54,25 @@ class ElasticSearchEngine(SearchEngine):
|
||||
|
||||
# Index settings
|
||||
self.es.indices.put_settings(body={
|
||||
"analysis": {
|
||||
"tokenizer": {
|
||||
"my_nGram_tokenizer": {
|
||||
"type": "nGram", "min_gram": 3, "max_gram": 3
|
||||
}
|
||||
}
|
||||
}}, index=self.index_name)
|
||||
self.es.indices.put_settings(body={
|
||||
"index": {
|
||||
"refresh_interval": "30s",
|
||||
"codec": "best_compression"
|
||||
},
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_nGram": {
|
||||
"tokenizer": "my_nGram_tokenizer",
|
||||
"filter": ["lowercase", "asciifolding"]
|
||||
}
|
||||
},
|
||||
"tokenizer": {
|
||||
"my_nGram_tokenizer": {
|
||||
"type": "nGram", "min_gram": 3, "max_gram": 3
|
||||
}
|
||||
}
|
||||
}}, index=self.index_name)
|
||||
|
||||
# Index Mappings
|
||||
self.es.indices.put_mapping(body={
|
||||
"properties": {
|
||||
"path": {"analyzer": "standard", "type": "text"},
|
||||
@ -110,12 +88,6 @@ class ElasticSearchEngine(SearchEngine):
|
||||
|
||||
self.es.indices.open(index=self.index_name)
|
||||
|
||||
def reset(self):
|
||||
self.init()
|
||||
|
||||
def ping(self):
|
||||
return self.es.ping()
|
||||
|
||||
def delete_docs(self, website_id):
|
||||
|
||||
while True:
|
||||
@ -332,7 +304,8 @@ class ElasticSearchEngine(SearchEngine):
|
||||
yield urljoin(base_url, "/") + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \
|
||||
("." if src["ext"] != "" else "") + src["ext"]
|
||||
|
||||
def get_global_stats(self):
|
||||
@staticmethod
|
||||
def get_global_stats():
|
||||
|
||||
if os.path.exists("_stats.json"):
|
||||
with open("_stats.json", "r") as f:
|
||||
@ -489,7 +462,7 @@ class ElasticSearchEngine(SearchEngine):
|
||||
"query": {
|
||||
"match_all": {}
|
||||
}
|
||||
}, scroll="1m", client=self.es, index=self.index_name, request_timeout=60)
|
||||
}, scroll="30s", client=self.es, index=self.index_name, request_timeout=30)
|
||||
|
||||
def refresh(self):
|
||||
self.es.indices.refresh(self.index_name)
|
||||
|
26
tasks.py
26
tasks.py
@ -3,9 +3,11 @@ import logging
|
||||
import os
|
||||
import time
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from tempfile import NamedTemporaryFile
|
||||
from threading import Thread
|
||||
from uuid import uuid4
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
import config
|
||||
@ -60,13 +62,13 @@ class IndexingTask:
|
||||
class TaskManager:
|
||||
|
||||
def __init__(self):
|
||||
self.search = ElasticSearchEngine("od-database")
|
||||
self.search = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
|
||||
self.db = database.Database(config.DB_CONN_STR)
|
||||
self.tracker = TaskTrackerApi(config.TT_API)
|
||||
|
||||
self.worker = Worker.from_file(self.tracker)
|
||||
if not self.worker:
|
||||
self.worker = self.tracker.make_worker("oddb_master")
|
||||
self.worker = self.tracker.make_worker("$oddb_master")
|
||||
self.worker.dump_to_file()
|
||||
self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
|
||||
self.worker.request_access(config.TT_INDEX_PROJECT, True, False)
|
||||
@ -91,8 +93,9 @@ class TaskManager:
|
||||
try:
|
||||
recipe = task.json_recipe()
|
||||
logger.debug("Got indexing task: " + str(recipe))
|
||||
filename = os.path.join(config.WSB_PATH,
|
||||
format_file_name(recipe["website_id"], recipe["upload_token"]))
|
||||
|
||||
filename = download_file(config.WSB_API + "/slot?token=" + recipe["upload_token"])
|
||||
|
||||
self._complete_task(filename, Task(recipe["website_id"], recipe["url"]))
|
||||
except Exception as e:
|
||||
self.worker.release_task(task_id=task.id, result=1, verification=0)
|
||||
@ -167,3 +170,18 @@ class TaskManager:
|
||||
|
||||
def format_file_name(website_id, token):
|
||||
return "%d_%s.NDJSON" % (website_id, token,)
|
||||
|
||||
|
||||
def download_file(url):
|
||||
r = requests.get(url, stream=True,)
|
||||
|
||||
if r.status_code != 200:
|
||||
raise ValueError("HTTP error %d: %s" % (r.status_code, url))
|
||||
|
||||
tmp = NamedTemporaryFile(delete=False)
|
||||
for chunk in r.iter_content(chunk_size=4096):
|
||||
if chunk:
|
||||
tmp.write(chunk)
|
||||
tmp.close()
|
||||
|
||||
return tmp.name
|
||||
|
0
templates/search.html
Executable file → Normal file
0
templates/search.html
Executable file → Normal file
24
tt_config.yml
Normal file
24
tt_config.yml
Normal file
@ -0,0 +1,24 @@
|
||||
server:
|
||||
address: "0.0.0.0:3010"
|
||||
|
||||
database:
|
||||
conn_str: "postgres://task_tracker:changeme@tt_db/task_tracker?sslmode=disable"
|
||||
log_levels: ["error", "info", "warn"]
|
||||
|
||||
git:
|
||||
webhook_hash: "sha256"
|
||||
webhook_sig_header: "X-Gogs-Signature"
|
||||
|
||||
log:
|
||||
level: "trace"
|
||||
|
||||
session:
|
||||
cookie_name: "tt"
|
||||
expiration: "48h"
|
||||
|
||||
monitoring:
|
||||
snapshot_interval: "120s"
|
||||
history_length: "1800h"
|
||||
|
||||
maintenance:
|
||||
reset_timed_out_tasks_interval: "10m"
|
Loading…
x
Reference in New Issue
Block a user