352 lines
11 KiB
Python

import datetime
import json
import logging
import os.path
from datetime import datetime
from enum import Enum
from io import TextIOWrapper
from logging import FileHandler
from subprocess import Popen, PIPE
from tempfile import NamedTemporaryFile
from threading import Thread
from typing import List
from pydantic import BaseModel
from config import logger, LOG_FOLDER, DATA_FOLDER
class Sist2Version:
def __init__(self, version: str):
self._version = version
self.major, self.minor, self.patch = [int(x) for x in version.split(".")]
def __str__(self):
return f"{self.major}.{self.minor}.{self.patch}"
class SearchBackendType(Enum):
SQLITE = "sqlite"
ELASTICSEARCH = "elasticsearch"
class Sist2SearchBackend(BaseModel):
backend_type: SearchBackendType = SearchBackendType("elasticsearch")
name: str
search_index: str = ""
es_url: str = "http://elasticsearch:9200"
es_insecure_ssl: bool = False
es_index: str = "sist2"
threads: int = 1
batch_size: int = 70
@staticmethod
def create_default(name: str, backend_type: SearchBackendType = SearchBackendType("elasticsearch")):
return Sist2SearchBackend(
name=name,
search_index=f"search-index-{name.replace('/', '_')}.sist2",
backend_type=backend_type
)
class IndexOptions(BaseModel):
path: str = None
incremental_index: bool = True
search_backend: str = None
def __init__(self, **kwargs):
super().__init__(**kwargs)
def args(self, search_backend):
absolute_path = os.path.join(DATA_FOLDER, self.path)
if search_backend.backend_type == SearchBackendType("sqlite"):
search_index_absolute = os.path.join(DATA_FOLDER, search_backend.search_index)
args = ["sqlite-index", absolute_path, "--search-index", search_index_absolute]
else:
args = ["index", absolute_path, f"--threads={search_backend.threads}",
f"--es-url={search_backend.es_url}",
f"--es-index={search_backend.es_index}",
f"--batch-size={search_backend.batch_size}"]
if search_backend.es_insecure_ssl:
args.append(f"--es-insecure-ssl")
if self.incremental_index:
args.append(f"--incremental-index")
return args
ARCHIVE_SKIP = "skip"
ARCHIVE_LIST = "list"
ARCHIVE_SHALLOW = "shallow"
ARCHIVE_RECURSE = "recurse"
class ScanOptions(BaseModel):
path: str
threads: int = 1
thumbnail_quality: int = 50
thumbnail_size: int = 552
thumbnail_count: int = 1
content_size: int = 32768
depth: int = -1
archive: str = ARCHIVE_RECURSE
archive_passphrase: str = None
ocr_lang: str = None
ocr_images: bool = False
ocr_ebooks: bool = False
exclude: str = None
fast: bool = False
treemap_threshold: float = 0.0005
mem_buffer: int = 2000
read_subtitles: bool = False
fast_epub: bool = False
checksums: bool = False
incremental: bool = True
optimize_index: bool = False
output: str = None
name: str = None
rewrite_url: str = None
list_file: str = None
def __init__(self, **kwargs):
super().__init__(**kwargs)
def args(self):
output_path = os.path.join(DATA_FOLDER, self.output)
args = ["scan", self.path, f"--threads={self.threads}", f"--thumbnail-quality={self.thumbnail_quality}",
f"--thumbnail-count={self.thumbnail_count}", f"--thumbnail-size={self.thumbnail_size}",
f"--content-size={self.content_size}", f"--output={output_path}", f"--depth={self.depth}",
f"--archive={self.archive}", f"--mem-buffer={self.mem_buffer}"]
if self.incremental:
args.append(f"--incremental")
if self.optimize_index:
args.append(f"--optimize-index")
if self.rewrite_url:
args.append(f"--rewrite-url={self.rewrite_url}")
if self.name:
args.append(f"--name={self.name}")
if self.archive_passphrase:
args.append(f"--archive-passphrase={self.archive_passphrase}")
if self.ocr_lang:
args.append(f"--ocr-lang={self.ocr_lang}")
if self.ocr_ebooks:
args.append(f"--ocr-ebooks")
if self.ocr_images:
args.append(f"--ocr-images")
if self.exclude:
args.append(f"--exclude={self.exclude}")
if self.fast:
args.append(f"--fast")
if self.treemap_threshold:
args.append(f"--treemap-threshold={self.treemap_threshold}")
if self.read_subtitles:
args.append(f"--read-subtitles")
if self.fast_epub:
args.append(f"--fast-epub")
if self.checksums:
args.append(f"--checksums")
if self.list_file:
args.append(f"--list_file={self.list_file}")
return args
class Sist2Index:
def __init__(self, path):
self.path = path
with open(os.path.join(path, "descriptor.json")) as f:
self._descriptor = json.load(f)
def to_json(self):
return {
"path": self.path,
"version": self.version(),
"timestamp": self.timestamp(),
"name": self.name()
}
def version(self) -> Sist2Version:
return Sist2Version(self._descriptor["version"])
def timestamp(self) -> datetime:
return datetime.fromtimestamp(self._descriptor["timestamp"])
def name(self) -> str:
return self._descriptor["name"]
class WebOptions(BaseModel):
indices: List[str] = []
search_backend: str = "elasticsearch"
bind: str = "0.0.0.0:4090"
auth: str = None
tag_auth: str = None
tagline: str = "Lightning-fast file system indexer and search tool"
dev: bool = False
lang: str = "en"
auth0_audience: str = None
auth0_domain: str = None
auth0_client_id: str = None
auth0_public_key: str = None
auth0_public_key_file: str = None
def __init__(self, **kwargs):
super().__init__(**kwargs)
def args(self, search_backend: Sist2SearchBackend):
args = ["web", f"--bind={self.bind}", f"--tagline={self.tagline}",
f"--lang={self.lang}"]
if search_backend.backend_type == SearchBackendType("sqlite"):
search_index_absolute = os.path.join(DATA_FOLDER, search_backend.search_index)
args.append(f"--search-index={search_index_absolute}")
else:
args.append(f"--es-url={search_backend.es_url}")
args.append(f"--es-index={search_backend.es_index}")
if search_backend.es_insecure_ssl:
args.append(f"--es-insecure-ssl")
if self.auth0_audience:
args.append(f"--auth0-audience={self.auth0_audience}")
if self.auth0_domain:
args.append(f"--auth0-domain={self.auth0_domain}")
if self.auth0_client_id:
args.append(f"--auth0-client-id={self.auth0_client_id}")
if self.auth0_public_key_file:
args.append(f"--auth0-public-key-file={self.auth0_public_key_file}")
if self.auth:
args.append(f"--auth={self.auth}")
if self.tag_auth:
args.append(f"--tag-auth={self.tag_auth}")
if self.dev:
args.append(f"--dev")
args.extend(self.indices)
return args
class Sist2:
def __init__(self, bin_path: str, data_directory: str):
self.bin_path = bin_path
self._data_dir = data_directory
def index(self, options: IndexOptions, search_backend: Sist2SearchBackend, logs_cb):
args = [
self.bin_path,
*options.args(search_backend),
"--json-logs",
"--very-verbose"
]
logs_cb({"sist2-admin": f"Starting sist2 command with args {args}"})
proc = Popen(args, stdout=PIPE, stderr=PIPE)
t_stderr = Thread(target=self._consume_logs_stderr, args=(logs_cb, proc))
t_stderr.start()
self._consume_logs_stdout(logs_cb, proc)
t_stderr.join()
return proc.returncode
def scan(self, options: ScanOptions, logs_cb, set_pid_cb):
if options.output is None:
options.output = f"scan-{options.name.replace('/', '_')}-{datetime.utcnow()}.sist2"
args = [
self.bin_path,
*options.args(),
"--json-logs",
"--very-verbose"
]
logs_cb({"sist2-admin": f"Starting sist2 command with args {args}"})
proc = Popen(args, stdout=PIPE, stderr=PIPE)
set_pid_cb(proc.pid)
t_stderr = Thread(target=self._consume_logs_stderr, args=(logs_cb, proc))
t_stderr.start()
self._consume_logs_stdout(logs_cb, proc)
t_stderr.join()
return proc.returncode
@staticmethod
def _consume_logs_stderr(logs_cb, proc):
pipe_wrapper = TextIOWrapper(proc.stderr, encoding="utf8", errors="ignore")
try:
for line in pipe_wrapper:
if line.strip() == "":
continue
logs_cb({"stderr": line})
finally:
proc.wait()
pipe_wrapper.close()
@staticmethod
def _consume_logs_stdout(logs_cb, proc):
pipe_wrapper = TextIOWrapper(proc.stdout, encoding="utf8", errors="ignore")
for line in pipe_wrapper:
try:
if line.strip() == "":
continue
log_object = json.loads(line)
logs_cb(log_object)
except Exception as e:
try:
logs_cb({"sist2-admin": f"Could not decode log line: {line}; {e}"})
except NameError:
pass
def web(self, options: WebOptions, search_backend: Sist2SearchBackend, name: str):
if options.auth0_public_key:
with NamedTemporaryFile("w", prefix="sist2-admin", suffix=".txt", delete=False) as f:
f.write(options.auth0_public_key)
options.auth0_public_key_file = f.name
else:
options.auth0_public_key_file = None
args = [
self.bin_path,
*options.args(search_backend)
]
web_logger = logging.Logger(name=f"sist2-frontend-{name}")
web_logger.addHandler(FileHandler(os.path.join(LOG_FOLDER, f"frontend-{name}.log")))
def logs_cb(message):
web_logger.info(json.dumps(message))
logger.info(f"Starting frontend {' '.join(args)}")
proc = Popen(args, stdout=PIPE, stderr=PIPE)
t_stderr = Thread(target=self._consume_logs_stderr, args=(logs_cb, proc))
t_stderr.start()
t_stdout = Thread(target=self._consume_logs_stdout, args=(logs_cb, proc))
t_stdout.start()
return proc.pid