mirror of
				https://github.com/simon987/sist2.git
				synced 2025-10-31 07:56:53 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			329 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			329 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import datetime
 | |
| import json
 | |
| import logging
 | |
| import os.path
 | |
| import traceback
 | |
| from datetime import datetime
 | |
| from io import TextIOWrapper
 | |
| from logging import FileHandler
 | |
| from subprocess import Popen, PIPE
 | |
| from tempfile import NamedTemporaryFile
 | |
| from threading import Thread
 | |
| from typing import List
 | |
| 
 | |
| from pydantic import BaseModel
 | |
| 
 | |
| from config import logger, LOG_FOLDER
 | |
| 
 | |
| 
 | |
| class Sist2Version:
 | |
|     def __init__(self, version: str):
 | |
|         self._version = version
 | |
| 
 | |
|         self.major, self.minor, self.patch = [int(x) for x in version.split(".")]
 | |
| 
 | |
|     def __str__(self):
 | |
|         return f"{self.major}.{self.minor}.{self.patch}"
 | |
| 
 | |
| 
 | |
| class WebOptions(BaseModel):
 | |
|     indices: List[str] = []
 | |
|     es_url: str = "http://elasticsearch:9200"
 | |
|     es_insecure_ssl: bool = False
 | |
|     es_index: str = "sist2"
 | |
|     bind: str = "0.0.0.0:4090"
 | |
|     auth: str = None
 | |
|     tag_auth: str = None
 | |
|     tagline: str = "Lightning-fast file system indexer and search tool"
 | |
|     dev: bool = False
 | |
|     lang: str = "en"
 | |
|     auth0_audience: str = None
 | |
|     auth0_domain: str = None
 | |
|     auth0_client_id: str = None
 | |
|     auth0_public_key: str = None
 | |
|     auth0_public_key_file: str = None
 | |
| 
 | |
|     def __init__(self, **kwargs):
 | |
|         super().__init__(**kwargs)
 | |
| 
 | |
|     def args(self):
 | |
|         args = ["web", f"--es-url={self.es_url}", f"--bind={self.bind}",
 | |
|                 f"--tagline={self.tagline}", f"--lang={self.lang}"]
 | |
| 
 | |
|         if self.auth0_audience:
 | |
|             args.append(f"--auth0-audience={self.auth0_audience}")
 | |
|         if self.auth0_domain:
 | |
|             args.append(f"--auth0-domain={self.auth0_domain}")
 | |
|         if self.auth0_client_id:
 | |
|             args.append(f"--auth0-client-id={self.auth0_client_id}")
 | |
|         if self.auth0_public_key_file:
 | |
|             args.append(f"--auth0-public-key-file={self.auth0_public_key_file}")
 | |
|         if self.es_insecure_ssl:
 | |
|             args.append(f"--es-insecure-ssl")
 | |
|         if self.auth:
 | |
|             args.append(f"--auth={self.auth}")
 | |
|         if self.tag_auth:
 | |
|             args.append(f"--tag_auth={self.tag_auth}")
 | |
|         if self.dev:
 | |
|             args.append(f"--dev")
 | |
| 
 | |
|         args.extend(self.indices)
 | |
| 
 | |
|         return args
 | |
| 
 | |
| 
 | |
| class IndexOptions(BaseModel):
 | |
|     path: str = None
 | |
|     threads: int = 1
 | |
|     es_url: str = "http://elasticsearch:9200"
 | |
|     es_insecure_ssl: bool = False
 | |
|     es_index: str = "sist2"
 | |
|     incremental_index: bool = False
 | |
|     script: str = ""
 | |
|     script_file: str = None
 | |
|     batch_size: int = 100
 | |
| 
 | |
|     def __init__(self, **kwargs):
 | |
|         super().__init__(**kwargs)
 | |
| 
 | |
|     def args(self):
 | |
| 
 | |
|         args = ["index", self.path, f"--threads={self.threads}", f"--es-url={self.es_url}",
 | |
|                 f"--es-index={self.es_index}", f"--batch-size={self.batch_size}"]
 | |
| 
 | |
|         if self.script_file:
 | |
|             args.append(f"--script-file={self.script_file}")
 | |
|         if self.es_insecure_ssl:
 | |
|             args.append(f"--es-insecure-ssl")
 | |
|         if self.incremental_index:
 | |
|             args.append(f"--incremental-index")
 | |
| 
 | |
|         return args
 | |
| 
 | |
| 
 | |
| ARCHIVE_SKIP = "skip"
 | |
| ARCHIVE_LIST = "list"
 | |
| ARCHIVE_SHALLOW = "shallow"
 | |
| ARCHIVE_RECURSE = "recurse"
 | |
| 
 | |
| 
 | |
| class ScanOptions(BaseModel):
 | |
|     path: str
 | |
|     threads: int = 1
 | |
|     mem_throttle: int = 0
 | |
|     thumbnail_quality: int = 2
 | |
|     thumbnail_size: int = 500
 | |
|     thumbnail_count: int = 1
 | |
|     content_size: int = 32768
 | |
|     depth: int = -1
 | |
|     archive: str = ARCHIVE_RECURSE
 | |
|     archive_passphrase: str = None
 | |
|     ocr_lang: str = None
 | |
|     ocr_images: bool = False
 | |
|     ocr_ebooks: bool = False
 | |
|     exclude: str = None
 | |
|     fast: bool = False
 | |
|     treemap_threshold: float = 0.0005
 | |
|     mem_buffer: int = 2000
 | |
|     read_subtitles: bool = False
 | |
|     fast_epub: bool = False
 | |
|     checksums: bool = False
 | |
|     incremental: str = None
 | |
|     output: str = None
 | |
|     name: str = None
 | |
|     rewrite_url: str = None
 | |
|     list_file: str = None
 | |
| 
 | |
|     def __init__(self, **kwargs):
 | |
|         super().__init__(**kwargs)
 | |
| 
 | |
|     def args(self):
 | |
|         args = ["scan", self.path, f"--threads={self.threads}", f"--mem-throttle={self.mem_throttle}",
 | |
|                 f"--thumbnail-quality={self.thumbnail_quality}", f"--thumbnail-count={self.thumbnail_count}",
 | |
|                 f"--content-size={self.content_size}", f"--output={self.output}", f"--depth={self.depth}",
 | |
|                 f"--archive={self.archive}", f"--mem-buffer={self.mem_buffer}"]
 | |
| 
 | |
|         if self.incremental:
 | |
|             args.append(f"--incremental={self.incremental}")
 | |
|         if self.rewrite_url:
 | |
|             args.append(f"--rewrite-url={self.rewrite_url}")
 | |
|         if self.name:
 | |
|             args.append(f"--name={self.name}")
 | |
|         if self.archive_passphrase:
 | |
|             args.append(f"--archive-passphrase={self.archive_passphrase}")
 | |
|         if self.ocr_lang:
 | |
|             args.append(f"--ocr-lang={self.ocr_lang}")
 | |
|         if self.ocr_ebooks:
 | |
|             args.append(f"--ocr-ebooks")
 | |
|         if self.ocr_images:
 | |
|             args.append(f"--ocr-images")
 | |
|         if self.exclude:
 | |
|             args.append(f"--exclude={self.exclude}")
 | |
|         if self.fast:
 | |
|             args.append(f"--fast")
 | |
|         if self.treemap_threshold:
 | |
|             args.append(f"--treemap-threshold={self.treemap_threshold}")
 | |
|         if self.read_subtitles:
 | |
|             args.append(f"--read-subtitles")
 | |
|         if self.fast_epub:
 | |
|             args.append(f"--fast-epub")
 | |
|         if self.checksums:
 | |
|             args.append(f"--checksums")
 | |
|         if self.list_file:
 | |
|             args.append(f"--list_file={self.list_file}")
 | |
| 
 | |
|         return args
 | |
| 
 | |
| 
 | |
| class Sist2Index:
 | |
|     def __init__(self, path):
 | |
|         self.path = path
 | |
| 
 | |
|         with open(os.path.join(path, "descriptor.json")) as f:
 | |
|             self._descriptor = json.load(f)
 | |
| 
 | |
|     def to_json(self):
 | |
|         return {
 | |
|             "path": self.path,
 | |
|             "version": self.version(),
 | |
|             "timestamp": self.timestamp(),
 | |
|             "name": self.name()
 | |
|         }
 | |
| 
 | |
|     def version(self) -> Sist2Version:
 | |
|         return Sist2Version(self._descriptor["version"])
 | |
| 
 | |
|     def timestamp(self) -> datetime:
 | |
|         return datetime.fromtimestamp(self._descriptor["timestamp"])
 | |
| 
 | |
|     def name(self) -> str:
 | |
|         return self._descriptor["name"]
 | |
| 
 | |
| 
 | |
| class Sist2:
 | |
| 
 | |
|     def __init__(self, bin_path: str, data_directory: str):
 | |
|         self._bin_path = bin_path
 | |
|         self._data_dir = data_directory
 | |
| 
 | |
|     def index(self, options: IndexOptions, logs_cb):
 | |
| 
 | |
|         if options.script:
 | |
|             with NamedTemporaryFile("w", prefix="sist2-admin", suffix=".painless", delete=False) as f:
 | |
|                 f.write(options.script)
 | |
|             options.script_file = f.name
 | |
|         else:
 | |
|             options.script_file = None
 | |
| 
 | |
|         args = [
 | |
|             self._bin_path,
 | |
|             *options.args(),
 | |
|             "--json-logs",
 | |
|             "--very-verbose"
 | |
|         ]
 | |
|         proc = Popen(args, stdout=PIPE, stderr=PIPE)
 | |
| 
 | |
|         t_stderr = Thread(target=self._consume_logs_stderr, args=(logs_cb, proc))
 | |
|         t_stderr.start()
 | |
| 
 | |
|         self._consume_logs_stdout(logs_cb, proc)
 | |
| 
 | |
|         t_stderr.join()
 | |
| 
 | |
|         return proc.returncode
 | |
| 
 | |
|     def scan(self, options: ScanOptions, logs_cb, set_pid_cb):
 | |
| 
 | |
|         output_dir = os.path.join(
 | |
|             self._data_dir,
 | |
|             f"scan-{datetime.now()}.sist2"
 | |
|         )
 | |
|         options.output = output_dir
 | |
| 
 | |
|         args = [
 | |
|             self._bin_path,
 | |
|             *options.args(),
 | |
|             "--json-logs",
 | |
|             "--very-verbose"
 | |
|         ]
 | |
| 
 | |
|         logs_cb({"sist2-admin": f"Starting sist2 command with args {args}"})
 | |
| 
 | |
|         proc = Popen(args, stdout=PIPE, stderr=PIPE)
 | |
| 
 | |
|         set_pid_cb(proc.pid)
 | |
| 
 | |
|         t_stderr = Thread(target=self._consume_logs_stderr, args=(logs_cb, proc))
 | |
|         t_stderr.start()
 | |
| 
 | |
|         self._consume_logs_stdout(logs_cb, proc)
 | |
| 
 | |
|         t_stderr.join()
 | |
| 
 | |
|         return proc.returncode
 | |
| 
 | |
|     @staticmethod
 | |
|     def _consume_logs_stderr(logs_cb, proc):
 | |
|         pipe_wrapper = TextIOWrapper(proc.stderr, encoding="utf8", errors="ignore")
 | |
|         try:
 | |
|             for line in pipe_wrapper:
 | |
|                 if line.strip() == "":
 | |
|                     continue
 | |
|                 logs_cb({"stderr": line})
 | |
|         finally:
 | |
|             proc.wait()
 | |
|             pipe_wrapper.close()
 | |
| 
 | |
|     @staticmethod
 | |
|     def _consume_logs_stdout(logs_cb, proc):
 | |
|         pipe_wrapper = TextIOWrapper(proc.stdout, encoding="utf8", errors="ignore")
 | |
|         try:
 | |
|             for line in pipe_wrapper:
 | |
|                 if line.strip() == "":
 | |
|                     continue
 | |
|                 log_object = json.loads(line)
 | |
|                 logs_cb(log_object)
 | |
|         except Exception as e:
 | |
|             proc.kill()
 | |
|             try:
 | |
|                 print(line)
 | |
|             except NameError:
 | |
|                 pass
 | |
|             print(traceback.format_exc())
 | |
|         finally:
 | |
|             pass
 | |
|             # proc.wait()
 | |
|             # pipe_wrapper.close()
 | |
| 
 | |
|     def web(self, options: WebOptions, name: str):
 | |
| 
 | |
|         if options.auth0_public_key:
 | |
|             with NamedTemporaryFile("w", prefix="sist2-admin", suffix=".txt", delete=False) as f:
 | |
|                 f.write(options.auth0_public_key)
 | |
|             options.auth0_public_key_file = f.name
 | |
|         else:
 | |
|             options.auth0_public_key_file = None
 | |
| 
 | |
|         args = [
 | |
|             self._bin_path,
 | |
|             *options.args()
 | |
|         ]
 | |
| 
 | |
|         web_logger = logging.Logger(name=f"sist2-frontend-{name}")
 | |
|         web_logger.addHandler(FileHandler(os.path.join(LOG_FOLDER, f"frontend-{name}.log")))
 | |
| 
 | |
|         def logs_cb(message):
 | |
|             web_logger.info(json.dumps(message))
 | |
| 
 | |
|         logger.info(f"Starting frontend {' '.join(args)}")
 | |
| 
 | |
|         proc = Popen(args, stdout=PIPE, stderr=PIPE)
 | |
| 
 | |
|         t_stderr = Thread(target=self._consume_logs_stderr, args=(logs_cb, proc))
 | |
|         t_stderr.start()
 | |
| 
 | |
|         t_stdout = Thread(target=self._consume_logs_stdout, args=(logs_cb, proc))
 | |
|         t_stdout.start()
 | |
| 
 | |
|         return proc.pid
 |