mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-10 14:06:41 +00:00
189 lines
6.8 KiB
Python
189 lines
6.8 KiB
Python
import os
|
|
from storage import Task, LocalStorage
|
|
import json
|
|
from multiprocessing import Process, Value
|
|
from apscheduler.schedulers.background import BackgroundScheduler
|
|
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
|
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
|
|
PdfFileParser, DocxParser, EbookParser
|
|
from indexer import Indexer
|
|
from search import Search
|
|
from thumbnail import ThumbnailGenerator
|
|
from storage import Directory
|
|
import shutil
|
|
import config
|
|
|
|
|
|
class RunningTask:
|
|
|
|
def __init__(self, task: Task):
|
|
self.total_files = Value("i", 0)
|
|
self.parsed_files = Value("i", 0)
|
|
self.task = task
|
|
self.done = Value("i", 0)
|
|
|
|
def to_json(self):
|
|
return json.dumps({"parsed": self.parsed_files.value, "total": self.total_files.value, "id": self.task.id})
|
|
|
|
|
|
class Crawler:
|
|
|
|
def __init__(self, enabled_parsers: list, mime_guesser: MimeGuesser=ExtensionMimeGuesser(), indexer=None, dir_id=0,
|
|
root_dir="/"):
|
|
self.documents = []
|
|
self.enabled_parsers = enabled_parsers
|
|
self.indexer = indexer
|
|
self.dir_id = dir_id
|
|
self.root_dir = root_dir
|
|
|
|
for parser in self.enabled_parsers:
|
|
if parser.is_default:
|
|
self.default_parser = parser
|
|
|
|
self.ext_map = {}
|
|
|
|
for parser in self.enabled_parsers:
|
|
for ext in parser.mime_types:
|
|
self.ext_map[ext] = parser
|
|
|
|
self.mime_guesser = mime_guesser
|
|
|
|
def crawl(self, root_dir: str, counter: Value=None):
|
|
|
|
document_counter = 0
|
|
|
|
for root, dirs, files in os.walk(root_dir):
|
|
|
|
for filename in files:
|
|
full_path = os.path.join(root, filename)
|
|
|
|
mime = self.mime_guesser.guess_mime(full_path)
|
|
|
|
parser = self.ext_map.get(mime, self.default_parser)
|
|
|
|
document_counter += 1
|
|
if document_counter >= config.index_every:
|
|
document_counter = 0
|
|
|
|
self.indexer.index(self.documents, self.dir_id)
|
|
self.documents.clear()
|
|
|
|
try:
|
|
if counter:
|
|
counter.value += 1
|
|
|
|
doc = parser.parse(full_path)
|
|
doc["mime"] = mime
|
|
|
|
self.documents.append(doc)
|
|
except FileNotFoundError:
|
|
continue # File was deleted
|
|
|
|
if self.indexer is not None and len(self.documents) > 0:
|
|
self.indexer.index(self.documents, self.dir_id)
|
|
|
|
def countFiles(self, root_dir: str):
|
|
count = 0
|
|
|
|
for root, dirs, files in os.walk(root_dir):
|
|
count += len(files)
|
|
|
|
return count
|
|
|
|
|
|
class TaskManager:
|
|
def __init__(self, storage: LocalStorage):
|
|
self.current_task = None
|
|
self.storage = storage
|
|
self.current_process = None
|
|
self.indexer = Indexer("changeme")
|
|
|
|
scheduler = BackgroundScheduler()
|
|
scheduler.add_job(self.check_new_task, "interval", seconds=0.5)
|
|
scheduler.start()
|
|
|
|
def start_task(self, task: Task):
|
|
self.current_task = RunningTask(task)
|
|
|
|
directory = self.storage.dirs()[task.dir_id]
|
|
|
|
if task.type == Task.INDEX:
|
|
c = Crawler([])
|
|
self.current_task.total_files.value = c.countFiles(directory.path)
|
|
|
|
self.current_process = Process(target=self.execute_crawl, args=(directory, self.current_task.parsed_files,
|
|
self.current_task.done))
|
|
|
|
elif task.type == Task.GEN_THUMBNAIL:
|
|
self.current_process = Process(target=self.execute_thumbnails, args=(directory,
|
|
self.current_task.total_files,
|
|
self.current_task.parsed_files,
|
|
self.current_task.done))
|
|
self.current_process.start()
|
|
|
|
def execute_crawl(self, directory: Directory, counter: Value, done: Value):
|
|
|
|
Search("changeme").delete_directory(directory.id)
|
|
|
|
chksum_calcs = []
|
|
|
|
for arg in directory.get_option("CheckSumCalculators").split(","):
|
|
|
|
if arg.strip() == "md5":
|
|
chksum_calcs.append(Md5CheckSumCalculator())
|
|
elif arg.strip() == "sha1":
|
|
chksum_calcs.append(Sha1CheckSumCalculator())
|
|
elif arg.strip() == "sha256":
|
|
chksum_calcs.append(Sha256CheckSumCalculator())
|
|
|
|
mime_guesser = ExtensionMimeGuesser() if directory.get_option("MimeGuesser") == "extension" \
|
|
else ContentMimeGuesser()
|
|
|
|
c = Crawler([GenericFileParser(chksum_calcs, directory.path),
|
|
MediaFileParser(chksum_calcs, directory.path),
|
|
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path),
|
|
PictureFileParser(chksum_calcs, directory.path),
|
|
FontParser(chksum_calcs, directory.path),
|
|
PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path),
|
|
DocxParser(chksum_calcs, int(directory.get_option("SpreadsheetContentLength")), directory.path),
|
|
EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)],
|
|
mime_guesser, self.indexer, directory.id)
|
|
c.crawl(directory.path, counter)
|
|
|
|
done.value = 1
|
|
|
|
def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value):
|
|
|
|
dest_path = os.path.join("static/thumbnails", str(directory.id))
|
|
if os.path.exists(dest_path):
|
|
shutil.rmtree(dest_path)
|
|
|
|
docs = list(Search("changeme").get_all_documents(directory.id))
|
|
|
|
total_files.value = len(docs)
|
|
|
|
tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")),
|
|
int(directory.get_option("ThumbnailQuality")),
|
|
directory.get_option("ThumbnailColor"))
|
|
tn_generator.generate_all(docs, dest_path, counter, directory)
|
|
|
|
done.value = 1
|
|
|
|
def cancel_task(self):
|
|
self.current_task.done.value = 1
|
|
|
|
def check_new_task(self):
|
|
|
|
if self.current_task is None:
|
|
for i in sorted(self.storage.tasks(), reverse=True):
|
|
self.start_task(self.storage.tasks()[i])
|
|
else:
|
|
if self.current_task.done.value == 1:
|
|
|
|
self.current_process.terminate()
|
|
self.storage.del_task(self.current_task.task.id)
|
|
self.current_task = None
|
|
|
|
|
|
|