Simple-Incremental-Search-Tool/crawler.py

import os
from storage import Task, LocalStorage
import json
from multiprocessing import Process, Value
from apscheduler.schedulers.background import BackgroundScheduler
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
    PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
    PdfFileParser, DocxParser, EbookParser
from indexer import Indexer
from search import Search
from thumbnail import ThumbnailGenerator
from storage import Directory
import shutil
import config


class RunningTask:

    def __init__(self, task: Task):
        self.total_files = Value("i", 0)
        self.parsed_files = Value("i", 0)
        self.task = task
        self.done = Value("i", 0)

    def to_json(self):
        return json.dumps({"parsed": self.parsed_files.value, "total": self.total_files.value, "id": self.task.id})


class Crawler:

    def __init__(self, enabled_parsers: list, mime_guesser: MimeGuesser=ExtensionMimeGuesser(), indexer=None, dir_id=0,
                 root_dir="/"):
        self.documents = []
        self.enabled_parsers = enabled_parsers
        self.indexer = indexer
        self.dir_id = dir_id
        self.root_dir = root_dir

        for parser in self.enabled_parsers:
            if parser.is_default:
                self.default_parser = parser

        self.ext_map = {}

        for parser in self.enabled_parsers:
            for ext in parser.mime_types:
                self.ext_map[ext] = parser

        self.mime_guesser = mime_guesser

    def crawl(self, root_dir: str, counter: Value=None):

        document_counter = 0

        for root, dirs, files in os.walk(root_dir):

            for filename in files:
                full_path = os.path.join(root, filename)

                mime = self.mime_guesser.guess_mime(full_path)

                parser = self.ext_map.get(mime, self.default_parser)

                document_counter += 1
                if document_counter >= config.index_every:
                    document_counter = 0

                    self.indexer.index(self.documents, self.dir_id)
                    self.documents.clear()

                try:
                    if counter:
                        counter.value += 1

                    doc = parser.parse(full_path)
                    doc["mime"] = mime

                    self.documents.append(doc)
                except FileNotFoundError:
                    continue  # File was deleted

        if self.indexer is not None and len(self.documents) > 0:
            self.indexer.index(self.documents, self.dir_id)

    def countFiles(self, root_dir: str):
        count = 0

        for root, dirs, files in os.walk(root_dir):
            count += len(files)

        return count


class TaskManager:
    def __init__(self, storage: LocalStorage):
        self.current_task = None
        self.storage = storage
        self.current_process = None
        self.indexer = Indexer("changeme")

        scheduler = BackgroundScheduler()
        scheduler.add_job(self.check_new_task, "interval", seconds=0.5)
        scheduler.start()

    def start_task(self, task: Task):
        self.current_task = RunningTask(task)

        directory = self.storage.dirs()[task.dir_id]

        if task.type == Task.INDEX:
            c = Crawler([])
            self.current_task.total_files.value = c.countFiles(directory.path)

            self.current_process = Process(target=self.execute_crawl, args=(directory, self.current_task.parsed_files,
                                                                            self.current_task.done))

        elif task.type == Task.GEN_THUMBNAIL:
            self.current_process = Process(target=self.execute_thumbnails, args=(directory,
                                                                                 self.current_task.total_files,
                                                                                 self.current_task.parsed_files,
                                                                                 self.current_task.done))
        self.current_process.start()

    def execute_crawl(self, directory: Directory, counter: Value, done: Value):

        Search("changeme").delete_directory(directory.id)

        chksum_calcs = []

        for arg in directory.get_option("CheckSumCalculators").split(","):

            if arg.strip() == "md5":
                chksum_calcs.append(Md5CheckSumCalculator())
            elif arg.strip() == "sha1":
                chksum_calcs.append(Sha1CheckSumCalculator())
            elif arg.strip() == "sha256":
                chksum_calcs.append(Sha256CheckSumCalculator())

        mime_guesser = ExtensionMimeGuesser() if directory.get_option("MimeGuesser") == "extension" \
            else ContentMimeGuesser()

        c = Crawler([GenericFileParser(chksum_calcs, directory.path),
                     MediaFileParser(chksum_calcs, directory.path),
                     TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path),
                     PictureFileParser(chksum_calcs, directory.path),
                     FontParser(chksum_calcs, directory.path),
                     PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path),
                     DocxParser(chksum_calcs, int(directory.get_option("SpreadsheetContentLength")), directory.path),
                     EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)],
                    mime_guesser, self.indexer, directory.id)
        c.crawl(directory.path, counter)

        done.value = 1

    def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value):

        dest_path = os.path.join("static/thumbnails", str(directory.id))
        if os.path.exists(dest_path):
            shutil.rmtree(dest_path)

        docs = list(Search("changeme").get_all_documents(directory.id))

        total_files.value = len(docs)

        tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")),
                                          int(directory.get_option("ThumbnailQuality")),
                                          directory.get_option("ThumbnailColor"))
        tn_generator.generate_all(docs, dest_path, counter, directory)

        done.value = 1

    def cancel_task(self):
        self.current_task.done.value = 1

    def check_new_task(self):

        if self.current_task is None:
            for i in sorted(self.storage.tasks(), reverse=True):
                self.start_task(self.storage.tasks()[i])
        else:
            if self.current_task.done.value == 1:

                self.current_process.terminate()
                self.storage.del_task(self.current_task.task.id)
                self.current_task = None