diff --git a/crawler.py b/crawler.py index 187cf50..f4aaae8 100644 --- a/crawler.py +++ b/crawler.py @@ -1,17 +1,19 @@ -import os -from storage import Task, LocalStorage import json +import os +import shutil from multiprocessing import Process, Value + from apscheduler.schedulers.background import BackgroundScheduler + +import config +from indexer import Indexer from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \ PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \ PdfFileParser, DocxParser, EbookParser -from indexer import Indexer from search import Search -from thumbnail import ThumbnailGenerator from storage import Directory -import shutil -import config +from storage import Task, LocalStorage +from thumbnail import ThumbnailGenerator class RunningTask: @@ -28,7 +30,8 @@ class RunningTask: class Crawler: - def __init__(self, enabled_parsers: list, mime_guesser: MimeGuesser=ExtensionMimeGuesser(), indexer=None, dir_id=0, + def __init__(self, enabled_parsers: list, mime_guesser: MimeGuesser = ExtensionMimeGuesser(), indexer=None, + dir_id=0, root_dir="/"): self.documents = [] self.enabled_parsers = enabled_parsers @@ -48,7 +51,7 @@ class Crawler: self.mime_guesser = mime_guesser - def crawl(self, root_dir: str, counter: Value=None): + def crawl(self, root_dir: str, counter: Value = None): document_counter = 0 @@ -179,10 +182,6 @@ class TaskManager: self.start_task(self.storage.tasks()[i]) else: if self.current_task.done.value == 1: - self.current_process.terminate() self.storage.del_task(self.current_task.task.id) self.current_task = None - - - diff --git a/parsing.py b/parsing.py index 2602a25..26a17bd 100644 --- a/parsing.py +++ b/parsing.py @@ -127,6 +127,7 @@ class GenericFileParser(FileParser): def __init__(self, checksum_calculators: list, root_dir: str): self.checksum_calculators = checksum_calculators self.root_dir = root_dir + self.root_dir_len = len(root_dir)+1 def parse(self, full_path: str) -> dict: """ @@ -142,11 +143,12 @@ class GenericFileParser(FileParser): name, extension = os.path.splitext(name) info["size"] = file_stat.st_size - info["path"] = os.path.relpath(path, self.root_dir) + info["path"] = path[self.root_dir_len:] info["name"] = name info["extension"] = extension[1:] info["mtime"] = file_stat.st_mtime + # TODO: calculate all checksums at once for calculator in self.checksum_calculators: info[calculator.name] = calculator.checksum(full_path) @@ -317,7 +319,6 @@ class FontParser(GenericFileParser): warnings.simplefilter("ignore") try: - font = TTFont(f) if "name" in font: