diff --git a/crawler.py b/crawler.py index df44406..c947db1 100644 --- a/crawler.py +++ b/crawler.py @@ -29,11 +29,13 @@ class RunningTask: class Crawler: - def __init__(self, enabled_parsers: list, mime_guesser: MimeGuesser=ContentMimeGuesser(), indexer=None, dir_id=0): + def __init__(self, enabled_parsers: list, mime_guesser: MimeGuesser=ContentMimeGuesser(), indexer=None, dir_id=0, + root_dir="/"): self.documents = [] self.enabled_parsers = enabled_parsers self.indexer = indexer self.dir_id = dir_id + self.root_dir = root_dir for parser in self.enabled_parsers: if parser.is_default: @@ -136,14 +138,14 @@ class TaskManager: mime_guesser = ExtensionMimeGuesser() if directory.get_option("MimeGuesser") == "extension" \ else ContentMimeGuesser() - c = Crawler([GenericFileParser(chksum_calcs), - MediaFileParser(chksum_calcs), - TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), - PictureFileParser(chksum_calcs), - FontParser(chksum_calcs), - PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt - DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt - EbookParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt + c = Crawler([GenericFileParser(chksum_calcs, directory.path), + MediaFileParser(chksum_calcs, directory.path), + TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path), + PictureFileParser(chksum_calcs, directory.path), + FontParser(chksum_calcs, directory.path), + PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path), # todo get content len from other opt + DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path), # todo get content len from other opt + EbookParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path)], # todo get content len from other opt mime_guesser, self.indexer, directory.id) c.crawl(directory.path, counter) @@ -162,7 +164,7 @@ class TaskManager: tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")), int(directory.get_option("ThumbnailQuality")), directory.get_option("ThumbnailColor")) - tn_generator.generate_all(docs, dest_path, counter) + tn_generator.generate_all(docs, dest_path, counter, directory) done.value = 1 diff --git a/indexer.py b/indexer.py index 2ac6c85..98132ef 100644 --- a/indexer.py +++ b/indexer.py @@ -15,7 +15,6 @@ class Indexer: try: requests.head("http://localhost:9200") - print("elasticsearch is already running") except requests.exceptions.ConnectionError: import time diff --git a/parsing.py b/parsing.py index c009f54..562b5b6 100644 --- a/parsing.py +++ b/parsing.py @@ -9,7 +9,7 @@ import html import warnings import docx2txt import xlrd -from pdfminer.pdfparser import PDFParser +from pdfminer.pdfparser import PDFParser, PDFSyntaxError from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter @@ -124,8 +124,9 @@ class GenericFileParser(FileParser): mime_types = [] is_default = True - def __init__(self, checksum_calculators: list): + def __init__(self, checksum_calculators: list, root_dir: str): self.checksum_calculators = checksum_calculators + self.root_dir = root_dir def parse(self, full_path: str) -> dict: """ @@ -141,7 +142,7 @@ class GenericFileParser(FileParser): name, extension = os.path.splitext(name) info["size"] = file_stat.st_size - info["path"] = path # todo save relative path + info["path"] = os.path.relpath(path, self.root_dir) info["name"] = name info["extension"] = extension[1:] info["mtime"] = file_stat.st_mtime @@ -156,8 +157,8 @@ class MediaFileParser(GenericFileParser): is_default = False relevant_properties = ["bit_rate", "nb_streams", "duration", "format_name", "format_long_name"] - def __init__(self, checksum_calculators: list): - super().__init__(checksum_calculators) + def __init__(self, checksum_calculators: list, root_dir): + super().__init__(checksum_calculators, root_dir) self.mime_types = [ "video/3gpp", "video/mp4", "video/mpeg", "video/ogg", "video/quicktime", @@ -207,8 +208,8 @@ class MediaFileParser(GenericFileParser): class PictureFileParser(GenericFileParser): is_default = False - def __init__(self, checksum_calculators: list): - super().__init__(checksum_calculators) + def __init__(self, checksum_calculators: list, root_dir): + super().__init__(checksum_calculators, root_dir) self.mime_types = [ "image/bmp", "image/cgm", "image/cis-cod", "image/g3fax", "image/gif", @@ -246,8 +247,8 @@ class PictureFileParser(GenericFileParser): class TextFileParser(GenericFileParser): is_default = False - def __init__(self, checksum_calculators: list, content_length: int): - super().__init__(checksum_calculators) + def __init__(self, checksum_calculators: list, content_length: int, root_dir): + super().__init__(checksum_calculators, root_dir) self.content_length = content_length self.mime_types = [ @@ -271,7 +272,7 @@ class TextFileParser(GenericFileParser): "text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch", "text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4", "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po", - "text/x-makefile", "application/javascript" + "text/x-makefile", "application/javascript", "application/rtf" ] def parse(self, full_path: str): @@ -298,8 +299,8 @@ class TextFileParser(GenericFileParser): class FontParser(GenericFileParser): is_default = False - def __init__(self, checksum_calculators: list): - super().__init__(checksum_calculators) + def __init__(self, checksum_calculators: list, root_dir): + super().__init__(checksum_calculators, root_dir) self.mime_types = [ "application/font-sfnt", "application/font-woff", "application/vdn.ms-fontobject", @@ -336,8 +337,8 @@ class FontParser(GenericFileParser): class PdfFileParser(GenericFileParser): is_default = False - def __init__(self, checksum_calculators: list, content_length: int): - super().__init__(checksum_calculators) + def __init__(self, checksum_calculators: list, content_length: int, root_dir): + super().__init__(checksum_calculators, root_dir) self.content_length = content_length @@ -351,11 +352,14 @@ class PdfFileParser(GenericFileParser): if self.content_length > 0: with open(full_path, "rb") as f: + try: + parser = PDFParser(f) + document = PDFDocument(parser) + except PDFSyntaxError: + print("couldn't parse PDF " + full_path) + return info + info["content"] = "" - - parser = PDFParser(f) - document = PDFDocument(parser) - if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"": if isinstance(document.info[0]["Title"], bytes): info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n" @@ -399,8 +403,8 @@ class PdfFileParser(GenericFileParser): class EbookParser(GenericFileParser): is_default = False - def __init__(self, checksum_calculators: list, content_length: int): - super().__init__(checksum_calculators) + def __init__(self, checksum_calculators: list, content_length: int, root_dir): + super().__init__(checksum_calculators, root_dir) self.content_length = content_length @@ -435,8 +439,8 @@ class EbookParser(GenericFileParser): class DocxParser(GenericFileParser): is_default = False - def __init__(self, checksum_calculators: list, content_length: int): - super().__init__(checksum_calculators) + def __init__(self, checksum_calculators: list, content_length: int, root_dir): + super().__init__(checksum_calculators, root_dir) self.content_length = content_length @@ -447,12 +451,16 @@ class DocxParser(GenericFileParser): def parse(self, full_path: str): info = super().parse(full_path) - text = docx2txt.process(full_path) + if self.content_length > 0: + try: + text = docx2txt.process(full_path) - if len(text) < self.content_length: - info["content"] = text - else: - info["content"] = text[0:self.content_length] + if len(text) < self.content_length: + info["content"] = text + else: + info["content"] = text[0:self.content_length] + except: + print("Couldn't parse Ebook: " + full_path) return info @@ -460,8 +468,8 @@ class DocxParser(GenericFileParser): class SpreadSheetParser(GenericFileParser): is_default = False - def __init__(self, checksum_calculators: list, content_length: int): - super().__init__(checksum_calculators) + def __init__(self, checksum_calculators: list, content_length: int, root_dir): + super().__init__(checksum_calculators, root_dir) self.content_length = content_length diff --git a/run.py b/run.py index d5157b3..3210dc5 100644 --- a/run.py +++ b/run.py @@ -128,7 +128,23 @@ def search_route(): size_max = request.json["size_max"] mime_types = request.json["mime_types"] must_match = request.json["must_match"] - directories = request.json["directories"] # todo: make sure dir exists and is enabled + directories = request.json["directories"] + + # Remove disabled & non-existing directories + for search_directory in directories: + directory_exists = False + + for dir_id in storage.dirs(): + if search_directory == dir_id: + directory_exists = True + + if not storage.dirs()[dir_id].enabled: + directories.remove(search_directory) + break + + if not directory_exists: + directories.remove(search_directory) + path = request.json["path"] page = search.search(query, size_min, size_max, mime_types, must_match, directories, path) diff --git a/spec/Crawler_spec.py b/spec/Crawler_spec.py index 9cd3c2e..e5e332c 100644 --- a/spec/Crawler_spec.py +++ b/spec/Crawler_spec.py @@ -8,9 +8,9 @@ class CrawlerTest(TestCase): def test_dir_walk(self): - c = Crawler([GenericFileParser([Sha1CheckSumCalculator()])]) + c = Crawler([GenericFileParser([Sha1CheckSumCalculator()], "test_files/")]) - c.crawl("test_folder") + c.crawl("./test_folder") self.assertEqual(len(c.documents), 31) @@ -19,3 +19,16 @@ class CrawlerTest(TestCase): c = Crawler([]) self.assertEqual(c.countFiles("test_folder"), 31) + + def test_path(self): + + c = Crawler([GenericFileParser([], "./test_folder")]) + c.crawl("./test_folder") + + file_count_in_sub2 = 0 + + for doc in c.documents: + if doc["path"] == "sub2": + file_count_in_sub2 += 1 + + self.assertEqual(file_count_in_sub2, 2) diff --git a/spec/DocxParser_spec.py b/spec/DocxParser_spec.py index 4758605..9f3de0f 100644 --- a/spec/DocxParser_spec.py +++ b/spec/DocxParser_spec.py @@ -6,7 +6,7 @@ class DocxParserTest(TestCase): def test_parse_content(self): - parser = DocxParser([], 1000) + parser = DocxParser([], 1000, "test_files/") info = parser.parse("test_files/docx1.docx") diff --git a/spec/EbookParserTest.py b/spec/EbookParserTest.py index 9550bf3..3a87d3b 100644 --- a/spec/EbookParserTest.py +++ b/spec/EbookParserTest.py @@ -6,7 +6,7 @@ class EbookParserTest(TestCase): def test_parse_content(self): - parser = EbookParser([], 1000) + parser = EbookParser([], 1000, "test_files/") info = parser.parse("test_files/epub1.epub") diff --git a/spec/FileParser_spec.py b/spec/FileParser_spec.py index 1c94358..0f5e92e 100644 --- a/spec/FileParser_spec.py +++ b/spec/FileParser_spec.py @@ -16,34 +16,34 @@ class GenericFileParserTest(TestCase): test_file.close() os.utime("test_parse.txt", (1330123456, 1330654321)) - self.parser = GenericFileParser([Md5CheckSumCalculator()]) + self.parser = GenericFileParser([Md5CheckSumCalculator()], "./test_files/") def tearDown(self): os.remove("test_parse.txt") def test_parse_size(self): - result = self.parser.parse("test_parse.txt") + result = self.parser.parse("./test_parse.txt") self.assertEqual(result["size"], 8) def test_parse_name(self): - result = self.parser.parse("test_parse.txt") + result = self.parser.parse("./test_parse.txt") self.assertEqual(result["name"], "test_parse") def test_parse_ext(self): - result = self.parser.parse("test_parse.txt") + result = self.parser.parse("./test_parse.txt") self.assertEqual(result["extension"], "txt") def test_parse_md5(self): - result = self.parser.parse("test_parse.txt") + result = self.parser.parse("./test_parse.txt") self.assertEqual(result["md5"], "25D55AD283AA400AF464C76D713C07AD") def test_mtime(self): - result = self.parser.parse("test_parse.txt") + result = self.parser.parse("./test_parse.txt") self.assertEqual(result["mtime"], 1330654321) diff --git a/spec/FontParser_spec.py b/spec/FontParser_spec.py index 19a576c..52a3b14 100644 --- a/spec/FontParser_spec.py +++ b/spec/FontParser_spec.py @@ -6,7 +6,7 @@ class FontParserTest(TestCase): def test_parse_name_trueType(self): - parser = FontParser([]) + parser = FontParser([], "test_files/") info = parser.parse("test_files/truetype1.ttf") @@ -14,7 +14,7 @@ class FontParserTest(TestCase): def test_parse_name_openType(self): - parser = FontParser([]) + parser = FontParser([], "test_files/") info = parser.parse("test_files/opentype1.otf") @@ -22,7 +22,7 @@ class FontParserTest(TestCase): def test_parse_name_woff(self): - parser = FontParser([]) + parser = FontParser([], "test_files/") info = parser.parse("test_files/woff.woff") @@ -30,7 +30,7 @@ class FontParserTest(TestCase): def test_parse_name_woff2(self): - parser = FontParser([]) + parser = FontParser([], "test_files/") info = parser.parse("test_files/woff2.woff2") diff --git a/spec/MediaFileParser_spec.py b/spec/MediaFileParser_spec.py index 06ae47e..8df6436 100644 --- a/spec/MediaFileParser_spec.py +++ b/spec/MediaFileParser_spec.py @@ -6,37 +6,33 @@ class MediaFileParserTest(TestCase): def test_audio_wav(self): - parser = MediaFileParser([]) + parser = MediaFileParser([], "test_files/") - info = parser.parse("test_files/cat1.wav") + info = parser.parse("./test_files/cat1.wav") - self.assertEqual(info["format_name"], "wav") self.assertEqual(info["format_long_name"], "WAV / WAVE (Waveform Audio)") self.assertEqual(info["duration"], 20.173875) def test_video_mov(self): - parser = MediaFileParser([]) + parser = MediaFileParser([], "./test_files") - info = parser.parse("test_files/vid1.mp4") + info = parser.parse("./test_files/vid1.mp4") - self.assertEqual(info["format_name"], "mov,mp4,m4a,3gp,3g2,mj2") self.assertEqual(info["format_long_name"], "QuickTime / MOV") self.assertEqual(info["duration"], 5.334) def test_video_webm(self): - parser = MediaFileParser([]) + parser = MediaFileParser([], "test_files/") info = parser.parse("test_files/vid2.webm") - self.assertEqual(info["format_name"], "matroska,webm") self.assertEqual(info["format_long_name"], "Matroska / WebM") self.assertEqual(info["duration"], 10.619) def test_video_ogg(self): - parser = MediaFileParser([]) + parser = MediaFileParser([], "test_files/") info = parser.parse("test_files/vid3.ogv") - self.assertEqual(info["format_name"], "ogg") self.assertEqual(info["format_long_name"], "Ogg") self.assertEqual(info["duration"], 10.618867) diff --git a/spec/PdfFileParser.py b/spec/PdfFileParser.py index 2e1d52b..95834f7 100644 --- a/spec/PdfFileParser.py +++ b/spec/PdfFileParser.py @@ -6,7 +6,7 @@ class PdfParserTest(TestCase): def test_parse_content(self): - parser = PdfFileParser([], 12488) + parser = PdfFileParser([], 12488, "test_files/") info = parser.parse("test_files/pdf1.pdf") diff --git a/spec/PictureFileParser_spec.py b/spec/PictureFileParser_spec.py index f63edf8..e89d03a 100644 --- a/spec/PictureFileParser_spec.py +++ b/spec/PictureFileParser_spec.py @@ -6,7 +6,7 @@ class PictureFileParserTest(TestCase): def test_parse_jpg(self): - parser = PictureFileParser([]) + parser = PictureFileParser([], "test_files/") info = parser.parse("test_folder/sample_1.jpg") @@ -17,7 +17,7 @@ class PictureFileParserTest(TestCase): def test_parse_png(self): - parser = PictureFileParser([]) + parser = PictureFileParser([], "test_files/") info = parser.parse("test_folder/sample_5.png") @@ -28,7 +28,7 @@ class PictureFileParserTest(TestCase): def test_parse_gif(self): - parser = PictureFileParser([]) + parser = PictureFileParser([], "test_files/") info = parser.parse("test_folder/sample_6.gif") @@ -39,7 +39,7 @@ class PictureFileParserTest(TestCase): def test_parse_bmp(self): - parser = PictureFileParser([]) + parser = PictureFileParser([], "test_files/") info = parser.parse("test_folder/sample_7.bmp") diff --git a/spec/SpreadSheetParser_spec.py b/spec/SpreadSheetParser_spec.py index dc44dfb..a5e9f3f 100644 --- a/spec/SpreadSheetParser_spec.py +++ b/spec/SpreadSheetParser_spec.py @@ -6,7 +6,7 @@ class PdfParserTest(TestCase): def test_parse_content_xls(self): - parser = SpreadSheetParser([], 1500) + parser = SpreadSheetParser([], 1500, "test_files/") info = parser.parse("test_files/xls1.xls") @@ -14,7 +14,7 @@ class PdfParserTest(TestCase): def test_parse_content_xlsx(self): - parser = SpreadSheetParser([], 1500) + parser = SpreadSheetParser([], 1500, "test_files/") info = parser.parse("test_files/xlsx1.xlsx") diff --git a/spec/TextFileParser_spec.py b/spec/TextFileParser_spec.py index f72005c..fda3d1a 100644 --- a/spec/TextFileParser_spec.py +++ b/spec/TextFileParser_spec.py @@ -6,7 +6,7 @@ class TextFileParserTest(TestCase): def test_parse_csv(self): - parser = TextFileParser([], 1234) + parser = TextFileParser([], 1234, "test_files/") info = parser.parse("test_files/text.csv") diff --git a/static/js/search.js b/static/js/search.js index d805c11..359df57 100644 --- a/static/js/search.js +++ b/static/js/search.js @@ -592,4 +592,4 @@ document.getElementById("pathBar").addEventListener("keyup", function () { searchQueued = true; }); -window.setInterval(search, 75); \ No newline at end of file +window.setInterval(search, 150); \ No newline at end of file diff --git a/templates/directory.html b/templates/directory.html index db6d9d2..5cfa4c5 100644 --- a/templates/directory.html +++ b/templates/directory.html @@ -43,7 +43,7 @@ {% for dir in directories %}
{{ directories[dir].path }}
{{ directories[dir].path }}