diff --git a/crawler.py b/crawler.py index d0b0346..2074c73 100644 --- a/crawler.py +++ b/crawler.py @@ -6,6 +6,7 @@ from apscheduler.schedulers.background import BackgroundScheduler from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser from indexer import Indexer from search import Search +from thumbnail import ThumbnailGenerator class RunningTask: @@ -80,10 +81,10 @@ class TaskManager: if task.type == Task.INDEX: c = Crawler([]) - path = self.storage.dirs()[task.dir_id].path - self.current_task.total_files.value = c.countFiles(path) + directory = self.storage.dirs()[task.dir_id] + self.current_task.total_files.value = c.countFiles(directory.path) - self.current_process = Process(target=self.execute_crawl, args=(path, self.current_task.parsed_files, + self.current_process = Process(target=self.execute_crawl, args=(directory.path, self.current_task.parsed_files, self.current_task.done, self.current_task.task.dir_id)) self.current_process.start() @@ -99,17 +100,25 @@ class TaskManager: c = Crawler([GenericFileParser([Md5CheckSumCalculator()], ExtensionMimeGuesser())]) c.crawl(path, counter) + # todo: create indexer inside the crawler and index every X files Indexer("changeme").index(c.documents, directory) done.value = 1 def execute_thumbnails(self, dir_id: int, total_files: Value, counter: Value, done: Value): - docs = list(Search("changeme").getAllDocuments(dir_id)) + docs = list(Search("changeme").get_all_documents(dir_id)) + + print(docs) #todo remove total_files.value = len(docs) + tn_generator = ThumbnailGenerator(300) # todo get from config + + + done.value = 1 + def cancel_task(self): self.current_task = None self.current_process.terminate() diff --git a/indexer.py b/indexer.py index ca48acd..46692ce 100644 --- a/indexer.py +++ b/indexer.py @@ -53,9 +53,7 @@ class Indexer: def index(self, docs: list, directory: int): print("Indexing " + str(len(docs)) + " docs") index_string = Indexer.create_bulk_index_string(docs, directory) - print("bulk-start") self.es.bulk(body=index_string, index=self.index_name, doc_type="file", refresh="true") - print("bulk-done") def clear(self): diff --git a/search.py b/search.py index 9a59482..852f37c 100644 --- a/search.py +++ b/search.py @@ -15,7 +15,7 @@ class Search: except: print("elasticsearch is not running") - def getAllDocuments(self, dir_id: int): + def get_all_documents(self, dir_id: int): return helpers.scan(client=self.es, query={"_source": {"includes": ["path", "name"]}, diff --git a/spec/Crawler_spec.py b/spec/Crawler_spec.py index 2639b86..9fdc903 100644 --- a/spec/Crawler_spec.py +++ b/spec/Crawler_spec.py @@ -12,10 +12,10 @@ class CrawlerTest(TestCase): c.crawl("test_folder") - self.assertEqual(len(c.documents), 28) + self.assertEqual(len(c.documents), 31) def test_file_count(self): c = Crawler([]) - self.assertEqual(c.countFiles("test_folder"), 28) + self.assertEqual(c.countFiles("test_folder"), 31) diff --git a/spec/ThumbnailGenerator_spec.py b/spec/ThumbnailGenerator_spec.py index 2e4cacb..d1b0c9b 100644 --- a/spec/ThumbnailGenerator_spec.py +++ b/spec/ThumbnailGenerator_spec.py @@ -2,6 +2,7 @@ from unittest import TestCase from thumbnail import ThumbnailGenerator from PIL import Image import os +import shutil class ThumbnailGeneratorTest(TestCase): @@ -22,3 +23,35 @@ class ThumbnailGeneratorTest(TestCase): if os.path.isfile("test_thumb1.jpg"): os.remove("test_thumb1.jpg") + def test_generate_all(self): + shutil.rmtree("test_thumbnails") + + generator = ThumbnailGenerator(300) + + docs = [{'_source': {'path': 'test_folder', 'name': 'books.csv'}, '_id': 'books.csv-ID'}, + {'_source': {'path': 'test_folder', 'name': 'sample_3.jpg'}, '_id': 'sample_3.jpg-ID'}, + {'_source': {'path': 'test_folder', 'name': 'sample_5.png'}, '_id': 'sample_5.png-ID'}, + {'_source': {'path': 'test_folder', 'name': 'sample_6.gif'}, '_id': 'sample_6.gif-ID'}, + {'_source': {'path': 'test_folder', 'name': 'sample_7.bmp'}, '_id': 'sample_7.bmp-ID'}, + {'_source': {'path': 'test_folder', 'name': 'sample_2.jpeg'}, '_id': 'sample_2.jpeg-ID'}] + + generator.generate_all(docs, "test_thumbnails") + + self.assertFalse(os.path.isfile("test_thumbnails/books.csv-ID") and + os.path.getsize("test_thumbnails/books.csv-ID") > 0) + self.assertTrue(os.path.isfile("test_thumbnails/sample_3.jpg-ID") and + os.path.getsize("test_thumbnails/sample_3.jpg-ID") > 0) + self.assertTrue(os.path.isfile("test_thumbnails/sample_2.jpeg-ID") and + os.path.getsize("test_thumbnails/sample_2.jpeg-ID") > 0) + self.assertTrue(os.path.isfile("test_thumbnails/sample_5.png-ID") and + os.path.getsize("test_thumbnails/sample_5.png-ID") > 0) + self.assertTrue(os.path.isfile("test_thumbnails/sample_6.gif-ID") and + os.path.getsize("test_thumbnails/sample_6.gif-ID") > 0) + self.assertTrue(os.path.isfile("test_thumbnails/sample_7.bmp-ID") and + os.path.getsize("test_thumbnails/sample_7.bmp-ID") > 0) + + + + + + diff --git a/spec/test_folder/sample_5.png b/spec/test_folder/sample_5.png new file mode 100644 index 0000000..2d7586b Binary files /dev/null and b/spec/test_folder/sample_5.png differ diff --git a/spec/test_folder/sample_6.gif b/spec/test_folder/sample_6.gif new file mode 100644 index 0000000..30c08f4 Binary files /dev/null and b/spec/test_folder/sample_6.gif differ diff --git a/spec/test_folder/sample_7.bmp b/spec/test_folder/sample_7.bmp new file mode 100644 index 0000000..1f15383 Binary files /dev/null and b/spec/test_folder/sample_7.bmp differ diff --git a/storage.py b/storage.py index 41c0cfc..fcd2f87 100644 --- a/storage.py +++ b/storage.py @@ -54,6 +54,14 @@ class Directory: def __str__(self): return self.path + " | enabled: " + str(self.enabled) + " | opts: " + str(self.options) + + def get_option(self, key): + + for option in self.options: + if option.key == key: + return option.value + + return None class Task: diff --git a/thumbnail.py b/thumbnail.py index 6f2d6be..a1fec43 100644 --- a/thumbnail.py +++ b/thumbnail.py @@ -1,14 +1,43 @@ from PIL import Image +import os +from parsing import ContentMimeGuesser class ThumbnailGenerator: def __init__(self, size): self.size = (size, size) + self.mime_guesser = ContentMimeGuesser def generate(self, path, dest_path): - image = Image.open(path) - image.thumbnail(self.size, Image.BICUBIC) - image.save(dest_path) - image.close() + try: + with open(path, "rb") as image_file: + with Image.open(image_file) as image: + + image.thumbnail(self.size, Image.BICUBIC) + + canvas = Image.new("RGB", image.size, (255, 0, 255)) + + if image.mode in ('RGBA', 'LA') or (image.mode == 'P' and 'transparency' in image.info): + canvas.paste(image, mask=image.split()[3]) # 3 is the alpha channel + else: + canvas.paste(image) + + canvas.save(dest_path, "JPEG", quality=50, optimize=True) + canvas.close() + + except OSError as e: + print(e) + print("Not an image " + path) + + def generate_all(self, docs, dest_path): + + os.makedirs(dest_path, exist_ok=True) + + for doc in docs: + + full_path = os.path.join(doc["_source"]["path"], doc["_source"]["name"]) + + if os.path.isfile(full_path): + self.generate(full_path, os.path.join(dest_path, doc["_id"])) \ No newline at end of file