diff --git a/crawler.py b/crawler.py index e3be54a..d0b0346 100644 --- a/crawler.py +++ b/crawler.py @@ -5,18 +5,19 @@ from multiprocessing import Process, Value from apscheduler.schedulers.background import BackgroundScheduler from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser from indexer import Indexer +from search import Search class RunningTask: def __init__(self, task: Task): - self.total_files = 0 + self.total_files = Value("i", 0) self.parsed_files = Value("i", 0) self.task = task self.done = Value("i", 0) def to_json(self): - return json.dumps({"parsed": self.parsed_files.value, "total": self.total_files, "id": self.task.id}) + return json.dumps({"parsed": self.parsed_files.value, "total": self.total_files.value, "id": self.task.id}) class Crawler: @@ -77,24 +78,36 @@ class TaskManager: def start_task(self, task: Task): self.current_task = RunningTask(task) - c = Crawler([]) - path = self.storage.dirs()[task.dir_id].path - self.current_task.total_files = c.countFiles(path) + if task.type == Task.INDEX: + c = Crawler([]) + path = self.storage.dirs()[task.dir_id].path + self.current_task.total_files.value = c.countFiles(path) - print("Started task - " + str(self.current_task.total_files) + " files") - print(path) + self.current_process = Process(target=self.execute_crawl, args=(path, self.current_task.parsed_files, + self.current_task.done, + self.current_task.task.dir_id)) + self.current_process.start() - self.current_process = Process(target=self.execute_crawl, args=(path, self.current_task.parsed_files, self.current_task.done)) - # self.current_process.daemon = True - self.current_process.start() + elif task.type == Task.GEN_THUMBNAIL: + self.current_process = Process(target=self.execute_thumbnails, args=(self.current_task.task.dir_id, + self.current_task.total_files, + self.current_task.parsed_files, + self.current_task.done)) + self.current_process.start() - def execute_crawl(self, path: str, counter: Value, done: Value): + def execute_crawl(self, path: str, counter: Value, done: Value, directory: int): c = Crawler([GenericFileParser([Md5CheckSumCalculator()], ExtensionMimeGuesser())]) c.crawl(path, counter) - Indexer("changeme").index(c.documents) + Indexer("changeme").index(c.documents, directory) + done.value = 1 + + def execute_thumbnails(self, dir_id: int, total_files: Value, counter: Value, done: Value): + + docs = list(Search("changeme").getAllDocuments(dir_id)) + + total_files.value = len(docs) - print("Done") done.value = 1 def cancel_task(self): diff --git a/indexer.py b/indexer.py index fdebed4..ca48acd 100644 --- a/indexer.py +++ b/indexer.py @@ -30,7 +30,7 @@ class Indexer: subprocess.Popen(["elasticsearch/bin/elasticsearch"]) @staticmethod - def create_bulk_index_string(docs: list): + def create_bulk_index_string(docs: list, directory: int): """ Creates a insert string for sending to elasticsearch """ @@ -42,6 +42,7 @@ class Indexer: action_string = '{"index":{}}\n' for doc in docs: + doc["directory"] = directory result += action_string result += json.dumps(doc) + "\n" @@ -49,11 +50,11 @@ class Indexer: return result - def index(self, docs: list): + def index(self, docs: list, directory: int): print("Indexing " + str(len(docs)) + " docs") - index_string = Indexer.create_bulk_index_string(docs) + index_string = Indexer.create_bulk_index_string(docs, directory) print("bulk-start") - self.es.bulk(body=index_string, index=self.index_name, doc_type="file") + self.es.bulk(body=index_string, index=self.index_name, doc_type="file", refresh="true") print("bulk-done") def clear(self): @@ -73,7 +74,8 @@ class Indexer: self.es.indices.put_mapping(body='{"properties": {' '"name": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},' '"suggest-path": {"type": "completion", "analyzer": "keyword"},' - '"mime": {"type": "keyword"}' + '"mime": {"type": "keyword"},' + '"directory": {"type": "keyword"}' '}}', doc_type="file", index=self.index_name) self.es.indices.open(index=self.index_name) diff --git a/search.py b/search.py new file mode 100644 index 0000000..9a59482 --- /dev/null +++ b/search.py @@ -0,0 +1,24 @@ +import elasticsearch +from elasticsearch import helpers +import requests + + +class Search: + + def __init__(self, index: str): + self.index_name = index + self.es = elasticsearch.Elasticsearch() + + try: + requests.head("http://localhost:9200") + print("elasticsearch is already running") + except: + print("elasticsearch is not running") + + def getAllDocuments(self, dir_id: int): + + return helpers.scan(client=self.es, + query={"_source": {"includes": ["path", "name"]}, + "query": {"term": {"directory": dir_id}}}, + index=self.index_name) + diff --git a/spec/Indexer_spec.py b/spec/Indexer_spec.py index a43fa01..99690fb 100644 --- a/spec/Indexer_spec.py +++ b/spec/Indexer_spec.py @@ -8,9 +8,14 @@ class IndexerTest(TestCase): docs = [{"name": "doc1"}, {"name": "doc2"}] - result = Indexer.create_bulk_index_string(docs, "indexName") + result = Indexer.create_bulk_index_string(docs, 1) + + self.assertTrue(result == '{"index":{}}\n' + '{"directory": 1, "name": "doc1"}\n' + '{"index":{}}\n' + '{"directory": 1, "name": "doc2"}\n' + or result == '{"index":{}}\n' + '{"name": "doc1", "directory": 1}\n' + '{"index":{}}\n' + '{"name": "doc2", "directory": 1}\n') - self.assertEqual(result, '{"index":{"_index":"indexName","_type":"file"}}\n' - '{"name": "doc1"}\n' - '{"index":{"_index":"indexName","_type":"file"}}\n' - '{"name": "doc2"}\n') diff --git a/spec/ThumbnailGenerator_spec.py b/spec/ThumbnailGenerator_spec.py new file mode 100644 index 0000000..2e4cacb --- /dev/null +++ b/spec/ThumbnailGenerator_spec.py @@ -0,0 +1,24 @@ +from unittest import TestCase +from thumbnail import ThumbnailGenerator +from PIL import Image +import os + + +class ThumbnailGeneratorTest(TestCase): + + def test_generate(self): + + generator = ThumbnailGenerator(300) + # Original image is 420x315 + generator.generate("test_folder/sample_1.jpg", "test_thumb1.jpg") + + img = Image.open("test_thumb1.jpg") + width, height = img.size + img.close() + + self.assertEqual(300, width) + self.assertEqual(225, height) + + if os.path.isfile("test_thumb1.jpg"): + os.remove("test_thumb1.jpg") + diff --git a/storage.py b/storage.py index 9c42a32..41c0cfc 100644 --- a/storage.py +++ b/storage.py @@ -58,6 +58,9 @@ class Directory: class Task: + INDEX = 1 + GEN_THUMBNAIL = 2 + def __init__(self, task_type: int, dir_id: int, completed: bool = False, completed_time: time.time = None, task_id: int = None): self.id = task_id diff --git a/templates/directory_manage.html b/templates/directory_manage.html index 877ef7f..d25282d 100644 --- a/templates/directory_manage.html +++ b/templates/directory_manage.html @@ -126,7 +126,6 @@ Remove - {% endfor %} diff --git a/templates/task.html b/templates/task.html index 603a700..b010464 100644 --- a/templates/task.html +++ b/templates/task.html @@ -119,7 +119,6 @@ - {% endfor %} diff --git a/test_generate_big_dir.py b/test_generate_big_dir.py new file mode 100644 index 0000000..ffdc24e --- /dev/null +++ b/test_generate_big_dir.py @@ -0,0 +1,15 @@ +import os + + +if __name__ == "__main__": + + if not os.path.isdir("big_dir"): + os.mkdir("big_dir") + + for i in range(100): + + if not os.path.isdir("big_dir/" + str(i)): + os.mkdir("big_dir/" + str(i)) + + for j in range(10000): + open("big_dir/" + str(i) + "/file-" + str(j), 'a').close() diff --git a/thumbnail.py b/thumbnail.py new file mode 100644 index 0000000..6f2d6be --- /dev/null +++ b/thumbnail.py @@ -0,0 +1,14 @@ +from PIL import Image + + +class ThumbnailGenerator: + + def __init__(self, size): + self.size = (size, size) + + def generate(self, path, dest_path): + + image = Image.open(path) + image.thumbnail(self.size, Image.BICUBIC) + image.save(dest_path) + image.close() diff --git a/tmp_specs b/tmp_specs new file mode 100644 index 0000000..3c3947f --- /dev/null +++ b/tmp_specs @@ -0,0 +1,28 @@ +Ajouter un utilisateur +mettre admin +Enlever admin + ne marche pas si t'est le seul admin + +y'existe conn.executescript + +Utiliser des functions queries pour afficher genre le total size of query, etc + Utiliser opendirectories-bot pour afficher des info + + + +Plugins + MP3 tags + todo: other music + Font files + images + video tags + +use es filter to filter out folders that the user has no permission to search + +option to toggle auto complete +option to set password loop count +option to chose checksum thingy +option to chose mime guesser +option to toggle search history/stats + +thumbnails are stored in a folder for each folder: easy to delete \ No newline at end of file