From fe52ecceff16ae4e759f7c77750a0b7934cffcc4 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 24 Apr 2018 08:49:12 -0400 Subject: [PATCH] Small improvements in indexing --- README.md | 25 +++++++++++++++++++------ config.py | 6 ++++++ indexer.py | 6 +++++- parsing.py | 4 +--- test/test_TextFileParser.py | 2 +- thumbnail.py | 12 +++++++----- 6 files changed, 39 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 30b76ea..16c5323 100644 --- a/README.md +++ b/README.md @@ -2,21 +2,34 @@ Work in progress: probably won't work without some tweaking -## Running on linux +## Setup on Windows +```bash +git clone https://github.com/simon987/Projet-Web-2018 +cd Projet-Web-2018 +``` +[Download latest elasticsearch version](https://www.elastic.co/downloads/elasticsearch) and extract to `Projet-Web-2018\elasticsearch` + +```bash +sudo pip3 install -r requirements.txt + +python3 run.py +``` + +## Setup on Mac/linux ```bash git clone https://github.com/simon987/Projet-Web-2018 cd Projet-Web-2018 wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.2.4.zip unzip elasticsearch-6.2.4.zip rm elasticsearch-6.2.4.zip -mv elasticsearch-6.2.4 elasticsearch +mv elasticsearch-6.2.4 elasticsearch -sudo pip3 install -r requirements.txt +sudo pip3 install -r requirements.txt python3 run.py ``` -## Running tests -``` -python3 -m unittest discover +## Running unit tests +```bash +python3 -m unittest ``` diff --git a/config.py b/config.py index 11ebfa6..d87a23a 100644 --- a/config.py +++ b/config.py @@ -25,4 +25,10 @@ bcrypt_rounds = 14 # sqlite3 database path db_path = "./local_storage.db" +try: + import cairosvg + cairosvg = True +except: + cairosvg = False + VERSION = "1.0a" diff --git a/indexer.py b/indexer.py index 9215a0c..f7676fb 100644 --- a/indexer.py +++ b/indexer.py @@ -37,7 +37,6 @@ class Indexer: if platform.system() == "Windows": subprocess.Popen(["elasticsearch\\bin\\elasticsearch.bat"]) else: - print(platform.system()) subprocess.Popen(["elasticsearch/bin/elasticsearch"]) @staticmethod @@ -86,6 +85,9 @@ class Indexer: "analysis": {"analyzer": {"my_nGram": {"tokenizer": "my_nGram_tokenizer", "filter": ["lowercase", "asciifolding"]}}}}, index=self.index_name) + self.es.indices.put_settings(body={ + "analysis": {"analyzer": {"content_analyser": {"tokenizer": "standard", "filter": ["lowercase"]}}}}, + index=self.index_name) self.es.indices.put_mapping(body={"properties": { "path": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"}, @@ -98,6 +100,7 @@ class Indexer: "width": {"type": "integer"}, "height": {"type": "integer"}, "mtime": {"type": "integer"}, + "size": {"type": "long"}, "directory": {"type": "short"}, "name": {"analyzer": "my_nGram", "type": "text"}, "album": {"analyzer": "my_nGram", "type": "text"}, @@ -105,6 +108,7 @@ class Indexer: "title": {"analyzer": "my_nGram", "type": "text"}, "genre": {"analyzer": "my_nGram", "type": "text"}, "album_artist": {"analyzer": "my_nGram", "type": "text"}, + "content": {"analyzer": "content_analyser", "type": "text"}, }}, doc_type="file", index=self.index_name) self.es.indices.open(index=self.index_name) diff --git a/parsing.py b/parsing.py index 96d6aa7..6563b41 100644 --- a/parsing.py +++ b/parsing.py @@ -4,7 +4,6 @@ import mimetypes import subprocess import json import chardet -import html import warnings import docx2txt import xlrd @@ -290,7 +289,7 @@ class TextFileParser(GenericFileParser): info["encoding"] = encoding try: content = raw_content.decode(encoding, "ignore") - info["content"] = html.escape(content) + info["content"] = content except Exception: print("Unknown encoding: " + encoding) @@ -497,7 +496,6 @@ class SpreadSheetParser(GenericFileParser): num_cells = worksheet.ncols for curr_row in range(num_rows): - row = worksheet.row(curr_row) new_output = [] for index_col in xrange(num_cells): value = worksheet.cell_value(curr_row, index_col) diff --git a/test/test_TextFileParser.py b/test/test_TextFileParser.py index 2b71c00..157d428 100644 --- a/test/test_TextFileParser.py +++ b/test/test_TextFileParser.py @@ -14,5 +14,5 @@ class TextFileParserTest(TestCase): info = parser.parse(dir_name + "/test_files/text.csv") self.assertTrue(info["content"].startswith("rosbagTimestamp,header,seq,stamp,secs,nsecs,")) - self.assertEqual(len(info["content"]), 1309) # Size is larger because of html escaping + self.assertEqual(len(info["content"]), 1234) self.assertEqual(info["encoding"], "ascii") diff --git a/thumbnail.py b/thumbnail.py index 26ecf57..78338ad 100644 --- a/thumbnail.py +++ b/thumbnail.py @@ -2,7 +2,10 @@ from PIL import Image import os from multiprocessing import Value, Process import ffmpeg -#import cairosvg +import config + +if config.cairosvg: + import cairosvg class ThumbnailGenerator: @@ -17,12 +20,12 @@ class ThumbnailGenerator: if mime is None: return - if mime == "image/svg+xml": + if mime == "image/svg+xml" and config.cairosvg: try: p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"}) p.start() - p.join(1.5) + p.join(1) if p.is_alive(): p.terminate() @@ -50,8 +53,7 @@ class ThumbnailGenerator: .run() ) self.generate_image("tmp", dest_path) - except Exception as e: - print(e) + except Exception: print("Couldn't make thumbnail for " + path) if os.path.exists("tmp"):