From fe52ecceff16ae4e759f7c77750a0b7934cffcc4 Mon Sep 17 00:00:00 2001
From: simon <fortier.simon@hotmail.com>
Date: Tue, 24 Apr 2018 08:49:12 -0400
Subject: [PATCH] Small improvements in indexing

---
 README.md                   | 25 +++++++++++++++++++------
 config.py                   |  6 ++++++
 indexer.py                  |  6 +++++-
 parsing.py                  |  4 +---
 test/test_TextFileParser.py |  2 +-
 thumbnail.py                | 12 +++++++-----
 6 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 30b76ea..16c5323 100644
--- a/README.md
+++ b/README.md
@@ -2,21 +2,34 @@
 
 Work in progress: probably won't work without some tweaking
 
-## Running on linux
+## Setup on Windows
+```bash
+git clone https://github.com/simon987/Projet-Web-2018
+cd Projet-Web-2018
+```
+[Download latest elasticsearch version](https://www.elastic.co/downloads/elasticsearch) and extract to `Projet-Web-2018\elasticsearch`
+
+```bash
+sudo pip3 install -r requirements.txt
+
+python3 run.py
+```
+
+## Setup on Mac/linux
 ```bash
 git clone https://github.com/simon987/Projet-Web-2018
 cd Projet-Web-2018
 wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.2.4.zip
 unzip elasticsearch-6.2.4.zip
 rm elasticsearch-6.2.4.zip
-mv elasticsearch-6.2.4 elasticsearch
+mv elasticsearch-6.2.4 elasticsearch    
 
-sudo pip3 install -r requirements.txt
+sudo pip3 install -r requirements.txt    
 
 python3 run.py
 ```
 
-## Running tests
-```
-python3 -m unittest discover
+## Running unit tests
+```bash
+python3 -m unittest
 ```
diff --git a/config.py b/config.py
index 11ebfa6..d87a23a 100644
--- a/config.py
+++ b/config.py
@@ -25,4 +25,10 @@ bcrypt_rounds = 14
 # sqlite3 database path
 db_path = "./local_storage.db"
 
+try:
+    import cairosvg
+    cairosvg = True
+except:
+    cairosvg = False
+
 VERSION = "1.0a"
diff --git a/indexer.py b/indexer.py
index 9215a0c..f7676fb 100644
--- a/indexer.py
+++ b/indexer.py
@@ -37,7 +37,6 @@ class Indexer:
         if platform.system() == "Windows":
             subprocess.Popen(["elasticsearch\\bin\\elasticsearch.bat"])
         else:
-            print(platform.system())
             subprocess.Popen(["elasticsearch/bin/elasticsearch"])
 
     @staticmethod
@@ -86,6 +85,9 @@ class Indexer:
             "analysis": {"analyzer": {"my_nGram": {"tokenizer": "my_nGram_tokenizer", "filter": ["lowercase",
                                                                                                  "asciifolding"]}}}},
             index=self.index_name)
+        self.es.indices.put_settings(body={
+            "analysis": {"analyzer": {"content_analyser": {"tokenizer": "standard", "filter": ["lowercase"]}}}},
+            index=self.index_name)
 
         self.es.indices.put_mapping(body={"properties": {
             "path": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},
@@ -98,6 +100,7 @@ class Indexer:
             "width": {"type": "integer"},
             "height": {"type": "integer"},
             "mtime": {"type": "integer"},
+            "size": {"type": "long"},
             "directory": {"type": "short"},
             "name": {"analyzer": "my_nGram", "type": "text"},
             "album": {"analyzer": "my_nGram", "type": "text"},
@@ -105,6 +108,7 @@ class Indexer:
             "title": {"analyzer": "my_nGram", "type": "text"},
             "genre": {"analyzer": "my_nGram", "type": "text"},
             "album_artist": {"analyzer": "my_nGram", "type": "text"},
+            "content": {"analyzer": "content_analyser", "type": "text"},
         }}, doc_type="file", index=self.index_name)
 
         self.es.indices.open(index=self.index_name)
diff --git a/parsing.py b/parsing.py
index 96d6aa7..6563b41 100644
--- a/parsing.py
+++ b/parsing.py
@@ -4,7 +4,6 @@ import mimetypes
 import subprocess
 import json
 import chardet
-import html
 import warnings
 import docx2txt
 import xlrd
@@ -290,7 +289,7 @@ class TextFileParser(GenericFileParser):
                     info["encoding"] = encoding
                     try:
                         content = raw_content.decode(encoding, "ignore")
-                        info["content"] = html.escape(content)
+                        info["content"] = content
                     except Exception:
                         print("Unknown encoding: " + encoding)
 
@@ -497,7 +496,6 @@ class SpreadSheetParser(GenericFileParser):
                 num_cells = worksheet.ncols
 
                 for curr_row in range(num_rows):
-                    row = worksheet.row(curr_row)
                     new_output = []
                     for index_col in xrange(num_cells):
                         value = worksheet.cell_value(curr_row, index_col)
diff --git a/test/test_TextFileParser.py b/test/test_TextFileParser.py
index 2b71c00..157d428 100644
--- a/test/test_TextFileParser.py
+++ b/test/test_TextFileParser.py
@@ -14,5 +14,5 @@ class TextFileParserTest(TestCase):
         info = parser.parse(dir_name + "/test_files/text.csv")
 
         self.assertTrue(info["content"].startswith("rosbagTimestamp,header,seq,stamp,secs,nsecs,"))
-        self.assertEqual(len(info["content"]), 1309)  # Size is larger because of html escaping
+        self.assertEqual(len(info["content"]), 1234)
         self.assertEqual(info["encoding"], "ascii")
diff --git a/thumbnail.py b/thumbnail.py
index 26ecf57..78338ad 100644
--- a/thumbnail.py
+++ b/thumbnail.py
@@ -2,7 +2,10 @@ from PIL import Image
 import os
 from multiprocessing import Value, Process
 import ffmpeg
-#import cairosvg
+import config
+
+if config.cairosvg:
+    import cairosvg
 
 
 class ThumbnailGenerator:
@@ -17,12 +20,12 @@ class ThumbnailGenerator:
         if mime is None:
             return
 
-        if mime == "image/svg+xml":
+        if mime == "image/svg+xml" and config.cairosvg:
 
             try:
                 p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"})
                 p.start()
-                p.join(1.5)
+                p.join(1)
 
                 if p.is_alive():
                     p.terminate()
@@ -50,8 +53,7 @@ class ThumbnailGenerator:
                  .run()
                  )
                 self.generate_image("tmp", dest_path)
-            except Exception as e:
-                print(e)
+            except Exception:
                 print("Couldn't make thumbnail for " + path)
 
             if os.path.exists("tmp"):