Small improvements in indexing

2025-09-08 15:46:58 +00:00 · 2018-04-24 08:49:12 -04:00 · 2018-04-24 08:49:12 -04:00 · fe52ecceff
commit fe52ecceff
parent 213cc61da9
6 changed files with 39 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -2,21 +2,34 @@
 Work in progress: probably won't work without some tweaking
-## Running on linux
+## Setup on Windows
 ```bash
 git clone https://github.com/simon987/Projet-Web-2018
 cd Projet-Web-2018
 ```
 [Download latest elasticsearch version](https://www.elastic.co/downloads/elasticsearch) and extract to `Projet-Web-2018\elasticsearch`
 ```bash
 sudo pip3 install -r requirements.txt
 python3 run.py
 ```
 ## Setup on Mac/linux
 ```bash
 git clone https://github.com/simon987/Projet-Web-2018
 cd Projet-Web-2018
 wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.2.4.zip
 unzip elasticsearch-6.2.4.zip
 rm elasticsearch-6.2.4.zip
-mv elasticsearch-6.2.4 elasticsearch
+mv elasticsearch-6.2.4 elasticsearch    
-sudo pip3 install -r requirements.txt
+sudo pip3 install -r requirements.txt    
 python3 run.py
 ```
-## Running tests
+## Running unit tests
-```
+```bash
-python3 -m unittest discover
+python3 -m unittest
 ```
--- a/config.py
+++ b/config.py
@ -25,4 +25,10 @@ bcrypt_rounds = 14
 # sqlite3 database path
 db_path = "./local_storage.db"
 try:
    import cairosvg
    cairosvg = True
 except:
    cairosvg = False
 VERSION = "1.0a"
--- a/indexer.py
+++ b/indexer.py
@ -37,7 +37,6 @@ class Indexer:
        if platform.system() == "Windows":
            subprocess.Popen(["elasticsearch\\bin\\elasticsearch.bat"])
        else:
            print(platform.system())
            subprocess.Popen(["elasticsearch/bin/elasticsearch"])
    @staticmethod
@ -86,6 +85,9 @@ class Indexer:
            "analysis": {"analyzer": {"my_nGram": {"tokenizer": "my_nGram_tokenizer", "filter": ["lowercase",
                                                                                                 "asciifolding"]}}}},
            index=self.index_name)
        self.es.indices.put_settings(body={
            "analysis": {"analyzer": {"content_analyser": {"tokenizer": "standard", "filter": ["lowercase"]}}}},
            index=self.index_name)
        self.es.indices.put_mapping(body={"properties": {
            "path": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},
@ -98,6 +100,7 @@ class Indexer:
            "width": {"type": "integer"},
            "height": {"type": "integer"},
            "mtime": {"type": "integer"},
            "size": {"type": "long"},
            "directory": {"type": "short"},
            "name": {"analyzer": "my_nGram", "type": "text"},
            "album": {"analyzer": "my_nGram", "type": "text"},
@ -105,6 +108,7 @@ class Indexer:
            "title": {"analyzer": "my_nGram", "type": "text"},
            "genre": {"analyzer": "my_nGram", "type": "text"},
            "album_artist": {"analyzer": "my_nGram", "type": "text"},
            "content": {"analyzer": "content_analyser", "type": "text"},
        }}, doc_type="file", index=self.index_name)
        self.es.indices.open(index=self.index_name)
--- a/parsing.py
+++ b/parsing.py
@ -4,7 +4,6 @@ import mimetypes
 import subprocess
 import json
 import chardet
 import html
 import warnings
 import docx2txt
 import xlrd
@ -290,7 +289,7 @@ class TextFileParser(GenericFileParser):
                    info["encoding"] = encoding
                    try:
                        content = raw_content.decode(encoding, "ignore")
-                        info["content"] = html.escape(content)
+                        info["content"] = content
                    except Exception:
                        print("Unknown encoding: " + encoding)
@ -497,7 +496,6 @@ class SpreadSheetParser(GenericFileParser):
                num_cells = worksheet.ncols
                for curr_row in range(num_rows):
                    row = worksheet.row(curr_row)
                    new_output = []
                    for index_col in xrange(num_cells):
                        value = worksheet.cell_value(curr_row, index_col)
--- a/test/test_TextFileParser.py
+++ b/test/test_TextFileParser.py
@ -14,5 +14,5 @@ class TextFileParserTest(TestCase):
        info = parser.parse(dir_name + "/test_files/text.csv")
        self.assertTrue(info["content"].startswith("rosbagTimestamp,header,seq,stamp,secs,nsecs,"))
-        self.assertEqual(len(info["content"]), 1309)  # Size is larger because of html escaping
+        self.assertEqual(len(info["content"]), 1234)
        self.assertEqual(info["encoding"], "ascii")
--- a/thumbnail.py
+++ b/thumbnail.py
@ -2,7 +2,10 @@ from PIL import Image
 import os
 from multiprocessing import Value, Process
 import ffmpeg
-#import cairosvg
+import config
 if config.cairosvg:
    import cairosvg
 class ThumbnailGenerator:
@ -17,12 +20,12 @@ class ThumbnailGenerator:
        if mime is None:
            return
-        if mime == "image/svg+xml":
+        if mime == "image/svg+xml" and config.cairosvg:
            try:
                p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"})
                p.start()
-                p.join(1.5)
+                p.join(1)
                if p.is_alive():
                    p.terminate()
@ -50,8 +53,7 @@ class ThumbnailGenerator:
                 .run()
                 )
                self.generate_image("tmp", dest_path)
-            except Exception as e:
+            except Exception:
                print(e)
                print("Couldn't make thumbnail for " + path)
            if os.path.exists("tmp"):