Small improvements in indexing

This commit is contained in:
simon 2018-04-24 08:49:12 -04:00
parent 213cc61da9
commit fe52ecceff
6 changed files with 39 additions and 16 deletions

View File

@ -2,21 +2,34 @@
Work in progress: probably won't work without some tweaking Work in progress: probably won't work without some tweaking
## Running on linux ## Setup on Windows
```bash
git clone https://github.com/simon987/Projet-Web-2018
cd Projet-Web-2018
```
[Download latest elasticsearch version](https://www.elastic.co/downloads/elasticsearch) and extract to `Projet-Web-2018\elasticsearch`
```bash
sudo pip3 install -r requirements.txt
python3 run.py
```
## Setup on Mac/linux
```bash ```bash
git clone https://github.com/simon987/Projet-Web-2018 git clone https://github.com/simon987/Projet-Web-2018
cd Projet-Web-2018 cd Projet-Web-2018
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.2.4.zip wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.2.4.zip
unzip elasticsearch-6.2.4.zip unzip elasticsearch-6.2.4.zip
rm elasticsearch-6.2.4.zip rm elasticsearch-6.2.4.zip
mv elasticsearch-6.2.4 elasticsearch mv elasticsearch-6.2.4 elasticsearch
sudo pip3 install -r requirements.txt sudo pip3 install -r requirements.txt
python3 run.py python3 run.py
``` ```
## Running tests ## Running unit tests
``` ```bash
python3 -m unittest discover python3 -m unittest
``` ```

View File

@ -25,4 +25,10 @@ bcrypt_rounds = 14
# sqlite3 database path # sqlite3 database path
db_path = "./local_storage.db" db_path = "./local_storage.db"
try:
import cairosvg
cairosvg = True
except:
cairosvg = False
VERSION = "1.0a" VERSION = "1.0a"

View File

@ -37,7 +37,6 @@ class Indexer:
if platform.system() == "Windows": if platform.system() == "Windows":
subprocess.Popen(["elasticsearch\\bin\\elasticsearch.bat"]) subprocess.Popen(["elasticsearch\\bin\\elasticsearch.bat"])
else: else:
print(platform.system())
subprocess.Popen(["elasticsearch/bin/elasticsearch"]) subprocess.Popen(["elasticsearch/bin/elasticsearch"])
@staticmethod @staticmethod
@ -86,6 +85,9 @@ class Indexer:
"analysis": {"analyzer": {"my_nGram": {"tokenizer": "my_nGram_tokenizer", "filter": ["lowercase", "analysis": {"analyzer": {"my_nGram": {"tokenizer": "my_nGram_tokenizer", "filter": ["lowercase",
"asciifolding"]}}}}, "asciifolding"]}}}},
index=self.index_name) index=self.index_name)
self.es.indices.put_settings(body={
"analysis": {"analyzer": {"content_analyser": {"tokenizer": "standard", "filter": ["lowercase"]}}}},
index=self.index_name)
self.es.indices.put_mapping(body={"properties": { self.es.indices.put_mapping(body={"properties": {
"path": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"}, "path": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},
@ -98,6 +100,7 @@ class Indexer:
"width": {"type": "integer"}, "width": {"type": "integer"},
"height": {"type": "integer"}, "height": {"type": "integer"},
"mtime": {"type": "integer"}, "mtime": {"type": "integer"},
"size": {"type": "long"},
"directory": {"type": "short"}, "directory": {"type": "short"},
"name": {"analyzer": "my_nGram", "type": "text"}, "name": {"analyzer": "my_nGram", "type": "text"},
"album": {"analyzer": "my_nGram", "type": "text"}, "album": {"analyzer": "my_nGram", "type": "text"},
@ -105,6 +108,7 @@ class Indexer:
"title": {"analyzer": "my_nGram", "type": "text"}, "title": {"analyzer": "my_nGram", "type": "text"},
"genre": {"analyzer": "my_nGram", "type": "text"}, "genre": {"analyzer": "my_nGram", "type": "text"},
"album_artist": {"analyzer": "my_nGram", "type": "text"}, "album_artist": {"analyzer": "my_nGram", "type": "text"},
"content": {"analyzer": "content_analyser", "type": "text"},
}}, doc_type="file", index=self.index_name) }}, doc_type="file", index=self.index_name)
self.es.indices.open(index=self.index_name) self.es.indices.open(index=self.index_name)

View File

@ -4,7 +4,6 @@ import mimetypes
import subprocess import subprocess
import json import json
import chardet import chardet
import html
import warnings import warnings
import docx2txt import docx2txt
import xlrd import xlrd
@ -290,7 +289,7 @@ class TextFileParser(GenericFileParser):
info["encoding"] = encoding info["encoding"] = encoding
try: try:
content = raw_content.decode(encoding, "ignore") content = raw_content.decode(encoding, "ignore")
info["content"] = html.escape(content) info["content"] = content
except Exception: except Exception:
print("Unknown encoding: " + encoding) print("Unknown encoding: " + encoding)
@ -497,7 +496,6 @@ class SpreadSheetParser(GenericFileParser):
num_cells = worksheet.ncols num_cells = worksheet.ncols
for curr_row in range(num_rows): for curr_row in range(num_rows):
row = worksheet.row(curr_row)
new_output = [] new_output = []
for index_col in xrange(num_cells): for index_col in xrange(num_cells):
value = worksheet.cell_value(curr_row, index_col) value = worksheet.cell_value(curr_row, index_col)

View File

@ -14,5 +14,5 @@ class TextFileParserTest(TestCase):
info = parser.parse(dir_name + "/test_files/text.csv") info = parser.parse(dir_name + "/test_files/text.csv")
self.assertTrue(info["content"].startswith("rosbagTimestamp,header,seq,stamp,secs,nsecs,")) self.assertTrue(info["content"].startswith("rosbagTimestamp,header,seq,stamp,secs,nsecs,"))
self.assertEqual(len(info["content"]), 1309) # Size is larger because of html escaping self.assertEqual(len(info["content"]), 1234)
self.assertEqual(info["encoding"], "ascii") self.assertEqual(info["encoding"], "ascii")

View File

@ -2,7 +2,10 @@ from PIL import Image
import os import os
from multiprocessing import Value, Process from multiprocessing import Value, Process
import ffmpeg import ffmpeg
#import cairosvg import config
if config.cairosvg:
import cairosvg
class ThumbnailGenerator: class ThumbnailGenerator:
@ -17,12 +20,12 @@ class ThumbnailGenerator:
if mime is None: if mime is None:
return return
if mime == "image/svg+xml": if mime == "image/svg+xml" and config.cairosvg:
try: try:
p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"}) p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"})
p.start() p.start()
p.join(1.5) p.join(1)
if p.is_alive(): if p.is_alive():
p.terminate() p.terminate()
@ -50,8 +53,7 @@ class ThumbnailGenerator:
.run() .run()
) )
self.generate_image("tmp", dest_path) self.generate_image("tmp", dest_path)
except Exception as e: except Exception:
print(e)
print("Couldn't make thumbnail for " + path) print("Couldn't make thumbnail for " + path)
if os.path.exists("tmp"): if os.path.exists("tmp"):