mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-15 16:26:45 +00:00
Small improvements in indexing
This commit is contained in:
parent
213cc61da9
commit
fe52ecceff
25
README.md
25
README.md
@ -2,21 +2,34 @@
|
|||||||
|
|
||||||
Work in progress: probably won't work without some tweaking
|
Work in progress: probably won't work without some tweaking
|
||||||
|
|
||||||
## Running on linux
|
## Setup on Windows
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/simon987/Projet-Web-2018
|
||||||
|
cd Projet-Web-2018
|
||||||
|
```
|
||||||
|
[Download latest elasticsearch version](https://www.elastic.co/downloads/elasticsearch) and extract to `Projet-Web-2018\elasticsearch`
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo pip3 install -r requirements.txt
|
||||||
|
|
||||||
|
python3 run.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Setup on Mac/linux
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/simon987/Projet-Web-2018
|
git clone https://github.com/simon987/Projet-Web-2018
|
||||||
cd Projet-Web-2018
|
cd Projet-Web-2018
|
||||||
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.2.4.zip
|
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.2.4.zip
|
||||||
unzip elasticsearch-6.2.4.zip
|
unzip elasticsearch-6.2.4.zip
|
||||||
rm elasticsearch-6.2.4.zip
|
rm elasticsearch-6.2.4.zip
|
||||||
mv elasticsearch-6.2.4 elasticsearch
|
mv elasticsearch-6.2.4 elasticsearch
|
||||||
|
|
||||||
sudo pip3 install -r requirements.txt
|
sudo pip3 install -r requirements.txt
|
||||||
|
|
||||||
python3 run.py
|
python3 run.py
|
||||||
```
|
```
|
||||||
|
|
||||||
## Running tests
|
## Running unit tests
|
||||||
```
|
```bash
|
||||||
python3 -m unittest discover
|
python3 -m unittest
|
||||||
```
|
```
|
||||||
|
@ -25,4 +25,10 @@ bcrypt_rounds = 14
|
|||||||
# sqlite3 database path
|
# sqlite3 database path
|
||||||
db_path = "./local_storage.db"
|
db_path = "./local_storage.db"
|
||||||
|
|
||||||
|
try:
|
||||||
|
import cairosvg
|
||||||
|
cairosvg = True
|
||||||
|
except:
|
||||||
|
cairosvg = False
|
||||||
|
|
||||||
VERSION = "1.0a"
|
VERSION = "1.0a"
|
||||||
|
@ -37,7 +37,6 @@ class Indexer:
|
|||||||
if platform.system() == "Windows":
|
if platform.system() == "Windows":
|
||||||
subprocess.Popen(["elasticsearch\\bin\\elasticsearch.bat"])
|
subprocess.Popen(["elasticsearch\\bin\\elasticsearch.bat"])
|
||||||
else:
|
else:
|
||||||
print(platform.system())
|
|
||||||
subprocess.Popen(["elasticsearch/bin/elasticsearch"])
|
subprocess.Popen(["elasticsearch/bin/elasticsearch"])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -86,6 +85,9 @@ class Indexer:
|
|||||||
"analysis": {"analyzer": {"my_nGram": {"tokenizer": "my_nGram_tokenizer", "filter": ["lowercase",
|
"analysis": {"analyzer": {"my_nGram": {"tokenizer": "my_nGram_tokenizer", "filter": ["lowercase",
|
||||||
"asciifolding"]}}}},
|
"asciifolding"]}}}},
|
||||||
index=self.index_name)
|
index=self.index_name)
|
||||||
|
self.es.indices.put_settings(body={
|
||||||
|
"analysis": {"analyzer": {"content_analyser": {"tokenizer": "standard", "filter": ["lowercase"]}}}},
|
||||||
|
index=self.index_name)
|
||||||
|
|
||||||
self.es.indices.put_mapping(body={"properties": {
|
self.es.indices.put_mapping(body={"properties": {
|
||||||
"path": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},
|
"path": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},
|
||||||
@ -98,6 +100,7 @@ class Indexer:
|
|||||||
"width": {"type": "integer"},
|
"width": {"type": "integer"},
|
||||||
"height": {"type": "integer"},
|
"height": {"type": "integer"},
|
||||||
"mtime": {"type": "integer"},
|
"mtime": {"type": "integer"},
|
||||||
|
"size": {"type": "long"},
|
||||||
"directory": {"type": "short"},
|
"directory": {"type": "short"},
|
||||||
"name": {"analyzer": "my_nGram", "type": "text"},
|
"name": {"analyzer": "my_nGram", "type": "text"},
|
||||||
"album": {"analyzer": "my_nGram", "type": "text"},
|
"album": {"analyzer": "my_nGram", "type": "text"},
|
||||||
@ -105,6 +108,7 @@ class Indexer:
|
|||||||
"title": {"analyzer": "my_nGram", "type": "text"},
|
"title": {"analyzer": "my_nGram", "type": "text"},
|
||||||
"genre": {"analyzer": "my_nGram", "type": "text"},
|
"genre": {"analyzer": "my_nGram", "type": "text"},
|
||||||
"album_artist": {"analyzer": "my_nGram", "type": "text"},
|
"album_artist": {"analyzer": "my_nGram", "type": "text"},
|
||||||
|
"content": {"analyzer": "content_analyser", "type": "text"},
|
||||||
}}, doc_type="file", index=self.index_name)
|
}}, doc_type="file", index=self.index_name)
|
||||||
|
|
||||||
self.es.indices.open(index=self.index_name)
|
self.es.indices.open(index=self.index_name)
|
||||||
|
@ -4,7 +4,6 @@ import mimetypes
|
|||||||
import subprocess
|
import subprocess
|
||||||
import json
|
import json
|
||||||
import chardet
|
import chardet
|
||||||
import html
|
|
||||||
import warnings
|
import warnings
|
||||||
import docx2txt
|
import docx2txt
|
||||||
import xlrd
|
import xlrd
|
||||||
@ -290,7 +289,7 @@ class TextFileParser(GenericFileParser):
|
|||||||
info["encoding"] = encoding
|
info["encoding"] = encoding
|
||||||
try:
|
try:
|
||||||
content = raw_content.decode(encoding, "ignore")
|
content = raw_content.decode(encoding, "ignore")
|
||||||
info["content"] = html.escape(content)
|
info["content"] = content
|
||||||
except Exception:
|
except Exception:
|
||||||
print("Unknown encoding: " + encoding)
|
print("Unknown encoding: " + encoding)
|
||||||
|
|
||||||
@ -497,7 +496,6 @@ class SpreadSheetParser(GenericFileParser):
|
|||||||
num_cells = worksheet.ncols
|
num_cells = worksheet.ncols
|
||||||
|
|
||||||
for curr_row in range(num_rows):
|
for curr_row in range(num_rows):
|
||||||
row = worksheet.row(curr_row)
|
|
||||||
new_output = []
|
new_output = []
|
||||||
for index_col in xrange(num_cells):
|
for index_col in xrange(num_cells):
|
||||||
value = worksheet.cell_value(curr_row, index_col)
|
value = worksheet.cell_value(curr_row, index_col)
|
||||||
|
@ -14,5 +14,5 @@ class TextFileParserTest(TestCase):
|
|||||||
info = parser.parse(dir_name + "/test_files/text.csv")
|
info = parser.parse(dir_name + "/test_files/text.csv")
|
||||||
|
|
||||||
self.assertTrue(info["content"].startswith("rosbagTimestamp,header,seq,stamp,secs,nsecs,"))
|
self.assertTrue(info["content"].startswith("rosbagTimestamp,header,seq,stamp,secs,nsecs,"))
|
||||||
self.assertEqual(len(info["content"]), 1309) # Size is larger because of html escaping
|
self.assertEqual(len(info["content"]), 1234)
|
||||||
self.assertEqual(info["encoding"], "ascii")
|
self.assertEqual(info["encoding"], "ascii")
|
||||||
|
12
thumbnail.py
12
thumbnail.py
@ -2,7 +2,10 @@ from PIL import Image
|
|||||||
import os
|
import os
|
||||||
from multiprocessing import Value, Process
|
from multiprocessing import Value, Process
|
||||||
import ffmpeg
|
import ffmpeg
|
||||||
#import cairosvg
|
import config
|
||||||
|
|
||||||
|
if config.cairosvg:
|
||||||
|
import cairosvg
|
||||||
|
|
||||||
|
|
||||||
class ThumbnailGenerator:
|
class ThumbnailGenerator:
|
||||||
@ -17,12 +20,12 @@ class ThumbnailGenerator:
|
|||||||
if mime is None:
|
if mime is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
if mime == "image/svg+xml":
|
if mime == "image/svg+xml" and config.cairosvg:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"})
|
p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"})
|
||||||
p.start()
|
p.start()
|
||||||
p.join(1.5)
|
p.join(1)
|
||||||
|
|
||||||
if p.is_alive():
|
if p.is_alive():
|
||||||
p.terminate()
|
p.terminate()
|
||||||
@ -50,8 +53,7 @@ class ThumbnailGenerator:
|
|||||||
.run()
|
.run()
|
||||||
)
|
)
|
||||||
self.generate_image("tmp", dest_path)
|
self.generate_image("tmp", dest_path)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
print(e)
|
|
||||||
print("Couldn't make thumbnail for " + path)
|
print("Couldn't make thumbnail for " + path)
|
||||||
|
|
||||||
if os.path.exists("tmp"):
|
if os.path.exists("tmp"):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user