mirror of
				https://github.com/simon987/Simple-Incremental-Search-Tool.git
				synced 2025-11-04 01:06:52 +00:00 
			
		
		
		
	Small improvements in indexing
This commit is contained in:
		
							parent
							
								
									213cc61da9
								
							
						
					
					
						commit
						fe52ecceff
					
				
							
								
								
									
										25
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								README.md
									
									
									
									
									
								
							@ -2,21 +2,34 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
Work in progress: probably won't work without some tweaking
 | 
					Work in progress: probably won't work without some tweaking
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Running on linux
 | 
					## Setup on Windows
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					git clone https://github.com/simon987/Projet-Web-2018
 | 
				
			||||||
 | 
					cd Projet-Web-2018
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					[Download latest elasticsearch version](https://www.elastic.co/downloads/elasticsearch) and extract to `Projet-Web-2018\elasticsearch`
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					sudo pip3 install -r requirements.txt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					python3 run.py
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Setup on Mac/linux
 | 
				
			||||||
```bash
 | 
					```bash
 | 
				
			||||||
git clone https://github.com/simon987/Projet-Web-2018
 | 
					git clone https://github.com/simon987/Projet-Web-2018
 | 
				
			||||||
cd Projet-Web-2018
 | 
					cd Projet-Web-2018
 | 
				
			||||||
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.2.4.zip
 | 
					wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.2.4.zip
 | 
				
			||||||
unzip elasticsearch-6.2.4.zip
 | 
					unzip elasticsearch-6.2.4.zip
 | 
				
			||||||
rm elasticsearch-6.2.4.zip
 | 
					rm elasticsearch-6.2.4.zip
 | 
				
			||||||
mv elasticsearch-6.2.4 elasticsearch
 | 
					mv elasticsearch-6.2.4 elasticsearch    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
sudo pip3 install -r requirements.txt
 | 
					sudo pip3 install -r requirements.txt    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
python3 run.py
 | 
					python3 run.py
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Running tests
 | 
					## Running unit tests
 | 
				
			||||||
```
 | 
					```bash
 | 
				
			||||||
python3 -m unittest discover
 | 
					python3 -m unittest
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
				
			|||||||
@ -25,4 +25,10 @@ bcrypt_rounds = 14
 | 
				
			|||||||
# sqlite3 database path
 | 
					# sqlite3 database path
 | 
				
			||||||
db_path = "./local_storage.db"
 | 
					db_path = "./local_storage.db"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					try:
 | 
				
			||||||
 | 
					    import cairosvg
 | 
				
			||||||
 | 
					    cairosvg = True
 | 
				
			||||||
 | 
					except:
 | 
				
			||||||
 | 
					    cairosvg = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
VERSION = "1.0a"
 | 
					VERSION = "1.0a"
 | 
				
			||||||
 | 
				
			|||||||
@ -37,7 +37,6 @@ class Indexer:
 | 
				
			|||||||
        if platform.system() == "Windows":
 | 
					        if platform.system() == "Windows":
 | 
				
			||||||
            subprocess.Popen(["elasticsearch\\bin\\elasticsearch.bat"])
 | 
					            subprocess.Popen(["elasticsearch\\bin\\elasticsearch.bat"])
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            print(platform.system())
 | 
					 | 
				
			||||||
            subprocess.Popen(["elasticsearch/bin/elasticsearch"])
 | 
					            subprocess.Popen(["elasticsearch/bin/elasticsearch"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
@ -86,6 +85,9 @@ class Indexer:
 | 
				
			|||||||
            "analysis": {"analyzer": {"my_nGram": {"tokenizer": "my_nGram_tokenizer", "filter": ["lowercase",
 | 
					            "analysis": {"analyzer": {"my_nGram": {"tokenizer": "my_nGram_tokenizer", "filter": ["lowercase",
 | 
				
			||||||
                                                                                                 "asciifolding"]}}}},
 | 
					                                                                                                 "asciifolding"]}}}},
 | 
				
			||||||
            index=self.index_name)
 | 
					            index=self.index_name)
 | 
				
			||||||
 | 
					        self.es.indices.put_settings(body={
 | 
				
			||||||
 | 
					            "analysis": {"analyzer": {"content_analyser": {"tokenizer": "standard", "filter": ["lowercase"]}}}},
 | 
				
			||||||
 | 
					            index=self.index_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.es.indices.put_mapping(body={"properties": {
 | 
					        self.es.indices.put_mapping(body={"properties": {
 | 
				
			||||||
            "path": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},
 | 
					            "path": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},
 | 
				
			||||||
@ -98,6 +100,7 @@ class Indexer:
 | 
				
			|||||||
            "width": {"type": "integer"},
 | 
					            "width": {"type": "integer"},
 | 
				
			||||||
            "height": {"type": "integer"},
 | 
					            "height": {"type": "integer"},
 | 
				
			||||||
            "mtime": {"type": "integer"},
 | 
					            "mtime": {"type": "integer"},
 | 
				
			||||||
 | 
					            "size": {"type": "long"},
 | 
				
			||||||
            "directory": {"type": "short"},
 | 
					            "directory": {"type": "short"},
 | 
				
			||||||
            "name": {"analyzer": "my_nGram", "type": "text"},
 | 
					            "name": {"analyzer": "my_nGram", "type": "text"},
 | 
				
			||||||
            "album": {"analyzer": "my_nGram", "type": "text"},
 | 
					            "album": {"analyzer": "my_nGram", "type": "text"},
 | 
				
			||||||
@ -105,6 +108,7 @@ class Indexer:
 | 
				
			|||||||
            "title": {"analyzer": "my_nGram", "type": "text"},
 | 
					            "title": {"analyzer": "my_nGram", "type": "text"},
 | 
				
			||||||
            "genre": {"analyzer": "my_nGram", "type": "text"},
 | 
					            "genre": {"analyzer": "my_nGram", "type": "text"},
 | 
				
			||||||
            "album_artist": {"analyzer": "my_nGram", "type": "text"},
 | 
					            "album_artist": {"analyzer": "my_nGram", "type": "text"},
 | 
				
			||||||
 | 
					            "content": {"analyzer": "content_analyser", "type": "text"},
 | 
				
			||||||
        }}, doc_type="file", index=self.index_name)
 | 
					        }}, doc_type="file", index=self.index_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.es.indices.open(index=self.index_name)
 | 
					        self.es.indices.open(index=self.index_name)
 | 
				
			||||||
 | 
				
			|||||||
@ -4,7 +4,6 @@ import mimetypes
 | 
				
			|||||||
import subprocess
 | 
					import subprocess
 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
import chardet
 | 
					import chardet
 | 
				
			||||||
import html
 | 
					 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
import docx2txt
 | 
					import docx2txt
 | 
				
			||||||
import xlrd
 | 
					import xlrd
 | 
				
			||||||
@ -290,7 +289,7 @@ class TextFileParser(GenericFileParser):
 | 
				
			|||||||
                    info["encoding"] = encoding
 | 
					                    info["encoding"] = encoding
 | 
				
			||||||
                    try:
 | 
					                    try:
 | 
				
			||||||
                        content = raw_content.decode(encoding, "ignore")
 | 
					                        content = raw_content.decode(encoding, "ignore")
 | 
				
			||||||
                        info["content"] = html.escape(content)
 | 
					                        info["content"] = content
 | 
				
			||||||
                    except Exception:
 | 
					                    except Exception:
 | 
				
			||||||
                        print("Unknown encoding: " + encoding)
 | 
					                        print("Unknown encoding: " + encoding)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -497,7 +496,6 @@ class SpreadSheetParser(GenericFileParser):
 | 
				
			|||||||
                num_cells = worksheet.ncols
 | 
					                num_cells = worksheet.ncols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                for curr_row in range(num_rows):
 | 
					                for curr_row in range(num_rows):
 | 
				
			||||||
                    row = worksheet.row(curr_row)
 | 
					 | 
				
			||||||
                    new_output = []
 | 
					                    new_output = []
 | 
				
			||||||
                    for index_col in xrange(num_cells):
 | 
					                    for index_col in xrange(num_cells):
 | 
				
			||||||
                        value = worksheet.cell_value(curr_row, index_col)
 | 
					                        value = worksheet.cell_value(curr_row, index_col)
 | 
				
			||||||
 | 
				
			|||||||
@ -14,5 +14,5 @@ class TextFileParserTest(TestCase):
 | 
				
			|||||||
        info = parser.parse(dir_name + "/test_files/text.csv")
 | 
					        info = parser.parse(dir_name + "/test_files/text.csv")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertTrue(info["content"].startswith("rosbagTimestamp,header,seq,stamp,secs,nsecs,"))
 | 
					        self.assertTrue(info["content"].startswith("rosbagTimestamp,header,seq,stamp,secs,nsecs,"))
 | 
				
			||||||
        self.assertEqual(len(info["content"]), 1309)  # Size is larger because of html escaping
 | 
					        self.assertEqual(len(info["content"]), 1234)
 | 
				
			||||||
        self.assertEqual(info["encoding"], "ascii")
 | 
					        self.assertEqual(info["encoding"], "ascii")
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										12
									
								
								thumbnail.py
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								thumbnail.py
									
									
									
									
									
								
							@ -2,7 +2,10 @@ from PIL import Image
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
from multiprocessing import Value, Process
 | 
					from multiprocessing import Value, Process
 | 
				
			||||||
import ffmpeg
 | 
					import ffmpeg
 | 
				
			||||||
#import cairosvg
 | 
					import config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if config.cairosvg:
 | 
				
			||||||
 | 
					    import cairosvg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ThumbnailGenerator:
 | 
					class ThumbnailGenerator:
 | 
				
			||||||
@ -17,12 +20,12 @@ class ThumbnailGenerator:
 | 
				
			|||||||
        if mime is None:
 | 
					        if mime is None:
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if mime == "image/svg+xml":
 | 
					        if mime == "image/svg+xml" and config.cairosvg:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"})
 | 
					                p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"})
 | 
				
			||||||
                p.start()
 | 
					                p.start()
 | 
				
			||||||
                p.join(1.5)
 | 
					                p.join(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                if p.is_alive():
 | 
					                if p.is_alive():
 | 
				
			||||||
                    p.terminate()
 | 
					                    p.terminate()
 | 
				
			||||||
@ -50,8 +53,7 @@ class ThumbnailGenerator:
 | 
				
			|||||||
                 .run()
 | 
					                 .run()
 | 
				
			||||||
                 )
 | 
					                 )
 | 
				
			||||||
                self.generate_image("tmp", dest_path)
 | 
					                self.generate_image("tmp", dest_path)
 | 
				
			||||||
            except Exception as e:
 | 
					            except Exception:
 | 
				
			||||||
                print(e)
 | 
					 | 
				
			||||||
                print("Couldn't make thumbnail for " + path)
 | 
					                print("Couldn't make thumbnail for " + path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if os.path.exists("tmp"):
 | 
					            if os.path.exists("tmp"):
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user