Simple-Incremental-Search-Tool/indexer.py

import json

import elasticsearch
import requests

import config


class Indexer:

    def __init__(self, index: str):

        self.index_name = index
        self.es = elasticsearch.Elasticsearch()

        requests.head(config.elasticsearch_url)
        if self.es.indices.exists(self.index_name):
            print("Index is already setup")
        else:
            print("First time setup...")
            self.init()

    @staticmethod
    def create_bulk_index_string(docs: list, directory: int):
        """
        Creates a insert string for sending to elasticsearch
        """

        result = ""

        action_string = '{"index":{}}\n'

        for doc in docs:
            doc["directory"] = directory
            result += action_string
            result += json.dumps(doc) + "\n"

        return result

    def index(self, docs: list, directory: int):
        print("Indexing " + str(len(docs)) + " docs")
        index_string = Indexer.create_bulk_index_string(docs, directory)
        self.es.bulk(body=index_string, index=self.index_name, doc_type="file", refresh="true")

    def clear(self):

        self.es.indices.delete(self.index_name)
        self.es.indices.create(self.index_name)

    def init(self):
        if self.es.indices.exists(self.index_name):
            self.es.indices.delete(index=self.index_name)
        self.es.indices.create(index=self.index_name)
        self.es.indices.close(index=self.index_name)

        self.es.indices.put_settings(body={
            "analysis": {"tokenizer": {"path_tokenizer": {"type": "path_hierarchy"}}}},
            index=self.index_name)
        self.es.indices.put_settings(body={
            "analysis": {"tokenizer": {
                "my_nGram_tokenizer": {"type": "nGram", "min_gram": config.nGramMin, "max_gram": config.nGramMax}}}},
            index=self.index_name)
        self.es.indices.put_settings(body={
            "analysis": {"analyzer": {"path_analyser": {"tokenizer": "path_tokenizer", "filter": ["lowercase"]}}}},
            index=self.index_name)
        self.es.indices.put_settings(body={
            "analysis": {"analyzer": {"my_nGram": {"tokenizer": "my_nGram_tokenizer", "filter": ["lowercase",
                                                                                                 "asciifolding"]}}}},
            index=self.index_name)
        self.es.indices.put_settings(body={
            "analysis": {"analyzer": {"content_analyser": {"tokenizer": "standard", "filter": ["lowercase"]}}}},
            index=self.index_name)

        self.es.indices.put_mapping(body={"properties": {
            "path": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},
            "suggest-path": {"type": "completion", "analyzer": "keyword"},
            "mime": {"type": "keyword"},
            "encoding": {"type": "keyword"},
            "format_name": {"type": "keyword"},
            "format_long_name": {"type": "keyword"},
            "duration": {"type": "float"},
            "width": {"type": "integer"},
            "height": {"type": "integer"},
            "mtime": {"type": "integer"},
            "size": {"type": "long"},
            "directory": {"type": "short"},
            "name": {"analyzer": "content_analyser", "type": "text",
                     "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}
                     },
            "album": {"analyzer": "my_nGram", "type": "text"},
            "artist": {"analyzer": "my_nGram", "type": "text"},
            "title": {"analyzer": "my_nGram", "type": "text"},
            "genre": {"analyzer": "my_nGram", "type": "text"},
            "album_artist": {"analyzer": "my_nGram", "type": "text"},
            "content": {"analyzer": "content_analyser", "type": "text"},
        }}, doc_type="file", index=self.index_name, include_type_name=True)

        self.es.indices.open(index=self.index_name)

        print("Initialised elesticsearch")