Skip to content

ElasticStore#

Info

To see example of using ElasticStore visit: How-To: Use Elastic Search to Store Similarity Index

dbally.similarity.ElasticsearchStore #

ElasticsearchStore(index_name: str, embedding_client: EmbeddingClient, host: str, http_user: str, http_password: str, ca_cert_path: str)

Bases: SimilarityStore

The ElasticsearchStore class stores text embeddings and implements method to find the most similar values using knn algorithm.

Initializes the ElasticsearchStore.

PARAMETER DESCRIPTION
index_name

The name of the index.

TYPE: str

embedding_client

The client to use for creating text embeddings.

TYPE: EmbeddingClient

host

The host address of the Elasticsearch instance.

TYPE: str

http_user

The username used for HTTP authentication.

TYPE: str

http_password

The password used for HTTP authentication.

TYPE: str

ca_cert_path

The path to the CA certificate for SSL/TLS verification.

TYPE: str

Source code in src/dbally/similarity/elasticsearch_store.py
def __init__(
    self,
    index_name: str,
    embedding_client: EmbeddingClient,
    host: str,
    http_user: str,
    http_password: str,
    ca_cert_path: str,
) -> None:
    """
    Initializes the ElasticsearchStore.

    Args:
        index_name: The name of the index.
        embedding_client: The client to use for creating text embeddings.
        host: The host address of the Elasticsearch instance.
        http_user: The username used for HTTP authentication.
        http_password: The password used for HTTP authentication.
        ca_cert_path: The path to the CA certificate for SSL/TLS verification.
    """
    super().__init__()
    self.client = AsyncElasticsearch(
        hosts=host,
        http_auth=(http_user, http_password),
        ca_certs=ca_cert_path,
    )
    self.index_name = index_name
    self.embedding_client = embedding_client

client instance-attribute #

client = AsyncElasticsearch(hosts=host, http_auth=(http_user, http_password), ca_certs=ca_cert_path)

index_name instance-attribute #

index_name = index_name

embedding_client instance-attribute #

embedding_client = embedding_client

store async #

store(data: List[str]) -> None

Stores the data in a elastic store.

PARAMETER DESCRIPTION
data

The data to store.

TYPE: List[str]

Source code in src/dbally/similarity/elasticsearch_store.py
async def store(self, data: List[str]) -> None:
    """
    Stores the data in a elastic store.

    Args:
        data: The data to store.
    """

    mappings = {
        "properties": {
            "search_vector": {
                "type": "dense_vector",
                "index": "true",
                "similarity": "cosine",
            }
        }
    }

    if not await self.client.indices.exists(index=self.index_name):
        await self.client.indices.create(index=self.index_name, mappings=mappings)

    store_data = [
        {
            "_index": self.index_name,
            "_id": sha256(column.encode("utf-8")).hexdigest(),
            "column": column,
            "search_vector": (await self.embedding_client.get_embeddings([column]))[0],
        }
        for column in data
    ]

    await async_bulk(self.client, store_data)

find_similar async #

find_similar(text: str, k_closest: int = 5, num_candidates: int = 50) -> Optional[str]

Finds the most similar text in the store or returns None if no similar text is found.

PARAMETER DESCRIPTION
text

The text to find similar to.

TYPE: str

k_closest

The k nearest neighbours used by knn-search.

TYPE: int DEFAULT: 5

num_candidates

The number of approximate nearest neighbor candidates on each shard.

TYPE: int DEFAULT: 50

RETURNS DESCRIPTION
Optional[str]

The most similar text or None if no similar text is found.

Source code in src/dbally/similarity/elasticsearch_store.py
async def find_similar(
    self,
    text: str,
    k_closest: int = 5,
    num_candidates: int = 50,
) -> Optional[str]:
    """
    Finds the most similar text in the store or returns None if no similar text is found.

    Args:
        text: The text to find similar to.
        k_closest: The k nearest neighbours used by knn-search.
        num_candidates: The number of approximate nearest neighbor candidates on each shard.

    Returns:
        The most similar text or None if no similar text is found.
    """
    query_embedding = (await self.embedding_client.get_embeddings([text]))[0]

    search_results = await self.client.search(
        knn={
            "field": "search_vector",
            "k": k_closest,
            "num_candidates": num_candidates,
            "query_vector": query_embedding,
        }
    )

    return (
        search_results["hits"]["hits"][0]["_source"]["column"] if len(search_results["hits"]["hits"]) != 0 else None
    )

dbally.similarity.ElasticVectorStore #

ElasticVectorStore(index_name: str, host: str, http_user: str, http_password: str, ca_cert_path: str)

Bases: SimilarityStore

The Elastic Vector Store class uses the ELSER (Elastic Learned Sparse EncodeR) model on Elasticsearch to store and search data.

Initializes the Elastic Vector Store.

PARAMETER DESCRIPTION
index_name

The name of the index.

TYPE: str

host

The host address of the Elasticsearch instance.

TYPE: str

http_user

The username used for HTTP authentication.

TYPE: str

http_password

The password used for HTTP authentication.

TYPE: str

ca_cert_path

The path to the CA certificate for SSL/TLS verification.

TYPE: str

Source code in src/dbally/similarity/elastic_vector_search.py
def __init__(
    self,
    index_name: str,
    host: str,
    http_user: str,
    http_password: str,
    ca_cert_path: str,
) -> None:
    """
    Initializes the Elastic Vector Store.

    Args:
        index_name: The name of the index.
        host: The host address of the Elasticsearch instance.
        http_user: The username used for HTTP authentication.
        http_password: The password used for HTTP authentication.
        ca_cert_path: The path to the CA certificate for SSL/TLS verification.
    """
    super().__init__()
    self.client = AsyncElasticsearch(
        hosts=host,
        http_auth=(http_user, http_password),
        ca_certs=ca_cert_path,
    )
    self.index_name = index_name

client instance-attribute #

client = AsyncElasticsearch(hosts=host, http_auth=(http_user, http_password), ca_certs=ca_cert_path)

index_name instance-attribute #

index_name = index_name

store async #

store(data: List[str]) -> None

Stores the given data in an Elasticsearch store.

PARAMETER DESCRIPTION
data

The data to store in the Elasticsearch index.

TYPE: List[str]

Source code in src/dbally/similarity/elastic_vector_search.py
async def store(self, data: List[str]) -> None:
    """
    Stores the given data in an Elasticsearch store.

    Args:
        data: The data to store in the Elasticsearch index.
    """
    mappings = {
        "properties": {
            "column": {
                "type": "text",
            },
            "column_embedding": {"type": "sparse_vector"},
        }
    }
    if not await self.client.indices.exists(index=self.index_name):
        await self.client.indices.create(
            index=self.index_name,
            mappings=mappings,
            settings={"index": {"default_pipeline": "elser-ingest-pipeline"}},
        )
    store_data = [
        {
            "_index": self.index_name,
            "_id": sha256(column.encode("utf-8")).hexdigest(),
            "column": column,
        }
        for column in data
    ]
    await async_bulk(self.client, store_data)

find_similar async #

find_similar(text: str) -> Optional[str]

Finds the most similar stored text to the given input text.

This function performs a search in the Elasticsearch index using text expansion to find the stored text that is most similar to the provided input text.

PARAMETER DESCRIPTION
text

The input text for which to find a similar stored text.

TYPE: str

RETURNS DESCRIPTION
Optional[str]

The most similar stored text if found, otherwise None.

Source code in src/dbally/similarity/elastic_vector_search.py
async def find_similar(
    self,
    text: str,
) -> Optional[str]:
    """
    Finds the most similar stored text to the given input text.

    This function performs a search in the Elasticsearch index using text expansion to find
    the stored text that is most similar to the provided input text.

    Args:
        text: The input text for which to find a similar stored text.

    Returns:
        The most similar stored text if found, otherwise None.
    """
    response = await self.client.search(
        index=self.index_name,
        size=1,
        query={
            "text_expansion": {
                "column_embedding": {
                    "model_id": ".elser_model_2",
                    "model_text": text,
                }
            }
        },
    )

    return response["hits"]["hits"][0]["_source"]["column"] if len(response["hits"]["hits"]) > 0 else None