Source code for langchain_milvus.utils.sparse
from abc import ABC, abstractmethod
from typing import Dict, List
from scipy.sparse import csr_array  # type: ignore
[docs]class BaseSparseEmbedding(ABC):
    """Interface for Sparse embedding models.
    You can inherit from it and implement your custom sparse embedding model.
    """
[docs]    @abstractmethod
    def embed_query(self, query: str) -> Dict[int, float]:
        """Embed query text.""" 
[docs]    @abstractmethod
    def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]:
        """Embed search docs."""  
[docs]class BM25SparseEmbedding(BaseSparseEmbedding):
    """Sparse embedding model based on BM25.
    This class uses the BM25 model in Milvus model to implement sparse vector embedding.
    This model requires pymilvus[model] to be installed.
    `pip install pymilvus[model]`
    For more information please refer to:
    https://milvus.io/docs/embed-with-bm25.md
    """
[docs]    def __init__(self, corpus: List[str], language: str = "en"):
        from pymilvus.model.sparse import BM25EmbeddingFunction  # type: ignore
        from pymilvus.model.sparse.bm25.tokenizers import (  # type: ignore
            build_default_analyzer,
        )
        self.analyzer = build_default_analyzer(language=language)
        self.bm25_ef = BM25EmbeddingFunction(self.analyzer, num_workers=1)
        self.bm25_ef.fit(corpus) 
[docs]    def embed_query(self, text: str) -> Dict[int, float]:
        return self._sparse_to_dict(self.bm25_ef.encode_queries([text])) 
[docs]    def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]:
        sparse_arrays = self.bm25_ef.encode_documents(texts)
        return [self._sparse_to_dict(sparse_array) for sparse_array in sparse_arrays] 
    def _sparse_to_dict(self, sparse_array: csr_array) -> Dict[int, float]:
        row_indices, col_indices = sparse_array.nonzero()
        non_zero_values = sparse_array.data
        result_dict = {}
        for col_index, value in zip(col_indices, non_zero_values):
            result_dict[col_index] = value
        return result_dict