EmbeddingRedundantFilter #24240

djj067 · 2024-07-14T14:15:19Z

djj067
Jul 14, 2024

Checked other resources

I added a very descriptive title to this question.
I searched the LangChain documentation with the integrated search.
I used the GitHub search to find a similar question and didn't find it.

Commit to Help

I commit to help with one of those options 👆

Example Code

"""Transform documents"""
from typing import Any, Callable, List, Sequence

import numpy as np
from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseModel, Field

from langchain_community.utils.math import cosine_similarity


class _DocumentWithState(Document):
    """Wrapper for a document that includes arbitrary state."""

    state: dict = Field(default_factory=dict)
    """State associated with the document."""

    @classmethod
    def is_lc_serializable(cls) -> bool:
        return False

    def to_document(self) -> Document:
        """Convert the DocumentWithState to a Document."""
        return Document(page_content=self.page_content, metadata=self.metadata)

    @classmethod
    def from_document(cls, doc: Document) -> "_DocumentWithState":
        """Create a DocumentWithState from a Document."""
        if isinstance(doc, cls):
            return doc
        return cls(page_content=doc.page_content, metadata=doc.metadata)


def get_stateful_documents(
    documents: Sequence[Document],
) -> Sequence[_DocumentWithState]:
    """Convert a list of documents to a list of documents with state.

    Args:
        documents: The documents to convert.

    Returns:
        A list of documents with state.
    """
    return [_DocumentWithState.from_document(doc) for doc in documents]


def _filter_similar_embeddings(
    embedded_documents: List[List[float]], similarity_fn: Callable, threshold: float
) -> List[int]:
    """Filter redundant documents based on the similarity of their embeddings."""
    similarity = np.tril(similarity_fn(embedded_documents, embedded_documents), k=-1)
    redundant = np.where(similarity > threshold)
    redundant_stacked = np.column_stack(redundant)
    redundant_sorted = np.argsort(similarity[redundant])[::-1]
    included_idxs = set(range(len(embedded_documents)))
    for first_idx, second_idx in redundant_stacked[redundant_sorted]:
        if first_idx in included_idxs and second_idx in included_idxs:
            # Default to dropping the second document of any highly similar pair.
            included_idxs.remove(second_idx)
    return list(sorted(included_idxs))


def _get_embeddings_from_stateful_docs(
    embeddings: Embeddings, documents: Sequence[_DocumentWithState]
) -> List[List[float]]:
    if len(documents) and "embedded_doc" in documents[0].state:
        embedded_documents = [doc.state["embedded_doc"] for doc in documents]
    else:
        embedded_documents = embeddings.embed_documents(
            [d.page_content for d in documents]
        )
        for doc, embedding in zip(documents, embedded_documents):
            doc.state["embedded_doc"] = embedding
    return embedded_documents


def _filter_cluster_embeddings(
    embedded_documents: List[List[float]],
    num_clusters: int,
    num_closest: int,
    random_state: int,
    remove_duplicates: bool,
) -> List[int]:
    """Filter documents based on proximity of their embeddings to clusters."""

    try:
        from sklearn.cluster import KMeans
    except ImportError:
        raise ImportError(
            "sklearn package not found, please install it with "
            "`pip install scikit-learn`"
        )

    kmeans = KMeans(n_clusters=num_clusters, random_state=random_state).fit(
        embedded_documents
    )
    closest_indices = []

    # Loop through the number of clusters you have
    for i in range(num_clusters):
        # Get the list of distances from that particular cluster center
        distances = np.linalg.norm(
            embedded_documents - kmeans.cluster_centers_[i], axis=1
        )

        # Find the indices of the two unique closest ones
        # (using argsort to find the smallest 2 distances)
        if remove_duplicates:
            # Only add not duplicated vectors.
            closest_indices_sorted = [
                x
                for x in np.argsort(distances)[:num_closest]
                if x not in closest_indices
            ]
        else:
            # Skip duplicates and add the next closest vector.
            closest_indices_sorted = [
                x for x in np.argsort(distances) if x not in closest_indices
            ][:num_closest]

        # Append that position closest indices list
        closest_indices.extend(closest_indices_sorted)

    return closest_indices


class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
    """Filter that drops redundant documents by comparing their embeddings."""

    embeddings: Embeddings
    """Embeddings to use for embedding document contents."""
    similarity_fn: Callable = cosine_similarity
    """Similarity function for comparing documents. Function expected to take as input
    two matrices (List[List[float]]) and return a matrix of scores where higher values
    indicate greater similarity."""
    similarity_threshold: float = 0.9
    """Threshold for determining when two documents are similar enough
    to be considered redundant."""

    class Config:
        """Configuration for this pydantic object."""

        arbitrary_types_allowed = True

    def transform_documents(
        self, documents: Sequence[Document], **kwargs: Any
    ) -> Sequence[Document]:
        """Filter down documents."""
        stateful_documents = get_stateful_documents(documents)
        embedded_documents = _get_embeddings_from_stateful_docs(
            self.embeddings, stateful_documents
        )
        included_idxs = _filter_similar_embeddings(
            embedded_documents, self.similarity_fn, self.similarity_threshold
        )
        return [stateful_documents[i] for i in sorted(included_idxs)]


class EmbeddingsClusteringFilter(BaseDocumentTransformer, BaseModel):
    """Perform K-means clustering on document vectors.
    Returns an arbitrary number of documents closest to center."""

    embeddings: Embeddings
    """Embeddings to use for embedding document contents."""

    num_clusters: int = 5
    """Number of clusters. Groups of documents with similar meaning."""

    num_closest: int = 1
    """The number of closest vectors to return for each cluster center."""

    random_state: int = 42
    """Controls the random number generator used to initialize the cluster centroids.
    If you set the random_state parameter to None, the KMeans algorithm will use a 
    random number generator that is seeded with the current time. This means 
    that the results of the KMeans algorithm will be different each time you 
    run it."""

    sorted: bool = False
    """By default results are re-ordered "grouping" them by cluster, if sorted is true
    result will be ordered by the original position from the retriever"""

    remove_duplicates: bool = False
    """ By default duplicated results are skipped and replaced by the next closest 
    vector in the cluster. If remove_duplicates is true no replacement will be done:
    This could dramatically reduce results when there is a lot of overlap between 
    clusters.
    """

    class Config:
        """Configuration for this pydantic object."""

        arbitrary_types_allowed = True

    def transform_documents(
        self, documents: Sequence[Document], **kwargs: Any
    ) -> Sequence[Document]:
        """Filter down documents."""
        stateful_documents = get_stateful_documents(documents)
        embedded_documents = _get_embeddings_from_stateful_docs(
            self.embeddings, stateful_documents
        )
        included_idxs = _filter_cluster_embeddings(
            embedded_documents,
            self.num_clusters,
            self.num_closest,
            self.random_state,
            self.remove_duplicates,
        )
        results = sorted(included_idxs) if self.sorted else included_idxs
        return [stateful_documents[i] for i in results]

Description

This is the source code from EmbeddingRedundantFilter i have a few questions:

the state is get from 'CLS' what is cls, is the state of the doc means the embedding of the doc?

how does the filter work? if two docs are similar, which one will be filtered

System Info

langchain= 0.1.8

@dosu · 2024-07-14T14:21:50Z

dosubot[bot]
bot Jul 14, 2024

The EmbeddingRedundantFilter class filters out redundant documents based on the similarity of their embeddings. Here's a detailed explanation addressing your questions:

1. What is `cls` in the context of `_DocumentWithState`?

In the _DocumentWithState class, cls refers to the class itself. It is a common convention in Python to use cls as the first parameter in class methods to refer to the class. This is similar to how self refers to the instance in instance methods.

2. Is the state of the document the embedding of the document?

Yes, in the context of _DocumentWithState, the state dictionary can store various pieces of information, including the embeddings of the document. When embeddings are computed, they are stored in the state dictionary under the key "embedded_doc".

3. How does the filter work?

The EmbeddingRedundantFilter works by comparing the embeddings of documents and filtering out those that are too similar. Here's a step-by-step breakdown:

Convert Documents to Stateful Documents:
The transform_documents method converts the input documents to _DocumentWithState objects using the get_stateful_documents function.
```
stateful_documents = get_stateful_documents(documents)
```
Get Embeddings:
It then obtains the embeddings for these documents using the _get_embeddings_from_stateful_docs function. If the embeddings are already present in the state dictionary, they are reused; otherwise, they are computed and stored.
```
embedded_documents = _get_embeddings_from_stateful_docs(self.embeddings, stateful_documents)
```
Filter Similar Embeddings:
The _filter_similar_embeddings function is called to determine which documents are redundant based on their similarity. This function computes a similarity matrix and identifies pairs of documents with similarity scores above the specified threshold. It then removes the second document in each highly similar pair.
```
included_idxs = _filter_similar_embeddings(embedded_documents, self.similarity_fn, self.similarity_threshold)
```
Return Filtered Documents:
Finally, the method returns the filtered list of documents.
```
return [stateful_documents[i] for i in sorted(included_idxs)]
```

Which Document is Filtered?

When two documents are found to be similar, the _filter_similar_embeddings function defaults to dropping the second document in each highly similar pair. This is done to reduce redundancy while retaining the first document.

Here is the relevant part of the _filter_similar_embeddings function:

for first_idx, second_idx in redundant_stacked[redundant_sorted]:
    if first_idx in included_idxs and second_idx in included_idxs:
        # Default to dropping the second document of any highly similar pair.
        included_idxs.remove(second_idx)

In summary, the EmbeddingRedundantFilter class filters out redundant documents by comparing their embeddings and removing the second document in each pair of highly similar documents ^[1].

To continue talking to Dosu, mention @dosu.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

EmbeddingRedundantFilter #24240

{{title}}

Replies: 1 comment

{{title}}

Select a reply

EmbeddingRedundantFilter #24240

djj067 Jul 14, 2024

Checked other resources

Commit to Help

Example Code

Description

System Info

Replies: 1 comment

dosubot[bot] bot Jul 14, 2024

1. What is cls in the context of _DocumentWithState?

2. Is the state of the document the embedding of the document?

3. How does the filter work?

Which Document is Filtered?

djj067
Jul 14, 2024

dosubot[bot]
bot Jul 14, 2024

1. What is `cls` in the context of `_DocumentWithState`?