Skip to content

Commit

Permalink
Add test
Browse files Browse the repository at this point in the history
  • Loading branch information
tibor-reiss committed Sep 18, 2024
1 parent fc9b50c commit 50d2e59
Showing 1 changed file with 55 additions and 0 deletions.
55 changes: 55 additions & 0 deletions libs/experimental/tests/unit_tests/test_text_splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from typing import List, Optional

import pytest
from langchain_core.embeddings import Embeddings

from langchain_experimental.text_splitter import SemanticChunker

FAKE_EMBEDDINGS = [
[0.02905, 0.42969, 0.65394, 0.62200],
[0.00515, 0.47214, 0.45327, 0.75605],
[0.57401, 0.30344, 0.41702, 0.63603],
[0.60308, 0.18708, 0.68871, 0.35634],
[0.52510, 0.56163, 0.34100, 0.54089],
[0.73275, 0.22089, 0.42652, 0.48204],
[0.47466, 0.26161, 0.79687, 0.26694],
]
SAMPLE_TEXT = """
We need to harvest synergy effects viral engagement, but digitalize, nor
overcome key issues to meet key milestones. So digital literacy where the
metal hits the meat. So this vendor is incompetent. Can you champion this?
Let me diarize this. And we can synchronise ourselves at a later timepoint
t-shaped individual tread it daily. That is a good problem
"""


class MockEmbeddings(Embeddings):
def embed_documents(self, texts: List[str]) -> List[List[float]]:
return FAKE_EMBEDDINGS[: len(texts)]

def embed_query(self, text: str) -> List[float]:
return [1.0, 2.0]


@pytest.mark.parametrize(
"min_chunk_size, expected_chunks",
[
(None, 4),
(30, 4),
(60, 3),
(120, 3),
(240, 2),
],
)
def test_min_chunk_size(min_chunk_size: Optional[int], expected_chunks: int) -> None:
embeddings = MockEmbeddings()
chunker = SemanticChunker(
embeddings,
breakpoint_threshold_type="percentile",
breakpoint_threshold_amount=50,
min_chunk_size=min_chunk_size,
)

chunks = chunker.split_text(SAMPLE_TEXT)

assert len(chunks) == expected_chunks

0 comments on commit 50d2e59

Please sign in to comment.