Add test

langchain-ai · Sep 18, 2024 · 4ebc335 · 4ebc335
1 parent fc9b50c
commit 4ebc335
Showing 1 changed file with 53 additions and 0 deletions.
diff --git a/libs/experimental/tests/unit_tests/test_text_splitter.py b/libs/experimental/tests/unit_tests/test_text_splitter.py
@@ -0,0 +1,53 @@
+from typing import List, Optional
+
+import pytest
+from langchain_core.embeddings import Embeddings
+
+from langchain_experimental.text_splitter import SemanticChunker
+
+FAKE_EMBEDDINGS = [
+    [0.02905, 0.42969, 0.65394, 0.62200],
+    [0.00515, 0.47214, 0.45327, 0.75605],
+    [0.57401, 0.30344, 0.41702, 0.63603],
+    [0.60308, 0.18708, 0.68871, 0.35634],
+    [0.52510, 0.56163, 0.34100, 0.54089],
+    [0.73275, 0.22089, 0.42652, 0.48204],
+    [0.47466, 0.26161, 0.79687, 0.26694],
+]
+SAMPLE_TEXT = """
+We need to harvest synergy effects viral engagement, but digitalize, nor 
+overcome key issues to meet key milestones. So digital literacy where the 
+metal hits the meat. So this vendor is incompetent. Can you champion this? 
+Let me diarize this. And we can synchronise ourselves at a later timepoint 
+t-shaped individual tread it daily. That is a good problem
+"""
+
+
+class MockEmbeddings(Embeddings):
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        return FAKE_EMBEDDINGS[:len(texts)]
+
+    def embed_query(self, text: str) -> List[float]:
+        return [1.0, 2.0]
+
+
+@pytest.mark.parametrize(
+    "min_chunk_size, expected_chunks",
+    [
+        (None, 4),
+        (30, 4),
+        (60, 3),
+        (120, 3),
+        (240, 2),
+    ]
+)
+def test_min_chunk_size(min_chunk_size: Optional[int], expected_chunks: int):
+    embeddings = MockEmbeddings()
+    chunker = SemanticChunker(
+        embeddings,
+        breakpoint_threshold_type="percentile",
+        breakpoint_threshold_amount=50,
+        min_chunk_size=min_chunk_size,
+    )
+    chunks = chunker.split_text(SAMPLE_TEXT)
+    assert len(chunks) == expected_chunks