Add min_chunk_size

langchain-ai · Sep 15, 2024 · 1c5b9f8 · 1c5b9f8
1 parent 6896c4c
commit 1c5b9f8
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 1 deletion.
diff --git a/docs/docs/how_to/semantic-chunker.ipynb b/docs/docs/how_to/semantic-chunker.ipynb
@@ -125,6 +125,8 @@
     "\n",
     "There are a few ways to determine what that threshold is, which are controlled by the `breakpoint_threshold_type` kwarg.\n",
     "\n",
+    "Note: if the resulting chunk sizes are too small/big, adjust `breakpoint_threshold_amount` and/or \n",
+    "\n",
     "### Percentile\n",
     "\n",
     "The default way to split is based on percentile. In this method, all differences between sentences are calculated, and then any difference greater than the X percentile is split. The default value for X is 95.0 and can be adjusted by the keyword argument `breakpoint_threshold_amount` which expects a number between 0.0 and 100.0."

diff --git a/libs/experimental/langchain_experimental/text_splitter.py b/libs/experimental/langchain_experimental/text_splitter.py
@@ -117,6 +117,7 @@ def __init__(
         breakpoint_threshold_amount: Optional[float] = None,
         number_of_chunks: Optional[int] = None,
         sentence_split_regex: str = r"(?<=[.?!])\s+",
+        min_chunk_size: Optional[int] = None,
     ):
         self._add_start_index = add_start_index
         self.embeddings = embeddings
@@ -130,6 +131,7 @@ def __init__(
             ]
         else:
             self.breakpoint_threshold_amount = breakpoint_threshold_amount
+        self.min_chunk_size = min_chunk_size
 
     def _calculate_breakpoint_threshold(
         self, distances: List[float]
@@ -242,8 +244,11 @@ def split_text(
             end_index = index
 
             # Slice the sentence_dicts from the current start index to the end index
-            group = sentences[start_index : end_index + 1]
+            group = sentences[start_index: end_index + 1]
             combined_text = " ".join([d["sentence"] for d in group])
+            # If specified, merge together small chunks.
+            if self.min_chunk_size is not None and len(combined_text) < self.min_chunk_size:
+                continue
             chunks.append(combined_text)
 
             # Update the start index for the next group