Skip to content

Commit

Permalink
Add min_chunk_size
Browse files Browse the repository at this point in the history
  • Loading branch information
tibor-reiss committed Sep 18, 2024
1 parent 9257faf commit fc9b50c
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/docs/how_to/semantic-chunker.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@
"\n",
"There are a few ways to determine what that threshold is, which are controlled by the `breakpoint_threshold_type` kwarg.\n",
"\n",
"Note: if the resulting chunk sizes are too small/big, the additional kwargs `breakpoint_threshold_amount` and `min_chunk_size` can be used for adjustments.\n",
"\n",
"### Percentile\n",
"\n",
"The default way to split is based on percentile. In this method, all differences between sentences are calculated, and then any difference greater than the X percentile is split. The default value for X is 95.0 and can be adjusted by the keyword argument `breakpoint_threshold_amount` which expects a number between 0.0 and 100.0."
Expand Down
8 changes: 8 additions & 0 deletions libs/experimental/langchain_experimental/text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def __init__(
breakpoint_threshold_amount: Optional[float] = None,
number_of_chunks: Optional[int] = None,
sentence_split_regex: str = r"(?<=[.?!])\s+",
min_chunk_size: Optional[int] = None,
):
self._add_start_index = add_start_index
self.embeddings = embeddings
Expand All @@ -130,6 +131,7 @@ def __init__(
]
else:
self.breakpoint_threshold_amount = breakpoint_threshold_amount
self.min_chunk_size = min_chunk_size

def _calculate_breakpoint_threshold(
self, distances: List[float]
Expand Down Expand Up @@ -244,6 +246,12 @@ def split_text(
# Slice the sentence_dicts from the current start index to the end index
group = sentences[start_index : end_index + 1]
combined_text = " ".join([d["sentence"] for d in group])
# If specified, merge together small chunks.
if (
self.min_chunk_size is not None
and len(combined_text) < self.min_chunk_size
):
continue
chunks.append(combined_text)

# Update the start index for the next group
Expand Down

0 comments on commit fc9b50c

Please sign in to comment.