Skip to content

Commit

Permalink
Add min_chunk_size
Browse files Browse the repository at this point in the history
  • Loading branch information
tibor-reiss committed Sep 15, 2024
1 parent 6896c4c commit 1c5b9f8
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 1 deletion.
2 changes: 2 additions & 0 deletions docs/docs/how_to/semantic-chunker.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@
"\n",
"There are a few ways to determine what that threshold is, which are controlled by the `breakpoint_threshold_type` kwarg.\n",
"\n",
"Note: if the resulting chunk sizes are too small/big, adjust `breakpoint_threshold_amount` and/or \n",
"\n",
"### Percentile\n",
"\n",
"The default way to split is based on percentile. In this method, all differences between sentences are calculated, and then any difference greater than the X percentile is split. The default value for X is 95.0 and can be adjusted by the keyword argument `breakpoint_threshold_amount` which expects a number between 0.0 and 100.0."
Expand Down
7 changes: 6 additions & 1 deletion libs/experimental/langchain_experimental/text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def __init__(
breakpoint_threshold_amount: Optional[float] = None,
number_of_chunks: Optional[int] = None,
sentence_split_regex: str = r"(?<=[.?!])\s+",
min_chunk_size: Optional[int] = None,
):
self._add_start_index = add_start_index
self.embeddings = embeddings
Expand All @@ -130,6 +131,7 @@ def __init__(
]
else:
self.breakpoint_threshold_amount = breakpoint_threshold_amount
self.min_chunk_size = min_chunk_size

def _calculate_breakpoint_threshold(
self, distances: List[float]
Expand Down Expand Up @@ -242,8 +244,11 @@ def split_text(
end_index = index

# Slice the sentence_dicts from the current start index to the end index
group = sentences[start_index : end_index + 1]
group = sentences[start_index: end_index + 1]
combined_text = " ".join([d["sentence"] for d in group])
# If specified, merge together small chunks.
if self.min_chunk_size is not None and len(combined_text) < self.min_chunk_size:

Check failure on line 250 in libs/experimental/langchain_experimental/text_splitter.py

View workflow job for this annotation

GitHub Actions / cd libs/experimental / make lint #3.12

Ruff (E501)

langchain_experimental/text_splitter.py:250:89: E501 Line too long (92 > 88)

Check failure on line 250 in libs/experimental/langchain_experimental/text_splitter.py

View workflow job for this annotation

GitHub Actions / cd libs/experimental / make lint #3.9

Ruff (E501)

langchain_experimental/text_splitter.py:250:89: E501 Line too long (92 > 88)
continue
chunks.append(combined_text)

# Update the start index for the next group
Expand Down

0 comments on commit 1c5b9f8

Please sign in to comment.