From b3172d8da40907dc9627d394d94784064c332f85 Mon Sep 17 00:00:00 2001 From: Levara Date: Mon, 23 Dec 2024 15:58:26 +0100 Subject: [PATCH] Extract sentence splitting in SemanticChunker into a private method (#30) This change allows users to easily override splitting the text into sentences in the SemanticChunker, which allows them to use their own sentence splitting algorithm. Since the splitting logic wasn't changed, the unit tests still pass. Implements issue https://github.com/langchain-ai/langchain-experimental/issues/29 Co-authored-by: levara --- libs/experimental/langchain_experimental/text_splitter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libs/experimental/langchain_experimental/text_splitter.py b/libs/experimental/langchain_experimental/text_splitter.py index 78c8437..38b7180 100644 --- a/libs/experimental/langchain_experimental/text_splitter.py +++ b/libs/experimental/langchain_experimental/text_splitter.py @@ -208,12 +208,15 @@ def _calculate_sentence_distances( return calculate_cosine_distances(sentences) + def _get_single_sentences_list(self, text: str) -> List[str]: + return re.split(self.sentence_split_regex, text) + def split_text( self, text: str, ) -> List[str]: # Splitting the essay (by default on '.', '?', and '!') - single_sentences_list = re.split(self.sentence_split_regex, text) + single_sentences_list = self._get_single_sentences_list(text) # having len(single_sentences_list) == 1 would cause the following # np.percentile to fail.