From 752d54f867492eb628bc54b940e525ee138de518 Mon Sep 17 00:00:00 2001 From: levara Date: Sun, 22 Dec 2024 13:02:27 +0100 Subject: [PATCH] Extract sentence splitting in SemanticChunker into a private method This change allows users to easily override splitting the text into sentences in the SemanticChunker, which allows them to use their own sentence splitting algorithm. --- libs/experimental/langchain_experimental/text_splitter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libs/experimental/langchain_experimental/text_splitter.py b/libs/experimental/langchain_experimental/text_splitter.py index 78c8437..38b7180 100644 --- a/libs/experimental/langchain_experimental/text_splitter.py +++ b/libs/experimental/langchain_experimental/text_splitter.py @@ -208,12 +208,15 @@ def _calculate_sentence_distances( return calculate_cosine_distances(sentences) + def _get_single_sentences_list(self, text: str) -> List[str]: + return re.split(self.sentence_split_regex, text) + def split_text( self, text: str, ) -> List[str]: # Splitting the essay (by default on '.', '?', and '!') - single_sentences_list = re.split(self.sentence_split_regex, text) + single_sentences_list = self._get_single_sentences_list(text) # having len(single_sentences_list) == 1 would cause the following # np.percentile to fail.