From 8a09157d6fafd79455eb4893caa91232cf90271c Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Thu, 19 Dec 2024 18:46:25 +0100
Subject: [PATCH] \f was not being counted properly

---
 .../preprocessors/recursive_splitter.py       | 13 +++++----
 .../preprocessors/test_recursive_splitter.py  | 27 +++++++------------
 2 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py
index efd9d92cf1..6e8c3f0ea9 100644
--- a/haystack/components/preprocessors/recursive_splitter.py
+++ b/haystack/components/preprocessors/recursive_splitter.py
@@ -154,10 +154,11 @@ def _chunk_length(self, text: str) -> int:
             The length of the chunk in words or characters.
         """
         if self.split_units == "word":
-            print(text)
-            print(text.split())
-            print(len(text.split()))
-            print("-----------------")
+            # page breaks are counted as a single word or page breaks followed by only whitespace 1 or multiple times
+            # regex that matches a page break followed by only whitespace 1 or multiple times
+            if re.match(r"\f\s*", text):
+                return 1
+
             return len(text.split())
         else:
             return len(text)
@@ -226,7 +227,9 @@ def _chunk_text(self, text: str) -> List[str]:
                             break
                         chunks.extend(self._chunk_text(split_text))
                     else:
-                        chunks.append(split_text)
+                        # chunks.append(split_text)
+                        current_chunk.append(split_text)
+                        current_length += self._chunk_length(split_text)
                 else:
                     current_chunk.append(split_text)
                     current_length += self._chunk_length(split_text)
diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py
index 69c1b014b5..addfb8bb77 100644
--- a/test/components/preprocessors/test_recursive_splitter.py
+++ b/test/components/preprocessors/test_recursive_splitter.py
@@ -83,7 +83,7 @@ def test_chunk_text_by_period():
     assert chunks[2] == " And one more."
 
 
-def test_run_multiple_new_lines():
+def test_run_multiple_new_lines_unit_char():
     splitter = RecursiveDocumentSplitter(split_length=20, separators=["\n\n", "\n"], split_unit="char")
     text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test."
     doc = Document(content=text)
@@ -502,37 +502,28 @@ def test_run_split_by_word_count_page_breaks_word_unit():
     doc_chunks = splitter.run([doc])
     doc_chunks = doc_chunks["documents"]
 
-    print("\n\n")
-    print("-------------")
-    for doc in doc_chunks:
-        print(doc.content)
-        print(doc.meta)
-        print("-------------")
-
-    exit(-1)
-
-    assert len(doc_chunks) == 4
-    assert doc_chunks[0].content == "This is some text."
+    assert len(doc_chunks) == 5
+    assert doc_chunks[0].content == "This is some text. "
     assert doc_chunks[0].meta["page_number"] == 1
     assert doc_chunks[0].meta["split_id"] == 0
     assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content)
 
-    assert doc_chunks[1].content == " \f This text is on"
+    assert doc_chunks[1].content == "\f This text is "
     assert doc_chunks[1].meta["page_number"] == 2
     assert doc_chunks[1].meta["split_id"] == 1
     assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content)
 
-    assert doc_chunks[2].content == " another page. \f T"
+    assert doc_chunks[2].content == "on another page. \f "
     assert doc_chunks[2].meta["page_number"] == 3
     assert doc_chunks[2].meta["split_id"] == 2
     assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content)
 
-    assert doc_chunks[3].content == "his is the last pa"
+    assert doc_chunks[3].content == "This is the last "
     assert doc_chunks[3].meta["page_number"] == 3
     assert doc_chunks[3].meta["split_id"] == 3
     assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content)
 
-    assert doc_chunks[4].content == "g3."
+    assert doc_chunks[4].content == "pag3."
     assert doc_chunks[4].meta["page_number"] == 3
     assert doc_chunks[4].meta["split_id"] == 4
     assert doc_chunks[4].meta["split_idx_start"] == text.index(doc_chunks[4].content)
@@ -560,12 +551,12 @@ def test_run_split_by_page_break_count_page_breaks_word_unit() -> None:
     assert chunks_docs[1].meta["split_id"] == 1
     assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content)
 
-    assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f\f"
+    assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f"
     assert chunks_docs[2].meta["page_number"] == 3
     assert chunks_docs[2].meta["split_id"] == 2
     assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content)
 
-    assert chunks_docs[3].content == " Sentence on page 5."
+    assert chunks_docs[3].content == "\f Sentence on page 5."
     assert chunks_docs[3].meta["page_number"] == 5
     assert chunks_docs[3].meta["split_id"] == 3
     assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content)