Skip to content

Commit

Permalink
\f was not being counted properly
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista committed Dec 19, 2024
1 parent 2af6b03 commit 8a09157
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 23 deletions.
13 changes: 8 additions & 5 deletions haystack/components/preprocessors/recursive_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,11 @@ def _chunk_length(self, text: str) -> int:
The length of the chunk in words or characters.
"""
if self.split_units == "word":
print(text)
print(text.split())
print(len(text.split()))
print("-----------------")
# page breaks are counted as a single word or page breaks followed by only whitespace 1 or multiple times
# regex that matches a page break followed by only whitespace 1 or multiple times
if re.match(r"\f\s*", text):
return 1

return len(text.split())
else:
return len(text)
Expand Down Expand Up @@ -226,7 +227,9 @@ def _chunk_text(self, text: str) -> List[str]:
break
chunks.extend(self._chunk_text(split_text))
else:
chunks.append(split_text)
# chunks.append(split_text)
current_chunk.append(split_text)
current_length += self._chunk_length(split_text)
else:
current_chunk.append(split_text)
current_length += self._chunk_length(split_text)
Expand Down
27 changes: 9 additions & 18 deletions test/components/preprocessors/test_recursive_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_chunk_text_by_period():
assert chunks[2] == " And one more."


def test_run_multiple_new_lines():
def test_run_multiple_new_lines_unit_char():
splitter = RecursiveDocumentSplitter(split_length=20, separators=["\n\n", "\n"], split_unit="char")
text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test."
doc = Document(content=text)
Expand Down Expand Up @@ -502,37 +502,28 @@ def test_run_split_by_word_count_page_breaks_word_unit():
doc_chunks = splitter.run([doc])
doc_chunks = doc_chunks["documents"]

print("\n\n")
print("-------------")
for doc in doc_chunks:
print(doc.content)
print(doc.meta)
print("-------------")

exit(-1)

assert len(doc_chunks) == 4
assert doc_chunks[0].content == "This is some text."
assert len(doc_chunks) == 5
assert doc_chunks[0].content == "This is some text. "
assert doc_chunks[0].meta["page_number"] == 1
assert doc_chunks[0].meta["split_id"] == 0
assert doc_chunks[0].meta["split_idx_start"] == text.index(doc_chunks[0].content)

assert doc_chunks[1].content == " \f This text is on"
assert doc_chunks[1].content == "\f This text is "
assert doc_chunks[1].meta["page_number"] == 2
assert doc_chunks[1].meta["split_id"] == 1
assert doc_chunks[1].meta["split_idx_start"] == text.index(doc_chunks[1].content)

assert doc_chunks[2].content == " another page. \f T"
assert doc_chunks[2].content == "on another page. \f "
assert doc_chunks[2].meta["page_number"] == 3
assert doc_chunks[2].meta["split_id"] == 2
assert doc_chunks[2].meta["split_idx_start"] == text.index(doc_chunks[2].content)

assert doc_chunks[3].content == "his is the last pa"
assert doc_chunks[3].content == "This is the last "
assert doc_chunks[3].meta["page_number"] == 3
assert doc_chunks[3].meta["split_id"] == 3
assert doc_chunks[3].meta["split_idx_start"] == text.index(doc_chunks[3].content)

assert doc_chunks[4].content == "g3."
assert doc_chunks[4].content == "pag3."
assert doc_chunks[4].meta["page_number"] == 3
assert doc_chunks[4].meta["split_id"] == 4
assert doc_chunks[4].meta["split_idx_start"] == text.index(doc_chunks[4].content)
Expand Down Expand Up @@ -560,12 +551,12 @@ def test_run_split_by_page_break_count_page_breaks_word_unit() -> None:
assert chunks_docs[1].meta["split_id"] == 1
assert chunks_docs[1].meta["split_idx_start"] == text.index(chunks_docs[1].content)

assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f\f"
assert chunks_docs[2].content == "Sentence on page 3. Another on page 3.\f"
assert chunks_docs[2].meta["page_number"] == 3
assert chunks_docs[2].meta["split_id"] == 2
assert chunks_docs[2].meta["split_idx_start"] == text.index(chunks_docs[2].content)

assert chunks_docs[3].content == " Sentence on page 5."
assert chunks_docs[3].content == "\f Sentence on page 5."
assert chunks_docs[3].meta["page_number"] == 5
assert chunks_docs[3].meta["split_id"] == 3
assert chunks_docs[3].meta["split_idx_start"] == text.index(chunks_docs[3].content)
Expand Down

0 comments on commit 8a09157

Please sign in to comment.