Skip to content

Commit

Permalink
test: adding test for PyPDF to extract passages so that they are dete…
Browse files Browse the repository at this point in the history
…ct by DocumentSplitter (#8739)
  • Loading branch information
davidsbatista authored Jan 17, 2025
1 parent 21dd03d commit 2c84266
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 10 deletions.
19 changes: 9 additions & 10 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,17 +158,16 @@ def from_dict(cls, data):
def _default_convert(self, reader: "PdfReader") -> str:
texts = []
for page in reader.pages:
texts.append(
page.extract_text(
orientations=self.plain_mode_orientations,
extraction_mode=self.extraction_mode.value,
space_width=self.plain_mode_space_width,
layout_mode_space_vertically=self.layout_mode_space_vertically,
layout_mode_scale_weight=self.layout_mode_scale_weight,
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
)
extracted_text = page.extract_text(
orientations=self.plain_mode_orientations,
extraction_mode=self.extraction_mode.value,
space_width=self.plain_mode_space_width,
layout_mode_space_vertically=self.layout_mode_space_vertically,
layout_mode_scale_weight=self.layout_mode_scale_weight,
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
)
texts.append(extracted_text)
text = "\f".join(texts)
return text

Expand Down
22 changes: 22 additions & 0 deletions test/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from haystack import Document, default_from_dict, default_to_dict
from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
from haystack.components.preprocessors import DocumentSplitter
from haystack.dataclasses import ByteStream


Expand Down Expand Up @@ -213,3 +214,24 @@ def test_run_empty_document(self, caplog, test_files_path):
# Check that meta is used when the returned document is initialized and thus when doc id is generated
assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
assert output["documents"][0].id != Document(content="").id

def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path):
converter = PyPDFToDocument(extraction_mode=PyPDFExtractionMode.LAYOUT)
sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
pdf_doc = converter.run(sources=sources)
splitter = DocumentSplitter(split_length=1, split_by="passage")
docs = splitter.run(pdf_doc["documents"])

assert len(docs["documents"]) == 51

expected = (
"A wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively\n"
"edited and managed by its own audience directly using a web browser. A typical wiki\ncontains "
"multiple pages for the subjects or scope of the project and may be either open\nto the public or "
"limited to use within an organization for maintaining its internal knowledge\nbase. Wikis are "
"enabled by wiki software, otherwise known as wiki engines. A wiki engine,\nbeing a form of a "
"content management system, differs from other web-based systems\nsuch as blog software, in that "
"the content is created without any defined owner or leader,\nand wikis have little inherent "
"structure, allowing structure to emerge according to the\nneeds of the users.[1]\n\n"
)
assert docs["documents"][2].content == expected

0 comments on commit 2c84266

Please sign in to comment.