From 9d748e5bc5ff778f70eb0a738dad658c85dffca0 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 10 Feb 2025 16:41:16 +0100 Subject: [PATCH 01/11] Add params --- unstructured/partition/pdf_image/pdfminer_utils.py | 10 +++++++--- unstructured/partition/utils/config.py | 12 ++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 929affeaae..e886030765 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -8,13 +8,14 @@ from pdfminer.pdfpage import PDFPage from pdfminer.psparser import PSSyntaxError +from unstructured import env_config from unstructured.logger import logger from unstructured.utils import requires_dependencies -def init_pdfminer(): +def init_pdfminer(word_margin: float = 0.1, char_margin: float = 2): rsrcmgr = PDFResourceManager() - laparams = LAParams() + laparams = LAParams(word_margin=word_margin, char_margin=char_margin) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) @@ -80,7 +81,10 @@ def open_pdfminer_pages_generator( from unstructured.partition.pdf_image.pypdf_utils import get_page_data - device, interpreter = init_pdfminer() + device, interpreter = init_pdfminer( + word_margin=env_config.PDFMINER_WORD_MARGIN, + char_margin=env_config.PDFMINER_CHAR_MARGIN, + ) with tempfile.TemporaryDirectory() as tmp_dir_path: tmp_file_path = os.path.join(tmp_dir_path, "tmp_file") try: diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 291ae1b6a3..bf94accd50 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -224,5 +224,17 @@ def ANALYSIS_BBOX_FORMAT(self) -> str: """The format for analysed pages with bboxes drawn on them. Default is 'png'.""" return self._get_string("ANALYSIS_BBOX_FORMAT", "png") + @property + def PDFMINER_WORD_MARGIN(self) -> float: + """Distance (calculated as percentage of character width) between characters to consider + them seperated words and inject space between them""" + return self._get_float("PDFMINER_WORD_MARGIN", 0.1) + + @property + def PDFMINER_CHAR_MARGIN(self) -> float: + """Distance (calculated as percentage of character width) between characters to consider + them to be in the same line""" + return self._get_float("PDFMINER_CHAR_MARGIN", 2) + env_config = ENVConfig() From 931f6f1158afc6c753c6c16f32285749de469767 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 10 Feb 2025 18:41:20 +0100 Subject: [PATCH 02/11] fix import --- unstructured/partition/pdf_image/pdfminer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index e886030765..c9077adeff 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -8,8 +8,8 @@ from pdfminer.pdfpage import PDFPage from pdfminer.psparser import PSSyntaxError -from unstructured import env_config from unstructured.logger import logger +from unstructured.partition.utils.config import env_config from unstructured.utils import requires_dependencies From 4964c3ed438350e4586cbc46142f671a4482c2f1 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Tue, 11 Feb 2025 17:39:59 +0100 Subject: [PATCH 03/11] Add pdfminer config --- unstructured/partition/pdf.py | 39 ++++++++++++++++--- .../pdf_image/pdfminer_processing.py | 12 +++--- .../partition/pdf_image/pdfminer_utils.py | 27 +++++++------ 3 files changed, 57 insertions(+), 21 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 55d3f3c03c..772e4fc99d 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -80,6 +80,7 @@ merge_inferred_with_extracted_layout, ) from unstructured.partition.pdf_image.pdfminer_utils import ( + PDFMinerConfig, open_pdfminer_pages_generator, rect_to_bbox, ) @@ -144,6 +145,10 @@ def partition_pdf( starting_page_number: int = 1, extract_forms: bool = False, form_extraction_skip_tables: bool = True, + pdfminer_line_margin: Optional[float] = None, + pdfminer_char_margin: Optional[float] = None, + pdfminer_line_overlap: Optional[float] = None, + pdfminer_word_margin: Optional[float] = None, **kwargs: Any, ) -> list[Element]: """Parses a pdf document into a list of interpreted elements. @@ -207,7 +212,6 @@ def partition_pdf( exactly_one(filename=filename, file=file) languages = check_language_args(languages or [], ocr_languages) - return partition_pdf_or_image( filename=filename, file=file, @@ -224,6 +228,10 @@ def partition_pdf( starting_page_number=starting_page_number, extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, + pdfminer_line_margin=pdfminer_line_margin, + pdfminer_char_margin=pdfminer_char_margin, + pdfminer_line_overlap=pdfminer_line_overlap, + pdfminer_word_margin=pdfminer_word_margin, **kwargs, ) @@ -245,6 +253,10 @@ def partition_pdf_or_image( starting_page_number: int = 1, extract_forms: bool = False, form_extraction_skip_tables: bool = True, + pdfminer_line_margin: Optional[float] = None, + pdfminer_char_margin: Optional[float] = None, + pdfminer_line_overlap: Optional[float] = None, + pdfminer_word_margin: Optional[float] = None, **kwargs: Any, ) -> list[Element]: """Parses a pdf or image document into a list of interpreted elements.""" @@ -262,7 +274,12 @@ def partition_pdf_or_image( validate_strategy(strategy, is_image) last_modified = get_last_modified_date(filename) if filename else None - + pdfminer_config = PDFMinerConfig( + line_margin=pdfminer_line_margin, + char_margin=pdfminer_char_margin, + line_overlap=pdfminer_line_overlap, + word_margin=pdfminer_word_margin, + ) extracted_elements = [] pdf_text_extractable = False if not is_image: @@ -322,6 +339,7 @@ def partition_pdf_or_image( starting_page_number=starting_page_number, extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, + pdfminer_config=pdfminer_config, **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) @@ -360,6 +378,7 @@ def extractable_elements( languages: Optional[list[str]] = None, metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, + pdfminer_config: Optional[PDFMinerConfig] = None, **kwargs: Any, ) -> list[list[Element]]: if isinstance(file, bytes): @@ -370,6 +389,7 @@ def extractable_elements( languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, + pdfminer_config=pdfminer_config, **kwargs, ) @@ -380,6 +400,7 @@ def _partition_pdf_with_pdfminer( languages: list[str], metadata_last_modified: Optional[str], starting_page_number: int = 1, + pdfminer_config: Optional[PDFMinerConfig] = None, **kwargs: Any, ) -> list[list[Element]]: """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster @@ -403,6 +424,7 @@ def _partition_pdf_with_pdfminer( languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, + pdfminer_config=pdfminer_config, **kwargs, ) @@ -427,6 +449,7 @@ def _process_pdfminer_pages( metadata_last_modified: Optional[str], annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD, starting_page_number: int = 1, + pdfminer_config: Optional[PDFMinerConfig] = None, **kwargs, ) -> list[list[Element]]: """Uses PDFMiner to split a document into pages and process them.""" @@ -434,7 +457,8 @@ def _process_pdfminer_pages( elements = [] for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(fp), start=starting_page_number + open_pdfminer_pages_generator(fp, pdfminer_config=pdfminer_config), + start=starting_page_number, ): width, height = page_layout.width, page_layout.height @@ -556,6 +580,7 @@ def _partition_pdf_or_image_local( extract_forms: bool = False, form_extraction_skip_tables: bool = True, pdf_hi_res_max_pages: Optional[int] = None, + pdfminer_config: Optional[PDFMinerConfig] = None, **kwargs: Any, ) -> list[Element]: """Partition using package installed locally""" @@ -595,7 +620,9 @@ def _partition_pdf_or_image_local( ) extracted_layout, layouts_links = ( - process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) + process_file_with_pdfminer( + filename=filename, dpi=pdf_image_dpi, pdfminer_config=pdfminer_config + ) if pdf_text_extractable else ([], []) ) @@ -648,7 +675,9 @@ def _partition_pdf_or_image_local( file.seek(0) extracted_layout, layouts_links = ( - process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) + process_data_with_pdfminer( + file=file, dpi=pdf_image_dpi, pdfminer_config=pdfminer_config + ) if pdf_text_extractable else ([], []) ) diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 14836f1815..f933124f4e 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -12,6 +12,7 @@ from unstructured.documents.elements import CoordinatesMetadata from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters from unstructured.partition.pdf_image.pdfminer_utils import ( + PDFMinerConfig, extract_image_objects, extract_text_objects, open_pdfminer_pages_generator, @@ -34,14 +35,12 @@ def process_file_with_pdfminer( - filename: str = "", - dpi: int = 200, + filename: str = "", dpi: int = 200, pdfminer_config: Optional[PDFMinerConfig] = None ) -> tuple[List[List["TextRegion"]], List[List]]: with open_filename(filename, "rb") as fp: fp = cast(BinaryIO, fp) extracted_layout, layouts_links = process_data_with_pdfminer( - file=fp, - dpi=dpi, + file=fp, dpi=dpi, pdfminer_config=pdfminer_config ) return extracted_layout, layouts_links @@ -114,6 +113,7 @@ def process_page_layout_from_pdfminer( def process_data_with_pdfminer( file: Optional[Union[bytes, BinaryIO]] = None, dpi: int = 200, + pdfminer_config: Optional[PDFMinerConfig] = None, ) -> tuple[List[LayoutElements], List[List]]: """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the pdf pages using pdf2image""" @@ -124,7 +124,9 @@ def process_data_with_pdfminer( layouts_links = [] # Coefficient to rescale bounding box to be compatible with images coef = dpi / 72 - for page_number, (page, page_layout) in enumerate(open_pdfminer_pages_generator(file)): + for page_number, (page, page_layout) in enumerate( + open_pdfminer_pages_generator(file, pdfminer_config=pdfminer_config) + ): width, height = page_layout.width, page_layout.height annotation_list = [] diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index c9077adeff..bb3b0d812e 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -1,21 +1,31 @@ import os import tempfile -from typing import BinaryIO, List, Tuple +from typing import BinaryIO, List, Optional, Tuple from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage from pdfminer.psparser import PSSyntaxError +from pydantic import BaseModel from unstructured.logger import logger -from unstructured.partition.utils.config import env_config from unstructured.utils import requires_dependencies -def init_pdfminer(word_margin: float = 0.1, char_margin: float = 2): +class PDFMinerConfig(BaseModel): + line_overlap: float = 0.5 + word_margin: float = 0.1 + line_margin: float = 0.5 + char_margin: float = 2 + + +def init_pdfminer(pdfminer_config: Optional[PDFMinerConfig] = None): rsrcmgr = PDFResourceManager() - laparams = LAParams(word_margin=word_margin, char_margin=char_margin) + + laparams_kwargs = pdfminer_config.model_dump() if pdfminer_config else {} + laparams = LAParams(**laparams_kwargs) + device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) @@ -72,19 +82,14 @@ def rect_to_bbox( @requires_dependencies(["pikepdf", "pypdf"]) -def open_pdfminer_pages_generator( - fp: BinaryIO, -): +def open_pdfminer_pages_generator(fp: BinaryIO, pdfminer_config: Optional[PDFMinerConfig] = None): """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" import pikepdf from unstructured.partition.pdf_image.pypdf_utils import get_page_data - device, interpreter = init_pdfminer( - word_margin=env_config.PDFMINER_WORD_MARGIN, - char_margin=env_config.PDFMINER_CHAR_MARGIN, - ) + device, interpreter = init_pdfminer(pdfminer_config=pdfminer_config) with tempfile.TemporaryDirectory() as tmp_dir_path: tmp_file_path = os.path.join(tmp_dir_path, "tmp_file") try: From 4f12a6bd199bd4ea185e37428f8fd65acd93a563 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Tue, 11 Feb 2025 17:49:14 +0100 Subject: [PATCH 04/11] Handle Nones gracefully --- unstructured/partition/pdf_image/pdfminer_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index bb3b0d812e..7c29fa2dc0 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -14,16 +14,16 @@ class PDFMinerConfig(BaseModel): - line_overlap: float = 0.5 - word_margin: float = 0.1 - line_margin: float = 0.5 - char_margin: float = 2 + line_overlap: Optional[float] = None + word_margin: Optional[float] = None + line_margin: Optional[float] = None + char_margin: Optional[float] = None def init_pdfminer(pdfminer_config: Optional[PDFMinerConfig] = None): rsrcmgr = PDFResourceManager() - laparams_kwargs = pdfminer_config.model_dump() if pdfminer_config else {} + laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {} laparams = LAParams(**laparams_kwargs) device = PDFPageAggregator(rsrcmgr, laparams=laparams) From 9f9f6c136ddb1a2abd3ff444d4f38a06470e54ea Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Tue, 11 Feb 2025 17:49:38 +0100 Subject: [PATCH 05/11] Remove unwanted params --- unstructured/partition/utils/config.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index bf94accd50..291ae1b6a3 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -224,17 +224,5 @@ def ANALYSIS_BBOX_FORMAT(self) -> str: """The format for analysed pages with bboxes drawn on them. Default is 'png'.""" return self._get_string("ANALYSIS_BBOX_FORMAT", "png") - @property - def PDFMINER_WORD_MARGIN(self) -> float: - """Distance (calculated as percentage of character width) between characters to consider - them seperated words and inject space between them""" - return self._get_float("PDFMINER_WORD_MARGIN", 0.1) - - @property - def PDFMINER_CHAR_MARGIN(self) -> float: - """Distance (calculated as percentage of character width) between characters to consider - them to be in the same line""" - return self._get_float("PDFMINER_CHAR_MARGIN", 2) - env_config = ENVConfig() From 560a55cf9c65c78356b9e4914af0a5a7175c2189 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Tue, 11 Feb 2025 17:53:06 +0100 Subject: [PATCH 06/11] Bump changelog --- CHANGELOG.md | 10 ++++++++++ unstructured/__version__.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f642fc45a..d8b1939ae4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.16.21-dev0 + +### Enhancements +- **Add PDF Miner configuration** Now PDF Miner can be configured via `pdfminer_line_overlap`, `pdfminer_word_margin`, `pdfminer_line_margin` and `pdfminer_char_margin` parameters added to partition method. +- +### Features + +### Fixes + + ## 0.16.20 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a15f0c27fb..77d5eaef6a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.20" # pragma: no cover +__version__ = "0.16.21-dev0" # pragma: no cover From 7b637fcc00a91b00e761940b13bbcc4a92cb70b2 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 12 Feb 2025 12:37:25 +0100 Subject: [PATCH 07/11] Bump changelog --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8b1939ae4..c7d5b882d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,6 @@ ### Fixes - ## 0.16.20 ### Enhancements From f518279c0937b1854a7181941d3a4928f8f31d72 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 12 Feb 2025 13:32:59 +0100 Subject: [PATCH 08/11] Add docs --- unstructured/partition/pdf.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index eb7d265bdb..6cea2fcbf9 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -149,7 +149,7 @@ def partition_pdf( pdfminer_line_margin: Optional[float] = None, pdfminer_char_margin: Optional[float] = None, pdfminer_line_overlap: Optional[float] = None, - pdfminer_word_margin: Optional[float] = None, + pdfminer_word_margin: Optional[float] = 1.85, **kwargs: Any, ) -> list[Element]: """Parses a pdf document into a list of interpreted elements. @@ -208,6 +208,19 @@ def partition_pdf( (results in adding FormKeysValues elements to output). form_extraction_skip_tables Whether the form extraction logic should ignore regions designated as Tables. + pdfminer_line_margin + If two lines are close together they are considered to be part of the same paragraph. + The margin is specified relative to the height of a line. + pdfminer_char_margin + If two characters are closer together than this margin they are considered part of + the same line. The margin is specified relative to the width of the character. + pdfminer_line_overlap + If two characters have more overlap than this they are considered to be on the same line. + The overlap is specified relative to the minimum height of both characters. + pdfminer_word_margin + If two characters on the same line are further apart than this margin then they are + considered to be two separate words, and an intermediate space will be added for + readability. The margin is specified relative to the width of the character. """ exactly_one(filename=filename, file=file) @@ -259,7 +272,7 @@ def partition_pdf_or_image( pdfminer_line_margin: Optional[float] = None, pdfminer_char_margin: Optional[float] = None, pdfminer_line_overlap: Optional[float] = None, - pdfminer_word_margin: Optional[float] = None, + pdfminer_word_margin: Optional[float] = 1.85, **kwargs: Any, ) -> list[Element]: """Parses a pdf or image document into a list of interpreted elements.""" From 151105cd595c98ca9f966918bd5af25f2221a01e Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 12 Feb 2025 13:49:36 +0100 Subject: [PATCH 09/11] Add unit test --- .../pdf_image/test_pdfminer_processing.py | 20 +++++++++++++++++++ unstructured/partition/pdf.py | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index 501e6ced9d..5e4114fce8 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -1,5 +1,8 @@ +from unittest.mock import patch + import numpy as np import pytest +from pdfminer.layout import LAParams from PIL import Image from unstructured_inference.constants import Source as InferenceSource from unstructured_inference.inference.elements import ( @@ -11,6 +14,7 @@ from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout from test_unstructured.unit_utils import example_doc_path +from unstructured.partition.auto import partition from unstructured.partition.pdf_image.pdfminer_processing import ( _validate_bbox, aggregate_embedded_text_by_block, @@ -242,3 +246,19 @@ def test_process_file_with_pdfminer(): assert len(layout) assert "LayoutParser: A Unified Toolkit for Deep\n" in layout[0].texts assert links[0][0]["url"] == "https://layout-parser.github.io" + + +@patch("unstructured.partition.pdf_image.pdfminer_utils.LAParams", return_value=LAParams()) +def test_laprams_are_passed_from_partition_to_pdfminer(pdfminer_mock): + partition( + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), + pdfminer_line_margin=1.123, + pdfminer_char_margin=None, + pdfminer_line_overlap=0.0123, + pdfminer_word_margin=3.21, + ) + assert pdfminer_mock.call_args.kwargs == { + "line_margin": 1.123, + "line_overlap": 0.0123, + "word_margin": 3.21, + } diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 6cea2fcbf9..8c493f5123 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -307,6 +307,7 @@ def partition_pdf_or_image( metadata_last_modified=metadata_last_modified or last_modified, starting_page_number=starting_page_number, password=password, + pdfminer_config=pdfminer_config, **kwargs, ) pdf_text_extractable = any( @@ -428,7 +429,7 @@ def _partition_pdf_with_pdfminer( """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster processing or detectron2 is not available. - Implementation is based on the `extract_text` implemenation in pdfminer.six, but + Implementation is based on the `extract_text` implementation in pdfminer.six, but modified to support tracking page numbers and working with file-like objects. ref: https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/high_level.py From cc005aafa36a2953b7902b44e62976967c37e61c Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 12 Feb 2025 17:37:50 +0100 Subject: [PATCH 10/11] Fix default param value --- unstructured/partition/pdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 8c493f5123..0899c57dd6 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -149,7 +149,7 @@ def partition_pdf( pdfminer_line_margin: Optional[float] = None, pdfminer_char_margin: Optional[float] = None, pdfminer_line_overlap: Optional[float] = None, - pdfminer_word_margin: Optional[float] = 1.85, + pdfminer_word_margin: Optional[float] = 0.185, **kwargs: Any, ) -> list[Element]: """Parses a pdf document into a list of interpreted elements. @@ -272,7 +272,7 @@ def partition_pdf_or_image( pdfminer_line_margin: Optional[float] = None, pdfminer_char_margin: Optional[float] = None, pdfminer_line_overlap: Optional[float] = None, - pdfminer_word_margin: Optional[float] = 1.85, + pdfminer_word_margin: Optional[float] = 0.185, **kwargs: Any, ) -> list[Element]: """Parses a pdf or image document into a list of interpreted elements.""" From 6391d2aaec17e685f1d14c8fb30c3b76d89b8142 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Wed, 12 Feb 2025 09:11:28 -0800 Subject: [PATCH 11/11] Feat: Add pdfminer parameters configuration <- Ingest test fixtures update (#3920) This pull request includes updated ingest test fixtures. Please review and merge if appropriate. Co-authored-by: plutasnyy --- .../biomed-api/65/11/main.PMC6312790.pdf.json | 4 +- .../biomed-api/75/29/main.PMC6312793.pdf.json | 32 ++++---- .../recalibrating-risk-report.pdf.json | 4 +- .../multi-column-2p.pdf.json | 8 +- .../layout-parser-paper.pdf.json | 40 +++++----- .../biomed-api/65/11/main.PMC6312790.pdf.json | 20 ++--- .../biomed-api/75/29/main.PMC6312793.pdf.json | 76 +++++++++---------- 7 files changed, 92 insertions(+), 92 deletions(-) diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 0cd04bffdc..6f6c30b2a8 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -513,7 +513,7 @@ "type": "Title" }, { - "element_id": "be270e13c935334fa3b17b13066d639b", + "element_id": "9764a7d0d48e56e28ae267d6fe521036", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -522,7 +522,7 @@ ], "page_number": 2 }, - "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs. 1–3 respectively. It can be seen clearly from these Figures that the efficiency of egg shell powder increase with the inhibitor con- centration, The increase in its efficiency could be as a result of increase in the constituent molecule", + "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs.1–3 respectively. It can be seen clearly from these Figures that the efficiency of egg shell powder increase with the inhibitor con- centration, The increase in its efficiency could be as a result of increase in the constituent molecule", "type": "NarrativeText" }, { diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index 63b2ca0fb5..1fab6122c1 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -465,7 +465,7 @@ "type": "Title" }, { - "element_id": "0cc9334df550d1730f2d468941a38225", + "element_id": "02c4df0e110486afd2bd74245e7d93d9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -474,14 +474,14 @@ ], "links": [ { - "start_index": 386, + "start_index": 383, "text": "https :// orlib . uqcloud . net /", "url": "https://orlib.uqcloud.net/" } ], "page_number": 2 }, - "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data Tables, text files How data were acquired Artificially generated by a C þ þ program on Intels Xeons CPU E5– 2670 v2 with Linux operating system. Data format Raw Experimental factors Sixty randomly generated instances of the MDVSP with the number of depots in (8, 12, 16) and the number of trips in (1500, 2000, 2500, 3000) Experimental features Randomly generated instances Data source location IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data accessibility Data can be downloaded from https://orlib.uqcloud.net/ Related research article Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457–487 [3].", + "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data Tables, text files How data were acquired Artificially generated by a Cþ þ program on Intels Xeons CPU E5– 2670 v2 with Linux operating system. Data format Raw Experimental factors Sixty randomly generated instances of the MDVSP with the number of depots in (8,12,16) and the number of trips in (1500, 2000, 2500, 3000) Experimental features Randomly generated instances Data source location IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data accessibility Data can be downloaded from https://orlib.uqcloud.net/ Related research article Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457–487 [3].", "type": "Table" }, { @@ -576,7 +576,7 @@ "type": "Title" }, { - "element_id": "683993fc4592941bf8b06173870aa63c", + "element_id": "1f3d79f338b86fbfcfa7054f11de28f0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -585,14 +585,14 @@ ], "links": [ { - "start_index": 611, + "start_index": 609, "text": "https :// orlib . uqcloud . net", "url": "https://orlib.uqcloud.net" } ], "page_number": 2 }, - "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number for the size, ‘ðm; nÞ’, respectively. For example, the problem instance, ‘RN-8–1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm; nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. For each problem instance, the following information is provided:", + "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number for the size, ‘ðm;nÞ’, respectively. For example, the problem instance, ‘RN-8–1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm;nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. For each problem instance, the following information is provided:", "type": "NarrativeText" }, { @@ -661,7 +661,7 @@ "type": "UncategorizedText" }, { - "element_id": "96ca028aef61c1fd98c9f0232a833498", + "element_id": "39943e8e76f7ddd879284cf782cac2f4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -670,7 +670,7 @@ ], "page_number": 2 }, - "text": "For each trip i A 1; 2; …; n, a start time, ts i , an end time, te i , a start location, ls i , and an end location, le i , and", + "text": "For each trip iA1;2;…;n, a start time, ts i, an end time, te i , a start location, ls i, and an end location, le i , and", "type": "NarrativeText" }, { @@ -726,7 +726,7 @@ "type": "NarrativeText" }, { - "element_id": "2bd550b209c7c06c42966aad21822ea5", + "element_id": "9698643b7f3d779d8a5fdb13dffef106", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -735,7 +735,7 @@ ], "page_number": 3 }, - "text": "and end location of the trip. A long trip is about 3–5 h in duration and has the same start and end location. For all instances, m r l and the locations 1; …; m correspond to depots, while the remaining locations only appear as trip start and end locations.", + "text": "and end location of the trip. A long trip is about 3–5 h in duration and has the same start and end location. For all instances, mrl and the locations 1;…;m correspond to depots, while the remaining locations only appear as trip start and end locations.", "type": "NarrativeText" }, { @@ -804,7 +804,7 @@ "type": "NarrativeText" }, { - "element_id": "9d3f44c51fe13ebdf6b9511859e4f1b7", + "element_id": "02146cfa4d68e86d868e99acab4f7c42", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -813,7 +813,7 @@ ], "page_number": 3 }, - "text": "For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.", + "text": "For each instance size ðm;nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.", "type": "NarrativeText" }, { @@ -830,7 +830,7 @@ "type": "NarrativeText" }, { - "element_id": "d9904b5393369c5204af83b64035802a", + "element_id": "fc4b1e0c5bb8b330e2160f6615975401", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -839,7 +839,7 @@ ], "page_number": 3 }, - "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ.", + "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm;nÞ.", "type": "NarrativeText" }, { @@ -934,7 +934,7 @@ "type": "NarrativeText" }, { - "element_id": "17e17590003c0f514220c453f88da6b7", + "element_id": "86e18db80eab89d0556c22321732e4e7", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -943,7 +943,7 @@ ], "page_number": 4 }, - "text": "Number of Number of columns in Description lines each line 1 3 The number of depots, the number of trips, and the number of locations. 1 m The number of vehicles rd at each depot d. n 4 One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls i , the start i , the end location le time ts i and the end time te i for the corresponding trip. l l Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j.", + "text": "Number of Number of columns in Description lines each line 1 3 The number of depots, the number of trips, and the number of locations. 1 m The number of vehicles rd at each depot d. n 4 One line for each trip, i ¼ 1;2;…;n. Each line provides the start location ls i, the start i, the end location le time ts i and the end time te i for the corresponding trip. l l Each element, δij; where i;jA1;2;…;l, refers to the travel time between location i and location j.", "type": "Table" }, { diff --git a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json index ce75238d45..8c37cc5825 100644 --- a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json @@ -1799,8 +1799,8 @@ }, { "type": "Image", - "element_id": "1b93c33208a85ba6d2a69d23babd6def", - "text": "25 24.6 20 18.4 e 15 10 5 4.6 2.8 0 C oal Oil Bio m ass N atural gas 0.07 Wind 0.04 H ydropo w er 0.02 S olar 0.01 N uclear ", + "element_id": "c0a86e51afb417a3b057d7cf101bbed6", + "text": "25 24.6 20 18.4 e 15 10 5 4.6 2.8 0 Coal Oil Bio m ass Natural gas 0.07 Wind 0.04 Hydropower 0.02 Solar 0.01 Nuclear ", "metadata": { "filetype": "application/pdf", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json index b07103abf1..b6516f791c 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json @@ -111,8 +111,8 @@ }, { "type": "CompositeElement", - "element_id": "43198ac980a699b3b17c5f229aee8656", - "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as \u201cWho \ufb01rst voiced Meg on Family Guy?\u201d or \u201cWhere was the 8th Dalai Lama born?\u201d, a system is required to answer it using a large corpus of diversi\ufb01ed topics. More speci\ufb01cally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1, d2, \u00b7 \u00b7 \u00b7 , dD. We \ufb01rst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1, p2, . . . , pM }, where each passage pi can be viewed as a sequence 2 , \u00b7 \u00b7 \u00b7 , w(i) 1 , w(i) of tokens w(i) |pi|. Given a question q, the task is to \ufb01nd a span w(i) s+1, \u00b7 \u00b7 \u00b7 , w(i) s , w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an ef\ufb01cient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q, C) \u2192 CF is a function that takes as input a question q and a corpus C and returns a much smaller \ufb01lter set of texts CF \u2282 C, where |CF | = k (cid:28) |C|. For a \ufb01xed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", + "element_id": "e6dee1abec28f8ff365ab6275b3e5f0e", + "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as \u201cWho \ufb01rst voiced Meg on Family Guy?\u201d or \u201cWhere was the 8th Dalai Lama born?\u201d, a system is required to answer it using a large corpus of diversi\ufb01ed topics. More speci\ufb01cally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,\u00b7\u00b7\u00b7 ,dD. We \ufb01rst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,\u00b7\u00b7\u00b7 ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to \ufb01nd a span w(i) s+1,\u00b7\u00b7\u00b7 ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an ef\ufb01cient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) \u2192 CF is a function that takes as input a question q and a corpus C and returns a much smaller \ufb01lter set of texts CF \u2282 C, where |CF| = k (cid:28) |C|. For a \ufb01xed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", "metadata": { "data_source": { "record_locator": { @@ -133,8 +133,8 @@ }, { "type": "CompositeElement", - "element_id": "82cfad702e5779169139f705fd0af5ee", - "text": "3 Dense Passage Retriever (DPR)\n\nWe focus our research in this work on improv- ing the retrieval component in open-domain QA. Given a collection of M text passages, the goal of our dense passage retriever (DPR) is to index all the passages in a low-dimensional and continuous space, such that it can retrieve ef\ufb01ciently the top k passages relevant to the input question for the reader at run-time. Note that M can be very large (e.g., 21 million passages in our experiments, de- scribed in Section 4.1) and k is usually small, such as 20\u2013100.\n\n3.1 Overview\n\nOur dense passage retriever (DPR) uses a dense encoder EP (\u00b7) which maps any text passage to a d- dimensional real-valued vectors and builds an index for all the M passages that we will use for retrieval.\n\n3The ideal size and boundary of a text passage are func- tions of both the retriever and reader. We also experimented with natural paragraphs in our preliminary trials and found that using \ufb01xed-length passages performs better in both retrieval and \ufb01nal QA accuracy, as observed by Wang et al. (2019).\n\n4Exceptions include (Seo et al., 2019) and (Roberts et al., 2020), which retrieves and generates the answers, respectively.", + "element_id": "ac6733a570cbdd5c8d48f8252b345b17", + "text": "3 Dense Passage Retriever (DPR)\n\nWe focus our research in this work on improv- ing the retrieval component in open-domain QA. Given a collection of M text passages, the goal of our dense passage retriever (DPR) is to index all the passages in a low-dimensional and continuous space, such that it can retrieve ef\ufb01ciently the top k passages relevant to the input question for the reader at run-time. Note that M can be very large (e.g., 21 million passages in our experiments, de- scribed in Section 4.1) and k is usually small, such as 20\u2013100.\n\n3.1 Overview\n\nOur dense passage retriever (DPR) uses a dense encoder EP(\u00b7) which maps any text passage to a d- dimensional real-valued vectors and builds an index for all the M passages that we will use for retrieval.\n\n3The ideal size and boundary of a text passage are func- tions of both the retriever and reader. We also experimented with natural paragraphs in our preliminary trials and found that using \ufb01xed-length passages performs better in both retrieval and \ufb01nal QA accuracy, as observed by Wang et al. (2019).\n\n4Exceptions include (Seo et al., 2019) and (Roberts et al., 2020), which retrieves and generates the answers, respectively.", "metadata": { "data_source": { "record_locator": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 3e22f163fa..b9d9f35d17 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -1221,8 +1221,8 @@ }, { "type": "NarrativeText", - "element_id": "2172d9b276cd7a485dea4978805815d8", - "text": "Fig. 1: The overall architecture of LayoutParser. For an input document image, the core LayoutParser library provides a set of o\ufb00-the-shelf tools for layout detection, OCR, visualization, and storage, backed by a carefully designed layout data structure. LayoutParser also supports high level customization via e\ufb03cient layout annotation and model training functions. These improve model accuracy on the target samples. The community platform enables the easy sharing of DIA models and whole digitization pipelines to promote reusability and reproducibility. A collection of detailed documentation, tutorials and exemplar projects make LayoutParser easy to learn and use.", + "element_id": "466f0bc21599ccf0fa27c021cb023f90", + "text": "Fig.1: The overall architecture of LayoutParser. For an input document image, the core LayoutParser library provides a set of o\ufb00-the-shelf tools for layout detection, OCR, visualization, and storage, backed by a carefully designed layout data structure. LayoutParser also supports high level customization via e\ufb03cient layout annotation and model training functions. These improve model accuracy on the target samples. The community platform enables the easy sharing of DIA models and whole digitization pipelines to promote reusability and reproducibility. A collection of detailed documentation, tutorials and exemplar projects make LayoutParser easy to learn and use.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1623,8 +1623,8 @@ }, { "type": "NarrativeText", - "element_id": "c2af717e76ad68bd6da87a15a69f126a", - "text": "2 image = cv2 . imread ( \" image_file \" ) # load images 3 model = lp . De t e c tro n2 Lay outM odel ( 4 \" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config \" )", + "element_id": "7d55b80ca5a0c2888ff44b931430b0d8", + "text": "2 image = cv2.imread(\"image_file\") # load images 3 model = lp.Detectron2LayoutModel( 4 \"lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config\")", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1645,8 +1645,8 @@ }, { "type": "ListItem", - "element_id": "a002e13c7ea2613b2eabb9ea3501856d", - "text": "3 model = lp . De t e c tro n2 Lay outM odel (", + "element_id": "f30541418a7dca51e3e4cd880486ab9c", + "text": "3 model = lp.Detectron2LayoutModel(", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1667,8 +1667,8 @@ }, { "type": "ListItem", - "element_id": "366c05fd7babc86bf01d690b9df755da", - "text": "5 layout = model . detect ( image )", + "element_id": "ecaf88c55d275f8fdc8c25e2d919077f", + "text": "5 layout = model.detect(image)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1784,8 +1784,8 @@ }, { "type": "FigureCaption", - "element_id": "c2a2a4a054151d16820f38e115ce7a72", - "text": "Fig. 2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum \ufb02exibility.", + "element_id": "9f11aa6b22dea1bba7eb0d122c0c5562", + "text": "Fig.2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum \ufb02exibility.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2006,8 +2006,8 @@ }, { "type": "NarrativeText", - "element_id": "373a9a67f855ba5b79bdc1393d2f1ce9", - "text": "1 ocr_agent = lp . TesseractAgent () 2 # Can be easily switched to other OCR software 3 tokens = ocr_agent . detect ( image )", + "element_id": "2e605dfb574532cf2ab54ded080a2ab9", + "text": "1 ocr_agent = lp.TesseractAgent() 2 # Can be easily switched to other OCR software 3 tokens = ocr_agent.detect(image)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2378,8 +2378,8 @@ }, { "type": "NarrativeText", - "element_id": "fadd4ad54cd14e3e4711d41a1c99f813", - "text": "Fig. 3: Layout detection and OCR results visualization generated by the LayoutParser APIs. Mode I directly overlays the layout region bounding boxes and categories over the original image. Mode II recreates the original document via drawing the OCR\u2019d texts at their corresponding positions on the image canvas. In this \ufb01gure, tokens in textual regions are \ufb01ltered using the API and then displayed.", + "element_id": "4d1b9566e792683b9559b778be4f4046", + "text": "Fig.3: Layout detection and OCR results visualization generated by the LayoutParser APIs. Mode I directly overlays the layout region bounding boxes and categories over the original image. Mode II recreates the original document via drawing the OCR\u2019d texts at their corresponding positions on the image canvas. In this \ufb01gure, tokens in textual regions are \ufb01ltered using the API and then displayed.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2529,8 +2529,8 @@ }, { "type": "NarrativeText", - "element_id": "ebbb8c84b2a69f817c8ae7df20d72dd9", - "text": "Fig. 4: Illustration of (a) the original historical Japanese document with layout detection results and (b) a recreated version of the document image that achieves much better character recognition recall. The reorganization algorithm rearranges the tokens based on the their detected bounding boxes given a maximum allowed height.", + "element_id": "9667b0e42f9d28607c7c13bffb760906", + "text": "Fig.4: Illustration of (a) the original historical Japanese document with layout detection results and (b) a recreated version of the document image that achieves much better character recognition recall. The reorganization algorithm rearranges the tokens based on the their detected bounding boxes given a maximum allowed height.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2841,8 +2841,8 @@ }, { "type": "NarrativeText", - "element_id": "3c1fd89a3436d3cedb4d22d297c76437", - "text": "Fig. 5: Illustration of how LayoutParser helps with the historical document digi- tization pipeline.", + "element_id": "80291b42f1785935496188bb52788288", + "text": "Fig.5: Illustration of how LayoutParser helps with the historical document digi- tization pipeline.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3141,8 +3141,8 @@ }, { "type": "FigureCaption", - "element_id": "7e685908875164adafa447ec3d97455e", - "text": "Fig. 6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in di\ufb00erent locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.", + "element_id": "d35d253341e8b8d837f384ecd6ac410a", + "text": "Fig.6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in di\ufb00erent locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.", "metadata": { "filetype": "application/pdf", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json index 6264f96a86..fdb1b1ff86 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json @@ -794,8 +794,8 @@ }, { "type": "NarrativeText", - "element_id": "732dc7fa0795c651041c10c2d318a8ae", - "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs. 1\u20133 respectively. It can be seen clearly from these Figures that the ef\ufb01ciency of egg shell powder increase with the inhibitor con- centration, The increase in its ef\ufb01ciency could be as a result of increase in the constituent molecule", + "element_id": "28d5b195997810a34c2aa96c9f357de2", + "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs.1\u20133 respectively. It can be seen clearly from these Figures that the ef\ufb01ciency of egg shell powder increase with the inhibitor con- centration, The increase in its ef\ufb01ciency could be as a result of increase in the constituent molecule", "metadata": { "languages": [ "eng" @@ -2598,8 +2598,8 @@ }, { "type": "UncategorizedText", - "element_id": "6fcf2a276d4b2d81f991b4eb6f04009a", - "text": "(cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356", + "element_id": "a0aa9bf2a48ed1dff882a16cb320c616", + "text": "(cid:3)0.9393 (cid:3)0.8276 (cid:3)0.8825 (cid:3)0.8027 (cid:3)0.5896 (cid:3)0.5356", "metadata": { "languages": [ "eng" @@ -3434,8 +3434,8 @@ }, { "type": "Title", - "element_id": "d269706e81c2b5978ae0b5c820ce176a", - "text": "\u03b8 \u00bc CRo (cid:3) CR", + "element_id": "543caecd15c161082076a174ea946782", + "text": "\u03b8 \u00bc CRo(cid:3)CR", "metadata": { "languages": [ "eng" @@ -3478,8 +3478,8 @@ }, { "type": "Title", - "element_id": "d48a9ee64508de2e63b2f4579ef78432", - "text": "IE \u00f0%\u00de \u00bc CRo (cid:3) CR", + "element_id": "59a609931ac8f9c55855113bfae6655e", + "text": "IE \u00f0%\u00de \u00bc CRo(cid:3)CR", "metadata": { "languages": [ "eng" @@ -3720,8 +3720,8 @@ }, { "type": "NarrativeText", - "element_id": "a8d445f830ed31990875a519f4be0eb5", - "text": "steps of the linear polarization plot are substituted to get corrosion current. Nova software was used with linear polarization resistance (LPR) and the current was set to 10 mA (maximum) and 10 nA (minimum). LSV staircase parameter start potential (cid:3) 1.5 v, step potential 0.001 m/s and stop potential of \u00fe1.5 v set was used in this study.", + "element_id": "ac11629522e563b6a0a8f261ab4b94e0", + "text": "steps of the linear polarization plot are substituted to get corrosion current. Nova software was used with linear polarization resistance (LPR) and the current was set to 10 mA (maximum) and 10 nA (minimum). LSV staircase parameter start potential (cid:3)1.5 v, step potential 0.001 m/s and stop potential of \u00fe1.5 v set was used in this study.", "metadata": { "languages": [ "eng" diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json index 26955e33e1..908e9e125a 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json @@ -503,8 +503,8 @@ }, { "type": "NarrativeText", - "element_id": "d21722fd648aed04c8119948bf24b400", - "text": "Tables, text \ufb01les Arti\ufb01cially generated by a C \u00fe \u00fe program on Intels Xeons CPU E5\u2013 2670 v2 with Linux operating system. Raw Sixty randomly generated instances of the MDVSP with the number of depots in (8, 12, 16) and the number of trips in (1500, 2000, 2500, 3000) Randomly generated instances IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data can be downloaded from https://orlib.uqcloud.net/ Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457\u2013487 [3].", + "element_id": "d2073c6354217f9b2d4d5c654d77f232", + "text": "Tables, text \ufb01les Arti\ufb01cially generated by a C\u00fe \u00fe program on Intels Xeons CPU E5\u2013 2670 v2 with Linux operating system. Raw Sixty randomly generated instances of the MDVSP with the number of depots in (8,12,16) and the number of trips in (1500, 2000, 2500, 3000) Randomly generated instances IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data can be downloaded from https://orlib.uqcloud.net/ Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457\u2013487 [3].", "metadata": { "languages": [ "eng" @@ -513,7 +513,7 @@ { "text": "https :// orlib . uqcloud . net /", "url": "https://orlib.uqcloud.net/", - "start_index": 386 + "start_index": 383 } ], "page_number": 2, @@ -774,8 +774,8 @@ }, { "type": "NarrativeText", - "element_id": "96589dd8025c674caf26c856ea689d4e", - "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate \ufb01le. Each \ufb01le is named as \u2018RN-m-n-k.dat\u2019, where \u2018m\u2019, \u2018n\u2019, and \u2018k\u2019 denote the number of depots, the number of trips, and the instance number \u2018RN-8\u20131500-01.dat\u2019, for is the \ufb01rst problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, \u00f0m; n\u00de, \ufb01ve instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net.", + "element_id": "52c2b4b09c228b90a487fa4fd42a1590", + "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate \ufb01le. Each \ufb01le is named as \u2018RN-m-n-k.dat\u2019, where \u2018m\u2019, \u2018n\u2019, and \u2018k\u2019 denote the number of depots, the number of trips, and the instance number \u2018RN-8\u20131500-01.dat\u2019, for is the \ufb01rst problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, \u00f0m;n\u00de, \ufb01ve instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net.", "metadata": { "languages": [ "eng" @@ -784,7 +784,7 @@ { "text": "https :// orlib . uqcloud . net", "url": "https://orlib.uqcloud.net", - "start_index": 611 + "start_index": 609 } ], "page_number": 2, @@ -803,8 +803,8 @@ }, { "type": "UncategorizedText", - "element_id": "97686fd4b810190336f3a3f4debb4c5d", - "text": "\u2018\u00f0m; n\u00de\u2019,", + "element_id": "a442f6b8548f2b2be7eb0b0c488eaf3f", + "text": "\u2018\u00f0m;n\u00de\u2019,", "metadata": { "languages": [ "eng" @@ -891,8 +891,8 @@ }, { "type": "NarrativeText", - "element_id": "275e61db64667898a0ec65d6cbbff69b", - "text": "For each problem instance, the following information is provided: The number of depots m\u00f0 The number of trips \u00f0n\u00de, The number of locations \u00f0l\u00de, The number of vehicles at each depot, For each trip i A 1; 2; \u2026; n, a start time, ts", + "element_id": "20a5ace34ab61e08b1ab35c222c6554f", + "text": "For each problem instance, the following information is provided: The number of depots m\u00f0 The number of trips \u00f0n\u00de, The number of locations \u00f0l\u00de, The number of vehicles at each depot, For each trip iA1;2;\u2026;n, a start time, ts", "metadata": { "languages": [ "eng" @@ -957,8 +957,8 @@ }, { "type": "Title", - "element_id": "3c0009859c6faa133b3e59b1b5c42c5b", - "text": "i , an end time, te", + "element_id": "812eeb4f274baf14170f2447204a4a55", + "text": "i, an end time, te", "metadata": { "languages": [ "eng" @@ -979,8 +979,8 @@ }, { "type": "UncategorizedText", - "element_id": "4f3baeb46b82b7cb0acec9e6b9ac9787", - "text": "i , and an end location, le i ,", + "element_id": "4b917219b5939da4a52a907db733f551", + "text": "i, and an end location, le i ,", "metadata": { "languages": [ "eng" @@ -1023,8 +1023,8 @@ }, { "type": "NarrativeText", - "element_id": "9e7301ebb3fd5cbe1410901ea78c02db", - "text": "(cid:2) The travel time, \u03b4ij, between any two locations i; j A 1; \u2026; l.", + "element_id": "b1bb94d45fba27ddeefd146fbde1dcc4", + "text": "(cid:2) The travel time, \u03b4ij, between any two locations i;jA1;\u2026;l.", "metadata": { "languages": [ "eng" @@ -1111,8 +1111,8 @@ }, { "type": "NarrativeText", - "element_id": "a3a97226d270316d06712c89f7ff489d", - "text": "and end location of the trip. A long trip is about 3\u20135 h in duration and has the same start and end location. For all instances, m r l and the locations 1; \u2026; m correspond to depots, while the remaining locations only appear as trip start and end locations.", + "element_id": "eeba8dd874b520a36aa718db99dbfd38", + "text": "and end location of the trip. A long trip is about 3\u20135 h in duration and has the same start and end location. For all instances, mrl and the locations 1;\u2026;m correspond to depots, while the remaining locations only appear as trip start and end locations.", "metadata": { "languages": [ "eng" @@ -1155,8 +1155,8 @@ }, { "type": "NarrativeText", - "element_id": "51071653fbb405a5c84831cbacc6c618", - "text": ". If le i ls le i j , otherwise, the vehicle may require waiting at le i for the duration of \u00f0ts", + "element_id": "c4a028a7e5a91a69b88a778ed1d4c4c1", + "text": ". If le i ls le i j, otherwise, the vehicle may require waiting at le i for the duration of \u00f0ts", "metadata": { "languages": [ "eng" @@ -1177,8 +1177,8 @@ }, { "type": "Title", - "element_id": "edff69ec864e554eb9aee86908ecac9c", - "text": "Z te", + "element_id": "3351f34f87afe9cffe4fd31320b9ccc8", + "text": "Zte", "metadata": { "languages": [ "eng" @@ -1199,8 +1199,8 @@ }, { "type": "Title", - "element_id": "f038d089ae51f445f96217852ae9c670", - "text": "a ls", + "element_id": "7a378649c353830c59db2e86df7f7368", + "text": "als", "metadata": { "languages": [ "eng" @@ -1243,8 +1243,8 @@ }, { "type": "NarrativeText", - "element_id": "d3b130ec44c8f5b0865012570fe82fd0", - "text": "j , the vehicle must travel empty from le j (cid:3)te i \u00de. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satis\ufb01ed:", + "element_id": "f7296ef349382c5db6f8a271d8f3fe03", + "text": "j, the vehicle must travel empty from le j (cid:3)te i \u00de. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satis\ufb01ed:", "metadata": { "languages": [ "eng" @@ -1331,8 +1331,8 @@ }, { "type": "NarrativeText", - "element_id": "80d7ee3f1337fffbcb42c78e218d8aad", - "text": "A suf\ufb01cient number of vehicles are provided to maintain the feasibility of an instance. For each instance size \u00f0m; n\u00de, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over \ufb01ve instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.", + "element_id": "3dbb489d8594d6744d2fce9cdcde691c", + "text": "A suf\ufb01cient number of vehicles are provided to maintain the feasibility of an instance. For each instance size \u00f0m;n\u00de, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over \ufb01ve instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.", "metadata": { "languages": [ "eng" @@ -1353,8 +1353,8 @@ }, { "type": "NarrativeText", - "element_id": "2c71b28268ae79e366c8190e28761e31", - "text": "The description of the \ufb01le for each problem instance is presented in Table 2. The \ufb01rst line in the \ufb01le provides the number of depots \u00f0m\u00de, the number of trips, \u00f0n\u00de, and the number of locations \u00f0l\u00de, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; \u2026; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; \u2026; l", + "element_id": "7490a379155c95007ad9649ec7689e35", + "text": "The description of the \ufb01le for each problem instance is presented in Table 2. The \ufb01rst line in the \ufb01le provides the number of depots \u00f0m\u00de, the number of trips, \u00f0n\u00de, and the number of locations \u00f0l\u00de, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, iA 1;\u2026;n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i;jA 1;\u2026;l", "metadata": { "languages": [ "eng" @@ -1463,8 +1463,8 @@ }, { "type": "NarrativeText", - "element_id": "7c74ad0f1b0edb685ba951da2a788af8", - "text": "The dataset also includes a program \u2018GenerateInstance.cpp\u2019 that can be used to generate new instances. The program takes three inputs, the number of depots \u00f0m\u00de, the number of trips \u00f0n\u00de, and the number of instances for each size \u00f0m; n\u00de.", + "element_id": "0b37e732b73efa9dbd994f164dac8d5c", + "text": "The dataset also includes a program \u2018GenerateInstance.cpp\u2019 that can be used to generate new instances. The program takes three inputs, the number of depots \u00f0m\u00de, the number of trips \u00f0n\u00de, and the number of instances for each size \u00f0m;n\u00de.", "metadata": { "languages": [ "eng" @@ -1947,8 +1947,8 @@ }, { "type": "NarrativeText", - "element_id": "ebd5a6aeac91e0f42fecb980ef4a648a", - "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i \u00bc 1; 2; \u2026; n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, \u03b4ij; where i; j A 1; 2; \u2026; l, refers to the travel time between location i and location j.", + "element_id": "c981c256386d57e68a2c947147f30229", + "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i \u00bc 1;2;\u2026;n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, \u03b4ij; where i;jA1;2;\u2026;l, refers to the travel time between location i and location j.", "metadata": { "languages": [ "eng" @@ -1969,8 +1969,8 @@ }, { "type": "Title", - "element_id": "50fb8c466c52d5ae755055ffc24a418d", - "text": "i , the start", + "element_id": "e6e8997790263be5ca103754ee56e234", + "text": "i, the start", "metadata": { "languages": [ "eng" @@ -1991,8 +1991,8 @@ }, { "type": "Title", - "element_id": "44a4c21af61b74e9f30be3112d9eb1e7", - "text": "i , the end location le", + "element_id": "49f536ed0f91f7e6d8ad1d70d71991b0", + "text": "i, the end location le", "metadata": { "languages": [ "eng"