diff --git a/CHANGELOG.md b/CHANGELOG.md index 554ef679da..cb78293e58 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,12 @@ -## 0.16.21-dev4 +## 0.16.21-dev5 ### Enhancements - **Use password** to load PDF with all modes - **use vectorized logic to merge inferred and extracted layouts**. Using the new `LayoutElements` data structure and numpy library to refactor the layout merging logic to improve compute performance as well as making logic more clear +- **Add PDF Miner configuration** Now PDF Miner can be configured via `pdfminer_line_overlap`, `pdfminer_word_margin`, `pdfminer_line_margin` and `pdfminer_char_margin` parameters added to partition method. + ### Features ### Fixes diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index 501e6ced9d..5e4114fce8 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -1,5 +1,8 @@ +from unittest.mock import patch + import numpy as np import pytest +from pdfminer.layout import LAParams from PIL import Image from unstructured_inference.constants import Source as InferenceSource from unstructured_inference.inference.elements import ( @@ -11,6 +14,7 @@ from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout from test_unstructured.unit_utils import example_doc_path +from unstructured.partition.auto import partition from unstructured.partition.pdf_image.pdfminer_processing import ( _validate_bbox, aggregate_embedded_text_by_block, @@ -242,3 +246,19 @@ def test_process_file_with_pdfminer(): assert len(layout) assert "LayoutParser: A Unified Toolkit for Deep\n" in layout[0].texts assert links[0][0]["url"] == "https://layout-parser.github.io" + + +@patch("unstructured.partition.pdf_image.pdfminer_utils.LAParams", return_value=LAParams()) +def test_laprams_are_passed_from_partition_to_pdfminer(pdfminer_mock): + partition( + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), + pdfminer_line_margin=1.123, + pdfminer_char_margin=None, + pdfminer_line_overlap=0.0123, + pdfminer_word_margin=3.21, + ) + assert pdfminer_mock.call_args.kwargs == { + "line_margin": 1.123, + "line_overlap": 0.0123, + "word_margin": 3.21, + } diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 0cd04bffdc..6f6c30b2a8 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -513,7 +513,7 @@ "type": "Title" }, { - "element_id": "be270e13c935334fa3b17b13066d639b", + "element_id": "9764a7d0d48e56e28ae267d6fe521036", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -522,7 +522,7 @@ ], "page_number": 2 }, - "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs. 1–3 respectively. It can be seen clearly from these Figures that the efficiency of egg shell powder increase with the inhibitor con- centration, The increase in its efficiency could be as a result of increase in the constituent molecule", + "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs.1–3 respectively. It can be seen clearly from these Figures that the efficiency of egg shell powder increase with the inhibitor con- centration, The increase in its efficiency could be as a result of increase in the constituent molecule", "type": "NarrativeText" }, { diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index 63b2ca0fb5..1fab6122c1 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -465,7 +465,7 @@ "type": "Title" }, { - "element_id": "0cc9334df550d1730f2d468941a38225", + "element_id": "02c4df0e110486afd2bd74245e7d93d9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -474,14 +474,14 @@ ], "links": [ { - "start_index": 386, + "start_index": 383, "text": "https :// orlib . uqcloud . net /", "url": "https://orlib.uqcloud.net/" } ], "page_number": 2 }, - "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data Tables, text files How data were acquired Artificially generated by a C þ þ program on Intels Xeons CPU E5– 2670 v2 with Linux operating system. Data format Raw Experimental factors Sixty randomly generated instances of the MDVSP with the number of depots in (8, 12, 16) and the number of trips in (1500, 2000, 2500, 3000) Experimental features Randomly generated instances Data source location IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data accessibility Data can be downloaded from https://orlib.uqcloud.net/ Related research article Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457–487 [3].", + "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data Tables, text files How data were acquired Artificially generated by a Cþ þ program on Intels Xeons CPU E5– 2670 v2 with Linux operating system. Data format Raw Experimental factors Sixty randomly generated instances of the MDVSP with the number of depots in (8,12,16) and the number of trips in (1500, 2000, 2500, 3000) Experimental features Randomly generated instances Data source location IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data accessibility Data can be downloaded from https://orlib.uqcloud.net/ Related research article Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457–487 [3].", "type": "Table" }, { @@ -576,7 +576,7 @@ "type": "Title" }, { - "element_id": "683993fc4592941bf8b06173870aa63c", + "element_id": "1f3d79f338b86fbfcfa7054f11de28f0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -585,14 +585,14 @@ ], "links": [ { - "start_index": 611, + "start_index": 609, "text": "https :// orlib . uqcloud . net", "url": "https://orlib.uqcloud.net" } ], "page_number": 2 }, - "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number for the size, ‘ðm; nÞ’, respectively. For example, the problem instance, ‘RN-8–1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm; nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. For each problem instance, the following information is provided:", + "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number for the size, ‘ðm;nÞ’, respectively. For example, the problem instance, ‘RN-8–1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm;nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. For each problem instance, the following information is provided:", "type": "NarrativeText" }, { @@ -661,7 +661,7 @@ "type": "UncategorizedText" }, { - "element_id": "96ca028aef61c1fd98c9f0232a833498", + "element_id": "39943e8e76f7ddd879284cf782cac2f4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -670,7 +670,7 @@ ], "page_number": 2 }, - "text": "For each trip i A 1; 2; …; n, a start time, ts i , an end time, te i , a start location, ls i , and an end location, le i , and", + "text": "For each trip iA1;2;…;n, a start time, ts i, an end time, te i , a start location, ls i, and an end location, le i , and", "type": "NarrativeText" }, { @@ -726,7 +726,7 @@ "type": "NarrativeText" }, { - "element_id": "2bd550b209c7c06c42966aad21822ea5", + "element_id": "9698643b7f3d779d8a5fdb13dffef106", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -735,7 +735,7 @@ ], "page_number": 3 }, - "text": "and end location of the trip. A long trip is about 3–5 h in duration and has the same start and end location. For all instances, m r l and the locations 1; …; m correspond to depots, while the remaining locations only appear as trip start and end locations.", + "text": "and end location of the trip. A long trip is about 3–5 h in duration and has the same start and end location. For all instances, mrl and the locations 1;…;m correspond to depots, while the remaining locations only appear as trip start and end locations.", "type": "NarrativeText" }, { @@ -804,7 +804,7 @@ "type": "NarrativeText" }, { - "element_id": "9d3f44c51fe13ebdf6b9511859e4f1b7", + "element_id": "02146cfa4d68e86d868e99acab4f7c42", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -813,7 +813,7 @@ ], "page_number": 3 }, - "text": "For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.", + "text": "For each instance size ðm;nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.", "type": "NarrativeText" }, { @@ -830,7 +830,7 @@ "type": "NarrativeText" }, { - "element_id": "d9904b5393369c5204af83b64035802a", + "element_id": "fc4b1e0c5bb8b330e2160f6615975401", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -839,7 +839,7 @@ ], "page_number": 3 }, - "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ.", + "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm;nÞ.", "type": "NarrativeText" }, { @@ -934,7 +934,7 @@ "type": "NarrativeText" }, { - "element_id": "17e17590003c0f514220c453f88da6b7", + "element_id": "86e18db80eab89d0556c22321732e4e7", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -943,7 +943,7 @@ ], "page_number": 4 }, - "text": "Number of Number of columns in Description lines each line 1 3 The number of depots, the number of trips, and the number of locations. 1 m The number of vehicles rd at each depot d. n 4 One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls i , the start i , the end location le time ts i and the end time te i for the corresponding trip. l l Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j.", + "text": "Number of Number of columns in Description lines each line 1 3 The number of depots, the number of trips, and the number of locations. 1 m The number of vehicles rd at each depot d. n 4 One line for each trip, i ¼ 1;2;…;n. Each line provides the start location ls i, the start i, the end location le time ts i and the end time te i for the corresponding trip. l l Each element, δij; where i;jA1;2;…;l, refers to the travel time between location i and location j.", "type": "Table" }, { diff --git a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json index ce75238d45..8c37cc5825 100644 --- a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json @@ -1799,8 +1799,8 @@ }, { "type": "Image", - "element_id": "1b93c33208a85ba6d2a69d23babd6def", - "text": "25 24.6 20 18.4 e 15 10 5 4.6 2.8 0 C oal Oil Bio m ass N atural gas 0.07 Wind 0.04 H ydropo w er 0.02 S olar 0.01 N uclear ", + "element_id": "c0a86e51afb417a3b057d7cf101bbed6", + "text": "25 24.6 20 18.4 e 15 10 5 4.6 2.8 0 Coal Oil Bio m ass Natural gas 0.07 Wind 0.04 Hydropower 0.02 Solar 0.01 Nuclear ", "metadata": { "filetype": "application/pdf", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json index b07103abf1..b6516f791c 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json @@ -111,8 +111,8 @@ }, { "type": "CompositeElement", - "element_id": "43198ac980a699b3b17c5f229aee8656", - "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as \u201cWho \ufb01rst voiced Meg on Family Guy?\u201d or \u201cWhere was the 8th Dalai Lama born?\u201d, a system is required to answer it using a large corpus of diversi\ufb01ed topics. More speci\ufb01cally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1, d2, \u00b7 \u00b7 \u00b7 , dD. We \ufb01rst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1, p2, . . . , pM }, where each passage pi can be viewed as a sequence 2 , \u00b7 \u00b7 \u00b7 , w(i) 1 , w(i) of tokens w(i) |pi|. Given a question q, the task is to \ufb01nd a span w(i) s+1, \u00b7 \u00b7 \u00b7 , w(i) s , w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an ef\ufb01cient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q, C) \u2192 CF is a function that takes as input a question q and a corpus C and returns a much smaller \ufb01lter set of texts CF \u2282 C, where |CF | = k (cid:28) |C|. For a \ufb01xed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", + "element_id": "e6dee1abec28f8ff365ab6275b3e5f0e", + "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as \u201cWho \ufb01rst voiced Meg on Family Guy?\u201d or \u201cWhere was the 8th Dalai Lama born?\u201d, a system is required to answer it using a large corpus of diversi\ufb01ed topics. More speci\ufb01cally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,\u00b7\u00b7\u00b7 ,dD. We \ufb01rst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,\u00b7\u00b7\u00b7 ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to \ufb01nd a span w(i) s+1,\u00b7\u00b7\u00b7 ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an ef\ufb01cient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) \u2192 CF is a function that takes as input a question q and a corpus C and returns a much smaller \ufb01lter set of texts CF \u2282 C, where |CF| = k (cid:28) |C|. For a \ufb01xed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", "metadata": { "data_source": { "record_locator": { @@ -133,8 +133,8 @@ }, { "type": "CompositeElement", - "element_id": "82cfad702e5779169139f705fd0af5ee", - "text": "3 Dense Passage Retriever (DPR)\n\nWe focus our research in this work on improv- ing the retrieval component in open-domain QA. Given a collection of M text passages, the goal of our dense passage retriever (DPR) is to index all the passages in a low-dimensional and continuous space, such that it can retrieve ef\ufb01ciently the top k passages relevant to the input question for the reader at run-time. Note that M can be very large (e.g., 21 million passages in our experiments, de- scribed in Section 4.1) and k is usually small, such as 20\u2013100.\n\n3.1 Overview\n\nOur dense passage retriever (DPR) uses a dense encoder EP (\u00b7) which maps any text passage to a d- dimensional real-valued vectors and builds an index for all the M passages that we will use for retrieval.\n\n3The ideal size and boundary of a text passage are func- tions of both the retriever and reader. We also experimented with natural paragraphs in our preliminary trials and found that using \ufb01xed-length passages performs better in both retrieval and \ufb01nal QA accuracy, as observed by Wang et al. (2019).\n\n4Exceptions include (Seo et al., 2019) and (Roberts et al., 2020), which retrieves and generates the answers, respectively.", + "element_id": "ac6733a570cbdd5c8d48f8252b345b17", + "text": "3 Dense Passage Retriever (DPR)\n\nWe focus our research in this work on improv- ing the retrieval component in open-domain QA. Given a collection of M text passages, the goal of our dense passage retriever (DPR) is to index all the passages in a low-dimensional and continuous space, such that it can retrieve ef\ufb01ciently the top k passages relevant to the input question for the reader at run-time. Note that M can be very large (e.g., 21 million passages in our experiments, de- scribed in Section 4.1) and k is usually small, such as 20\u2013100.\n\n3.1 Overview\n\nOur dense passage retriever (DPR) uses a dense encoder EP(\u00b7) which maps any text passage to a d- dimensional real-valued vectors and builds an index for all the M passages that we will use for retrieval.\n\n3The ideal size and boundary of a text passage are func- tions of both the retriever and reader. We also experimented with natural paragraphs in our preliminary trials and found that using \ufb01xed-length passages performs better in both retrieval and \ufb01nal QA accuracy, as observed by Wang et al. (2019).\n\n4Exceptions include (Seo et al., 2019) and (Roberts et al., 2020), which retrieves and generates the answers, respectively.", "metadata": { "data_source": { "record_locator": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 3e22f163fa..b9d9f35d17 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -1221,8 +1221,8 @@ }, { "type": "NarrativeText", - "element_id": "2172d9b276cd7a485dea4978805815d8", - "text": "Fig. 1: The overall architecture of LayoutParser. For an input document image, the core LayoutParser library provides a set of o\ufb00-the-shelf tools for layout detection, OCR, visualization, and storage, backed by a carefully designed layout data structure. LayoutParser also supports high level customization via e\ufb03cient layout annotation and model training functions. These improve model accuracy on the target samples. The community platform enables the easy sharing of DIA models and whole digitization pipelines to promote reusability and reproducibility. A collection of detailed documentation, tutorials and exemplar projects make LayoutParser easy to learn and use.", + "element_id": "466f0bc21599ccf0fa27c021cb023f90", + "text": "Fig.1: The overall architecture of LayoutParser. For an input document image, the core LayoutParser library provides a set of o\ufb00-the-shelf tools for layout detection, OCR, visualization, and storage, backed by a carefully designed layout data structure. LayoutParser also supports high level customization via e\ufb03cient layout annotation and model training functions. These improve model accuracy on the target samples. The community platform enables the easy sharing of DIA models and whole digitization pipelines to promote reusability and reproducibility. A collection of detailed documentation, tutorials and exemplar projects make LayoutParser easy to learn and use.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1623,8 +1623,8 @@ }, { "type": "NarrativeText", - "element_id": "c2af717e76ad68bd6da87a15a69f126a", - "text": "2 image = cv2 . imread ( \" image_file \" ) # load images 3 model = lp . De t e c tro n2 Lay outM odel ( 4 \" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config \" )", + "element_id": "7d55b80ca5a0c2888ff44b931430b0d8", + "text": "2 image = cv2.imread(\"image_file\") # load images 3 model = lp.Detectron2LayoutModel( 4 \"lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config\")", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1645,8 +1645,8 @@ }, { "type": "ListItem", - "element_id": "a002e13c7ea2613b2eabb9ea3501856d", - "text": "3 model = lp . De t e c tro n2 Lay outM odel (", + "element_id": "f30541418a7dca51e3e4cd880486ab9c", + "text": "3 model = lp.Detectron2LayoutModel(", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1667,8 +1667,8 @@ }, { "type": "ListItem", - "element_id": "366c05fd7babc86bf01d690b9df755da", - "text": "5 layout = model . detect ( image )", + "element_id": "ecaf88c55d275f8fdc8c25e2d919077f", + "text": "5 layout = model.detect(image)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1784,8 +1784,8 @@ }, { "type": "FigureCaption", - "element_id": "c2a2a4a054151d16820f38e115ce7a72", - "text": "Fig. 2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum \ufb02exibility.", + "element_id": "9f11aa6b22dea1bba7eb0d122c0c5562", + "text": "Fig.2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum \ufb02exibility.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2006,8 +2006,8 @@ }, { "type": "NarrativeText", - "element_id": "373a9a67f855ba5b79bdc1393d2f1ce9", - "text": "1 ocr_agent = lp . TesseractAgent () 2 # Can be easily switched to other OCR software 3 tokens = ocr_agent . detect ( image )", + "element_id": "2e605dfb574532cf2ab54ded080a2ab9", + "text": "1 ocr_agent = lp.TesseractAgent() 2 # Can be easily switched to other OCR software 3 tokens = ocr_agent.detect(image)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2378,8 +2378,8 @@ }, { "type": "NarrativeText", - "element_id": "fadd4ad54cd14e3e4711d41a1c99f813", - "text": "Fig. 3: Layout detection and OCR results visualization generated by the LayoutParser APIs. Mode I directly overlays the layout region bounding boxes and categories over the original image. Mode II recreates the original document via drawing the OCR\u2019d texts at their corresponding positions on the image canvas. In this \ufb01gure, tokens in textual regions are \ufb01ltered using the API and then displayed.", + "element_id": "4d1b9566e792683b9559b778be4f4046", + "text": "Fig.3: Layout detection and OCR results visualization generated by the LayoutParser APIs. Mode I directly overlays the layout region bounding boxes and categories over the original image. Mode II recreates the original document via drawing the OCR\u2019d texts at their corresponding positions on the image canvas. In this \ufb01gure, tokens in textual regions are \ufb01ltered using the API and then displayed.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2529,8 +2529,8 @@ }, { "type": "NarrativeText", - "element_id": "ebbb8c84b2a69f817c8ae7df20d72dd9", - "text": "Fig. 4: Illustration of (a) the original historical Japanese document with layout detection results and (b) a recreated version of the document image that achieves much better character recognition recall. The reorganization algorithm rearranges the tokens based on the their detected bounding boxes given a maximum allowed height.", + "element_id": "9667b0e42f9d28607c7c13bffb760906", + "text": "Fig.4: Illustration of (a) the original historical Japanese document with layout detection results and (b) a recreated version of the document image that achieves much better character recognition recall. The reorganization algorithm rearranges the tokens based on the their detected bounding boxes given a maximum allowed height.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2841,8 +2841,8 @@ }, { "type": "NarrativeText", - "element_id": "3c1fd89a3436d3cedb4d22d297c76437", - "text": "Fig. 5: Illustration of how LayoutParser helps with the historical document digi- tization pipeline.", + "element_id": "80291b42f1785935496188bb52788288", + "text": "Fig.5: Illustration of how LayoutParser helps with the historical document digi- tization pipeline.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3141,8 +3141,8 @@ }, { "type": "FigureCaption", - "element_id": "7e685908875164adafa447ec3d97455e", - "text": "Fig. 6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in di\ufb00erent locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.", + "element_id": "d35d253341e8b8d837f384ecd6ac410a", + "text": "Fig.6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in di\ufb00erent locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.", "metadata": { "filetype": "application/pdf", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json index 6264f96a86..fdb1b1ff86 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json @@ -794,8 +794,8 @@ }, { "type": "NarrativeText", - "element_id": "732dc7fa0795c651041c10c2d318a8ae", - "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs. 1\u20133 respectively. It can be seen clearly from these Figures that the ef\ufb01ciency of egg shell powder increase with the inhibitor con- centration, The increase in its ef\ufb01ciency could be as a result of increase in the constituent molecule", + "element_id": "28d5b195997810a34c2aa96c9f357de2", + "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs.1\u20133 respectively. It can be seen clearly from these Figures that the ef\ufb01ciency of egg shell powder increase with the inhibitor con- centration, The increase in its ef\ufb01ciency could be as a result of increase in the constituent molecule", "metadata": { "languages": [ "eng" @@ -2598,8 +2598,8 @@ }, { "type": "UncategorizedText", - "element_id": "6fcf2a276d4b2d81f991b4eb6f04009a", - "text": "(cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356", + "element_id": "a0aa9bf2a48ed1dff882a16cb320c616", + "text": "(cid:3)0.9393 (cid:3)0.8276 (cid:3)0.8825 (cid:3)0.8027 (cid:3)0.5896 (cid:3)0.5356", "metadata": { "languages": [ "eng" @@ -3434,8 +3434,8 @@ }, { "type": "Title", - "element_id": "d269706e81c2b5978ae0b5c820ce176a", - "text": "\u03b8 \u00bc CRo (cid:3) CR", + "element_id": "543caecd15c161082076a174ea946782", + "text": "\u03b8 \u00bc CRo(cid:3)CR", "metadata": { "languages": [ "eng" @@ -3478,8 +3478,8 @@ }, { "type": "Title", - "element_id": "d48a9ee64508de2e63b2f4579ef78432", - "text": "IE \u00f0%\u00de \u00bc CRo (cid:3) CR", + "element_id": "59a609931ac8f9c55855113bfae6655e", + "text": "IE \u00f0%\u00de \u00bc CRo(cid:3)CR", "metadata": { "languages": [ "eng" @@ -3720,8 +3720,8 @@ }, { "type": "NarrativeText", - "element_id": "a8d445f830ed31990875a519f4be0eb5", - "text": "steps of the linear polarization plot are substituted to get corrosion current. Nova software was used with linear polarization resistance (LPR) and the current was set to 10 mA (maximum) and 10 nA (minimum). LSV staircase parameter start potential (cid:3) 1.5 v, step potential 0.001 m/s and stop potential of \u00fe1.5 v set was used in this study.", + "element_id": "ac11629522e563b6a0a8f261ab4b94e0", + "text": "steps of the linear polarization plot are substituted to get corrosion current. Nova software was used with linear polarization resistance (LPR) and the current was set to 10 mA (maximum) and 10 nA (minimum). LSV staircase parameter start potential (cid:3)1.5 v, step potential 0.001 m/s and stop potential of \u00fe1.5 v set was used in this study.", "metadata": { "languages": [ "eng" diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json index 26955e33e1..908e9e125a 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json @@ -503,8 +503,8 @@ }, { "type": "NarrativeText", - "element_id": "d21722fd648aed04c8119948bf24b400", - "text": "Tables, text \ufb01les Arti\ufb01cially generated by a C \u00fe \u00fe program on Intels Xeons CPU E5\u2013 2670 v2 with Linux operating system. Raw Sixty randomly generated instances of the MDVSP with the number of depots in (8, 12, 16) and the number of trips in (1500, 2000, 2500, 3000) Randomly generated instances IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data can be downloaded from https://orlib.uqcloud.net/ Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457\u2013487 [3].", + "element_id": "d2073c6354217f9b2d4d5c654d77f232", + "text": "Tables, text \ufb01les Arti\ufb01cially generated by a C\u00fe \u00fe program on Intels Xeons CPU E5\u2013 2670 v2 with Linux operating system. Raw Sixty randomly generated instances of the MDVSP with the number of depots in (8,12,16) and the number of trips in (1500, 2000, 2500, 3000) Randomly generated instances IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data can be downloaded from https://orlib.uqcloud.net/ Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457\u2013487 [3].", "metadata": { "languages": [ "eng" @@ -513,7 +513,7 @@ { "text": "https :// orlib . uqcloud . net /", "url": "https://orlib.uqcloud.net/", - "start_index": 386 + "start_index": 383 } ], "page_number": 2, @@ -774,8 +774,8 @@ }, { "type": "NarrativeText", - "element_id": "96589dd8025c674caf26c856ea689d4e", - "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate \ufb01le. Each \ufb01le is named as \u2018RN-m-n-k.dat\u2019, where \u2018m\u2019, \u2018n\u2019, and \u2018k\u2019 denote the number of depots, the number of trips, and the instance number \u2018RN-8\u20131500-01.dat\u2019, for is the \ufb01rst problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, \u00f0m; n\u00de, \ufb01ve instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net.", + "element_id": "52c2b4b09c228b90a487fa4fd42a1590", + "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate \ufb01le. Each \ufb01le is named as \u2018RN-m-n-k.dat\u2019, where \u2018m\u2019, \u2018n\u2019, and \u2018k\u2019 denote the number of depots, the number of trips, and the instance number \u2018RN-8\u20131500-01.dat\u2019, for is the \ufb01rst problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, \u00f0m;n\u00de, \ufb01ve instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net.", "metadata": { "languages": [ "eng" @@ -784,7 +784,7 @@ { "text": "https :// orlib . uqcloud . net", "url": "https://orlib.uqcloud.net", - "start_index": 611 + "start_index": 609 } ], "page_number": 2, @@ -803,8 +803,8 @@ }, { "type": "UncategorizedText", - "element_id": "97686fd4b810190336f3a3f4debb4c5d", - "text": "\u2018\u00f0m; n\u00de\u2019,", + "element_id": "a442f6b8548f2b2be7eb0b0c488eaf3f", + "text": "\u2018\u00f0m;n\u00de\u2019,", "metadata": { "languages": [ "eng" @@ -891,8 +891,8 @@ }, { "type": "NarrativeText", - "element_id": "275e61db64667898a0ec65d6cbbff69b", - "text": "For each problem instance, the following information is provided: The number of depots m\u00f0 The number of trips \u00f0n\u00de, The number of locations \u00f0l\u00de, The number of vehicles at each depot, For each trip i A 1; 2; \u2026; n, a start time, ts", + "element_id": "20a5ace34ab61e08b1ab35c222c6554f", + "text": "For each problem instance, the following information is provided: The number of depots m\u00f0 The number of trips \u00f0n\u00de, The number of locations \u00f0l\u00de, The number of vehicles at each depot, For each trip iA1;2;\u2026;n, a start time, ts", "metadata": { "languages": [ "eng" @@ -957,8 +957,8 @@ }, { "type": "Title", - "element_id": "3c0009859c6faa133b3e59b1b5c42c5b", - "text": "i , an end time, te", + "element_id": "812eeb4f274baf14170f2447204a4a55", + "text": "i, an end time, te", "metadata": { "languages": [ "eng" @@ -979,8 +979,8 @@ }, { "type": "UncategorizedText", - "element_id": "4f3baeb46b82b7cb0acec9e6b9ac9787", - "text": "i , and an end location, le i ,", + "element_id": "4b917219b5939da4a52a907db733f551", + "text": "i, and an end location, le i ,", "metadata": { "languages": [ "eng" @@ -1023,8 +1023,8 @@ }, { "type": "NarrativeText", - "element_id": "9e7301ebb3fd5cbe1410901ea78c02db", - "text": "(cid:2) The travel time, \u03b4ij, between any two locations i; j A 1; \u2026; l.", + "element_id": "b1bb94d45fba27ddeefd146fbde1dcc4", + "text": "(cid:2) The travel time, \u03b4ij, between any two locations i;jA1;\u2026;l.", "metadata": { "languages": [ "eng" @@ -1111,8 +1111,8 @@ }, { "type": "NarrativeText", - "element_id": "a3a97226d270316d06712c89f7ff489d", - "text": "and end location of the trip. A long trip is about 3\u20135 h in duration and has the same start and end location. For all instances, m r l and the locations 1; \u2026; m correspond to depots, while the remaining locations only appear as trip start and end locations.", + "element_id": "eeba8dd874b520a36aa718db99dbfd38", + "text": "and end location of the trip. A long trip is about 3\u20135 h in duration and has the same start and end location. For all instances, mrl and the locations 1;\u2026;m correspond to depots, while the remaining locations only appear as trip start and end locations.", "metadata": { "languages": [ "eng" @@ -1155,8 +1155,8 @@ }, { "type": "NarrativeText", - "element_id": "51071653fbb405a5c84831cbacc6c618", - "text": ". If le i ls le i j , otherwise, the vehicle may require waiting at le i for the duration of \u00f0ts", + "element_id": "c4a028a7e5a91a69b88a778ed1d4c4c1", + "text": ". If le i ls le i j, otherwise, the vehicle may require waiting at le i for the duration of \u00f0ts", "metadata": { "languages": [ "eng" @@ -1177,8 +1177,8 @@ }, { "type": "Title", - "element_id": "edff69ec864e554eb9aee86908ecac9c", - "text": "Z te", + "element_id": "3351f34f87afe9cffe4fd31320b9ccc8", + "text": "Zte", "metadata": { "languages": [ "eng" @@ -1199,8 +1199,8 @@ }, { "type": "Title", - "element_id": "f038d089ae51f445f96217852ae9c670", - "text": "a ls", + "element_id": "7a378649c353830c59db2e86df7f7368", + "text": "als", "metadata": { "languages": [ "eng" @@ -1243,8 +1243,8 @@ }, { "type": "NarrativeText", - "element_id": "d3b130ec44c8f5b0865012570fe82fd0", - "text": "j , the vehicle must travel empty from le j (cid:3)te i \u00de. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satis\ufb01ed:", + "element_id": "f7296ef349382c5db6f8a271d8f3fe03", + "text": "j, the vehicle must travel empty from le j (cid:3)te i \u00de. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satis\ufb01ed:", "metadata": { "languages": [ "eng" @@ -1331,8 +1331,8 @@ }, { "type": "NarrativeText", - "element_id": "80d7ee3f1337fffbcb42c78e218d8aad", - "text": "A suf\ufb01cient number of vehicles are provided to maintain the feasibility of an instance. For each instance size \u00f0m; n\u00de, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over \ufb01ve instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.", + "element_id": "3dbb489d8594d6744d2fce9cdcde691c", + "text": "A suf\ufb01cient number of vehicles are provided to maintain the feasibility of an instance. For each instance size \u00f0m;n\u00de, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over \ufb01ve instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.", "metadata": { "languages": [ "eng" @@ -1353,8 +1353,8 @@ }, { "type": "NarrativeText", - "element_id": "2c71b28268ae79e366c8190e28761e31", - "text": "The description of the \ufb01le for each problem instance is presented in Table 2. The \ufb01rst line in the \ufb01le provides the number of depots \u00f0m\u00de, the number of trips, \u00f0n\u00de, and the number of locations \u00f0l\u00de, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; \u2026; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; \u2026; l", + "element_id": "7490a379155c95007ad9649ec7689e35", + "text": "The description of the \ufb01le for each problem instance is presented in Table 2. The \ufb01rst line in the \ufb01le provides the number of depots \u00f0m\u00de, the number of trips, \u00f0n\u00de, and the number of locations \u00f0l\u00de, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, iA 1;\u2026;n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i;jA 1;\u2026;l", "metadata": { "languages": [ "eng" @@ -1463,8 +1463,8 @@ }, { "type": "NarrativeText", - "element_id": "7c74ad0f1b0edb685ba951da2a788af8", - "text": "The dataset also includes a program \u2018GenerateInstance.cpp\u2019 that can be used to generate new instances. The program takes three inputs, the number of depots \u00f0m\u00de, the number of trips \u00f0n\u00de, and the number of instances for each size \u00f0m; n\u00de.", + "element_id": "0b37e732b73efa9dbd994f164dac8d5c", + "text": "The dataset also includes a program \u2018GenerateInstance.cpp\u2019 that can be used to generate new instances. The program takes three inputs, the number of depots \u00f0m\u00de, the number of trips \u00f0n\u00de, and the number of instances for each size \u00f0m;n\u00de.", "metadata": { "languages": [ "eng" @@ -1947,8 +1947,8 @@ }, { "type": "NarrativeText", - "element_id": "ebd5a6aeac91e0f42fecb980ef4a648a", - "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i \u00bc 1; 2; \u2026; n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, \u03b4ij; where i; j A 1; 2; \u2026; l, refers to the travel time between location i and location j.", + "element_id": "c981c256386d57e68a2c947147f30229", + "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i \u00bc 1;2;\u2026;n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, \u03b4ij; where i;jA1;2;\u2026;l, refers to the travel time between location i and location j.", "metadata": { "languages": [ "eng" @@ -1969,8 +1969,8 @@ }, { "type": "Title", - "element_id": "50fb8c466c52d5ae755055ffc24a418d", - "text": "i , the start", + "element_id": "e6e8997790263be5ca103754ee56e234", + "text": "i, the start", "metadata": { "languages": [ "eng" @@ -1991,8 +1991,8 @@ }, { "type": "Title", - "element_id": "44a4c21af61b74e9f30be3112d9eb1e7", - "text": "i , the end location le", + "element_id": "49f536ed0f91f7e6d8ad1d70d71991b0", + "text": "i, the end location le", "metadata": { "languages": [ "eng" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 464c141982..909af6c8ec 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.21-dev4" # pragma: no cover +__version__ = "0.16.21-dev5" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 9a2efcd650..0899c57dd6 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -80,6 +80,7 @@ merge_inferred_with_extracted_layout, ) from unstructured.partition.pdf_image.pdfminer_utils import ( + PDFMinerConfig, open_pdfminer_pages_generator, rect_to_bbox, ) @@ -145,6 +146,10 @@ def partition_pdf( extract_forms: bool = False, form_extraction_skip_tables: bool = True, password: Optional[str] = None, + pdfminer_line_margin: Optional[float] = None, + pdfminer_char_margin: Optional[float] = None, + pdfminer_line_overlap: Optional[float] = None, + pdfminer_word_margin: Optional[float] = 0.185, **kwargs: Any, ) -> list[Element]: """Parses a pdf document into a list of interpreted elements. @@ -203,12 +208,24 @@ def partition_pdf( (results in adding FormKeysValues elements to output). form_extraction_skip_tables Whether the form extraction logic should ignore regions designated as Tables. + pdfminer_line_margin + If two lines are close together they are considered to be part of the same paragraph. + The margin is specified relative to the height of a line. + pdfminer_char_margin + If two characters are closer together than this margin they are considered part of + the same line. The margin is specified relative to the width of the character. + pdfminer_line_overlap + If two characters have more overlap than this they are considered to be on the same line. + The overlap is specified relative to the minimum height of both characters. + pdfminer_word_margin + If two characters on the same line are further apart than this margin then they are + considered to be two separate words, and an intermediate space will be added for + readability. The margin is specified relative to the width of the character. """ exactly_one(filename=filename, file=file) languages = check_language_args(languages or [], ocr_languages) - return partition_pdf_or_image( filename=filename, file=file, @@ -226,6 +243,10 @@ def partition_pdf( extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, password=password, + pdfminer_line_margin=pdfminer_line_margin, + pdfminer_char_margin=pdfminer_char_margin, + pdfminer_line_overlap=pdfminer_line_overlap, + pdfminer_word_margin=pdfminer_word_margin, **kwargs, ) @@ -248,6 +269,10 @@ def partition_pdf_or_image( extract_forms: bool = False, form_extraction_skip_tables: bool = True, password: Optional[str] = None, + pdfminer_line_margin: Optional[float] = None, + pdfminer_char_margin: Optional[float] = None, + pdfminer_line_overlap: Optional[float] = None, + pdfminer_word_margin: Optional[float] = 0.185, **kwargs: Any, ) -> list[Element]: """Parses a pdf or image document into a list of interpreted elements.""" @@ -265,7 +290,12 @@ def partition_pdf_or_image( validate_strategy(strategy, is_image) last_modified = get_last_modified_date(filename) if filename else None - + pdfminer_config = PDFMinerConfig( + line_margin=pdfminer_line_margin, + char_margin=pdfminer_char_margin, + line_overlap=pdfminer_line_overlap, + word_margin=pdfminer_word_margin, + ) extracted_elements = [] pdf_text_extractable = False if not is_image: @@ -277,6 +307,7 @@ def partition_pdf_or_image( metadata_last_modified=metadata_last_modified or last_modified, starting_page_number=starting_page_number, password=password, + pdfminer_config=pdfminer_config, **kwargs, ) pdf_text_extractable = any( @@ -327,6 +358,7 @@ def partition_pdf_or_image( extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, password=password, + pdfminer_config=pdfminer_config, **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) @@ -367,6 +399,7 @@ def extractable_elements( metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, password: Optional[str] = None, + pdfminer_config: Optional[PDFMinerConfig] = None, **kwargs: Any, ) -> list[list[Element]]: if isinstance(file, bytes): @@ -378,6 +411,7 @@ def extractable_elements( metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, password=password, + pdfminer_config=pdfminer_config, **kwargs, ) @@ -389,12 +423,13 @@ def _partition_pdf_with_pdfminer( metadata_last_modified: Optional[str], starting_page_number: int = 1, password: Optional[str] = None, + pdfminer_config: Optional[PDFMinerConfig] = None, **kwargs: Any, ) -> list[list[Element]]: """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster processing or detectron2 is not available. - Implementation is based on the `extract_text` implemenation in pdfminer.six, but + Implementation is based on the `extract_text` implementation in pdfminer.six, but modified to support tracking page numbers and working with file-like objects. ref: https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/high_level.py @@ -413,6 +448,7 @@ def _partition_pdf_with_pdfminer( metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, password=password, + pdfminer_config=pdfminer_config, **kwargs, ) @@ -424,6 +460,7 @@ def _partition_pdf_with_pdfminer( metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, password=password, + pdfminer_config=pdfminer_config, **kwargs, ) @@ -439,6 +476,7 @@ def _process_pdfminer_pages( annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD, starting_page_number: int = 1, password: Optional[str] = None, + pdfminer_config: Optional[PDFMinerConfig] = None, **kwargs, ) -> list[list[Element]]: """Uses PDFMiner to split a document into pages and process them.""" @@ -446,7 +484,7 @@ def _process_pdfminer_pages( elements = [] for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(fp, password=password), + open_pdfminer_pages_generator(fp, password=password, pdfminer_config=pdfminer_config), start=starting_page_number, ): width, height = page_layout.width, page_layout.height @@ -570,6 +608,7 @@ def _partition_pdf_or_image_local( form_extraction_skip_tables: bool = True, pdf_hi_res_max_pages: Optional[int] = None, password: Optional[str] = None, + pdfminer_config: Optional[PDFMinerConfig] = None, **kwargs: Any, ) -> list[Element]: """Partition using package installed locally""" @@ -610,7 +649,12 @@ def _partition_pdf_or_image_local( ) extracted_layout, layouts_links = ( - process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password) + process_file_with_pdfminer( + filename=filename, + dpi=pdf_image_dpi, + password=password, + pdfminer_config=pdfminer_config, + ) if pdf_text_extractable else ([], []) ) @@ -665,7 +709,9 @@ def _partition_pdf_or_image_local( file.seek(0) extracted_layout, layouts_links = ( - process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password) + process_data_with_pdfminer( + file=file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config + ) if pdf_text_extractable else ([], []) ) diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 724e34a814..4fa7e159f0 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -14,6 +14,7 @@ from unstructured.documents.elements import CoordinatesMetadata, ElementType from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters from unstructured.partition.pdf_image.pdfminer_utils import ( + PDFMinerConfig, extract_image_objects, extract_text_objects, open_pdfminer_pages_generator, @@ -39,13 +40,12 @@ def process_file_with_pdfminer( filename: str = "", dpi: int = 200, password: Optional[str] = None, + pdfminer_config: Optional[PDFMinerConfig] = None, ) -> tuple[List[List["TextRegion"]], List[List]]: with open_filename(filename, "rb") as fp: fp = cast(BinaryIO, fp) extracted_layout, layouts_links = process_data_with_pdfminer( - file=fp, - dpi=dpi, - password=password, + file=fp, dpi=dpi, password=password, pdfminer_config=pdfminer_config ) return extracted_layout, layouts_links @@ -435,6 +435,7 @@ def process_data_with_pdfminer( file: Optional[Union[bytes, BinaryIO]] = None, dpi: int = 200, password: Optional[str] = None, + pdfminer_config: Optional[PDFMinerConfig] = None, ) -> tuple[List[LayoutElements], List[List]]: """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the pdf pages using pdf2image""" @@ -446,7 +447,7 @@ def process_data_with_pdfminer( # Coefficient to rescale bounding box to be compatible with images coef = dpi / 72 for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(file, password=password) + open_pdfminer_pages_generator(file, password=password, pdfminer_config=pdfminer_config) ): width, height = page_layout.width, page_layout.height diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 3544e26762..ad6f981914 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -7,14 +7,25 @@ from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage from pdfminer.psparser import PSSyntaxError +from pydantic import BaseModel from unstructured.logger import logger from unstructured.utils import requires_dependencies -def init_pdfminer(): +class PDFMinerConfig(BaseModel): + line_overlap: Optional[float] = None + word_margin: Optional[float] = None + line_margin: Optional[float] = None + char_margin: Optional[float] = None + + +def init_pdfminer(pdfminer_config: Optional[PDFMinerConfig] = None): rsrcmgr = PDFResourceManager() - laparams = LAParams() + + laparams_kwargs = pdfminer_config.model_dump(exclude_none=True) if pdfminer_config else {} + laparams = LAParams(**laparams_kwargs) + device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) @@ -72,8 +83,7 @@ def rect_to_bbox( @requires_dependencies(["pikepdf", "pypdf"]) def open_pdfminer_pages_generator( - fp: BinaryIO, - password: Optional[str] = None, + fp: BinaryIO, password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None ): """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" @@ -81,7 +91,7 @@ def open_pdfminer_pages_generator( from unstructured.partition.pdf_image.pypdf_utils import get_page_data - device, interpreter = init_pdfminer() + device, interpreter = init_pdfminer(pdfminer_config=pdfminer_config) with tempfile.TemporaryDirectory() as tmp_dir_path: tmp_file_path = os.path.join(tmp_dir_path, "tmp_file") try: