diff --git a/libs/langchain/colored-1.4.4-py3-none-any.whl b/libs/langchain/colored-1.4.4-py3-none-any.whl new file mode 100644 index 0000000000000..86141567069de Binary files /dev/null and b/libs/langchain/colored-1.4.4-py3-none-any.whl differ diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 2ec7a684be61d..45a9f2f370d8c 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -263,24 +263,94 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: class DocumentIntelligenceParser(BaseBlobParser): """Loads a PDF with Azure Document Intelligence - (formerly Forms Recognizer) and chunks at character level.""" + (formerly Forms Recognizer). Returns Document with + pages or paragraphs, table headers, and rows.""" - def __init__(self, client: Any, model: str): + def __init__(self, client: Any, model: str, split_mode: str): self.client = client self.model = model + self.split_mode = split_mode def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: - for p in result.pages: - content = " ".join([line.content for line in p.lines]) - - d = Document( - page_content=content, - metadata={ - "source": blob.source, - "page": p.page_number, - }, - ) - yield d + page_content_dict = dict() + + for paragraph in result.paragraphs: + page_number = paragraph.bounding_regions[0].page_number + + if self.split_mode == "page": + if page_number not in page_content_dict: + page_content_dict[page_number] = str() + + page_content_dict[page_number] += paragraph.content + "\n\n" + elif self.split_mode == "paragraph": + d = Document( + page_content=paragraph.content, + metadata={ + "source": blob.source, + "page": page_number, + "type": "PARAGRAPH", + }, + ) + yield d + + if self.split_mode == "page": + for page, content in page_content_dict.items(): + d = Document( + page_content=content.strip(), + metadata={ + "source": blob.source, + "page": page, + "type": "PAGE", + }, + ) + yield d + + if self.model in ["prebuilt-document", "prebuilt-layout", "prebuilt-invoice"]: + import csv # noqa: F401 + from io import StringIO # noqa: F401 + + for table_idx, table in enumerate(result.tables): + page_num = table.bounding_regions[0].page_number + headers: list[str] = list() + rows: dict[int, list[str]] = dict() + + for cell in table.cells: + if cell.kind == "columnHeader": + headers.append(cell.content) + elif cell.kind == "content": + if cell.row_index not in rows: + rows[cell.row_index] = list() + rows[cell.row_index].append(cell.content) + + if headers: + h_op = StringIO() + csv.writer(h_op, quoting=csv.QUOTE_MINIMAL).writerow(headers) + header_string = h_op.getvalue().strip() + hd = Document( + page_content=header_string, + metadata={ + "source": blob.source, + "page": page_num, + "type": "TABLE_HEADER", + "table_index": table_idx, + }, + ) + yield hd + + for _, row_cells in sorted(rows.items()): + r_op = StringIO() + csv.writer(r_op, quoting=csv.QUOTE_MINIMAL).writerow(row_cells) + row_string = r_op.getvalue().strip() + rd = Document( + page_content=row_string, + metadata={ + "source": blob.source, + "page": page_num, + "type": "TABLE_ROW", + "table_index": table_idx, + }, + ) + yield rd def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index a64cdb07bc414..435738cd6e942 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -620,7 +620,7 @@ def __init__( file_path: str, client: Any, model: str = "prebuilt-document", - headers: Optional[Dict] = None, + split_mode: str = "page", ) -> None: """ Initialize the object for file processing with Azure Document Intelligence @@ -639,6 +639,8 @@ def __init__( A DocumentAnalysisClient to perform the analysis of the blob model : str The model name or ID to be used for form recognition in Azure. + split_mode : str + Whether to split by `paragraph` or `page`. Defaults to `page`. Examples: --------- @@ -646,11 +648,19 @@ def __init__( ... file_path="path/to/file", ... client=client, ... model="prebuilt-document" + ... split_mode="page | paragraph" ... ) """ - - self.parser = DocumentIntelligenceParser(client=client, model=model) - super().__init__(file_path, headers=headers) + + super().__init__(file_path) + if split_mode not in ["page", "paragraph"]: + raise ValueError( + f"Invalid split option {split_mode}, " + "valid values are `page` or `paragraph`." + ) + self.parser = DocumentIntelligenceParser( + client=client, model=model, split_mode=split_mode + ) def load(self) -> List[Document]: """Load given path as pages."""