langchain-ai · annjawn · Sep 10, 2023 · Sep 10, 2023 · Sep 11, 2023 · Sep 11, 2023
diff --git a/libs/langchain/colored-1.4.4-py3-none-any.whl b/libs/langchain/colored-1.4.4-py3-none-any.whl
diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py
@@ -263,24 +263,94 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
 
 class DocumentIntelligenceParser(BaseBlobParser):
     """Loads a PDF with Azure Document Intelligence
-    (formerly Forms Recognizer) and chunks at character level."""
+    (formerly Forms Recognizer). Returns Document with
+    pages or paragraphs, table headers, and rows."""
 
-    def __init__(self, client: Any, model: str):
+    def __init__(self, client: Any, model: str, split_mode: str):
         self.client = client
         self.model = model
+        self.split_mode = split_mode
 
     def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:
-        for p in result.pages:
-            content = " ".join([line.content for line in p.lines])
-
-            d = Document(
-                page_content=content,
-                metadata={
-                    "source": blob.source,
-                    "page": p.page_number,
-                },
-            )
-            yield d
+        page_content_dict = dict()
+
+        for paragraph in result.paragraphs:
+            page_number = paragraph.bounding_regions[0].page_number
+
+            if self.split_mode == "page":
+                if page_number not in page_content_dict:
+                    page_content_dict[page_number] = str()
+
+                page_content_dict[page_number] += paragraph.content + "\n\n"
+            elif self.split_mode == "paragraph":
+                d = Document(
+                    page_content=paragraph.content,
+                    metadata={
+                        "source": blob.source,
+                        "page": page_number,
+                        "type": "PARAGRAPH",
+                    },
+                )
+                yield d
+
+        if self.split_mode == "page":
+            for page, content in page_content_dict.items():
+                d = Document(
+                    page_content=content.strip(),
+                    metadata={
+                        "source": blob.source,
+                        "page": page,
+                        "type": "PAGE",
+                    },
+                )
+                yield d
+
+        if self.model in ["prebuilt-document", "prebuilt-layout", "prebuilt-invoice"]:
+            import csv  # noqa: F401
+            from io import StringIO  # noqa: F401
+
+            for table_idx, table in enumerate(result.tables):
+                page_num = table.bounding_regions[0].page_number
+                headers: list[str] = list()
+                rows: dict[int, list[str]] = dict()
+
+                for cell in table.cells:
+                    if cell.kind == "columnHeader":
+                        headers.append(cell.content)
+                    elif cell.kind == "content":
+                        if cell.row_index not in rows:
+                            rows[cell.row_index] = list()
+                        rows[cell.row_index].append(cell.content)
+
+                if headers:
+                    h_op = StringIO()
+                    csv.writer(h_op, quoting=csv.QUOTE_MINIMAL).writerow(headers)
+                    header_string = h_op.getvalue().strip()
+                    hd = Document(
+                        page_content=header_string,
+                        metadata={
+                            "source": blob.source,
+                            "page": page_num,
+                            "type": "TABLE_HEADER",
+                            "table_index": table_idx,
+                        },
+                    )
+                    yield hd
+
+                for _, row_cells in sorted(rows.items()):
+                    r_op = StringIO()
+                    csv.writer(r_op, quoting=csv.QUOTE_MINIMAL).writerow(row_cells)
+                    row_string = r_op.getvalue().strip()
+                    rd = Document(
+                        page_content=row_string,
+                        metadata={
+                            "source": blob.source,
+                            "page": page_num,
+                            "type": "TABLE_ROW",
+                            "table_index": table_idx,
+                        },
+                    )
+                    yield rd
 
     def lazy_parse(self, blob: Blob) -> Iterator[Document]:
         """Lazily parse the blob."""

diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py
@@ -620,7 +620,7 @@ def __init__(
         file_path: str,
         client: Any,
         model: str = "prebuilt-document",
-        headers: Optional[Dict] = None,
+        split_mode: str = "page",
     ) -> None:
         """
         Initialize the object for file processing with Azure Document Intelligence
@@ -639,18 +639,28 @@ def __init__(
             A DocumentAnalysisClient to perform the analysis of the blob
         model : str
             The model name or ID to be used for form recognition in Azure.
+        split_mode : str
+            Whether to split by `paragraph` or `page`. Defaults to `page`.
 
         Examples:
         ---------
         >>> obj = DocumentIntelligenceLoader(
         ...     file_path="path/to/file",
         ...     client=client,
         ...     model="prebuilt-document"
+        ...     split_mode="page | paragraph"
         ... )
         """
-
-        self.parser = DocumentIntelligenceParser(client=client, model=model)
-        super().__init__(file_path, headers=headers)
+
+        super().__init__(file_path)
+        if split_mode not in ["page", "paragraph"]:
+            raise ValueError(
+                f"Invalid split option {split_mode}, "
+                "valid values are `page` or `paragraph`."
+            )
+        self.parser = DocumentIntelligenceParser(
+            client=client, model=model, split_mode=split_mode
+        )
 
     def load(self) -> List[Document]:
         """Load given path as pages."""