-
Notifications
You must be signed in to change notification settings - Fork 15.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Azure Doc Intelligence 0.2 - support paragraphs and tables for multiple models #10431
Changes from all commits
f2dda1e
91e8edc
e788e81
c971413
ad1a834
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -263,24 +263,94 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: | |
|
||
class DocumentIntelligenceParser(BaseBlobParser): | ||
"""Loads a PDF with Azure Document Intelligence | ||
(formerly Forms Recognizer) and chunks at character level.""" | ||
(formerly Forms Recognizer). Returns Document with | ||
pages or paragraphs, table headers, and rows.""" | ||
|
||
def __init__(self, client: Any, model: str): | ||
def __init__(self, client: Any, model: str, split_mode: str): | ||
self.client = client | ||
self.model = model | ||
self.split_mode = split_mode | ||
|
||
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: | ||
for p in result.pages: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if split mode is page should we just keep existing logic? is there value in parsing by paragraph and re-assembling pages? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @baskaryan the idea of providing paragraphs as an option is to do chunking (splitting) as supported by the azure AI cognitive layout capabilities rather than having to do chunking again using, let’s say a Text Splitter. This would be helpful for generating embeddings of chunks (paragraphs) that will retain the semantic consistency of the text. We won’t reassemble the paragraphs back into pages if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what i mean is why not do something like if self.split_mode == "page":
for p in result.pages:
...
elif self.split_mode == "paragraph":
for p in result.paragraphs:
... to save us having to write logic for reassembling paragraphs into pages in the case that split mode is page There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @baskaryan right, I am actually doing this here. the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Attaching a sample JSON output from a 2 page document extracted via |
||
content = " ".join([line.content for line in p.lines]) | ||
|
||
d = Document( | ||
page_content=content, | ||
metadata={ | ||
"source": blob.source, | ||
"page": p.page_number, | ||
}, | ||
) | ||
yield d | ||
page_content_dict = dict() | ||
|
||
for paragraph in result.paragraphs: | ||
page_number = paragraph.bounding_regions[0].page_number | ||
|
||
if self.split_mode == "page": | ||
if page_number not in page_content_dict: | ||
page_content_dict[page_number] = str() | ||
|
||
page_content_dict[page_number] += paragraph.content + "\n\n" | ||
elif self.split_mode == "paragraph": | ||
d = Document( | ||
page_content=paragraph.content, | ||
metadata={ | ||
"source": blob.source, | ||
"page": page_number, | ||
"type": "PARAGRAPH", | ||
}, | ||
) | ||
yield d | ||
|
||
if self.split_mode == "page": | ||
for page, content in page_content_dict.items(): | ||
d = Document( | ||
page_content=content.strip(), | ||
metadata={ | ||
"source": blob.source, | ||
"page": page, | ||
"type": "PAGE", | ||
}, | ||
) | ||
yield d | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @baskaryan here's |
||
|
||
if self.model in ["prebuilt-document", "prebuilt-layout", "prebuilt-invoice"]: | ||
import csv # noqa: F401 | ||
from io import StringIO # noqa: F401 | ||
|
||
for table_idx, table in enumerate(result.tables): | ||
page_num = table.bounding_regions[0].page_number | ||
headers: list[str] = list() | ||
rows: dict[int, list[str]] = dict() | ||
|
||
for cell in table.cells: | ||
if cell.kind == "columnHeader": | ||
headers.append(cell.content) | ||
elif cell.kind == "content": | ||
if cell.row_index not in rows: | ||
rows[cell.row_index] = list() | ||
rows[cell.row_index].append(cell.content) | ||
|
||
if headers: | ||
h_op = StringIO() | ||
csv.writer(h_op, quoting=csv.QUOTE_MINIMAL).writerow(headers) | ||
header_string = h_op.getvalue().strip() | ||
hd = Document( | ||
page_content=header_string, | ||
metadata={ | ||
"source": blob.source, | ||
"page": page_num, | ||
"type": "TABLE_HEADER", | ||
"table_index": table_idx, | ||
}, | ||
) | ||
yield hd | ||
|
||
for _, row_cells in sorted(rows.items()): | ||
r_op = StringIO() | ||
csv.writer(r_op, quoting=csv.QUOTE_MINIMAL).writerow(row_cells) | ||
row_string = r_op.getvalue().strip() | ||
rd = Document( | ||
page_content=row_string, | ||
metadata={ | ||
"source": blob.source, | ||
"page": page_num, | ||
"type": "TABLE_ROW", | ||
"table_index": table_idx, | ||
}, | ||
) | ||
yield rd | ||
|
||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | ||
"""Lazily parse the blob.""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -620,7 +620,7 @@ def __init__( | |
file_path: str, | ||
client: Any, | ||
model: str = "prebuilt-document", | ||
headers: Optional[Dict] = None, | ||
split_mode: str = "page", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @baskaryan here's where it's defaulted to |
||
) -> None: | ||
""" | ||
Initialize the object for file processing with Azure Document Intelligence | ||
|
@@ -639,18 +639,28 @@ def __init__( | |
A DocumentAnalysisClient to perform the analysis of the blob | ||
model : str | ||
The model name or ID to be used for form recognition in Azure. | ||
split_mode : str | ||
Whether to split by `paragraph` or `page`. Defaults to `page`. | ||
|
||
Examples: | ||
--------- | ||
>>> obj = DocumentIntelligenceLoader( | ||
... file_path="path/to/file", | ||
... client=client, | ||
... model="prebuilt-document" | ||
... split_mode="page | paragraph" | ||
... ) | ||
""" | ||
|
||
self.parser = DocumentIntelligenceParser(client=client, model=model) | ||
super().__init__(file_path, headers=headers) | ||
|
||
super().__init__(file_path) | ||
if split_mode not in ["page", "paragraph"]: | ||
raise ValueError( | ||
f"Invalid split option {split_mode}, " | ||
"valid values are `page` or `paragraph`." | ||
) | ||
self.parser = DocumentIntelligenceParser( | ||
client=client, model=model, split_mode=split_mode | ||
) | ||
|
||
def load(self) -> List[Document]: | ||
"""Load given path as pages.""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could we give this default val, probably "page"? so this isn't a breaking change and default behavior doesn't change too much
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do default it to “page” in
DocumentIntelligenceLoader
https://github.com/annjawn/langchain/blob/c97141309583fc70d6447b9fd216e4b48f09722e/libs/langchain/langchain/document_loaders/pdf.py#L615
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
but can we have default here as well, in case this object is instantiated directly by a user?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes we can default "page" here as well @baskaryan