Skip to content

Commit

Permalink
Enhance: Added safe CSV for TABLES
Browse files Browse the repository at this point in the history
  • Loading branch information
Anjan Biswas authored and Anjan Biswas committed Sep 11, 2023
1 parent e788e81 commit c971413
Showing 1 changed file with 13 additions and 4 deletions.
17 changes: 13 additions & 4 deletions libs/langchain/langchain/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,10 +306,13 @@ def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:
yield d

if self.model in ["prebuilt-document", "prebuilt-layout", "prebuilt-invoice"]:
import csv # noqa: F401
from io import StringIO # noqa: F401

for table_idx, table in enumerate(result.tables):
page_num = table.bounding_regions[0].page_number
headers = list()
rows = dict()
headers: list[str] = list()
rows: dict[int, list[str]] = dict()

for cell in table.cells:
if cell.kind == "columnHeader":
Expand All @@ -320,8 +323,11 @@ def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:
rows[cell.row_index].append(cell.content)

if headers:
h_op = StringIO()
csv.writer(h_op, quoting=csv.QUOTE_MINIMAL).writerow(headers)
header_string = h_op.getvalue().strip()
hd = Document(
page_content=",".join(headers),
page_content=header_string,
metadata={
"source": blob.source,
"page": page_num,
Expand All @@ -332,8 +338,11 @@ def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:
yield hd

for _, row_cells in sorted(rows.items()):
r_op = StringIO()
csv.writer(r_op, quoting=csv.QUOTE_MINIMAL).writerow(row_cells)
row_string = r_op.getvalue().strip()
rd = Document(
page_content=",".join(row_cells),
page_content=row_string,
metadata={
"source": blob.source,
"page": page_num,
Expand Down

0 comments on commit c971413

Please sign in to comment.