Skip to content

Commit

Permalink
handle empty pages in doc-extract
Browse files Browse the repository at this point in the history
  • Loading branch information
devxpy committed Feb 12, 2024
1 parent 620f873 commit 7f5adb9
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
2 changes: 2 additions & 0 deletions daras_ai_v2/azure_doc_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ def azure_form_recognizer(url: str, model_id: str, params: dict = None):


def extract_records(result: dict, page_num: int) -> list[dict]:
if not result:
return []
table_polys = extract_tables(result, page_num)
records = []
for para in result["paragraphs"]:
Expand Down
6 changes: 5 additions & 1 deletion recipes/DocExtract.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,11 @@ def process_source(
params = dict(pages=str(page_num))
else:
params = None
transcript = str(azure_doc_extract_pages(content_url, params=params)[0])
pages = azure_doc_extract_pages(content_url, params=params)
if pages and pages[0]:
transcript = str(pages[0])
else:
transcript = ""
else:
raise NotImplementedError(
f"Unsupported type {doc_meta and doc_meta.mime_type} for {webpage_url}"
Expand Down

0 comments on commit 7f5adb9

Please sign in to comment.