From 0730ff9f75d06cfa4a959959c0403fc3e4931cf4 Mon Sep 17 00:00:00 2001 From: chloedia Date: Sun, 1 Dec 2024 22:29:46 +0100 Subject: [PATCH 01/17] feat: create first format modules --- .gitignore | 3 + .../src/megaparse/checker/__init__.py | 0 .../src/megaparse/checker/format_checker.py | 26 --- .../megaparse/checker/markdown_processor.py | 211 ------------------ .../megaparse/src/megaparse/formatter/base.py | 40 ++++ .../formatter/table_formatter/__init__.py | 12 + .../table_formatter/llm_table_formatter.py | 97 ++++++++ .../table_formatter/vision_table_formatter.py | 155 +++++++++++++ .../unstructured_formatter/__init__.py | 12 + .../unstructured_formatter/md_formatter.py | 54 +++++ libs/megaparse/src/megaparse/megaparse.py | 49 ++-- libs/megaparse/src/megaparse/parser/base.py | 8 +- .../megaparse/parser/unstructured_parser.py | 88 +------- 13 files changed, 404 insertions(+), 351 deletions(-) delete mode 100644 libs/megaparse/src/megaparse/checker/__init__.py delete mode 100644 libs/megaparse/src/megaparse/checker/format_checker.py delete mode 100644 libs/megaparse/src/megaparse/checker/markdown_processor.py create mode 100644 libs/megaparse/src/megaparse/formatter/base.py create mode 100644 libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py create mode 100644 libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py create mode 100644 libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py create mode 100644 libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py create mode 100644 libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py diff --git a/.gitignore b/.gitignore index b745f75..7b696b4 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,6 @@ venv *.DS_Store .tool-versions megaparse/sdk/examples/only_pdfs/* +benchmark/auto/* +benchmark/hi_res/* + diff --git a/libs/megaparse/src/megaparse/checker/__init__.py b/libs/megaparse/src/megaparse/checker/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/libs/megaparse/src/megaparse/checker/format_checker.py b/libs/megaparse/src/megaparse/checker/format_checker.py deleted file mode 100644 index aa7ae3a..0000000 --- a/libs/megaparse/src/megaparse/checker/format_checker.py +++ /dev/null @@ -1,26 +0,0 @@ -from typing import List - -from langchain_core.language_models.chat_models import BaseChatModel -from unstructured.documents.elements import Element - - -# TODO: Implement the FormatChecker class @Chloe -class FormatChecker: - """ - A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables. - Attributes - ---------- - model : BaseChatModel - An instance of a chat model used to process and improve the layout of elements. - Methods - ------- - improve_layout(elements: List[Element]) -> List[Element] - Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout. - - """ - - def __init__(self, model: BaseChatModel): - self.model = model - - def check(self, elements: List[Element]): - raise NotImplementedError("Method not implemented yet") diff --git a/libs/megaparse/src/megaparse/checker/markdown_processor.py b/libs/megaparse/src/megaparse/checker/markdown_processor.py deleted file mode 100644 index 541a282..0000000 --- a/libs/megaparse/src/megaparse/checker/markdown_processor.py +++ /dev/null @@ -1,211 +0,0 @@ -# Code to clean markdown files - not used but to be refactored -# import os -# from collections import Counter -# from typing import List, Tuple, Dict -# from langchain_openai import ChatOpenAI -# from dotenv import load_dotenv - - -# class MarkdownProcessor: -# """ -# Class for MarkdownProcessor. -# """ - -# load_dotenv() - -# def __init__(self, md_result: str, strict: bool, remove_pagination: bool): -# self.md_result = md_result -# self.strict = strict -# self.remove_pagination = remove_pagination - -# @staticmethod -# def clean(text: str) -> str: -# """ -# Clean the input text by removing newlines, double asterisks, and trimming whitespace. - -# Args: -# text (str): Input text - -# Returns: -# str: Cleaned text -# """ -# text = text.replace("\n", "") -# text = text.replace("**", "") -# text = text.strip() -# return text - -# def split_into_pages(self) -> List[str]: -# """ -# Split the markdown result into pages using triple newlines as the delimiter. - -# Returns: -# List[str]: Splitted markdown -# """ -# return self.md_result.split("\n\n\n") - -# @staticmethod -# def split_into_paragraphs(pages: list) -> List[str]: -# """ -# Split pages into paragraphs using double newlines as the delimiter. - -# Args: -# pages (list): Pages - -# Returns: -# List[str]: Splitted pages -# """ -# return "\n\n".join(pages).split("\n\n") - -# def remove_duplicates(self, paragraphs: list) -> Tuple[str, List[str]]: -# """ -# Remove duplicate paragraphs and identify unique and duplicate paragraphs. - -# Args: -# paragraphs (list): Paragraphs - -# Returns: -# Tuple[str, List[str]]: Cleaned paragraphs and duplicate paragraphs -# """ -# unique_paragraphs = list( -# set([self.clean(paragraph) for paragraph in paragraphs]) -# ) -# duplicate_paragraphs = [] -# cleaned_paragraphs = [] - -# for paragraph in paragraphs: -# cleaned_paragraph = self.clean(paragraph) -# if cleaned_paragraph in unique_paragraphs: -# cleaned_paragraphs.append(paragraph) -# unique_paragraphs.remove(cleaned_paragraph) -# else: -# duplicate_paragraphs.append(paragraph) -# return cleaned_paragraphs, duplicate_paragraphs - -# def identify_header_components(self, duplicate_paragraphs: list) -> Dict: -# """ -# Identify words in duplicate paragraphs that are likely header components. - -# Args: -# duplicate_paragraphs (list): Duplicate paragraphs - -# Returns: -# Dict: Header components -# """ -# header_components = list( -# set([self.clean(paragraph) for paragraph in duplicate_paragraphs]) -# ) -# header_components = " ".join(header_components).strip().split(" ") -# header_components_count = Counter(header_components) -# header_components_count = { -# k.replace(":", ""): v -# for k, v in header_components_count.items() -# if v > 1 and len(k) > 3 -# } -# return header_components_count - -# def remove_header_lines( -# self, paragraphs: List[str], header_components_count: Dict -# ) -> List[str]: -# """ -# Remove paragraphs that contain any of the header words or the word 'Page' if remove_pagination is true. - -# Args: -# paragraphs (List[str]): Paragraphs -# header_components_count (Dict): Header components - -# Returns: -# List[str]: New paragraphs -# """ - -# def should_remove(paragraph): -# if self.remove_pagination and "Page" in paragraph: -# return True -# return any(word in paragraph for word in header_components_count.keys()) - -# return [paragraph for paragraph in paragraphs if not should_remove(paragraph)] - -# def merge_tables(self, md_content: str) -> str: -# """ -# Merge tables inside Markdown content. - -# Args: -# md_content (str): Markdown content - -# Returns: -# str: Merged tables -# """ -# md_content = md_content.replace("|\n\n|", "|\n|") -# return md_content - -# def save_cleaned_result(self, cleaned_result: str, output_path: str) -> None: -# """ -# Save the cleaned paragraphs to a markdown file. - -# Args: -# cleaned_result (str): Cleaned result -# output_path (str): Output path -# """ -# with open(output_path, "w") as f: -# f.write(cleaned_result) - -# def remove_header_llm(self): -# llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")) -# # Define the prompt -# messages = [ -# ( -# "system", -# "You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown.", -# ), -# ] - -# prompt = f"""You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown. -# Here is a md file : "{self.md_result}" -# I want you to identify repetitive texts that could be associate to a document header and footer. Please identify the headers, the footer and remove them from the document. -# Answer with only the cleaned document in markdown format. -# Result : """ - -# messages.append(("human", self.md_result)) # type: ignore - -# result = llm.invoke(messages) - -# return result.content - -# def process(self, gpt4o_cleaner=False) -> str: -# """ -# Process the markdown result by removing duplicate paragraphs and headers. - -# Args: -# gpt4o_cleaner (bool, optional): GPT-4o cleaner. Defaults to False. - -# Returns: -# str: Cleaned result -# """ -# if gpt4o_cleaner: -# cleaned_result = self.remove_header_llm() - -# else: -# pages = self.split_into_pages() -# paragraphs = self.split_into_paragraphs(pages) -# # other_pages_paragraphs = self.split_into_paragraphs(pages[1:]) - -# cleaned_paragraphs, duplicate_paragraphs = self.remove_duplicates( -# paragraphs -# ) -# header_components_count = self.identify_header_components( -# duplicate_paragraphs -# ) - -# if self.strict: -# final_paragraphs = self.remove_header_lines( -# cleaned_paragraphs[5:], header_components_count -# ) -# final_paragraphs = cleaned_paragraphs[:5] + final_paragraphs -# else: -# final_paragraphs = cleaned_paragraphs - -# # Combine first page paragraphs with cleaned paragraphs from other pages -# all_paragraphs = final_paragraphs -# cleaned_result = "\n\n".join(all_paragraphs) - -# cleaned_result = self.merge_tables(str(cleaned_result)) -# return cleaned_result diff --git a/libs/megaparse/src/megaparse/formatter/base.py b/libs/megaparse/src/megaparse/formatter/base.py new file mode 100644 index 0000000..26fb759 --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/base.py @@ -0,0 +1,40 @@ +from abc import ABC +from typing import List, Union + +from langchain_core.language_models.chat_models import BaseChatModel +from unstructured.documents.elements import Element + + +# TODO: Implement the Formatter class @Chloe +class BaseFormatter(ABC): + """ + A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables. + Attributes + ---------- + model : BaseChatModel + An instance of a chat model used to process and improve the layout of elements. + Methods + ------- + improve_layout(elements: List[Element]) -> List[Element] + Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout. + """ + + def __init__(self, model: BaseChatModel | None = None): + self.model = model + + async def format( + self, elements: Union[List[Element], str], file_path: str | None = None + ) -> Union[List[Element], str]: + if isinstance(elements, list): + return await self.format_elements(elements, file_path) + return await self.format_string(elements, file_path) + + async def format_elements( + self, elements: List[Element], file_path: str | None = None + ) -> Union[List[Element], str]: + raise NotImplementedError("Subclasses should implement this method") + + async def format_string( + self, text: str, file_path: str | None = None + ) -> Union[List[Element], str]: + raise NotImplementedError("Subclasses should implement this method") diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py new file mode 100644 index 0000000..cfa905c --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py @@ -0,0 +1,12 @@ +from typing import List + +from unstructured.documents.elements import Element + +from megaparse.formatter.base import BaseFormatter + + +class TableFormatter(BaseFormatter): + async def format_elements( + self, elements: List[Element], file_path: str | None = None + ) -> List[Element]: + raise NotImplementedError() diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py b/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py new file mode 100644 index 0000000..415de9d --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py @@ -0,0 +1,97 @@ +import re +from typing import List, Optional + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.prompts import ChatPromptTemplate +from unstructured.documents.elements import Element + +from megaparse.formatter.table_formatter import TableFormatter + + +class SimpleMDTableFormatter(TableFormatter): + """ + A formatter that converts table elements into Markdown format using llms. + """ + + TABLE_MARKER_START = "[TABLE]" + TABLE_MARKER_END = "[/TABLE]" + CODE_BLOCK_PATTERN = r"^```.*$\n?" + + def __init__(self, model: Optional[BaseChatModel] = None): + super().__init__(model) + + async def format_elements( + self, elements: List[Element], file_path: str | None = None + ) -> List[Element]: + """ + Formats table elements within a list of elements. + Args: + elements: A list of Element objects. + Returns: + A list of Element objects with formatted tables. + """ + if not self.model: + raise ValueError("A Model is needed to use the SimpleMDTableFormatter.") + print("Formatting tables using SimpleMDTableFormatter...") + table_stack = [] + formatted_elements = [] + + for element in elements: + if element.category == "Table": + previous_table = table_stack[-1] if table_stack else "" + formatted_table = self.format_table(element, previous_table) + table_stack.append(formatted_table.text) + formatted_elements.append(formatted_table) + else: + formatted_elements.append(element) + + return formatted_elements + + def format_table(self, table_element: Element, previous_table: str) -> Element: + """ + Formats a single table element into Markdown using an AI language model. + Args: + table_element: The table element to format. + previous_table: The previously formatted table text. + Returns: + The formatted table element. + """ + assert self.model is not None, "Model is not set." + + prompt = ChatPromptTemplate.from_messages( + [ + ( + "human", + ( + "You are an expert in markdown tables. Match the following text and HTML table " + "to create a markdown table. Provide just the table in pure markdown, nothing else.\n" + "\n{text}\n\n" + "\n{html}\n\n" + "\n{previous_table}\n" + ), + ), + ] + ) + + chain = prompt | self.model + result = chain.invoke( + { + "text": table_element.text, + "html": table_element.metadata.text_as_html, + "previous_table": previous_table, + } + ) + + content_str = str(result.content) + cleaned_content = re.sub( + self.CODE_BLOCK_PATTERN, "", content_str, flags=re.MULTILINE + ) + markdown_table = ( + f"{self.TABLE_MARKER_START}\n" + f"{cleaned_content}\n" + f"{self.TABLE_MARKER_END}\n\n" + ) + + table_element.text = markdown_table + + return table_element diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py b/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py new file mode 100644 index 0000000..91ec8df --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py @@ -0,0 +1,155 @@ +import base64 +from io import BytesIO +from typing import List, Optional + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import HumanMessage +from pdf2image import convert_from_path +from PIL import Image +from unstructured.documents.elements import Element + +from megaparse.formatter.table_formatter import TableFormatter + +TABLE_OCR_PROMPT = """ +You are tasked with transcribing the content of a table into markdown format. Your goal is to create a well-structured, readable markdown table that accurately represents the original content while adding appropriate formatting. +Answer uniquely with the parsed table. Do not include the fenced code blocks backticks. + """ + + +class VisionMDTableFormatter(TableFormatter): + """ + A formatter that converts table elements into Markdown format using an AI language model. + """ + + TABLE_MARKER_START = "[TABLE]" + TABLE_MARKER_END = "[/TABLE]" + CODE_BLOCK_PATTERN = r"^```.*$\n?" + + def __init__(self, model: Optional[BaseChatModel] = None): + super().__init__(model) + + async def format_elements( + self, elements: List[Element], file_path: str | None = None + ) -> List[Element]: + """ + Formats table elements within a list of elements. + Args: + elements: A list of Element objects. + Returns: + A list of Element objects with formatted tables. + """ + if not self.model: + raise ValueError("A Model is needed to use the VisionMDTableFormatter.") + print("Formatting tables using VisionMDTableFormatter...") + assert ( + file_path + ), "A file path is needed to format tables using VisionMDTableFormatter." + + formatted_elements = [] + + for element in elements: + if element.category == "Table": + formatted_table = await self.format_table(element, file_path) + formatted_elements.append(formatted_table) + else: + formatted_elements.append(element) + + return formatted_elements + + def process_file(self, images: List[Image.Image], image_format="PNG") -> List[str]: + """ + Process a PDF file and convert its pages to base64 encoded images. + :param file_path: Path to the PDF file + :param image_format: Format to save the images (default: PNG) + :return: List of base64 encoded images + """ + try: + images_base64 = [] + for image in images: + buffered = BytesIO() + image.save(buffered, format=image_format) + image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + images_base64.append(image_base64) + return images_base64 + except Exception as e: + raise ValueError(f"Error processing PDF file: {str(e)}") + + async def format_table(self, table_element: Element, file_path: str) -> Element: + """ + Formats a table element into Markdown format usinf a Vision Model + Args: + table_element: An Element object representing a table. + previous_table: A string representing the previous table. + Returns: + An Element object with the formatted table. + """ + assert ( + table_element.metadata.coordinates + ), "Table element must have coordinates." + coordinates = table_element.metadata.coordinates.points + page_number = table_element.metadata.page_number + assert page_number, "Table element must have a page number." + assert coordinates, "Table element must have coordinates." + pages = convert_from_path(file_path) + + # Crop the file image to the table coordinates + # Convert coordinates to a tuple of four float values + box = ( + min( + coordinates[0][0], + coordinates[1][0], + coordinates[2][0], + coordinates[3][0], + ), + min( + coordinates[0][1], + coordinates[1][1], + coordinates[2][1], + coordinates[3][1], + ), + max( + coordinates[0][0], + coordinates[1][0], + coordinates[2][0], + coordinates[3][0], + ), + max( + coordinates[0][1], + coordinates[1][1], + coordinates[2][1], + coordinates[3][1], + ), + ) + table_image = pages[page_number - 1].crop(box) + table_image64 = self.process_file([table_image])[0] + formatted_table = await self.vision_extract(table_image64) + + markdown_table = ( + f"{self.TABLE_MARKER_START}\n" + f"{formatted_table}\n" + f"{self.TABLE_MARKER_END}\n\n" + ) + # Convert the table image to text + table_element.text = markdown_table + return table_element + + async def vision_extract(self, table_image) -> str: + """ + Send images to the language model for processing. + :param images_data: List of base64 encoded images + :return: Processed content as a string + """ + assert self.model, "A model is needed to use the SimpleMDTableFormatter." + image_prompt = { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{table_image}"}, + } + + message = HumanMessage( + content=[ + {"type": "text", "text": TABLE_OCR_PROMPT}, + image_prompt, + ], + ) + response = await self.model.ainvoke([message]) + return str(response.content) diff --git a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py new file mode 100644 index 0000000..b303273 --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py @@ -0,0 +1,12 @@ +from typing import List + +from unstructured.documents.elements import Element + +from megaparse.formatter.base import BaseFormatter + + +class UnstructuredFormatter(BaseFormatter): + async def format_elements( + self, elements: List[Element], file_path: str | None = None + ) -> str: + raise NotImplementedError() diff --git a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py new file mode 100644 index 0000000..893673b --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py @@ -0,0 +1,54 @@ +from typing import List + +from unstructured.documents.elements import Element + +from megaparse.formatter.unstructured_formatter import UnstructuredFormatter + + +class MarkDownFormatter(UnstructuredFormatter): + async def format_elements( + self, elements: List[Element], file_path: str | None = None + ) -> str: + print("Formatting elements using MarkDownFormatter...") + markdown_content = "" + + for el in elements: + markdown_content += self.get_markdown_line(el.to_dict()) + + return markdown_content + + def get_markdown_line(self, el: dict): + element_type = el["type"] + text = el["text"] + metadata = el["metadata"] + parent_id = metadata.get("parent_id", None) + category_depth = metadata.get("category_depth", 0) + # table_stack = [] + + if "emphasized_text_contents" in metadata: + print(metadata["emphasized_text_contents"]) + + # Markdown line defaults to empty + markdown_line = "" + + # Element type-specific markdown content + markdown_types = { + "Title": f"## {text}\n\n" if parent_id else f"# {text}\n\n", + "Subtitle": f"## {text}\n\n", + "Header": f"{'#' * (category_depth + 1)} {text}\n\n", + "Footer": f"#### {text}\n\n", + "NarrativeText": f"{text}\n\n", + "ListItem": f"- {text}\n", + "Table": f"{text}\n\n", + "PageBreak": "---\n\n", + "Image": f"![Image]({el['metadata'].get('image_path', '')})\n\n", + "Formula": f"$$ {text} $$\n\n", + "FigureCaption": f"**Figure:** {text}\n\n", + "Address": f"**Address:** {text}\n\n", + "EmailAddress": f"**Email:** {text}\n\n", + "CodeSnippet": f"```{el['metadata'].get('language', '')}\n{text}\n```\n\n", + "PageNumber": "", # Page number is not included in markdown + } + + markdown_line = markdown_types.get(element_type, f"{text}\n\n") + return markdown_line diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py index deebd9c..3e8cd4b 100644 --- a/libs/megaparse/src/megaparse/megaparse.py +++ b/libs/megaparse/src/megaparse/megaparse.py @@ -1,12 +1,13 @@ import asyncio import os from pathlib import Path -from typing import IO +from typing import IO, List from megaparse_sdk.schema.extensions import FileExtension +from unstructured.documents.elements import Element -from megaparse.checker.format_checker import FormatChecker from megaparse.exceptions.base import ParsingException +from megaparse.formatter.base import BaseFormatter from megaparse.parser.base import BaseParser from megaparse.parser.unstructured_parser import UnstructuredParser @@ -15,11 +16,10 @@ class MegaParse: def __init__( self, parser: BaseParser = UnstructuredParser(), - format_checker: FormatChecker | None = None, + formatters: List[BaseFormatter] | None = None, ) -> None: self.parser = parser - self.format_checker = format_checker - self.last_parsed_document: str = "" + self.formatters = formatters async def aload( self, @@ -48,8 +48,9 @@ async def aload( except ValueError: raise ValueError(f"Unsupported file extension: {file_extension}") + # FIXME: Parsers and formatters should have their own supported file extensions if file_extension != ".pdf": - if self.format_checker: + if self.formatters: raise ValueError( f"Format Checker : Unsupported file extension: {file_extension}" ) @@ -59,17 +60,16 @@ async def aload( ) try: - parsed_document: str = await self.parser.convert( - file_path=file_path, file=file - ) + parsed_document = await self.parser.convert(file_path=file_path, file=file) # @chloe FIXME: format_checker needs unstructured Elements as input which is to change - # if self.format_checker: - # parsed_document: str = await self.format_checker.check(parsed_document) + if self.formatters: + for formatter in self.formatters: + parsed_document = await formatter.format(parsed_document) except Exception as e: raise ParsingException(f"Error while parsing {file_path}: {e}") - - self.last_parsed_document = parsed_document + if not isinstance(parsed_document, str): + raise ValueError("The parser or the last formatter should return a string") return parsed_document def load(self, file_path: Path | str) -> str: @@ -78,7 +78,7 @@ def load(self, file_path: Path | str) -> str: file_extension: str = file_path.suffix if file_extension != ".pdf": - if self.format_checker: + if self.formatters: raise ValueError( f"Format Checker : Unsupported file extension: {file_extension}" ) @@ -89,22 +89,17 @@ def load(self, file_path: Path | str) -> str: try: loop = asyncio.get_event_loop() - parsed_document: str = loop.run_until_complete( - self.parser.convert(file_path) - ) + parsed_document = loop.run_until_complete(self.parser.convert(file_path)) # @chloe FIXME: format_checker needs unstructured Elements as input which is to change - # if self.format_checker: - # parsed_document: str = loop.run_until_complete( - # self.format_checker.check(parsed_document) - # ) + if self.formatters: + for formatter in self.formatters: + parsed_document = loop.run_until_complete( + formatter.format(parsed_document) + ) except Exception as e: raise ValueError(f"Error while parsing {file_path}: {e}") - self.last_parsed_document = parsed_document + if not isinstance(parsed_document, str): + raise ValueError("The parser or the last formatter should return a string") return parsed_document - - def save(self, file_path: Path | str) -> None: - os.makedirs(os.path.dirname(file_path), exist_ok=True) - with open(file_path, "w+") as f: - f.write(self.last_parsed_document) diff --git a/libs/megaparse/src/megaparse/parser/base.py b/libs/megaparse/src/megaparse/parser/base.py index 4c85244..0da4a98 100644 --- a/libs/megaparse/src/megaparse/parser/base.py +++ b/libs/megaparse/src/megaparse/parser/base.py @@ -1,6 +1,8 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import IO +from typing import IO, List + +from unstructured.documents.elements import Element class BaseParser(ABC): @@ -12,9 +14,9 @@ async def convert( file_path: str | Path | None = None, file: IO[bytes] | None = None, **kwargs, - ) -> str: + ) -> List[Element] | str: """ - Convert the given file to a specific format. + Convert the given file to the unstructured format. Args: file_path (str | Path): The path to the file to be converted. diff --git a/libs/megaparse/src/megaparse/parser/unstructured_parser.py b/libs/megaparse/src/megaparse/parser/unstructured_parser.py index 38e04f3..dd263b3 100644 --- a/libs/megaparse/src/megaparse/parser/unstructured_parser.py +++ b/libs/megaparse/src/megaparse/parser/unstructured_parser.py @@ -1,11 +1,11 @@ import re from pathlib import Path -from typing import IO +from typing import IO, List from dotenv import load_dotenv from langchain_core.language_models.chat_models import BaseChatModel -from langchain_core.prompts import ChatPromptTemplate from megaparse_sdk.schema.parser_config import StrategyEnum +from unstructured.documents.elements import Element from unstructured.partition.auto import partition from megaparse.parser import BaseParser @@ -20,90 +20,12 @@ def __init__( self.strategy = strategy self.model = model - # Function to convert element category to markdown format - def convert_to_markdown(self, elements): - markdown_content = "" - - for el in elements: - markdown_content += self.get_markdown_line(el) - - return markdown_content - - def get_markdown_line(self, el: dict): - element_type = el["type"] - text = el["text"] - metadata = el["metadata"] - parent_id = metadata.get("parent_id", None) - category_depth = metadata.get("category_depth", 0) - table_stack = [] # type: ignore - - # Markdown line defaults to empty - markdown_line = "" - - # Element type-specific markdown content - markdown_types = { - "Title": f"## {text}\n\n" if parent_id else f"# {text}\n\n", - "Subtitle": f"## {text}\n\n", - "Header": f"{'#' * (category_depth + 1)} {text}\n\n", - "Footer": f"#### {text}\n\n", - "NarrativeText": f"{text}\n\n", - "ListItem": f"- {text}\n", - "Table": f"{text}\n\n", - "PageBreak": "---\n\n", - "Image": f"![Image]({el['metadata'].get('image_path', '')})\n\n", - "Formula": f"$$ {text} $$\n\n", - "FigureCaption": f"**Figure:** {text}\n\n", - "Address": f"**Address:** {text}\n\n", - "EmailAddress": f"**Email:** {text}\n\n", - "CodeSnippet": f"```{el['metadata'].get('language', '')}\n{text}\n```\n\n", - "PageNumber": "", # Page number is not included in markdown - } - - markdown_line = markdown_types.get(element_type, f"{text}\n\n") - - if element_type == "Table" and self.model: - # FIXME: @ChloĆ© - Add a modular table enhancement here - LVM - prompt = ChatPromptTemplate.from_messages( - [ - ( - "human", - """You are an expert in markdown tables, match this text and this html table to fill a md table. You answer with just the table in pure markdown, nothing else. - - {text} - - - {html} - - - {previous_table} - """, - ), - ] - ) - chain = prompt | self.model - result = chain.invoke( - { - "text": el["text"], - "html": metadata["text_as_html"], - "previous_table": table_stack[-1] if table_stack else "", - } - ) - content_str = ( - str(result.content) - if not isinstance(result.content, str) - else result.content - ) - cleaned_content = re.sub(r"^```.*$\n?", "", content_str, flags=re.MULTILINE) - markdown_line = f"[TABLE]\n{cleaned_content}\n[/TABLE]\n\n" - - return markdown_line - async def convert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, **kwargs, - ) -> str: + ) -> List[Element]: # Partition the PDF elements = partition( filename=str(file_path) if file_path else None, @@ -111,6 +33,4 @@ async def convert( strategy=self.strategy, skip_infer_table_types=[], ) - elements_dict = [el.to_dict() for el in elements] - markdown_content = self.convert_to_markdown(elements_dict) - return markdown_content + return elements From 5b63dc6e13cb2ae3e85eed12c9081a4d1550ef5b Mon Sep 17 00:00:00 2001 From: chloedia Date: Tue, 10 Dec 2024 00:00:22 +0100 Subject: [PATCH 02/17] add: example file --- .../megaparse/src/megaparse/examples/parse_file.py | 14 ++++++++++++++ .../formatter/structured_output/__init__.py | 11 +++++++++++ .../table_formatter/llm_table_formatter.py | 3 +-- .../table_formatter/vision_table_formatter.py | 3 +-- .../formatter/unstructured_formatter/__init__.py | 3 +-- libs/megaparse/src/megaparse/megaparse.py | 2 +- 6 files changed, 29 insertions(+), 7 deletions(-) create mode 100644 libs/megaparse/src/megaparse/examples/parse_file.py create mode 100644 libs/megaparse/src/megaparse/formatter/structured_output/__init__.py diff --git a/libs/megaparse/src/megaparse/examples/parse_file.py b/libs/megaparse/src/megaparse/examples/parse_file.py new file mode 100644 index 0000000..b728824 --- /dev/null +++ b/libs/megaparse/src/megaparse/examples/parse_file.py @@ -0,0 +1,14 @@ +from megaparse.formatter.unstructured_formatter.md_formatter import MarkDownFormatter +from megaparse.megaparse import MegaParse +from megaparse.parser.unstructured_parser import UnstructuredParser + +if __name__ == "__main__": + # Parse a file + parser = UnstructuredParser() + formatter = MarkDownFormatter() + + megaparse = MegaParse(parser=parser, formatters=[formatter]) + + file_path = "libs/megaparse/tests/pdf/sample_pdf.pdf" + result = megaparse.load(file_path=file_path) + print(result) diff --git a/libs/megaparse/src/megaparse/formatter/structured_output/__init__.py b/libs/megaparse/src/megaparse/formatter/structured_output/__init__.py new file mode 100644 index 0000000..9152b58 --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/structured_output/__init__.py @@ -0,0 +1,11 @@ +# from typing import List + +# from megaparse.formatter.base import BaseFormatter +# from pydantic import BaseModel + + +# class StructuredFormatter(BaseFormatter): +# async def format_string( +# self, text: str, file_path: str | None = None, model: BaseModel | None = None +# ) -> BaseModel: +# raise NotImplementedError() diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py b/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py index 415de9d..b9a8740 100644 --- a/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py @@ -3,9 +3,8 @@ from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.prompts import ChatPromptTemplate -from unstructured.documents.elements import Element - from megaparse.formatter.table_formatter import TableFormatter +from unstructured.documents.elements import Element class SimpleMDTableFormatter(TableFormatter): diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py b/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py index 91ec8df..62370f4 100644 --- a/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py @@ -4,12 +4,11 @@ from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import HumanMessage +from megaparse.formatter.table_formatter import TableFormatter from pdf2image import convert_from_path from PIL import Image from unstructured.documents.elements import Element -from megaparse.formatter.table_formatter import TableFormatter - TABLE_OCR_PROMPT = """ You are tasked with transcribing the content of a table into markdown format. Your goal is to create a well-structured, readable markdown table that accurately represents the original content while adding appropriate formatting. Answer uniquely with the parsed table. Do not include the fenced code blocks backticks. diff --git a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py index b303273..c542476 100644 --- a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py +++ b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py @@ -1,8 +1,7 @@ from typing import List -from unstructured.documents.elements import Element - from megaparse.formatter.base import BaseFormatter +from unstructured.documents.elements import Element class UnstructuredFormatter(BaseFormatter): diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py index 3e8cd4b..d8656ea 100644 --- a/libs/megaparse/src/megaparse/megaparse.py +++ b/libs/megaparse/src/megaparse/megaparse.py @@ -61,7 +61,7 @@ async def aload( try: parsed_document = await self.parser.convert(file_path=file_path, file=file) - # @chloe FIXME: format_checker needs unstructured Elements as input which is to change + # @chloe FIXME: format_checker needs unstructured Elements as input which is to change to a megaparse element if self.formatters: for formatter in self.formatters: parsed_document = await formatter.format(parsed_document) From eea6cfd1dc4c1fb863b8040b1e279871c1f0a8fa Mon Sep 17 00:00:00 2001 From: chloedia Date: Tue, 10 Dec 2024 00:48:38 +0100 Subject: [PATCH 03/17] add: structured output formatter --- .../src/megaparse/examples/parse_file.py | 19 ++++++++- .../structured_formatter/__init__.py | 16 +++++++ .../custom_structured_formatter.py | 42 +++++++++++++++++++ .../formatter/structured_output/__init__.py | 11 ----- 4 files changed, 75 insertions(+), 13 deletions(-) create mode 100644 libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py create mode 100644 libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py delete mode 100644 libs/megaparse/src/megaparse/formatter/structured_output/__init__.py diff --git a/libs/megaparse/src/megaparse/examples/parse_file.py b/libs/megaparse/src/megaparse/examples/parse_file.py index b728824..b10d811 100644 --- a/libs/megaparse/src/megaparse/examples/parse_file.py +++ b/libs/megaparse/src/megaparse/examples/parse_file.py @@ -1,13 +1,28 @@ from megaparse.formatter.unstructured_formatter.md_formatter import MarkDownFormatter from megaparse.megaparse import MegaParse +from megaparse.formatter.structured_formatter.custom_structured_formatter import ( + CustomStructuredFormatter, +) from megaparse.parser.unstructured_parser import UnstructuredParser +from langchain_openai import ChatOpenAI +from pydantic import BaseModel, Field + + +class MyCustomFormat(BaseModel): + title: str = Field(description="The title of the document.") + problem: str = Field(description="The problem statement.") + solution: str = Field(description="The solution statement.") + + if __name__ == "__main__": # Parse a file parser = UnstructuredParser() - formatter = MarkDownFormatter() + model = ChatOpenAI() + formatter_1 = MarkDownFormatter() + formatter_2 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) - megaparse = MegaParse(parser=parser, formatters=[formatter]) + megaparse = MegaParse(parser=parser, formatters=[formatter_1, formatter_2]) file_path = "libs/megaparse/tests/pdf/sample_pdf.pdf" result = megaparse.load(file_path=file_path) diff --git a/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py new file mode 100644 index 0000000..c369a15 --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py @@ -0,0 +1,16 @@ +from langchain_core.language_models.chat_models import BaseChatModel +from megaparse.formatter.base import BaseFormatter +from pydantic import BaseModel + + +class StructuredFormatter(BaseFormatter): + def __init__(self, model: BaseChatModel, output_model: type[BaseModel]): + super().__init__(model) + self.output_model = output_model + + async def format_string( + self, + text: str, + file_path: str | None = None, + ) -> str: # FIXME: Return a structured output of type BaseModel ? + raise NotImplementedError() diff --git a/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py b/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py new file mode 100644 index 0000000..c5a5a50 --- /dev/null +++ b/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py @@ -0,0 +1,42 @@ +from typing import Optional + +from langchain_core.language_models.chat_models import BaseChatModel +from megaparse.formatter.structured_formatter import StructuredFormatter +from pydantic import BaseModel + + +class CustomStructuredFormatter(StructuredFormatter): + async def format_string( + self, + text: str, + file_path: str | None = None, + ) -> str: + """ + Structure the file using an AI language model. + Args: + text: The text to format. + file_path: The file path of the text. + model: The AI language model to use for formatting. + Returns: + The structured text. + """ + if not self.model: + raise ValueError("A Model is needed to use the CustomStructuredFormatter.") + print("Formatting text using CustomStructuredFormatter...") + if len(text) < 0: + raise ValueError( + "A non empty text is needed to format text using CustomStructuredFormatter." + ) + if not self.output_model: + raise ValueError( + "An output model is needed to structure text using CustomStructuredFormatter." + ) + + structured_model = self.model.with_structured_output(self.output_model) # type: ignore + + formatted_text = structured_model.invoke( + f"Parse the text in a structured format: {text}" + ) + assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel." + + return formatted_text.model_dump_json() diff --git a/libs/megaparse/src/megaparse/formatter/structured_output/__init__.py b/libs/megaparse/src/megaparse/formatter/structured_output/__init__.py deleted file mode 100644 index 9152b58..0000000 --- a/libs/megaparse/src/megaparse/formatter/structured_output/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -# from typing import List - -# from megaparse.formatter.base import BaseFormatter -# from pydantic import BaseModel - - -# class StructuredFormatter(BaseFormatter): -# async def format_string( -# self, text: str, file_path: str | None = None, model: BaseModel | None = None -# ) -> BaseModel: -# raise NotImplementedError() From 7917ae9b43bf318af2a3473830a8e5dcb4d9645f Mon Sep 17 00:00:00 2001 From: chloedia Date: Mon, 6 Jan 2025 17:41:52 +0100 Subject: [PATCH 04/17] fix: all parsers outputs list of elements & compatibility formatters --- .../src/megaparse/examples/parse_file.py | 27 ++- .../megaparse/src/megaparse/formatter/base.py | 27 ++- .../structured_formatter/__init__.py | 9 +- .../custom_structured_formatter.py | 40 +++- .../formatter/table_formatter/__init__.py | 10 +- .../table_formatter/llm_table_formatter.py | 13 +- .../table_formatter/vision_table_formatter.py | 177 ++++++++++-------- .../unstructured_formatter/__init__.py | 7 +- .../unstructured_formatter/md_formatter.py | 16 +- libs/megaparse/src/megaparse/megaparse.py | 52 +---- .../src/megaparse/models/document.py | 15 ++ libs/megaparse/src/megaparse/parser/base.py | 7 +- .../src/megaparse/parser/doctr_parser.py | 75 +++++++- libs/megaparse/src/megaparse/parser/llama.py | 18 +- .../src/megaparse/parser/megaparse_vision.py | 14 +- .../megaparse/parser/unstructured_parser.py | 17 +- requirements-dev.lock | 3 +- requirements.lock | 3 +- 18 files changed, 363 insertions(+), 167 deletions(-) create mode 100644 libs/megaparse/src/megaparse/models/document.py diff --git a/libs/megaparse/src/megaparse/examples/parse_file.py b/libs/megaparse/src/megaparse/examples/parse_file.py index b10d811..ee66b65 100644 --- a/libs/megaparse/src/megaparse/examples/parse_file.py +++ b/libs/megaparse/src/megaparse/examples/parse_file.py @@ -1,13 +1,20 @@ -from megaparse.formatter.unstructured_formatter.md_formatter import MarkDownFormatter -from megaparse.megaparse import MegaParse +import asyncio + +from langchain_openai import ChatOpenAI from megaparse.formatter.structured_formatter.custom_structured_formatter import ( CustomStructuredFormatter, ) +from megaparse.formatter.unstructured_formatter.md_formatter import MarkDownFormatter +from megaparse.megaparse import MegaParse +from megaparse.parser.doctr_parser import DoctrParser from megaparse.parser.unstructured_parser import UnstructuredParser - -from langchain_openai import ChatOpenAI from pydantic import BaseModel, Field +from llama_parse import LlamaParse +from llama_parse.utils import Language, ResultType +from typing import List +from llama_index.core.schema import Document as LlamaDocument + class MyCustomFormat(BaseModel): title: str = Field(description="The title of the document.") @@ -15,15 +22,19 @@ class MyCustomFormat(BaseModel): solution: str = Field(description="The solution statement.") -if __name__ == "__main__": +def main(): # Parse a file - parser = UnstructuredParser() - model = ChatOpenAI() + parser = DoctrParser() + model = ChatOpenAI(name="gpt-4o") formatter_1 = MarkDownFormatter() formatter_2 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) megaparse = MegaParse(parser=parser, formatters=[formatter_1, formatter_2]) - file_path = "libs/megaparse/tests/pdf/sample_pdf.pdf" + file_path = "./tests/pdf/sample_pdf.pdf" result = megaparse.load(file_path=file_path) print(result) + + +if __name__ == "__main__": + main() diff --git a/libs/megaparse/src/megaparse/formatter/base.py b/libs/megaparse/src/megaparse/formatter/base.py index 26fb759..8c26217 100644 --- a/libs/megaparse/src/megaparse/formatter/base.py +++ b/libs/megaparse/src/megaparse/formatter/base.py @@ -22,19 +22,36 @@ class BaseFormatter(ABC): def __init__(self, model: BaseChatModel | None = None): self.model = model - async def format( + def format( self, elements: Union[List[Element], str], file_path: str | None = None ) -> Union[List[Element], str]: if isinstance(elements, list): - return await self.format_elements(elements, file_path) - return await self.format_string(elements, file_path) + return self.format_elements(elements, file_path) + return self.format_string(elements, file_path) - async def format_elements( + async def aformat( + self, elements: Union[List[Element], str], file_path: str | None = None + ) -> Union[List[Element], str]: + if isinstance(elements, list): + return await self.aformat_elements(elements, file_path) + return await self.aformat_string(elements, file_path) + + def format_elements( + self, elements: List[Element], file_path: str | None = None + ) -> Union[List[Element], str]: + raise NotImplementedError("Subclasses should implement this method") + + async def aformat_elements( self, elements: List[Element], file_path: str | None = None ) -> Union[List[Element], str]: raise NotImplementedError("Subclasses should implement this method") - async def format_string( + def format_string( + self, text: str, file_path: str | None = None + ) -> Union[List[Element], str]: + raise NotImplementedError("Subclasses should implement this method") + + async def aformat_string( self, text: str, file_path: str | None = None ) -> Union[List[Element], str]: raise NotImplementedError("Subclasses should implement this method") diff --git a/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py index c369a15..2a95c6c 100644 --- a/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py +++ b/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py @@ -8,7 +8,14 @@ def __init__(self, model: BaseChatModel, output_model: type[BaseModel]): super().__init__(model) self.output_model = output_model - async def format_string( + async def aformat_string( + self, + text: str, + file_path: str | None = None, + ) -> str: # FIXME: Return a structured output of type BaseModel ? + raise NotImplementedError() + + def format_string( self, text: str, file_path: str | None = None, diff --git a/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py b/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py index c5a5a50..6041625 100644 --- a/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py +++ b/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py @@ -1,12 +1,9 @@ -from typing import Optional - -from langchain_core.language_models.chat_models import BaseChatModel from megaparse.formatter.structured_formatter import StructuredFormatter from pydantic import BaseModel class CustomStructuredFormatter(StructuredFormatter): - async def format_string( + def format_string( self, text: str, file_path: str | None = None, @@ -40,3 +37,38 @@ async def format_string( assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel." return formatted_text.model_dump_json() + + async def aformat_string( + self, + text: str, + file_path: str | None = None, + ) -> str: + """ + Asynchronously structure the file using an AI language model. + Args: + text: The text to format. + file_path: The file path of the text. + model: The AI language model to use for formatting. + Returns: + The structured text. + """ + if not self.model: + raise ValueError("A Model is needed to use the CustomStructuredFormatter.") + print("Formatting text using CustomStructuredFormatter...") + if len(text) < 0: + raise ValueError( + "A non empty text is needed to format text using CustomStructuredFormatter." + ) + if not self.output_model: + raise ValueError( + "An output model is needed to structure text using CustomStructuredFormatter." + ) + + structured_model = self.model.with_structured_output(self.output_model) # type: ignore + + formatted_text = await structured_model.ainvoke( + f"Parse the text in a structured format: {text}" + ) + assert isinstance(formatted_text, BaseModel), "Model output is not a BaseModel." + + return formatted_text.model_dump_json() diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py index cfa905c..caaebf6 100644 --- a/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py @@ -1,12 +1,16 @@ from typing import List -from unstructured.documents.elements import Element - from megaparse.formatter.base import BaseFormatter +from unstructured.documents.elements import Element class TableFormatter(BaseFormatter): - async def format_elements( + async def aformat_elements( + self, elements: List[Element], file_path: str | None = None + ) -> List[Element]: + raise NotImplementedError() + + def format_elements( self, elements: List[Element], file_path: str | None = None ) -> List[Element]: raise NotImplementedError() diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py b/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py index b9a8740..b90a83c 100644 --- a/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py @@ -1,5 +1,6 @@ import re from typing import List, Optional +import warnings from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.prompts import ChatPromptTemplate @@ -19,7 +20,17 @@ class SimpleMDTableFormatter(TableFormatter): def __init__(self, model: Optional[BaseChatModel] = None): super().__init__(model) - async def format_elements( + async def aformat_elements( + self, elements: List[Element], file_path: str | None = None + ) -> List[Element]: + warnings.warn( + "The SimpleMDTableFormatter is a sync formatter, please use the sync format method", + UserWarning, + stacklevel=2, + ) + return self.format_elements(elements, file_path) + + def format_elements( self, elements: List[Element], file_path: str | None = None ) -> List[Element]: """ diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py b/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py index 62370f4..4762d76 100644 --- a/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py @@ -12,7 +12,7 @@ TABLE_OCR_PROMPT = """ You are tasked with transcribing the content of a table into markdown format. Your goal is to create a well-structured, readable markdown table that accurately represents the original content while adding appropriate formatting. Answer uniquely with the parsed table. Do not include the fenced code blocks backticks. - """ +""" class VisionMDTableFormatter(TableFormatter): @@ -27,40 +27,111 @@ class VisionMDTableFormatter(TableFormatter): def __init__(self, model: Optional[BaseChatModel] = None): super().__init__(model) - async def format_elements( + def _crop_table_image(self, table_element: Element, file_path: str) -> str: + """ + Helper method to crop the table portion of the PDF page and convert it to a base64 string. + """ + assert ( + table_element.metadata.coordinates + ), "Table element must have coordinates." + coordinates = table_element.metadata.coordinates.points + page_number = table_element.metadata.page_number + assert page_number, "Table element must have a page number." + assert coordinates, "Table element must have coordinates." + + pages = convert_from_path(file_path) + + # Calculate the box for cropping + box = ( + min(coord[0] for coord in coordinates), + min(coord[1] for coord in coordinates), + max(coord[0] for coord in coordinates), + max(coord[1] for coord in coordinates), + ) + table_image = pages[page_number - 1].crop(box) + # Convert the cropped image to base64 + table_image64 = self.process_file([table_image])[0] + return table_image64 + + async def aformat_elements( self, elements: List[Element], file_path: str | None = None ) -> List[Element]: """ - Formats table elements within a list of elements. - Args: - elements: A list of Element objects. - Returns: - A list of Element objects with formatted tables. + Asynchronously formats table elements within a list of elements. """ if not self.model: raise ValueError("A Model is needed to use the VisionMDTableFormatter.") - print("Formatting tables using VisionMDTableFormatter...") + print("Formatting tables using VisionMDTableFormatter (async)...") assert ( file_path ), "A file path is needed to format tables using VisionMDTableFormatter." formatted_elements = [] - for element in elements: if element.category == "Table": - formatted_table = await self.format_table(element, file_path) + formatted_table = await self.aformat_table(element, file_path) formatted_elements.append(formatted_table) else: formatted_elements.append(element) + return formatted_elements + + def format_elements( + self, elements: List[Element], file_path: str | None = None + ) -> List[Element]: + """ + Synchronously formats table elements within a list of elements. + """ + if not self.model: + raise ValueError("A Model is needed to use the VisionMDTableFormatter.") + print("Formatting tables using VisionMDTableFormatter (sync)...") + assert ( + file_path + ), "A file path is needed to format tables using VisionMDTableFormatter." + formatted_elements = [] + for element in elements: + if element.category == "Table": + formatted_table = self.format_table(element, file_path) + formatted_elements.append(formatted_table) + else: + formatted_elements.append(element) return formatted_elements + async def aformat_table(self, table_element: Element, file_path: str) -> Element: + """ + Asynchronously formats a table element into Markdown format using a Vision Model. + """ + table_image64 = self._crop_table_image(table_element, file_path) + formatted_table = await self.avision_extract(table_image64) + + markdown_table = ( + f"{self.TABLE_MARKER_START}\n" + f"{formatted_table}\n" + f"{self.TABLE_MARKER_END}\n\n" + ) + # Replace the element's text with the formatted table text + table_element.text = markdown_table + return table_element + + def format_table(self, table_element: Element, file_path: str) -> Element: + """ + Synchronously formats a table element into Markdown format using a Vision Model. + """ + table_image64 = self._crop_table_image(table_element, file_path) + formatted_table = self.vision_extract(table_image64) + + markdown_table = ( + f"{self.TABLE_MARKER_START}\n" + f"{formatted_table}\n" + f"{self.TABLE_MARKER_END}\n\n" + ) + # Replace the element's text with the formatted table text + table_element.text = markdown_table + return table_element + def process_file(self, images: List[Image.Image], image_format="PNG") -> List[str]: """ - Process a PDF file and convert its pages to base64 encoded images. - :param file_path: Path to the PDF file - :param image_format: Format to save the images (default: PNG) - :return: List of base64 encoded images + Convert a list of PIL images to base64 encoded images. """ try: images_base64 = [] @@ -73,72 +144,32 @@ def process_file(self, images: List[Image.Image], image_format="PNG") -> List[st except Exception as e: raise ValueError(f"Error processing PDF file: {str(e)}") - async def format_table(self, table_element: Element, file_path: str) -> Element: + async def avision_extract(self, table_image: str) -> str: """ - Formats a table element into Markdown format usinf a Vision Model - Args: - table_element: An Element object representing a table. - previous_table: A string representing the previous table. - Returns: - An Element object with the formatted table. + Asynchronously send image data to the language model for processing. """ assert ( - table_element.metadata.coordinates - ), "Table element must have coordinates." - coordinates = table_element.metadata.coordinates.points - page_number = table_element.metadata.page_number - assert page_number, "Table element must have a page number." - assert coordinates, "Table element must have coordinates." - pages = convert_from_path(file_path) - - # Crop the file image to the table coordinates - # Convert coordinates to a tuple of four float values - box = ( - min( - coordinates[0][0], - coordinates[1][0], - coordinates[2][0], - coordinates[3][0], - ), - min( - coordinates[0][1], - coordinates[1][1], - coordinates[2][1], - coordinates[3][1], - ), - max( - coordinates[0][0], - coordinates[1][0], - coordinates[2][0], - coordinates[3][0], - ), - max( - coordinates[0][1], - coordinates[1][1], - coordinates[2][1], - coordinates[3][1], - ), - ) - table_image = pages[page_number - 1].crop(box) - table_image64 = self.process_file([table_image])[0] - formatted_table = await self.vision_extract(table_image64) + self.model + ), "A model is needed to use the VisionMDTableFormatter (async)." + image_prompt = { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{table_image}"}, + } - markdown_table = ( - f"{self.TABLE_MARKER_START}\n" - f"{formatted_table}\n" - f"{self.TABLE_MARKER_END}\n\n" + message = HumanMessage( + content=[ + {"type": "text", "text": TABLE_OCR_PROMPT}, + image_prompt, + ], ) - # Convert the table image to text - table_element.text = markdown_table - return table_element + response = await self.model.ainvoke([message]) + return str(response.content) - async def vision_extract(self, table_image) -> str: + def vision_extract(self, table_image: str) -> str: """ - Send images to the language model for processing. - :param images_data: List of base64 encoded images - :return: Processed content as a string + Synchronously send image data to the language model for processing. """ - assert self.model, "A model is needed to use the SimpleMDTableFormatter." + assert self.model, "A model is needed to use the VisionMDTableFormatter (sync)." image_prompt = { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{table_image}"}, @@ -150,5 +181,5 @@ async def vision_extract(self, table_image) -> str: image_prompt, ], ) - response = await self.model.ainvoke([message]) + response = self.model.invoke([message]) return str(response.content) diff --git a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py index c542476..4b7396e 100644 --- a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py +++ b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py @@ -5,7 +5,12 @@ class UnstructuredFormatter(BaseFormatter): - async def format_elements( + async def aformat_elements( + self, elements: List[Element], file_path: str | None = None + ) -> str: + raise NotImplementedError() + + def format_elements( self, elements: List[Element], file_path: str | None = None ) -> str: raise NotImplementedError() diff --git a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py index 893673b..4149187 100644 --- a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py +++ b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py @@ -1,12 +1,12 @@ +import warnings from typing import List -from unstructured.documents.elements import Element - from megaparse.formatter.unstructured_formatter import UnstructuredFormatter +from unstructured.documents.elements import Element class MarkDownFormatter(UnstructuredFormatter): - async def format_elements( + def format_elements( self, elements: List[Element], file_path: str | None = None ) -> str: print("Formatting elements using MarkDownFormatter...") @@ -17,6 +17,16 @@ async def format_elements( return markdown_content + async def aformat_elements( + self, elements: List[Element], file_path: str | None = None + ) -> str: + warnings.warn( + "The MarkDownFormatter is a sync formatter, please use the sync format method", + UserWarning, + stacklevel=2, + ) + return self.format_elements(elements, file_path) + def get_markdown_line(self, el: dict): element_type = el["type"] text = el["text"] diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py index 2dadc82..c1979fe 100644 --- a/libs/megaparse/src/megaparse/megaparse.py +++ b/libs/megaparse/src/megaparse/megaparse.py @@ -2,15 +2,12 @@ import logging import os from pathlib import Path -from typing import IO, List - -from megaparse_sdk.schema.extensions import FileExtension -from unstructured.documents.elements import Element -from typing import IO, BinaryIO +from typing import IO, BinaryIO, List from megaparse_sdk.config import MegaParseConfig from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum +from unstructured.documents.elements import Element from megaparse.exceptions.base import ParsingException from megaparse.formatter.base import BaseFormatter @@ -31,14 +28,11 @@ def __init__( formatters: List[BaseFormatter] | None = None, ocr_parser: BaseParser = DoctrParser(), strategy: StrategyEnum = StrategyEnum.AUTO, - format_checker: FormatChecker | None = None, ) -> None: self.strategy = strategy self.parser = parser self.formatters = formatters self.ocr_parser = ocr_parser - self.format_checker = format_checker - self.last_parsed_document: str = "" def validate_input( self, @@ -70,12 +64,6 @@ def validate_input( file_extension = FileExtension(file_extension) except ValueError: raise ValueError(f"Unsupported file extension: {file_extension}") - - if file_extension != FileExtension.PDF: - if self.format_checker: - raise ValueError( - f"Format Checker : Unsupported file extension: {file_extension}" - ) return file_extension async def aload( @@ -92,7 +80,7 @@ async def aload( # @chloe FIXME: format_checker needs unstructured Elements as input which is to change to a megaparse element if self.formatters: for formatter in self.formatters: - parsed_document = await formatter.format(parsed_document) + parsed_document = await formatter.aformat(parsed_document) except Exception as e: raise ParsingException(f"Error while parsing {file_path}: {e}") @@ -100,21 +88,6 @@ async def aload( raise ValueError("The parser or the last formatter should return a string") return parsed_document - def load(self, file_path: Path | str) -> str: - if isinstance(file_path, str): - file_path = Path(file_path) - file_extension: str = file_path.suffix - - if file_extension != ".pdf": - if self.formatters: - raise ValueError( - f"Format Checker : Unsupported file extension: {file_extension}" - ) - if not isinstance(self.parser, UnstructuredParser): - raise ValueError( - f"Parser {self.parser}: Unsupported file extension: {file_extension}" - ) - def load( self, file_path: Path | str | None = None, @@ -125,21 +98,19 @@ def load( file=file, file_path=file_path, file_extension=file_extension ) try: - parsed_document = self.parser.convert(file_path) - # @chloe FIXME: format_checker needs unstructured Elements as input which is to change - if self.formatters: - for formatter in self.formatters: - parsed_document = formatter.format(parsed_document) - parser = self._select_parser(file_path, file, file_extension) logger.info(f"Parsing using {parser.__class__.__name__} parser.") parsed_document = parser.convert( file_path=file_path, file=file, file_extension=file_extension ) + # @chloe FIXME: format_checker needs unstructured Elements as input which is to change + if self.formatters: + for formatter in self.formatters: + parsed_document = formatter.format(parsed_document) + # @chloe FIXME: format_checker needs unstructured Elements as input which is to change # if self.format_checker: - # parsed_document: str = await self.format_checker.check(parsed_document - self.last_parsed_document = parsed_document + # parsed_document: str = self.format_checker.check(parsed_document) if not isinstance(parsed_document, str): raise ValueError( "The parser or the last formatter should return a string" @@ -175,8 +146,3 @@ def _select_parser( if local_strategy == StrategyEnum.HI_RES: return self.ocr_parser return self.parser - - def save(self, file_path: Path | str) -> None: - os.makedirs(os.path.dirname(file_path), exist_ok=True) - with open(file_path, "w+") as f: - f.write(self.last_parsed_document) diff --git a/libs/megaparse/src/megaparse/models/document.py b/libs/megaparse/src/megaparse/models/document.py new file mode 100644 index 0000000..42d7552 --- /dev/null +++ b/libs/megaparse/src/megaparse/models/document.py @@ -0,0 +1,15 @@ +from typing import Dict, List + +from pydantic import BaseModel +from unstructured.documents.elements import Element + + +class Document(BaseModel): + """ + A class to represent a document. + Really Simplified. + """ + + name: str + metadata: Dict + content: List[Element] diff --git a/libs/megaparse/src/megaparse/parser/base.py b/libs/megaparse/src/megaparse/parser/base.py index 3e7c2f0..0f6a283 100644 --- a/libs/megaparse/src/megaparse/parser/base.py +++ b/libs/megaparse/src/megaparse/parser/base.py @@ -2,9 +2,8 @@ from pathlib import Path from typing import IO, List -from unstructured.documents.elements import Element - from megaparse_sdk.schema.extensions import FileExtension +from unstructured.documents.elements import Element class BaseParser(ABC): @@ -34,7 +33,7 @@ async def aconvert( file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, - ) -> str: + ) -> List[Element]: """ Convert the given file to a specific format. @@ -57,7 +56,7 @@ def convert( file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, - ) -> List[Element] | str: + ) -> List[Element]: """ Convert the given file to the unstructured format. diff --git a/libs/megaparse/src/megaparse/parser/doctr_parser.py b/libs/megaparse/src/megaparse/parser/doctr_parser.py index c009732..1dbd5d9 100644 --- a/libs/megaparse/src/megaparse/parser/doctr_parser.py +++ b/libs/megaparse/src/megaparse/parser/doctr_parser.py @@ -5,9 +5,17 @@ import onnxruntime as rt from megaparse_sdk.schema.extensions import FileExtension -from onnxtr.io import DocumentFile +from onnxtr.io import Document, DocumentFile from onnxtr.models import ocr_predictor from onnxtr.models.engine import EngineConfig +from unstructured.documents.coordinates import RelativeCoordinateSystem +from unstructured.documents.elements import ( + Element, + ElementMetadata, + Image, + PageBreak, + Text, +) from megaparse.parser.base import BaseParser @@ -27,7 +35,7 @@ def __init__( straighten_pages: bool = False, use_gpu: bool = False, **kwargs, - ): + ) -> None: self.use_gpu = use_gpu general_options = rt.SessionOptions() providers = self._get_providers() @@ -69,7 +77,7 @@ def convert( file: IO[bytes] | BinaryIO | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> List[Element]: if file: file.seek(0) pdf = file.read() @@ -82,8 +90,9 @@ def convert( doc = DocumentFile.from_pdf(pdf) # Analyze - result = self.predictor(doc) - return result.render() + doctr_result = self.predictor(doc) + + return self.__to_elements_list__(doctr_result) async def aconvert( self, @@ -91,10 +100,62 @@ async def aconvert( file: IO[bytes] | BinaryIO | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> List[Element]: warnings.warn( - "The UnstructuredParser is a sync parser, please use the sync convert method", + "The DocTRParser is a sync parser, please use the sync convert method", UserWarning, stacklevel=2, ) return self.convert(file_path, file, file_extension, **kwargs) + + def __to_elements_list__(self, doctr_document: Document) -> List[Element]: + result = [] + + for page in doctr_document.pages: + for block in page.blocks: + if len(block.lines) and len(block.artefacts) > 0: + raise ValueError( + "Block should not contain both lines and artefacts" + ) + word_coordinates = [ + word.geometry for line in block.lines for word in line.words + ] + x0 = min(word[0][0] for word in word_coordinates) + y0 = min(word[0][1] for word in word_coordinates) + x1 = max(word[1][0] for word in word_coordinates) + y1 = max(word[1][1] for word in word_coordinates) + + result.append( + Text( + text=block.render(), + coordinates=( + (x0, y0), + (x1, y0), + (x1, y1), + (x0, y1), + ), + coordinate_system=RelativeCoordinateSystem(), + metadata=ElementMetadata(), + detection_origin="doctr", + ) + ) + + for artefact in block.artefacts: + result.append( + Image( + text="", + coordinates=( + (artefact.geometry[0][0], artefact.geometry[0][1]), + (artefact.geometry[1][0], artefact.geometry[0][1]), + (artefact.geometry[1][0], artefact.geometry[1][1]), + (artefact.geometry[0][0], artefact.geometry[1][1]), + ), + coordinate_system=RelativeCoordinateSystem(), + metadata=ElementMetadata(), + detection_origin="doctr", + ) + ) + + result.append(PageBreak(text="")) + + return result diff --git a/libs/megaparse/src/megaparse/parser/llama.py b/libs/megaparse/src/megaparse/parser/llama.py index 9cb0d8c..695ed6a 100644 --- a/libs/megaparse/src/megaparse/parser/llama.py +++ b/libs/megaparse/src/megaparse/parser/llama.py @@ -1,4 +1,3 @@ -import asyncio from pathlib import Path from typing import IO, List @@ -6,6 +5,10 @@ from llama_parse import LlamaParse as _LlamaParse from llama_parse.utils import Language, ResultType from megaparse_sdk.schema.extensions import FileExtension +from unstructured.documents.elements import ( + Element, + Text, +) from megaparse.parser import BaseParser @@ -36,7 +39,7 @@ async def aconvert( file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> List[Element]: if not file_path: raise ValueError("File_path should be provided to run LlamaParser") self.check_supported_extension(file_extension, file_path) @@ -56,7 +59,7 @@ async def aconvert( text_content = document.text parsed_md = parsed_md + text_content - return parsed_md + return self.__to_elements_list__(parsed_md) def convert( self, @@ -64,14 +67,14 @@ def convert( file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> List[Element]: if not file_path: raise ValueError("File_path should be provided to run LlamaParser") self.check_supported_extension(file_extension, file_path) llama_parser = _LlamaParse( api_key=self.api_key, - result_type=ResultType.MD, + result_type=ResultType.JSON, gpt4o_mode=True, verbose=self.verbose, language=self.language, @@ -84,4 +87,7 @@ def convert( text_content = document.text parsed_md = parsed_md + text_content - return parsed_md + return self.__to_elements_list__(parsed_md) + + def __to_elements_list__(self, llama_doc: str) -> List[Element]: + return [Text(text=llama_doc)] diff --git a/libs/megaparse/src/megaparse/parser/megaparse_vision.py b/libs/megaparse/src/megaparse/parser/megaparse_vision.py index 0b05e73..3516870 100644 --- a/libs/megaparse/src/megaparse/parser/megaparse_vision.py +++ b/libs/megaparse/src/megaparse/parser/megaparse_vision.py @@ -3,12 +3,13 @@ import re from io import BytesIO from pathlib import Path -from typing import IO, List, Union +from typing import IO, List from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import HumanMessage from megaparse_sdk.schema.extensions import FileExtension from pdf2image import convert_from_path +from unstructured.documents.elements import Element, Text from megaparse.parser import BaseParser from megaparse.parser.entity import SupportedModel, TagEnum @@ -147,7 +148,7 @@ async def aconvert( file_extension: FileExtension | None = None, batch_size: int = 3, **kwargs, - ) -> str: + ) -> List[Element]: """ Parse a PDF file and process its content using the language model. @@ -170,7 +171,7 @@ async def aconvert( ] self.parsed_chunks = await asyncio.gather(*tasks) responses = self.get_cleaned_content("\n".join(self.parsed_chunks)) - return responses + return self.__to_elements_list__(responses) def convert( self, @@ -179,7 +180,7 @@ def convert( file_extension: FileExtension | None = None, batch_size: int = 3, **kwargs, - ) -> str: + ) -> List[Element]: """ Parse a PDF file and process its content using the language model. @@ -205,7 +206,7 @@ def convert( response = self.send_to_mlm(chunk) self.parsed_chunks.append(response) responses = self.get_cleaned_content("\n".join(self.parsed_chunks)) - return responses + return self.__to_elements_list__(responses) def get_cleaned_content(self, parsed_file: str) -> str: """ @@ -245,3 +246,6 @@ def remove_tag(match): cleaned_content = cleaned_content.strip() return cleaned_content + + def __to_elements_list__(self, mpv_doc: str) -> List[Element]: + return [Text(text=mpv_doc)] diff --git a/libs/megaparse/src/megaparse/parser/unstructured_parser.py b/libs/megaparse/src/megaparse/parser/unstructured_parser.py index b47a93a..cc3f815 100644 --- a/libs/megaparse/src/megaparse/parser/unstructured_parser.py +++ b/libs/megaparse/src/megaparse/parser/unstructured_parser.py @@ -38,7 +38,7 @@ def __init__( self.strategy = strategy self.model = model - async def convert( + def convert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, @@ -53,3 +53,18 @@ async def convert( content_type=file_extension.mimetype if file_extension else None, ) return elements + + async def aconvert( + self, + file_path: str | Path | None = None, + file: IO[bytes] | None = None, + file_extension: FileExtension | None = None, + **kwargs, + ) -> List[Element]: + self.check_supported_extension(file_extension, file_path) + warnings.warn( + "The UnstructuredParser is a sync parser, please use the sync convert method", + UserWarning, + stacklevel=2, + ) + return self.convert(file_path, file, file_extension, **kwargs) diff --git a/requirements-dev.lock b/requirements-dev.lock index 05ce254..f1246a0 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -255,7 +255,7 @@ layoutparser==0.3.4 # via unstructured-inference llama-index-core==0.12.0 # via llama-parse -llama-parse==0.5.14 +llama-parse==0.5.19 # via megaparse loguru==0.7.2 # via megaparse-sdk @@ -495,6 +495,7 @@ pydantic==2.9.2 # via langchain-core # via langsmith # via llama-index-core + # via llama-parse # via openai # via pydantic-settings # via unstructured-client diff --git a/requirements.lock b/requirements.lock index e0720ab..f747b77 100644 --- a/requirements.lock +++ b/requirements.lock @@ -209,7 +209,7 @@ layoutparser==0.3.4 # via unstructured-inference llama-index-core==0.12.0 # via llama-parse -llama-parse==0.5.14 +llama-parse==0.5.19 # via megaparse loguru==0.7.2 # via megaparse-sdk @@ -413,6 +413,7 @@ pydantic==2.9.2 # via langchain-core # via langsmith # via llama-index-core + # via llama-parse # via openai # via pydantic-settings # via unstructured-client From 351b63af9269a743462034a3406d4b7f83cb7185 Mon Sep 17 00:00:00 2001 From: chloedia Date: Mon, 6 Jan 2025 18:01:06 +0100 Subject: [PATCH 05/17] feat: new basemodel for document --- .../src/megaparse/examples/parse_file.py | 9 +- .../src/megaparse/models/document.py | 87 ++++++++++++++++++- 2 files changed, 88 insertions(+), 8 deletions(-) diff --git a/libs/megaparse/src/megaparse/examples/parse_file.py b/libs/megaparse/src/megaparse/examples/parse_file.py index ee66b65..59596e0 100644 --- a/libs/megaparse/src/megaparse/examples/parse_file.py +++ b/libs/megaparse/src/megaparse/examples/parse_file.py @@ -1,6 +1,10 @@ import asyncio +from typing import List from langchain_openai import ChatOpenAI +from llama_index.core.schema import Document as LlamaDocument +from llama_parse import LlamaParse +from llama_parse.utils import Language, ResultType from megaparse.formatter.structured_formatter.custom_structured_formatter import ( CustomStructuredFormatter, ) @@ -10,11 +14,6 @@ from megaparse.parser.unstructured_parser import UnstructuredParser from pydantic import BaseModel, Field -from llama_parse import LlamaParse -from llama_parse.utils import Language, ResultType -from typing import List -from llama_index.core.schema import Document as LlamaDocument - class MyCustomFormat(BaseModel): title: str = Field(description="The title of the document.") diff --git a/libs/megaparse/src/megaparse/models/document.py b/libs/megaparse/src/megaparse/models/document.py index 42d7552..f3b9830 100644 --- a/libs/megaparse/src/megaparse/models/document.py +++ b/libs/megaparse/src/megaparse/models/document.py @@ -1,7 +1,88 @@ from typing import Dict, List from pydantic import BaseModel -from unstructured.documents.elements import Element + + +class Block(BaseModel): + """ + A class to represent a block. + Really Simplified. + """ + + metadata: Dict # FIXME: To be defined as a pydantic model later @Amine + content: str + + +class TextBlock(Block): + """ + A class to represent a text block. + Really Simplified. + """ + + pass + + +class ImageBlock(Block): + """ + A class to represent an image block. + Really Simplified. + """ + + pass + + +class TitleBlock(Block): + """ + A class to represent a title block. + Really Simplified. + """ + + pass + + +class SubTitle(Block): + """ + A class to represent a subtitle block. + Really Simplified. + """ + + depth: int + + +class TableBlock(Block): + """ + A class to represent a table block. + Really Simplified. + """ + + pass + + +class ListBlock(Block): + """ + A class to represent a list block. + Really Simplified. + """ + + pass + + +class HeaderBlock(Block): + """ + A class to represent a header block. + Really Simplified. + """ + + pass + + +class FooterBlock(Block): + """ + A class to represent a footer block. + Really Simplified. + """ + + pass class Document(BaseModel): @@ -11,5 +92,5 @@ class Document(BaseModel): """ name: str - metadata: Dict - content: List[Element] + metadata: Dict # TBD @Amine + content: List[Block] From 52e2c028a01a2dcbd64f14e9d913085b9dafeb23 Mon Sep 17 00:00:00 2001 From: chloedia Date: Tue, 7 Jan 2025 19:35:05 +0100 Subject: [PATCH 06/17] add: structured output --- .../src/megaparse/examples/parse_file.py | 6 +- .../megaparse/src/megaparse/formatter/base.py | 38 +-- .../structured_formatter/__init__.py | 14 +- .../custom_structured_formatter.py | 17 +- .../formatter/table_formatter/__init__.py | 21 +- .../table_formatter/llm_table_formatter.py | 40 +-- .../table_formatter/vision_table_formatter.py | 76 +++-- .../unstructured_formatter/__init__.py | 16 - .../unstructured_formatter/md_formatter.py | 64 ---- libs/megaparse/src/megaparse/megaparse.py | 31 +- .../src/megaparse/models/document.py | 192 +++++++++--- libs/megaparse/src/megaparse/parser/base.py | 13 +- .../src/megaparse/parser/doctr_parser.py | 65 ++-- libs/megaparse/src/megaparse/parser/llama.py | 43 +-- .../src/megaparse/parser/megaparse_vision.py | 30 +- .../megaparse/parser/unstructured_parser.py | 293 +++++++++++++++++- libs/megaparse/tests/test_parsers.py | 2 +- 17 files changed, 646 insertions(+), 315 deletions(-) delete mode 100644 libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py delete mode 100644 libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py diff --git a/libs/megaparse/src/megaparse/examples/parse_file.py b/libs/megaparse/src/megaparse/examples/parse_file.py index 59596e0..5a37d29 100644 --- a/libs/megaparse/src/megaparse/examples/parse_file.py +++ b/libs/megaparse/src/megaparse/examples/parse_file.py @@ -8,7 +8,6 @@ from megaparse.formatter.structured_formatter.custom_structured_formatter import ( CustomStructuredFormatter, ) -from megaparse.formatter.unstructured_formatter.md_formatter import MarkDownFormatter from megaparse.megaparse import MegaParse from megaparse.parser.doctr_parser import DoctrParser from megaparse.parser.unstructured_parser import UnstructuredParser @@ -25,10 +24,9 @@ def main(): # Parse a file parser = DoctrParser() model = ChatOpenAI(name="gpt-4o") - formatter_1 = MarkDownFormatter() - formatter_2 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) + formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) - megaparse = MegaParse(parser=parser, formatters=[formatter_1, formatter_2]) + megaparse = MegaParse(parser=parser) file_path = "./tests/pdf/sample_pdf.pdf" result = megaparse.load(file_path=file_path) diff --git a/libs/megaparse/src/megaparse/formatter/base.py b/libs/megaparse/src/megaparse/formatter/base.py index 8c26217..7243e80 100644 --- a/libs/megaparse/src/megaparse/formatter/base.py +++ b/libs/megaparse/src/megaparse/formatter/base.py @@ -1,11 +1,11 @@ from abc import ABC +from pathlib import Path from typing import List, Union from langchain_core.language_models.chat_models import BaseChatModel -from unstructured.documents.elements import Element +from megaparse.models.document import Document -# TODO: Implement the Formatter class @Chloe class BaseFormatter(ABC): """ A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables. @@ -23,35 +23,11 @@ def __init__(self, model: BaseChatModel | None = None): self.model = model def format( - self, elements: Union[List[Element], str], file_path: str | None = None - ) -> Union[List[Element], str]: - if isinstance(elements, list): - return self.format_elements(elements, file_path) - return self.format_string(elements, file_path) - - async def aformat( - self, elements: Union[List[Element], str], file_path: str | None = None - ) -> Union[List[Element], str]: - if isinstance(elements, list): - return await self.aformat_elements(elements, file_path) - return await self.aformat_string(elements, file_path) - - def format_elements( - self, elements: List[Element], file_path: str | None = None - ) -> Union[List[Element], str]: - raise NotImplementedError("Subclasses should implement this method") - - async def aformat_elements( - self, elements: List[Element], file_path: str | None = None - ) -> Union[List[Element], str]: - raise NotImplementedError("Subclasses should implement this method") - - def format_string( - self, text: str, file_path: str | None = None - ) -> Union[List[Element], str]: + self, document: Document, file_path: Path | str | None = None + ) -> Union[Document, str]: raise NotImplementedError("Subclasses should implement this method") - async def aformat_string( - self, text: str, file_path: str | None = None - ) -> Union[List[Element], str]: + async def aformat( + self, document: Document, file_path: Path | str | None = None + ) -> Union[Document, str]: raise NotImplementedError("Subclasses should implement this method") diff --git a/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py index 2a95c6c..dba1089 100644 --- a/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py +++ b/libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py @@ -1,5 +1,7 @@ +from pathlib import Path from langchain_core.language_models.chat_models import BaseChatModel from megaparse.formatter.base import BaseFormatter +from megaparse.models.document import Document from pydantic import BaseModel @@ -8,16 +10,16 @@ def __init__(self, model: BaseChatModel, output_model: type[BaseModel]): super().__init__(model) self.output_model = output_model - async def aformat_string( + async def aformat( self, - text: str, - file_path: str | None = None, + document: Document, + file_path: Path | str | None = None, ) -> str: # FIXME: Return a structured output of type BaseModel ? raise NotImplementedError() - def format_string( + def format( self, - text: str, - file_path: str | None = None, + document: Document, + file_path: Path | str | None = None, ) -> str: # FIXME: Return a structured output of type BaseModel ? raise NotImplementedError() diff --git a/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py b/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py index 6041625..858253d 100644 --- a/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py +++ b/libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py @@ -1,12 +1,14 @@ +from pathlib import Path from megaparse.formatter.structured_formatter import StructuredFormatter +from megaparse.models.document import Document from pydantic import BaseModel class CustomStructuredFormatter(StructuredFormatter): - def format_string( + def format( self, - text: str, - file_path: str | None = None, + document: Document, + file_path: Path | str | None = None, ) -> str: """ Structure the file using an AI language model. @@ -20,6 +22,7 @@ def format_string( if not self.model: raise ValueError("A Model is needed to use the CustomStructuredFormatter.") print("Formatting text using CustomStructuredFormatter...") + text = str(document) if len(text) < 0: raise ValueError( "A non empty text is needed to format text using CustomStructuredFormatter." @@ -38,10 +41,10 @@ def format_string( return formatted_text.model_dump_json() - async def aformat_string( + async def aformat( self, - text: str, - file_path: str | None = None, + document: Document, + file_path: Path | str | None = None, ) -> str: """ Asynchronously structure the file using an AI language model. @@ -55,6 +58,8 @@ async def aformat_string( if not self.model: raise ValueError("A Model is needed to use the CustomStructuredFormatter.") print("Formatting text using CustomStructuredFormatter...") + text = str(document) + if len(text) < 0: raise ValueError( "A non empty text is needed to format text using CustomStructuredFormatter." diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py index caaebf6..9b28987 100644 --- a/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py @@ -1,16 +1,17 @@ -from typing import List +from pathlib import Path +from typing import Union from megaparse.formatter.base import BaseFormatter -from unstructured.documents.elements import Element +from megaparse.models.document import Document class TableFormatter(BaseFormatter): - async def aformat_elements( - self, elements: List[Element], file_path: str | None = None - ) -> List[Element]: - raise NotImplementedError() + def format( + self, document: Document, file_path: Path | str | None = None + ) -> Document: + raise NotImplementedError("Subclasses should implement this method") - def format_elements( - self, elements: List[Element], file_path: str | None = None - ) -> List[Element]: - raise NotImplementedError() + async def aformat( + self, document: Document, file_path: Path | str | None = None + ) -> Document: + raise NotImplementedError("Subclasses should implement this method") diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py b/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py index b90a83c..1c3eaea 100644 --- a/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py @@ -1,11 +1,12 @@ import re -from typing import List, Optional import warnings +from pathlib import Path +from typing import Optional from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.prompts import ChatPromptTemplate from megaparse.formatter.table_formatter import TableFormatter -from unstructured.documents.elements import Element +from megaparse.models.document import Document, TableBlock class SimpleMDTableFormatter(TableFormatter): @@ -20,19 +21,19 @@ class SimpleMDTableFormatter(TableFormatter): def __init__(self, model: Optional[BaseChatModel] = None): super().__init__(model) - async def aformat_elements( - self, elements: List[Element], file_path: str | None = None - ) -> List[Element]: + async def aformat( + self, document: Document, file_path: Path | str | None = None + ) -> Document: warnings.warn( "The SimpleMDTableFormatter is a sync formatter, please use the sync format method", UserWarning, stacklevel=2, ) - return self.format_elements(elements, file_path) + return self.format(document=document, file_path=file_path) - def format_elements( - self, elements: List[Element], file_path: str | None = None - ) -> List[Element]: + def format( + self, document: Document, file_path: Path | str | None = None + ) -> Document: """ Formats table elements within a list of elements. Args: @@ -46,18 +47,21 @@ def format_elements( table_stack = [] formatted_elements = [] - for element in elements: - if element.category == "Table": + for block in document.content: + if isinstance(block, TableBlock): previous_table = table_stack[-1] if table_stack else "" - formatted_table = self.format_table(element, previous_table) + formatted_table = self.format_table(block, previous_table) table_stack.append(formatted_table.text) formatted_elements.append(formatted_table) else: - formatted_elements.append(element) + formatted_elements.append(block) - return formatted_elements + document.content = formatted_elements + return document - def format_table(self, table_element: Element, previous_table: str) -> Element: + def format_table( + self, table_element: TableBlock, previous_table: str + ) -> TableBlock: """ Formats a single table element into Markdown using an AI language model. Args: @@ -73,10 +77,9 @@ def format_table(self, table_element: Element, previous_table: str) -> Element: ( "human", ( - "You are an expert in markdown tables. Match the following text and HTML table " - "to create a markdown table. Provide just the table in pure markdown, nothing else.\n" + "You are an expert in markdown tables. Transform the following parsed table into a " + "markdown table. Provide just the table in pure markdown, nothing else.\n" "\n{text}\n\n" - "\n{html}\n\n" "\n{previous_table}\n" ), ), @@ -87,7 +90,6 @@ def format_table(self, table_element: Element, previous_table: str) -> Element: result = chain.invoke( { "text": table_element.text, - "html": table_element.metadata.text_as_html, "previous_table": previous_table, } ) diff --git a/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py b/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py index 4762d76..e94d85b 100644 --- a/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py +++ b/libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py @@ -1,10 +1,12 @@ import base64 from io import BytesIO +from pathlib import Path from typing import List, Optional from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import HumanMessage from megaparse.formatter.table_formatter import TableFormatter +from megaparse.models.document import Document, TableBlock from pdf2image import convert_from_path from PIL import Image from unstructured.documents.elements import Element @@ -27,35 +29,33 @@ class VisionMDTableFormatter(TableFormatter): def __init__(self, model: Optional[BaseChatModel] = None): super().__init__(model) - def _crop_table_image(self, table_element: Element, file_path: str) -> str: + def _crop_table_image(self, table_element: TableBlock, file_path: str) -> str: """ Helper method to crop the table portion of the PDF page and convert it to a base64 string. """ - assert ( - table_element.metadata.coordinates - ), "Table element must have coordinates." - coordinates = table_element.metadata.coordinates.points - page_number = table_element.metadata.page_number + assert table_element.bbox, "Table element must have coordinates." + bbox = table_element.bbox + page_number = table_element.page_range[0] assert page_number, "Table element must have a page number." - assert coordinates, "Table element must have coordinates." + assert bbox, "Table element must have coordinates." pages = convert_from_path(file_path) # Calculate the box for cropping box = ( - min(coord[0] for coord in coordinates), - min(coord[1] for coord in coordinates), - max(coord[0] for coord in coordinates), - max(coord[1] for coord in coordinates), + bbox.top_left.x, + bbox.top_left.y, + bbox.bottom_right.x, + bbox.bottom_right.y, ) table_image = pages[page_number - 1].crop(box) # Convert the cropped image to base64 table_image64 = self.process_file([table_image])[0] return table_image64 - async def aformat_elements( - self, elements: List[Element], file_path: str | None = None - ) -> List[Element]: + async def aformat( + self, document: Document, file_path: Path | str | None = None + ) -> Document: """ Asynchronously formats table elements within a list of elements. """ @@ -65,39 +65,47 @@ async def aformat_elements( assert ( file_path ), "A file path is needed to format tables using VisionMDTableFormatter." - + if not isinstance(file_path, str): + file_path = str(file_path) formatted_elements = [] - for element in elements: - if element.category == "Table": - formatted_table = await self.aformat_table(element, file_path) + for block in document.content: + if isinstance(block, TableBlock): + formatted_table = await self.aformat_table(block, file_path) formatted_elements.append(formatted_table) else: - formatted_elements.append(element) - return formatted_elements + formatted_elements.append(block) - def format_elements( - self, elements: List[Element], file_path: str | None = None - ) -> List[Element]: + document.content = formatted_elements + return document + + def format( + self, document: Document, file_path: Path | str | None = None + ) -> Document: """ - Synchronously formats table elements within a list of elements. + Asynchronously formats table elements within a list of elements. """ if not self.model: raise ValueError("A Model is needed to use the VisionMDTableFormatter.") - print("Formatting tables using VisionMDTableFormatter (sync)...") + print("Formatting tables using VisionMDTableFormatter (async)...") assert ( file_path ), "A file path is needed to format tables using VisionMDTableFormatter." - + if not isinstance(file_path, str): + file_path = str(file_path) formatted_elements = [] - for element in elements: - if element.category == "Table": - formatted_table = self.format_table(element, file_path) + for block in document.content: + if isinstance(block, TableBlock): + formatted_table = self.format_table(block, file_path) formatted_elements.append(formatted_table) else: - formatted_elements.append(element) - return formatted_elements + formatted_elements.append(block) - async def aformat_table(self, table_element: Element, file_path: str) -> Element: + document.content = formatted_elements + return document + + async def aformat_table( + self, table_element: TableBlock, file_path: str + ) -> TableBlock: """ Asynchronously formats a table element into Markdown format using a Vision Model. """ @@ -113,9 +121,9 @@ async def aformat_table(self, table_element: Element, file_path: str) -> Element table_element.text = markdown_table return table_element - def format_table(self, table_element: Element, file_path: str) -> Element: + def format_table(self, table_element: TableBlock, file_path: str) -> TableBlock: """ - Synchronously formats a table element into Markdown format using a Vision Model. + Asynchronously formats a table element into Markdown format using a Vision Model. """ table_image64 = self._crop_table_image(table_element, file_path) formatted_table = self.vision_extract(table_image64) diff --git a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py deleted file mode 100644 index 4b7396e..0000000 --- a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import List - -from megaparse.formatter.base import BaseFormatter -from unstructured.documents.elements import Element - - -class UnstructuredFormatter(BaseFormatter): - async def aformat_elements( - self, elements: List[Element], file_path: str | None = None - ) -> str: - raise NotImplementedError() - - def format_elements( - self, elements: List[Element], file_path: str | None = None - ) -> str: - raise NotImplementedError() diff --git a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py b/libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py deleted file mode 100644 index 4149187..0000000 --- a/libs/megaparse/src/megaparse/formatter/unstructured_formatter/md_formatter.py +++ /dev/null @@ -1,64 +0,0 @@ -import warnings -from typing import List - -from megaparse.formatter.unstructured_formatter import UnstructuredFormatter -from unstructured.documents.elements import Element - - -class MarkDownFormatter(UnstructuredFormatter): - def format_elements( - self, elements: List[Element], file_path: str | None = None - ) -> str: - print("Formatting elements using MarkDownFormatter...") - markdown_content = "" - - for el in elements: - markdown_content += self.get_markdown_line(el.to_dict()) - - return markdown_content - - async def aformat_elements( - self, elements: List[Element], file_path: str | None = None - ) -> str: - warnings.warn( - "The MarkDownFormatter is a sync formatter, please use the sync format method", - UserWarning, - stacklevel=2, - ) - return self.format_elements(elements, file_path) - - def get_markdown_line(self, el: dict): - element_type = el["type"] - text = el["text"] - metadata = el["metadata"] - parent_id = metadata.get("parent_id", None) - category_depth = metadata.get("category_depth", 0) - # table_stack = [] - - if "emphasized_text_contents" in metadata: - print(metadata["emphasized_text_contents"]) - - # Markdown line defaults to empty - markdown_line = "" - - # Element type-specific markdown content - markdown_types = { - "Title": f"## {text}\n\n" if parent_id else f"# {text}\n\n", - "Subtitle": f"## {text}\n\n", - "Header": f"{'#' * (category_depth + 1)} {text}\n\n", - "Footer": f"#### {text}\n\n", - "NarrativeText": f"{text}\n\n", - "ListItem": f"- {text}\n", - "Table": f"{text}\n\n", - "PageBreak": "---\n\n", - "Image": f"![Image]({el['metadata'].get('image_path', '')})\n\n", - "Formula": f"$$ {text} $$\n\n", - "FigureCaption": f"**Figure:** {text}\n\n", - "Address": f"**Address:** {text}\n\n", - "EmailAddress": f"**Email:** {text}\n\n", - "CodeSnippet": f"```{el['metadata'].get('language', '')}\n{text}\n```\n\n", - "PageNumber": "", # Page number is not included in markdown - } - - markdown_line = markdown_types.get(element_type, f"{text}\n\n") - return markdown_line diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py index c1979fe..7bb2fad 100644 --- a/libs/megaparse/src/megaparse/megaparse.py +++ b/libs/megaparse/src/megaparse/megaparse.py @@ -1,13 +1,11 @@ -import asyncio import logging -import os from pathlib import Path from typing import IO, BinaryIO, List +import warnings from megaparse_sdk.config import MegaParseConfig from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum -from unstructured.documents.elements import Element from megaparse.exceptions.base import ParsingException from megaparse.formatter.base import BaseFormatter @@ -77,15 +75,23 @@ async def aload( ) try: parsed_document = await self.parser.aconvert(file_path=file_path, file=file) - # @chloe FIXME: format_checker needs unstructured Elements as input which is to change to a megaparse element + parsed_document.file_name = str(file_path) if file_path else None if self.formatters: for formatter in self.formatters: - parsed_document = await formatter.aformat(parsed_document) + if isinstance(parsed_document, str): + warnings.warn( + f"The last step returned a string, the {formatter.__class__} and following will not be applied", + stacklevel=2, + ) + break + parsed_document = await formatter.aformat( + document=parsed_document, file_path=file_path + ) except Exception as e: raise ParsingException(f"Error while parsing {file_path}: {e}") if not isinstance(parsed_document, str): - raise ValueError("The parser or the last formatter should return a string") + return str(parsed_document) return parsed_document def load( @@ -103,18 +109,23 @@ def load( parsed_document = parser.convert( file_path=file_path, file=file, file_extension=file_extension ) - # @chloe FIXME: format_checker needs unstructured Elements as input which is to change + parsed_document.file_name = str(file_path) if file_path else None + if self.formatters: for formatter in self.formatters: + if isinstance(parsed_document, str): + warnings.warn( + f"The last step returned a string, the {formatter.__class__} and following will not be applied", + stacklevel=2, + ) + break parsed_document = formatter.format(parsed_document) # @chloe FIXME: format_checker needs unstructured Elements as input which is to change # if self.format_checker: # parsed_document: str = self.format_checker.check(parsed_document) if not isinstance(parsed_document, str): - raise ValueError( - "The parser or the last formatter should return a string" - ) + return str(parsed_document) return parsed_document except Exception as e: raise ParsingException( diff --git a/libs/megaparse/src/megaparse/models/document.py b/libs/megaparse/src/megaparse/models/document.py index f3b9830..1b45a87 100644 --- a/libs/megaparse/src/megaparse/models/document.py +++ b/libs/megaparse/src/megaparse/models/document.py @@ -1,96 +1,206 @@ -from typing import Dict, List +import uuid +from typing import Any, Dict, List, Optional, Tuple -from pydantic import BaseModel +from megaparse.predictor.models.base import BBOX +from pydantic import BaseModel, Field, field_validator + + +class Point2D(BaseModel): + """ + A class to represent a 2D point + + """ + + x: float + y: float class Block(BaseModel): """ - A class to represent a block. - Really Simplified. + A class to represent a block + """ - metadata: Dict # FIXME: To be defined as a pydantic model later @Amine - content: str + block_id: Optional[uuid.UUID] = Field(default_factory=uuid.uuid4) + metadata: Dict[str, Any] # FIXME: TBD @Amine + bbox: Optional[BBOX] = ( + None # (x0,y0),(x1, y1) Coordinates are given as Relative positions to the page they are in + ) + page_range: Optional[Tuple[int, int]] = Field(...) # (start_page, end_page) + + @field_validator("page_range") + def validate_range(cls, value): + if value is None: + return None + start, end = value + if start > end: + raise ValueError( + "The first value of the page range must be less than the second value" + ) + return value class TextBlock(Block): """ - A class to represent a text block. - Really Simplified. + A class to represent a text block + """ - pass + text: str + def __str__(self): + return self.text -class ImageBlock(Block): + +class TitleBlock(TextBlock): """ - A class to represent an image block. - Really Simplified. + A class to represent a title block + """ - pass + def __str__(self): + return f"# {self.text}" -class TitleBlock(Block): +class SubTitleBlock(TextBlock): """ - A class to represent a title block. - Really Simplified. + A class to represent a subtitle block """ - pass + depth: int + + def __str__(self): + heading_level = min(self.depth + 1, 6) + return f"{'#' * heading_level} {self.text}" -class SubTitle(Block): +class ImageBlock(Block): """ - A class to represent a subtitle block. - Really Simplified. + A class to represent an image block """ - depth: int + text: Optional[str] = None + caption: Optional[str] = "unknown" + + def __str__(self) -> str: + return f"[Image: {self.caption}]" -class TableBlock(Block): +class TableBlock(ImageBlock): """ - A class to represent a table block. - Really Simplified. + A class to represent a table block + """ - pass + def __str__(self): + return self.text if self.text else f"[Table : {self.caption}]" -class ListBlock(Block): +class ListElement(BaseModel): """ - A class to represent a list block. - Really Simplified. + A class to represent a list element + """ - pass + text: str + depth: int -class HeaderBlock(Block): +class ListBlock(TextBlock): """ - A class to represent a header block. - Really Simplified. + A class to represent a list block + """ - pass + list_elements: List[ListElement] + + # rajouter fonction pydantic pour compute l attribut + + def __str__(self): + return "\n".join( + f"{' ' * (2 * element.depth)}* {element.text}" + for element in self.list_elements + ) -class FooterBlock(Block): +class HeaderBlock(TextBlock): """ - A class to represent a footer block. - Really Simplified. + A class to represent a header block + + """ + + def __str__(self): + return f"{'='*len(self.text)}\n\n{self.text}\n\n{'='*len(self.text)}" + + +class FooterBlock(TextBlock): """ + A class to represent a footer block + + """ + + def __str__(self): + return f"{'='*len(self.text)}\n\n{self.text}\n\n{'='*len(self.text)}" + + +class TOCItem(BaseModel): + title: str + depth: int + page_range: Tuple[int, int] = Field(...) # (start_page, end_page) - pass + @field_validator("page_range") + def validate_range(cls, value): + start, end = value + if start >= end: + raise ValueError( + "The first value of the page range must be less than the second value" + ) + return value + + def __str__(self): + start_page, end_page = self.page_range + page_info = ( + f"page {start_page}" + if start_page == end_page + else f"pages {start_page}-{end_page}" + ) + return f"{' ' * (2 * self.depth)}* {self.title} ({page_info})" + + +class TOC(BaseModel): + content: List[TOCItem] + + @property + def text(self) -> str: + return "\n".join(str(item) for item in self.content) + + def __str__(self): + return self.text class Document(BaseModel): """ - A class to represent a document. - Really Simplified. + + A class to represent a document + """ - name: str - metadata: Dict # TBD @Amine + file_name: Optional[str] = None + table_of_contents: Optional[TOC] = None + metadata: Dict[str, Any] # TBD @Amine content: List[Block] + detection_origin: str + + def __str__(self) -> str: + lines = [] + + # If there's a table of contents, include it + if self.table_of_contents: + lines.append("Table of Contents:") + # Use TOCā€™s own string-building property or method + lines.append(self.table_of_contents.text) + + # Print each blockā€™s text representation + lines.extend(str(block) for block in self.content) + + return "\n".join(lines) diff --git a/libs/megaparse/src/megaparse/parser/base.py b/libs/megaparse/src/megaparse/parser/base.py index 0f6a283..8c3964d 100644 --- a/libs/megaparse/src/megaparse/parser/base.py +++ b/libs/megaparse/src/megaparse/parser/base.py @@ -1,9 +1,10 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import IO, List +from typing import IO from megaparse_sdk.schema.extensions import FileExtension -from unstructured.documents.elements import Element + +from megaparse.models.document import Document class BaseParser(ABC): @@ -16,12 +17,12 @@ def check_supported_extension( ): if not file_extension and not file_path: raise ValueError( - "Either file_path or file_extension must be provided for {self.__class__.__name__}" + f"Either file_path or file_extension must be provided for {self.__class__.__name__}" ) if file_path and not file_extension: file_path = Path(file_path) if isinstance(file_path, str) else file_path file_extension = FileExtension(file_path.suffix) - if file_extension not in self.supported_extensions: + if file_extension and file_extension not in self.supported_extensions: raise ValueError( f"Unsupported file extension {file_extension.value} for {self.__class__.__name__}" ) @@ -33,7 +34,7 @@ async def aconvert( file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, - ) -> List[Element]: + ) -> Document: """ Convert the given file to a specific format. @@ -56,7 +57,7 @@ def convert( file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, - ) -> List[Element]: + ) -> Document: """ Convert the given file to the unstructured format. diff --git a/libs/megaparse/src/megaparse/parser/doctr_parser.py b/libs/megaparse/src/megaparse/parser/doctr_parser.py index 1dbd5d9..c4bef50 100644 --- a/libs/megaparse/src/megaparse/parser/doctr_parser.py +++ b/libs/megaparse/src/megaparse/parser/doctr_parser.py @@ -8,16 +8,11 @@ from onnxtr.io import Document, DocumentFile from onnxtr.models import ocr_predictor from onnxtr.models.engine import EngineConfig -from unstructured.documents.coordinates import RelativeCoordinateSystem -from unstructured.documents.elements import ( - Element, - ElementMetadata, - Image, - PageBreak, - Text, -) +from megaparse.models.document import Document as MPDocument +from megaparse.models.document import ImageBlock, TextBlock from megaparse.parser.base import BaseParser +from megaparse.predictor.models.base import BBOX, Point2D logger = logging.getLogger("megaparse") @@ -77,7 +72,7 @@ def convert( file: IO[bytes] | BinaryIO | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> List[Element]: + ) -> MPDocument: if file: file.seek(0) pdf = file.read() @@ -92,7 +87,7 @@ def convert( # Analyze doctr_result = self.predictor(doc) - return self.__to_elements_list__(doctr_result) + return self.__to_elements_list(doctr_result) async def aconvert( self, @@ -100,7 +95,7 @@ async def aconvert( file: IO[bytes] | BinaryIO | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> List[Element]: + ) -> MPDocument: warnings.warn( "The DocTRParser is a sync parser, please use the sync convert method", UserWarning, @@ -108,10 +103,10 @@ async def aconvert( ) return self.convert(file_path, file, file_extension, **kwargs) - def __to_elements_list__(self, doctr_document: Document) -> List[Element]: + def __to_elements_list(self, doctr_document: Document) -> MPDocument: result = [] - for page in doctr_document.pages: + for page_number, page in enumerate(doctr_document.pages): for block in page.blocks: if len(block.lines) and len(block.artefacts) > 0: raise ValueError( @@ -126,36 +121,34 @@ def __to_elements_list__(self, doctr_document: Document) -> List[Element]: y1 = max(word[1][1] for word in word_coordinates) result.append( - Text( + TextBlock( text=block.render(), - coordinates=( - (x0, y0), - (x1, y0), - (x1, y1), - (x0, y1), + bbox=BBOX( + top_left=Point2D(x=x0, y=y0), + bottom_right=Point2D(x=x1, y=y1), ), - coordinate_system=RelativeCoordinateSystem(), - metadata=ElementMetadata(), - detection_origin="doctr", + metadata={}, + page_range=(page_number, page_number), ) ) for artefact in block.artefacts: result.append( - Image( - text="", - coordinates=( - (artefact.geometry[0][0], artefact.geometry[0][1]), - (artefact.geometry[1][0], artefact.geometry[0][1]), - (artefact.geometry[1][0], artefact.geometry[1][1]), - (artefact.geometry[0][0], artefact.geometry[1][1]), + ImageBlock( + bbox=BBOX( + top_left=Point2D( + x=artefact.geometry[0][0], y=artefact.geometry[0][1] + ), + bottom_right=Point2D( + x=artefact.geometry[1][0], y=artefact.geometry[1][1] + ), ), - coordinate_system=RelativeCoordinateSystem(), - metadata=ElementMetadata(), - detection_origin="doctr", + metadata={}, + page_range=(page_number, page_number), ) ) - - result.append(PageBreak(text="")) - - return result + return MPDocument( + metadata={}, + content=result, + detection_origin="doctr", + ) diff --git a/libs/megaparse/src/megaparse/parser/llama.py b/libs/megaparse/src/megaparse/parser/llama.py index 695ed6a..40321ea 100644 --- a/libs/megaparse/src/megaparse/parser/llama.py +++ b/libs/megaparse/src/megaparse/parser/llama.py @@ -5,12 +5,11 @@ from llama_parse import LlamaParse as _LlamaParse from llama_parse.utils import Language, ResultType from megaparse_sdk.schema.extensions import FileExtension -from unstructured.documents.elements import ( - Element, - Text, -) +from megaparse.models.document import Document as MPDocument +from megaparse.models.document import TextBlock from megaparse.parser import BaseParser +from megaparse.predictor.models.base import BBOX, Point2D class LlamaParser(BaseParser): @@ -39,7 +38,7 @@ async def aconvert( file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> List[Element]: + ) -> MPDocument: if not file_path: raise ValueError("File_path should be provided to run LlamaParser") self.check_supported_extension(file_extension, file_path) @@ -54,12 +53,8 @@ async def aconvert( ) documents: List[LlamaDocument] = await llama_parser.aload_data(str(file_path)) - parsed_md = "" - for document in documents: - text_content = document.text - parsed_md = parsed_md + text_content - return self.__to_elements_list__(parsed_md) + return self.__to_elements_list__(documents) def convert( self, @@ -67,7 +62,7 @@ def convert( file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> List[Element]: + ) -> MPDocument: if not file_path: raise ValueError("File_path should be provided to run LlamaParser") self.check_supported_extension(file_extension, file_path) @@ -82,12 +77,24 @@ def convert( ) documents: List[LlamaDocument] = llama_parser.load_data(str(file_path)) - parsed_md = "" - for document in documents: - text_content = document.text - parsed_md = parsed_md + text_content - return self.__to_elements_list__(parsed_md) + return self.__to_elements_list__(documents) - def __to_elements_list__(self, llama_doc: str) -> List[Element]: - return [Text(text=llama_doc)] + def __to_elements_list__(self, llama_doc: List[LlamaDocument]) -> MPDocument: + list_blocks = [] + for i, page in enumerate(llama_doc): + list_blocks.append( + TextBlock( + text=page.text, + metadata={}, + page_range=(i, i + 1), + bbox=BBOX( + top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1) + ), + ) + ) + return MPDocument( + metadata={}, + detection_origin="llamaparse", + content=list_blocks, + ) diff --git a/libs/megaparse/src/megaparse/parser/megaparse_vision.py b/libs/megaparse/src/megaparse/parser/megaparse_vision.py index 3516870..39490ff 100644 --- a/libs/megaparse/src/megaparse/parser/megaparse_vision.py +++ b/libs/megaparse/src/megaparse/parser/megaparse_vision.py @@ -9,10 +9,12 @@ from langchain_core.messages import HumanMessage from megaparse_sdk.schema.extensions import FileExtension from pdf2image import convert_from_path -from unstructured.documents.elements import Element, Text +from megaparse.models.document import Block, TextBlock +from megaparse.models.document import Document as MPDocument from megaparse.parser import BaseParser from megaparse.parser.entity import SupportedModel, TagEnum +from megaparse.predictor.models.base import BBOX, Point2D # BASE_OCR_PROMPT = """ # Transcribe the content of this file into markdown. Be mindful of the formatting. @@ -148,7 +150,7 @@ async def aconvert( file_extension: FileExtension | None = None, batch_size: int = 3, **kwargs, - ) -> List[Element]: + ) -> MPDocument: """ Parse a PDF file and process its content using the language model. @@ -165,13 +167,14 @@ async def aconvert( self.check_supported_extension(file_extension, file_path) pdf_base64 = self.process_file(file_path) + n_pages = len(pdf_base64) tasks = [ self.asend_to_mlm(pdf_base64[i : i + batch_size]) for i in range(0, len(pdf_base64), batch_size) ] self.parsed_chunks = await asyncio.gather(*tasks) responses = self.get_cleaned_content("\n".join(self.parsed_chunks)) - return self.__to_elements_list__(responses) + return self.__to_elements_list__(responses, n_pages=n_pages) def convert( self, @@ -180,7 +183,7 @@ def convert( file_extension: FileExtension | None = None, batch_size: int = 3, **kwargs, - ) -> List[Element]: + ) -> MPDocument: """ Parse a PDF file and process its content using the language model. @@ -197,6 +200,7 @@ def convert( self.check_supported_extension(file_extension, file_path) pdf_base64 = self.process_file(file_path) + n_pages = len(pdf_base64) chunks = [ pdf_base64[i : i + batch_size] for i in range(0, len(pdf_base64), batch_size) @@ -206,7 +210,7 @@ def convert( response = self.send_to_mlm(chunk) self.parsed_chunks.append(response) responses = self.get_cleaned_content("\n".join(self.parsed_chunks)) - return self.__to_elements_list__(responses) + return self.__to_elements_list__(responses, n_pages) def get_cleaned_content(self, parsed_file: str) -> str: """ @@ -247,5 +251,17 @@ def remove_tag(match): return cleaned_content - def __to_elements_list__(self, mpv_doc: str) -> List[Element]: - return [Text(text=mpv_doc)] + def __to_elements_list__(self, mpv_doc: str, n_pages: int) -> MPDocument: + list_blocks: List[Block] = [ + TextBlock( + text=mpv_doc, + metadata={}, + page_range=(0, n_pages - 1), + bbox=BBOX(top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)), + ) + ] + return MPDocument( + metadata={}, + detection_origin="megaparse_vision", + content=list_blocks, + ) diff --git a/libs/megaparse/src/megaparse/parser/unstructured_parser.py b/libs/megaparse/src/megaparse/parser/unstructured_parser.py index cc3f815..d6b8317 100644 --- a/libs/megaparse/src/megaparse/parser/unstructured_parser.py +++ b/libs/megaparse/src/megaparse/parser/unstructured_parser.py @@ -1,17 +1,29 @@ -import re import warnings from pathlib import Path -from typing import IO, List +from typing import IO, Dict, List from dotenv import load_dotenv from langchain_core.language_models.chat_models import BaseChatModel -from langchain_core.prompts import ChatPromptTemplate from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum from unstructured.documents.elements import Element from unstructured.partition.auto import partition +from megaparse.models.document import ( + Block, + FooterBlock, + HeaderBlock, + ImageBlock, + SubTitleBlock, + TableBlock, + TextBlock, + TitleBlock, +) +from megaparse.models.document import ( + Document as MPDocument, +) from megaparse.parser import BaseParser +from megaparse.predictor.models.base import BBOX, Point2D class UnstructuredParser(BaseParser): @@ -44,7 +56,8 @@ def convert( file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, - ) -> List[Element]: + ) -> MPDocument: + self.check_supported_extension(file_extension, file_path) # Partition the PDF elements = partition( filename=str(file_path) if file_path else None, @@ -52,7 +65,7 @@ def convert( strategy=self.strategy, content_type=file_extension.mimetype if file_extension else None, ) - return elements + return self.__to_mp_document(elements) async def aconvert( self, @@ -60,7 +73,7 @@ async def aconvert( file: IO[bytes] | None = None, file_extension: FileExtension | None = None, **kwargs, - ) -> List[Element]: + ) -> MPDocument: self.check_supported_extension(file_extension, file_path) warnings.warn( "The UnstructuredParser is a sync parser, please use the sync convert method", @@ -68,3 +81,271 @@ async def aconvert( stacklevel=2, ) return self.convert(file_path, file, file_extension, **kwargs) + + def __to_mp_document(self, elements: List[Element]) -> MPDocument: + text_blocks = [] + for element in elements: + text_blocks.append(self.__convert_element_to_block(element)) + return MPDocument( + content=text_blocks, metadata={}, detection_origin="unstructured" + ) + + def __convert_element_to_block(self, element: Element) -> Block | None: + element_type = element.category + text = element.text + metadata = element.metadata + category_depth = metadata.category_depth + + # Element type-specific markdown content + markdown_types: Dict[str, Block] = { + "Title": TitleBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Subtitle": SubTitleBlock( + text=text, + depth=category_depth if category_depth else 0, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Header": HeaderBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Footer": FooterBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "NarrativeText": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "ListItem": TextBlock( # FIXME: @chloedia, list item need to be handled differently in ListBlock + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Table": TableBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Image": ImageBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Formula": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "FigureCaption": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "Address": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "EmailAddress": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + "CodeSnippet": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), + } + + return markdown_types.get(element_type, None) diff --git a/libs/megaparse/tests/test_parsers.py b/libs/megaparse/tests/test_parsers.py index ae081dd..40e772a 100644 --- a/libs/megaparse/tests/test_parsers.py +++ b/libs/megaparse/tests/test_parsers.py @@ -34,7 +34,7 @@ def test_sync_parser(parser, extension): response = myparser.convert(file_path) assert response - assert len(response) > 0 + assert len(str(response)) > 0 else: with pytest.raises(ValueError): myparser.convert(file_path) From 50f4bb67d4d59ab49d7c2944ec45a6e27b41c3aa Mon Sep 17 00:00:00 2001 From: chloedia Date: Tue, 7 Jan 2025 20:07:29 +0100 Subject: [PATCH 07/17] fix: test --- .../src/megaparse/examples/parse_file.py | 13 ++++++++++++- libs/megaparse/src/megaparse/megaparse.py | 5 ++++- .../src/megaparse/models/document.py | 4 +++- .../megaparse/parser/unstructured_parser.py | 4 +++- libs/megaparse/tests/conftest.py | 19 +++++++++++++++---- 5 files changed, 37 insertions(+), 8 deletions(-) diff --git a/libs/megaparse/src/megaparse/examples/parse_file.py b/libs/megaparse/src/megaparse/examples/parse_file.py index 5a37d29..46cd105 100644 --- a/libs/megaparse/src/megaparse/examples/parse_file.py +++ b/libs/megaparse/src/megaparse/examples/parse_file.py @@ -11,6 +11,7 @@ from megaparse.megaparse import MegaParse from megaparse.parser.doctr_parser import DoctrParser from megaparse.parser.unstructured_parser import UnstructuredParser +from megaparse_sdk.schema.extensions import FileExtension from pydantic import BaseModel, Field @@ -33,5 +34,15 @@ def main(): print(result) +async def test(): + processor = MegaParse() + pdf = "./tests/pdf/sample_pdf.pdf" + + with open(pdf, "rb") as f: + result = await processor.aload(file=f, file_extension=FileExtension.PDF) + assert len(str(result)) > 0 + + if __name__ == "__main__": - main() + # main() + asyncio.run(test()) diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py index 7bb2fad..29e3142 100644 --- a/libs/megaparse/src/megaparse/megaparse.py +++ b/libs/megaparse/src/megaparse/megaparse.py @@ -73,8 +73,11 @@ async def aload( file_extension = self.validate_input( file=file, file_path=file_path, file_extension=file_extension ) + try: - parsed_document = await self.parser.aconvert(file_path=file_path, file=file) + parsed_document = await self.parser.aconvert( + file_path=file_path, file=file, file_extension=file_extension + ) parsed_document.file_name = str(file_path) if file_path else None if self.formatters: for formatter in self.formatters: diff --git a/libs/megaparse/src/megaparse/models/document.py b/libs/megaparse/src/megaparse/models/document.py index 1b45a87..85b395e 100644 --- a/libs/megaparse/src/megaparse/models/document.py +++ b/libs/megaparse/src/megaparse/models/document.py @@ -26,7 +26,9 @@ class Block(BaseModel): bbox: Optional[BBOX] = ( None # (x0,y0),(x1, y1) Coordinates are given as Relative positions to the page they are in ) - page_range: Optional[Tuple[int, int]] = Field(...) # (start_page, end_page) + page_range: Optional[Tuple[int, int]] = Field( + default=None + ) # (start_page, end_page) @field_validator("page_range") def validate_range(cls, value): diff --git a/libs/megaparse/src/megaparse/parser/unstructured_parser.py b/libs/megaparse/src/megaparse/parser/unstructured_parser.py index d6b8317..0730a9c 100644 --- a/libs/megaparse/src/megaparse/parser/unstructured_parser.py +++ b/libs/megaparse/src/megaparse/parser/unstructured_parser.py @@ -85,7 +85,9 @@ async def aconvert( def __to_mp_document(self, elements: List[Element]) -> MPDocument: text_blocks = [] for element in elements: - text_blocks.append(self.__convert_element_to_block(element)) + block = self.__convert_element_to_block(element) + if block: + text_blocks.append(block) return MPDocument( content=text_blocks, metadata={}, detection_origin="unstructured" ) diff --git a/libs/megaparse/tests/conftest.py b/libs/megaparse/tests/conftest.py index e898f81..41eceda 100644 --- a/libs/megaparse/tests/conftest.py +++ b/libs/megaparse/tests/conftest.py @@ -8,6 +8,7 @@ from megaparse.api.app import app, get_playwright_loader, parser_builder_dep from megaparse.parser.base import BaseParser from megaparse_sdk.schema.extensions import FileExtension +from megaparse.models.document import Document as MPDocument, TextBlock class FakeParserBuilder: @@ -29,9 +30,14 @@ def convert( file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> MPDocument: print("Fake parser is converting the file") - return "Fake conversion result" + return MPDocument( + file_name="Fake file", + content=[TextBlock(text="Fake conversion result", metadata={})], + metadata={}, + detection_origin="fakeparser", + ) async def aconvert( self, @@ -39,9 +45,14 @@ async def aconvert( file: IO[bytes] | None = None, file_extension: None | FileExtension = None, **kwargs, - ) -> str: + ) -> MPDocument: print("Fake parser is converting the file") - return "Fake conversion result" + return MPDocument( + file_name="Fake file", + content=[TextBlock(text="Fake conversion result", metadata={})], + metadata={}, + detection_origin="fakeparser", + ) return FakeParser() From 01cab33edbdb32cfe28effcb46c94986044d9086 Mon Sep 17 00:00:00 2001 From: chloedia Date: Wed, 8 Jan 2025 10:42:27 +0100 Subject: [PATCH 08/17] fix: add uncategorized text handling --- .../megaparse/parser/unstructured_parser.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/libs/megaparse/src/megaparse/parser/unstructured_parser.py b/libs/megaparse/src/megaparse/parser/unstructured_parser.py index 0730a9c..294f386 100644 --- a/libs/megaparse/src/megaparse/parser/unstructured_parser.py +++ b/libs/megaparse/src/megaparse/parser/unstructured_parser.py @@ -348,6 +348,24 @@ def __convert_element_to_block(self, element: Element) -> Block | None: if metadata.coordinates and metadata.coordinates.points else None, ), + "UncategorizedText": TextBlock( + text=text, + metadata={}, + page_range=(metadata.page_number, metadata.page_number) + if metadata.page_number + else None, + bbox=BBOX( + top_left=Point2D( + x=metadata.coordinates.points[0][0], + y=metadata.coordinates.points[0][1], + ), + bottom_right=Point2D( + x=metadata.coordinates.points[3][0], + y=metadata.coordinates.points[3][1], + ), + ) + if metadata.coordinates and metadata.coordinates.points + else None, + ), } - return markdown_types.get(element_type, None) From 04a858f3616cfe2d584311650a5d69b17c8e988f Mon Sep 17 00:00:00 2001 From: chloedia Date: Wed, 8 Jan 2025 12:17:52 +0100 Subject: [PATCH 09/17] add: skip on flaky pdf --- libs/megaparse/tests/pdf/test_detect_ocr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libs/megaparse/tests/pdf/test_detect_ocr.py b/libs/megaparse/tests/pdf/test_detect_ocr.py index 6b6c57d..4373a7e 100644 --- a/libs/megaparse/tests/pdf/test_detect_ocr.py +++ b/libs/megaparse/tests/pdf/test_detect_ocr.py @@ -12,6 +12,9 @@ @pytest.mark.parametrize("hi_res_pdf", ocr_pdfs) def test_hi_res_strategy(hi_res_pdf): + if hi_res_pdf == "0168004.pdf": + pytest.skip("Skip 0168004.pdf as it is flaky currently") + strategy = strategy_handler.determine_strategy( f"./tests/pdf/ocr/{hi_res_pdf}", ) From 2dcd952f01886ccebf3137dd8cbe976a2497a268 Mon Sep 17 00:00:00 2001 From: chloedia Date: Wed, 8 Jan 2025 15:44:12 +0100 Subject: [PATCH 10/17] add: section block --- libs/megaparse/src/megaparse/models/document.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/libs/megaparse/src/megaparse/models/document.py b/libs/megaparse/src/megaparse/models/document.py index 85b395e..6d382be 100644 --- a/libs/megaparse/src/megaparse/models/document.py +++ b/libs/megaparse/src/megaparse/models/document.py @@ -145,6 +145,22 @@ def __str__(self): return f"{'='*len(self.text)}\n\n{self.text}\n\n{'='*len(self.text)}" +class SectionBlock(Block): + """ + A class to represent a section block + + """ + + title: str + depth: int + content: List[Block] + + def __str__(self): + lines = [] + lines.extend(str(block) for block in self.content) + return "\n".join(lines) + + class TOCItem(BaseModel): title: str depth: int From 790bba303ec9818ce9ecaacca23bcab3fb187dc4 Mon Sep 17 00:00:00 2001 From: chloedia Date: Wed, 8 Jan 2025 17:32:59 +0100 Subject: [PATCH 11/17] fix: change load logic & reate page element --- libs/megaparse/src/megaparse/megaparse.py | 60 ++++++++++++------- libs/megaparse/src/megaparse/models/page.py | 29 +++++++++ .../src/megaparse/parser/strategy.py | 44 ++++++++------ 3 files changed, 94 insertions(+), 39 deletions(-) create mode 100644 libs/megaparse/src/megaparse/models/page.py diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py index 9dfa1fb..9c2e19f 100644 --- a/libs/megaparse/src/megaparse/megaparse.py +++ b/libs/megaparse/src/megaparse/megaparse.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import IO, BinaryIO, List +from megaparse.models.page import Page from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum @@ -88,11 +89,15 @@ async def aload( file_extension = self.validate_input( file=file, file_path=file_path, file_extension=file_extension ) + opened_file = None # FIXM: Not sure of this method try: - parser = self._select_parser(file_path, file, file_extension) + if file_path: + opended_file = open(file_path, "rb") + file = opended_file + parser = self._select_parser(file, file_extension) logger.info(f"Parsing using {parser.__class__.__name__} parser.") parsed_document = await parser.aconvert( - file_path=file_path, file=file, file_extension=file_extension + file=file, file_extension=file_extension ) parsed_document.file_name = str(file_path) if file_path else None @@ -116,6 +121,9 @@ async def aload( raise ParsingException( f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}" ) + finally: + if opened_file: + opened_file.close() def load( self, @@ -126,12 +134,21 @@ def load( file_extension = self.validate_input( file=file, file_path=file_path, file_extension=file_extension ) + opened_file = None # FIXM: Not sure of this method try: - parser = self._select_parser(file_path, file, file_extension) - logger.info(f"Parsing using {parser.__class__.__name__} parser.") - parsed_document = parser.convert( - file_path=file_path, file=file, file_extension=file_extension + if file_path: + opended_file = open(file_path, "rb") + file = opended_file + + assert file is not None, "No File provided" + # First parse the file in with fast and get text detections + pages = self.strategy_handler.determine_strategy( + file=file, ) + parser = self._select_parser(pages=pages, file_extension=file_extension) + + logger.info(f"Parsing using {parser.__class__.__name__} parser.") + parsed_document = parser.convert(file=file, file_extension=file_extension) parsed_document.file_name = str(file_path) if file_path else None if self.formatters: @@ -144,9 +161,6 @@ def load( break parsed_document = formatter.format(parsed_document) - # @chloe FIXME: format_checker needs unstructured Elements as input which is to change - # if self.format_checker: - # parsed_document: str = self.format_checker.check(parsed_document) if not isinstance(parsed_document, str): return str(parsed_document) return parsed_document @@ -154,23 +168,29 @@ def load( raise ParsingException( f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}" ) + finally: + if opened_file: + opened_file.close() def _select_parser( self, - file_path: Path | str | None = None, - file: BinaryIO | None = None, + pages: List[Page], file_extension: str | FileExtension = "", ) -> BaseParser: - local_strategy = None - if self.strategy != StrategyEnum.AUTO or file_extension != FileExtension.PDF: + if file_extension != FileExtension.PDF or self.strategy == StrategyEnum.FAST: return self.parser - if file: - local_strategy = self.strategy_handler.determine_strategy( - file=file, # type: ignore #FIXME: Careful here on removing BinaryIO (not handled by onnxtr) - ) - if file_path: - local_strategy = self.strategy_handler.determine_strategy(file=file_path) + if self.strategy == StrategyEnum.HI_RES: + return self.ocr_parser + + need_ocr = 0 + for page in pages: + if page.strategy == StrategyEnum.HI_RES: + need_ocr += 1 + + doc_need_ocr = ( + need_ocr / len(pages) + ) > self.config.auto_parse_config.auto_document_threshold - if local_strategy == StrategyEnum.HI_RES: + if doc_need_ocr: return self.ocr_parser return self.parser diff --git a/libs/megaparse/src/megaparse/models/page.py b/libs/megaparse/src/megaparse/models/page.py new file mode 100644 index 0000000..337fa98 --- /dev/null +++ b/libs/megaparse/src/megaparse/models/page.py @@ -0,0 +1,29 @@ +from typing import List + +from megaparse.predictor.models.base import PageLayout +from megaparse_sdk.schema.parser_config import StrategyEnum +from numpy.typing import NDArray +from pydantic import BaseModel +from pypdfium2._helpers.page import PdfPage + + +class PageDimension(BaseModel): + """ + A class to represent a page dimension + """ + + width: int + height: int + + +class Page(BaseModel): + """ + A class to represent a page + """ + + strategy: StrategyEnum + text_detections: PageLayout + rasterized: NDArray + page_size: PageDimension + page_index: int + pdfium_elements: PdfPage diff --git a/libs/megaparse/src/megaparse/parser/strategy.py b/libs/megaparse/src/megaparse/parser/strategy.py index 780c634..d9ebea3 100644 --- a/libs/megaparse/src/megaparse/parser/strategy.py +++ b/libs/megaparse/src/megaparse/parser/strategy.py @@ -2,7 +2,7 @@ import random import warnings from pathlib import Path -from typing import Any, List +from typing import BinaryIO, List, Tuple import numpy as np import onnxruntime as rt @@ -14,6 +14,7 @@ from pypdfium2._helpers.page import PdfPage from megaparse.configs.auto import AutoStrategyConfig, DeviceEnum, TextDetConfig +from megaparse.models.page import Page, PageDimension from megaparse.predictor.doctr_layout_detector import LayoutPredictor from megaparse.predictor.models.base import PageLayout @@ -129,39 +130,44 @@ def get_strategy_page( def determine_strategy( self, - file: str - | Path - | bytes, # FIXME : Careful here on removing BinaryIO (not handled by onnxtr) + file: BinaryIO | Path | bytes, max_samples: int = 5, - ) -> StrategyEnum: + ) -> List[Page]: + if isinstance(file, BinaryIO): + file = file.read() # onnxtr expects a file as AbstractPath or bytes logger.info("Determining strategy...") - need_ocr = 0 onnxtr_document = DocumentFile.from_pdf(file) layout_predictor = LayoutPredictor(self.det_predictor) pdfium_document = pdfium.PdfDocument(file) - if len(pdfium_document) > max_samples: - sample_pages_index = random.sample(range(len(onnxtr_document)), max_samples) - onnxtr_document = [onnxtr_document[i] for i in sample_pages_index] - pdfium_document = [pdfium_document[i] for i in sample_pages_index] + # if len(pdfium_document) > max_samples: + # sample_pages_index = random.sample(range(len(onnxtr_document)), max_samples) + # onnxtr_document = [onnxtr_document[i] for i in sample_pages_index] + # pdfium_document = [pdfium_document[i] for i in sample_pages_index] onnxtr_document_layout = layout_predictor(onnxtr_document) + mp_pages: List[Page] = [] + for pdfium_page, onnxtr_page in zip( pdfium_document, onnxtr_document_layout, strict=True ): strategy = self.get_strategy_page(pdfium_page, onnxtr_page) - need_ocr += strategy == StrategyEnum.HI_RES + mp_pages.append( + Page( + strategy=strategy, + text_detections=onnxtr_page, + rasterized=pdfium_page.render(), # FIXME check + page_size=PageDimension( + width=pdfium_page.get_width(), height=pdfium_page.get_height() + ), + page_index=onnxtr_page.page_index, + pdfium_elements=pdfium_page, + ) + ) - doc_need_ocr = ( - need_ocr / len(pdfium_document) - ) > self.config.auto_document_threshold if isinstance(pdfium_document, pdfium.PdfDocument): pdfium_document.close() - if doc_need_ocr: - logger.info("Using HI_RES strategy") - return StrategyEnum.HI_RES - logger.info("Using FAST strategy") - return StrategyEnum.FAST + return mp_pages From 79354e4f1d1820b233d648e641690bdb5c9b80e0 Mon Sep 17 00:00:00 2001 From: chloedia Date: Thu, 9 Jan 2025 18:59:52 +0100 Subject: [PATCH 12/17] fix: add pages --- .../src/megaparse/examples/parse_file.py | 4 +-- libs/megaparse/src/megaparse/megaparse.py | 26 ++++++++----------- libs/megaparse/src/megaparse/models/page.py | 13 ++++++---- .../src/megaparse/parser/strategy.py | 23 ++++++++++++++-- .../src/megaparse/utils/strategy_utils.py | 16 ++++++++++++ libs/megaparse/tests/pdf/test_detect_ocr.py | 19 ++++++++------ .../tests/pdf/test_pdf_processing.py | 23 +++++++++------- 7 files changed, 83 insertions(+), 41 deletions(-) create mode 100644 libs/megaparse/src/megaparse/utils/strategy_utils.py diff --git a/libs/megaparse/src/megaparse/examples/parse_file.py b/libs/megaparse/src/megaparse/examples/parse_file.py index 1dea322..248ad7c 100644 --- a/libs/megaparse/src/megaparse/examples/parse_file.py +++ b/libs/megaparse/src/megaparse/examples/parse_file.py @@ -26,9 +26,9 @@ async def main(): # Parse a file parser = DoctrParser() model = ChatOpenAI(name="gpt-4o") - formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) + # formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) - megaparse = MegaParse(ocr_parser=parser, formatters=[formatter_1]) + megaparse = MegaParse(ocr_parser=parser) file_path = Path("./tests/pdf/sample_pdf.pdf") result = await megaparse.aload(file_path=file_path) diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py index 9c2e19f..03f1b06 100644 --- a/libs/megaparse/src/megaparse/megaparse.py +++ b/libs/megaparse/src/megaparse/megaparse.py @@ -3,17 +3,18 @@ from pathlib import Path from typing import IO, BinaryIO, List -from megaparse.models.page import Page from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum from megaparse.configs.auto import DeviceEnum, MegaParseConfig from megaparse.exceptions.base import ParsingException from megaparse.formatter.base import BaseFormatter +from megaparse.models.page import Page from megaparse.parser.base import BaseParser from megaparse.parser.doctr_parser import DoctrParser from megaparse.parser.strategy import StrategyHandler from megaparse.parser.unstructured_parser import UnstructuredParser +from megaparse.utils.strategy_utils import need_hi_res logger = logging.getLogger("megaparse") @@ -94,7 +95,14 @@ async def aload( if file_path: opended_file = open(file_path, "rb") file = opended_file - parser = self._select_parser(file, file_extension) + + assert file is not None, "No File provided" + # First parse the file in with fast and get text detections + pages = self.strategy_handler.determine_strategy( + file=file, strategy=self.strategy + ) + parser = self._select_parser(pages=pages, file_extension=file_extension) + logger.info(f"Parsing using {parser.__class__.__name__} parser.") parsed_document = await parser.aconvert( file=file, file_extension=file_extension @@ -111,9 +119,6 @@ async def aload( break parsed_document = await formatter.aformat(parsed_document) - # @chloe FIXME: format_checker needs unstructured Elements as input which is to change - # if self.format_checker: - # parsed_document: str = self.format_checker.check(parsed_document) if not isinstance(parsed_document, str): return str(parsed_document) return parsed_document @@ -182,15 +187,6 @@ def _select_parser( if self.strategy == StrategyEnum.HI_RES: return self.ocr_parser - need_ocr = 0 - for page in pages: - if page.strategy == StrategyEnum.HI_RES: - need_ocr += 1 - - doc_need_ocr = ( - need_ocr / len(pages) - ) > self.config.auto_parse_config.auto_document_threshold - - if doc_need_ocr: + if need_hi_res(pages, self.config.auto_parse_config): return self.ocr_parser return self.parser diff --git a/libs/megaparse/src/megaparse/models/page.py b/libs/megaparse/src/megaparse/models/page.py index 337fa98..eb9c1f1 100644 --- a/libs/megaparse/src/megaparse/models/page.py +++ b/libs/megaparse/src/megaparse/models/page.py @@ -3,8 +3,9 @@ from megaparse.predictor.models.base import PageLayout from megaparse_sdk.schema.parser_config import StrategyEnum from numpy.typing import NDArray -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from pypdfium2._helpers.page import PdfPage +from PIL.Image import Image as PILImage class PageDimension(BaseModel): @@ -12,8 +13,8 @@ class PageDimension(BaseModel): A class to represent a page dimension """ - width: int - height: int + width: float + height: float class Page(BaseModel): @@ -22,8 +23,10 @@ class Page(BaseModel): """ strategy: StrategyEnum - text_detections: PageLayout - rasterized: NDArray + text_detections: PageLayout | None = None + rasterized: PILImage page_size: PageDimension page_index: int pdfium_elements: PdfPage + + model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/libs/megaparse/src/megaparse/parser/strategy.py b/libs/megaparse/src/megaparse/parser/strategy.py index d9ebea3..1f9f20e 100644 --- a/libs/megaparse/src/megaparse/parser/strategy.py +++ b/libs/megaparse/src/megaparse/parser/strategy.py @@ -132,14 +132,33 @@ def determine_strategy( self, file: BinaryIO | Path | bytes, max_samples: int = 5, + strategy: StrategyEnum = StrategyEnum.AUTO, ) -> List[Page]: if isinstance(file, BinaryIO): file = file.read() # onnxtr expects a file as AbstractPath or bytes logger.info("Determining strategy...") + pdfium_document = pdfium.PdfDocument(file) + + if strategy == StrategyEnum.FAST: + mp_pages = [] + for i, pdfium_page in enumerate(pdfium_document): + mp_pages.append( + Page( + strategy=strategy, + text_detections=None, + rasterized=pdfium_page.render().to_pil(), + page_size=PageDimension( + width=pdfium_page.get_width(), + height=pdfium_page.get_height(), + ), + page_index=i, + pdfium_elements=pdfium_page, + ) + ) + return mp_pages onnxtr_document = DocumentFile.from_pdf(file) layout_predictor = LayoutPredictor(self.det_predictor) - pdfium_document = pdfium.PdfDocument(file) # if len(pdfium_document) > max_samples: # sample_pages_index = random.sample(range(len(onnxtr_document)), max_samples) @@ -158,7 +177,7 @@ def determine_strategy( Page( strategy=strategy, text_detections=onnxtr_page, - rasterized=pdfium_page.render(), # FIXME check + rasterized=pdfium_page.render().to_pil(), # FIXME check page_size=PageDimension( width=pdfium_page.get_width(), height=pdfium_page.get_height() ), diff --git a/libs/megaparse/src/megaparse/utils/strategy_utils.py b/libs/megaparse/src/megaparse/utils/strategy_utils.py new file mode 100644 index 0000000..0204562 --- /dev/null +++ b/libs/megaparse/src/megaparse/utils/strategy_utils.py @@ -0,0 +1,16 @@ +from typing import List + +from megaparse.configs.auto import AutoStrategyConfig +from megaparse.models.page import Page +from megaparse_sdk.schema.parser_config import StrategyEnum + + +def need_hi_res( + pages: List[Page], auto_config: AutoStrategyConfig = AutoStrategyConfig() +) -> bool: + need_ocr = 0 + for page in pages: + if page.strategy == StrategyEnum.HI_RES: + need_ocr += 1 + + return (need_ocr / len(pages)) > auto_config.auto_document_threshold diff --git a/libs/megaparse/tests/pdf/test_detect_ocr.py b/libs/megaparse/tests/pdf/test_detect_ocr.py index 4373a7e..e0d1f77 100644 --- a/libs/megaparse/tests/pdf/test_detect_ocr.py +++ b/libs/megaparse/tests/pdf/test_detect_ocr.py @@ -1,5 +1,6 @@ import os +from megaparse.utils.strategy_utils import need_hi_res import pytest from megaparse.parser.strategy import StrategyHandler from megaparse_sdk.schema.parser_config import StrategyEnum @@ -15,15 +16,17 @@ def test_hi_res_strategy(hi_res_pdf): if hi_res_pdf == "0168004.pdf": pytest.skip("Skip 0168004.pdf as it is flaky currently") - strategy = strategy_handler.determine_strategy( - f"./tests/pdf/ocr/{hi_res_pdf}", - ) - assert strategy == StrategyEnum.HI_RES + with open(f"./tests/pdf/ocr/{hi_res_pdf}", "rb") as f: + pages = strategy_handler.determine_strategy( + f, + ) + assert need_hi_res(pages) @pytest.mark.parametrize("native_pdf", native_pdfs) def test_fast_strategy(native_pdf): - strategy = strategy_handler.determine_strategy( - f"./tests/pdf/native/{native_pdf}", - ) - assert strategy == StrategyEnum.FAST + with open(f"./tests/pdf/native/{native_pdf}", "rb") as f: + pages = strategy_handler.determine_strategy( + f, + ) + assert not need_hi_res(pages) diff --git a/libs/megaparse/tests/pdf/test_pdf_processing.py b/libs/megaparse/tests/pdf/test_pdf_processing.py index 2b85d2c..3fd5e96 100644 --- a/libs/megaparse/tests/pdf/test_pdf_processing.py +++ b/libs/megaparse/tests/pdf/test_pdf_processing.py @@ -4,6 +4,7 @@ from megaparse.megaparse import MegaParse from megaparse.parser.strategy import StrategyHandler from megaparse.parser.unstructured_parser import UnstructuredParser +from megaparse.utils.strategy_utils import need_hi_res from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum @@ -55,12 +56,16 @@ async def test_megaparse_pdf_processor_file(pdf_name, request): def test_strategy(scanned_pdf, native_pdf): - strategy = strategy_handler.determine_strategy( - scanned_pdf, - ) - assert strategy == StrategyEnum.HI_RES - - strategy = strategy_handler.determine_strategy( - native_pdf, - ) - assert strategy == StrategyEnum.FAST + with open(native_pdf, "rb") as f: + native_pages = strategy_handler.determine_strategy( + f, + ) + result = need_hi_res(native_pages) + assert not result + + with open(scanned_pdf, "rb") as f: + scanned_pages = strategy_handler.determine_strategy( + f, + ) + result = need_hi_res(scanned_pages) + assert result From adc69b1bc349ac5934675e552973786d5aaa76f9 Mon Sep 17 00:00:00 2001 From: chloedia Date: Fri, 10 Jan 2025 16:59:46 +0100 Subject: [PATCH 13/17] add: split onnxtr det and reco --- libs/megaparse/src/megaparse/models/page.py | 4 +- libs/megaparse/src/megaparse/parser/base.py | 5 +- .../src/megaparse/parser/doctr_parser.py | 239 ++++++++++++++---- .../predictor/doctr_layout_detector.py | 141 ----------- .../src/megaparse/predictor/models/base.py | 62 ++++- 5 files changed, 259 insertions(+), 192 deletions(-) delete mode 100644 libs/megaparse/src/megaparse/predictor/doctr_layout_detector.py diff --git a/libs/megaparse/src/megaparse/models/page.py b/libs/megaparse/src/megaparse/models/page.py index eb9c1f1..970c226 100644 --- a/libs/megaparse/src/megaparse/models/page.py +++ b/libs/megaparse/src/megaparse/models/page.py @@ -2,10 +2,10 @@ from megaparse.predictor.models.base import PageLayout from megaparse_sdk.schema.parser_config import StrategyEnum -from numpy.typing import NDArray from pydantic import BaseModel, ConfigDict from pypdfium2._helpers.page import PdfPage from PIL.Image import Image as PILImage +import numpy as np class PageDimension(BaseModel): @@ -24,7 +24,7 @@ class Page(BaseModel): strategy: StrategyEnum text_detections: PageLayout | None = None - rasterized: PILImage + rasterized: PILImage | None = None page_size: PageDimension page_index: int pdfium_elements: PdfPage diff --git a/libs/megaparse/src/megaparse/parser/base.py b/libs/megaparse/src/megaparse/parser/base.py index 8c3964d..e079fbb 100644 --- a/libs/megaparse/src/megaparse/parser/base.py +++ b/libs/megaparse/src/megaparse/parser/base.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import IO +from typing import IO, List +from megaparse.models.page import Page from megaparse_sdk.schema.extensions import FileExtension from megaparse.models.document import Document @@ -32,6 +33,7 @@ async def aconvert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, + pages: List[Page] | None = None, file_extension: FileExtension | None = None, **kwargs, ) -> Document: @@ -55,6 +57,7 @@ def convert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, + pages: List[Page] | None = None, file_extension: FileExtension | None = None, **kwargs, ) -> Document: diff --git a/libs/megaparse/src/megaparse/parser/doctr_parser.py b/libs/megaparse/src/megaparse/parser/doctr_parser.py index 29a3a7e..b8ac07e 100644 --- a/libs/megaparse/src/megaparse/parser/doctr_parser.py +++ b/libs/megaparse/src/megaparse/parser/doctr_parser.py @@ -1,24 +1,43 @@ import logging import warnings from pathlib import Path -from typing import IO, BinaryIO, List +from typing import IO, Any, BinaryIO, List -from megaparse.configs.auto import DeviceEnum, TextRecoConfig, TextDetConfig +import PIL import onnxruntime as rt from megaparse_sdk.schema.extensions import FileExtension from onnxtr.io import Document, DocumentFile -from onnxtr.models import ocr_predictor +from onnxtr.models import detection_predictor, ocr_predictor, recognition_predictor +from onnxtr.models.detection.predictor import DetectionPredictor from onnxtr.models.engine import EngineConfig +from onnxtr.models.predictor.base import _OCRPredictor +from onnxtr.utils.geometry import detach_scores +from onnxtr.utils.repr import NestedObject +from megaparse.configs.auto import DeviceEnum, TextDetConfig, TextRecoConfig from megaparse.models.document import Document as MPDocument from megaparse.models.document import ImageBlock, TextBlock +from megaparse.models.page import Page from megaparse.parser.base import BaseParser -from megaparse.predictor.models.base import BBOX, Point2D +from megaparse.predictor.models.base import ( + BBOX, + BlockLayout, + BlockType, + PageLayout, + Point2D, +) +from onnxtr.models._utils import get_language + + +import numpy as np +from typing import List + +from PIL import Image as PILImage logger = logging.getLogger("megaparse") -class DoctrParser(BaseParser): +class DoctrParser(NestedObject, _OCRPredictor): supported_extensions = [FileExtension.PDF] def __init__( @@ -27,6 +46,8 @@ def __init__( text_reco_config: TextRecoConfig = TextRecoConfig(), device: DeviceEnum = DeviceEnum.CPU, straighten_pages: bool = False, + detect_orientation: bool = False, + detect_language: bool = False, **kwargs, ): self.device = device @@ -36,21 +57,38 @@ def __init__( session_options=general_options, providers=providers, ) - # TODO: set in config or pass as kwargs - self.predictor = ocr_predictor( - det_arch=text_det_config.det_arch, - reco_arch=text_reco_config.reco_arch, - det_bs=text_det_config.batch_size, - reco_bs=text_reco_config.batch_size, - assume_straight_pages=text_det_config.assume_straight_pages, - straighten_pages=straighten_pages, - # Preprocessing related parameters - det_engine_cfg=engine_config, - reco_engine_cfg=engine_config, + + _OCRPredictor.__init__( + self, + text_det_config.assume_straight_pages, + straighten_pages, + text_det_config.preserve_aspect_ratio, + text_det_config.symmetric_pad, + detect_orientation, clf_engine_cfg=engine_config, **kwargs, ) + self.det_predictor = detection_predictor( + arch=text_det_config.det_arch, + assume_straight_pages=text_det_config.assume_straight_pages, + preserve_aspect_ratio=text_det_config.preserve_aspect_ratio, + symmetric_pad=text_det_config.symmetric_pad, + batch_size=text_det_config.batch_size, + load_in_8_bit=text_det_config.load_in_8_bit, + engine_cfg=engine_config, + ) + + self.reco_predictor = recognition_predictor( + arch=text_reco_config.reco_arch, + batch_size=text_reco_config.batch_size, + load_in_8_bit=text_det_config.load_in_8_bit, + engine_cfg=engine_config, + ) + + self.detect_orientation = detect_orientation + self.detect_language = detect_language + def _get_providers(self) -> List[str]: prov = rt.get_available_providers() logger.info("Available providers:", prov) @@ -77,42 +115,151 @@ def _get_providers(self) -> List[str]: ) return ["CPUExecutionProvider"] - def convert( - self, - file_path: str | Path | None = None, - file: IO[bytes] | BinaryIO | None = None, - file_extension: None | FileExtension = None, - **kwargs, - ) -> MPDocument: - if file: - file.seek(0) - pdf = file.read() - elif file_path: - pdf = file_path # type: ignore + def get_text_detections(self, pages: list[Page], **kwargs) -> List[Page]: + rasterized_pages = [np.array(page.rasterized) for page in pages] + # Dimension check + if any(page.ndim != 3 for page in rasterized_pages): + raise ValueError( + "incorrect input shape: all pages are expected to be multi-channel 2D images." + ) + + origin_page_shapes = [page.shape[:2] for page in rasterized_pages] + + # Localize text elements + loc_preds, out_maps = self.det_predictor( + rasterized_pages, return_maps=True, **kwargs + ) + + # Detect document rotation and rotate pages + seg_maps = [ + np.where( + out_map > self.det_predictor.model.postprocessor, + 255, + 0, + ).astype(np.uint8) + for out_map in out_maps + ] + if self.detect_orientation: + general_pages_orientations, origin_pages_orientations = ( + self._get_orientations(rasterized_pages, seg_maps) + ) + orientations = [ + {"value": orientation_page, "confidence": None} + for orientation_page in origin_pages_orientations + ] else: - raise ValueError("Can't convert if file and file_path are None") + orientations = None + general_pages_orientations = None + origin_pages_orientations = None + if self.straighten_pages: + rasterized_pages = self._straighten_pages( + rasterized_pages, + seg_maps, + general_pages_orientations, + origin_pages_orientations, + ) + # update page shapes after straightening + origin_page_shapes = [page.shape[:2] for page in rasterized_pages] - self.check_supported_extension(file_extension, file_path) + # forward again to get predictions on straight pagess + loc_preds = self.det_predictor(pages, **kwargs) # type: ignore[assignment] - doc = DocumentFile.from_pdf(pdf) - # Analyze - doctr_result = self.predictor(doc) + # Detach objectness scores from loc_preds + loc_preds, objectness_scores = detach_scores(loc_preds) # type: ignore[arg-type] - return self.__to_elements_list(doctr_result) + # Apply hooks to loc_preds if any + for hook in self.hooks: + loc_preds = hook(loc_preds) - async def aconvert( - self, - file_path: str | Path | None = None, - file: IO[bytes] | BinaryIO | None = None, - file_extension: None | FileExtension = None, - **kwargs, - ) -> MPDocument: - warnings.warn( - "The DocTRParser is a sync parser, please use the sync convert method", - UserWarning, - stacklevel=2, + for page_index, (rast_page, loc_pred, objectness_score, page) in enumerate( + zip(rasterized_pages, loc_preds, objectness_scores, pages, strict=True) + ): + block_layouts = [] + for bbox, score in zip(loc_pred, objectness_score, strict=True): + block_layouts.append( + BlockLayout( + bbox=BBOX(bbox[:2].tolist(), bbox[2:].tolist()), + objectness_score=score, + block_type=BlockType.TEXT, + ) + ) + page.text_detections = PageLayout( + bboxes=block_layouts, + page_index=page_index, + dimensions=rast_page.shape[:2], + orientation=orientations[page_index] if orientations is not None else 0, + origin_page_shape=origin_page_shapes[page_index], + ) + + return pages + + def get_text_recognition(self, pages: List[Page], **kwargs) -> MPDocument: + assert any( + page.text_detections is not None for page in pages + ), "Text detections should be computed before running text recognition" + + rasterized_pages = [] + loc_preds = [] + objectness_scores = [] + orientations = [] + origin_page_shapes = [] + for page in pages: + rasterized_pages.append(np.array(page.rasterized)) + loc_preds.append(page.text_detections.get_loc_preds()) # type: ignore + objectness_scores.append(page.text_detections.get_objectness_scores()) # type: ignore + orientations.append(page.text_detections.get_orientations()) # type: ignore + origin_page_shapes.append(page.text_detections.get_origin_page_shapes()) # type: ignore + # Crop images + crops, loc_preds = self._prepare_crops( + rasterized_pages, + loc_preds, # type: ignore[arg-type] + channels_last=True, + assume_straight_pages=self.assume_straight_pages, + assume_horizontal=self._page_orientation_disabled, + ) + # Rectify crop orientation and get crop orientation predictions + crop_orientations: Any = [] + if not self.assume_straight_pages: + crops, loc_preds, _crop_orientations = self._rectify_crops(crops, loc_preds) + crop_orientations = [ + {"value": orientation[0], "confidence": orientation[1]} + for orientation in _crop_orientations + ] + + # Identify character sequences + word_preds = self.reco_predictor( + [crop for page_crops in crops for crop in page_crops], **kwargs + ) + if not crop_orientations: + crop_orientations = [{"value": 0, "confidence": None} for _ in word_preds] + + boxes, text_preds, crop_orientations = self._process_predictions( + loc_preds, word_preds, crop_orientations + ) + + if self.detect_language: + languages = [ + get_language(" ".join([item[0] for item in text_pred])) + for text_pred in text_preds + ] + languages_dict = [ + {"value": lang[0], "confidence": lang[1]} for lang in languages + ] + else: + languages_dict = None + + # FIXME : Not good return type we want :( + out = self.doc_builder( + rasterized_pages, + boxes, + objectness_scores, + text_preds, + origin_page_shapes, + crop_orientations, + orientations, + languages_dict, ) - return self.convert(file_path, file, file_extension, **kwargs) + return self.__to_elements_list(out) def __to_elements_list(self, doctr_document: Document) -> MPDocument: result = [] diff --git a/libs/megaparse/src/megaparse/predictor/doctr_layout_detector.py b/libs/megaparse/src/megaparse/predictor/doctr_layout_detector.py deleted file mode 100644 index fc50e5a..0000000 --- a/libs/megaparse/src/megaparse/predictor/doctr_layout_detector.py +++ /dev/null @@ -1,141 +0,0 @@ -import logging -from typing import Any, List - -import numpy as np -from megaparse.predictor.models.base import ( - BBOX, - BlockLayout, - BlockType, - PageLayout, -) -from onnxtr.models.detection.predictor import DetectionPredictor -from onnxtr.models.engine import EngineConfig -from onnxtr.models.predictor.base import _OCRPredictor -from onnxtr.utils.geometry import detach_scores -from onnxtr.utils.repr import NestedObject - -logger = logging.getLogger("megaparse") - - -class LayoutPredictor(NestedObject, _OCRPredictor): - """Implements an object able to localize and identify text elements in a set of documents - - Args: - det_predictor: detection module - reco_predictor: recognition module - assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages - without rotated textual elements. - straighten_pages: if True, estimates the page general orientation based on the median line orientation. - Then, rotates page before passing it to the deep learning modules. The final predictions will be remapped - accordingly. Doing so will improve performances for documents with page-uniform rotations. - detect_orientation: if True, the estimated general page orientation will be added to the predictions for each - page. Doing so will slightly deteriorate the overall latency. - detect_language: if True, the language prediction will be added to the predictions for each - page. Doing so will slightly deteriorate the overall latency. - clf_engine_cfg: configuration of the orientation classification engine - **kwargs: keyword args of `DocumentBuilder` - """ - - def __init__( - self, - det_predictor: DetectionPredictor, - assume_straight_pages: bool = True, - straighten_pages: bool = False, - preserve_aspect_ratio: bool = True, - symmetric_pad: bool = True, - detect_orientation: bool = False, - use_gpu: bool = False, - clf_engine_cfg: EngineConfig | None = None, - **kwargs: Any, - ): - self.det_predictor = det_predictor - _OCRPredictor.__init__( - self, - assume_straight_pages, - straighten_pages, - preserve_aspect_ratio, - symmetric_pad, - detect_orientation, - clf_engine_cfg=clf_engine_cfg, - **kwargs, - ) - self.detect_orientation = detect_orientation - - def __call__( - self, - pages: list[np.ndarray], - **kwargs: Any, - ) -> List[PageLayout]: # FIXME : Create new LayoutDocument class - """Localize and identify text elements in a set of documents - - Args: - pages: list of pages to be processed - - Returns: - Document: the document object containing the text elements - """ - # Dimension check - if any(page.ndim != 3 for page in pages): - raise ValueError( - "incorrect input shape: all pages are expected to be multi-channel 2D images." - ) - - # Localize text elements - loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs) - - # Detect document rotation and rotate pages - seg_maps = [ - np.where( - out_map > self.det_predictor.model.postprocessor.bin_thresh, - 255, - 0, - ).astype(np.uint8) - for out_map in out_maps - ] - if self.detect_orientation: - general_pages_orientations, origin_pages_orientations = ( - self._get_orientations(pages, seg_maps) - ) - else: - general_pages_orientations = None - origin_pages_orientations = None - if self.straighten_pages: - pages = self._straighten_pages( - pages, seg_maps, general_pages_orientations, origin_pages_orientations - ) - - # forward again to get predictions on straight pages - loc_preds = self.det_predictor(pages, **kwargs) # type: ignore[assignment] - - # Detach objectness scores from loc_preds - loc_preds, objectness_scores = detach_scores(loc_preds) # type: ignore[arg-type] - - # Apply hooks to loc_preds if any - for hook in self.hooks: - loc_preds = hook(loc_preds) - - all_pages_layouts = [] - for page_index, (page, loc_pred, objectness_score) in enumerate( - zip(pages, loc_preds, objectness_scores, strict=True) - ): - block_layouts = [] - for bbox, score in zip(loc_pred, objectness_score, strict=True): - block_layouts.append( - BlockLayout( - bbox=BBOX(bbox[:2].tolist(), bbox[2:].tolist()), - objectness_score=score, - block_type=BlockType.TEXT, - ) - ) - all_pages_layouts.append( - PageLayout( - bboxes=block_layouts, - page_index=page_index, - dimensions=page.shape[:2], - orientation=general_pages_orientations[page_index] - if general_pages_orientations is not None - else 0, - ) - ) - - return all_pages_layouts diff --git a/libs/megaparse/src/megaparse/predictor/models/base.py b/libs/megaparse/src/megaparse/predictor/models/base.py index 6b92121..27b63b3 100644 --- a/libs/megaparse/src/megaparse/predictor/models/base.py +++ b/libs/megaparse/src/megaparse/predictor/models/base.py @@ -27,17 +27,25 @@ class BlockLayout(BaseModel): class PageLayout: - __slots__ = ["bboxes", "page_index", "dimensions", "orientation"] + __slots__ = [ + "bboxes", + "page_index", + "dimensions", + "orientation", + "origin_page_shape", + ] bboxes: List[BlockLayout] page_index: int dimensions: Tuple[int, ...] orientation: Tuple[int, float] | Literal[0] + origin_page_shape: Tuple[int, ...] - def __init__(self, bboxes, page_index, dimensions, orientation): + def __init__(self, bboxes, page_index, dimensions, orientation, origin_page_shape): self.bboxes = bboxes self.page_index = page_index self.dimensions = dimensions self.orientation = orientation + self.origin_page_shape = origin_page_shape def __repr__(self) -> str: return f"PageLayout(bboxes={self.bboxes}, page_index={self.page_index}, dimensions={self.dimensions}, orientation={self.orientation})" @@ -69,3 +77,53 @@ def render( image.save(output_path) print(f"Page layout saved to {output_path}") return image + + def get_loc_preds(self) -> np.ndarray: + """ + Get the location predictions of the bounding boxes. + + Returns: + np.ndarray: The location predictions as a NumPy array. + """ + loc_preds = np.array( + [ + [ + block.bbox.top_left.x, + block.bbox.top_left.y, + block.bbox.bottom_right.x, + block.bbox.bottom_right.y, + ] + for block in self.bboxes + ] + ) + return loc_preds + + def get_objectness_scores(self) -> np.ndarray: + """ + Get the objectness scores of the bounding boxes. + + Returns: + np.ndarray: The objectness scores as a NumPy array. + """ + objectness_scores = np.array([block.objectness_score for block in self.bboxes]) + return objectness_scores + + def get_origin_page_shapes(self) -> np.ndarray: + """ + Get the original page shapes. + + Returns: + np.ndarray: The original page shapes as a NumPy array. + """ + origin_page_shapes = np.array([self.origin_page_shape for _ in self.bboxes]) + return origin_page_shapes + + def get_orientations(self) -> np.ndarray: + """ + Get the orientations of the bounding boxes. + + Returns: + np.ndarray: The orientations as a NumPy array. + """ + orientations = np.array([self.orientation for _ in self.bboxes]) + return orientations From e0c0db0d30d197fa5cd07ca92340bb96d467eade Mon Sep 17 00:00:00 2001 From: chloedia Date: Mon, 13 Jan 2025 15:49:18 +0100 Subject: [PATCH 14/17] feat: Doctr in MegaParse --- benchmark/process_single_doc.py | 4 +- evaluations/script.py | 3 +- libs/megaparse/src/megaparse/api/app.py | 22 +- libs/megaparse/src/megaparse/configs/auto.py | 17 +- .../src/megaparse/examples/parse_file.py | 15 +- .../src/megaparse/examples/parsing_process.py | 391 ++++++++++++++++++ libs/megaparse/src/megaparse/megaparse.py | 273 +++++++----- libs/megaparse/src/megaparse/models/page.py | 13 +- libs/megaparse/src/megaparse/parser/base.py | 2 - .../src/megaparse/parser/doctr_parser.py | 26 +- .../src/megaparse/parser/strategy.py | 192 --------- .../megaparse/src/megaparse/utils/strategy.py | 71 ++++ .../src/megaparse/utils/strategy_utils.py | 16 - libs/megaparse/tests/pdf/test_all_parsers.py | 31 -- libs/megaparse/tests/pdf/test_detect_ocr.py | 29 +- .../tests/pdf/test_pdf_processing.py | 35 +- libs/megaparse/tests/test_import.py | 4 +- libs/megaparse/tests/test_parsers.py | 2 +- 18 files changed, 718 insertions(+), 428 deletions(-) create mode 100644 libs/megaparse/src/megaparse/examples/parsing_process.py delete mode 100644 libs/megaparse/src/megaparse/parser/strategy.py create mode 100644 libs/megaparse/src/megaparse/utils/strategy.py delete mode 100644 libs/megaparse/src/megaparse/utils/strategy_utils.py delete mode 100644 libs/megaparse/tests/pdf/test_all_parsers.py diff --git a/benchmark/process_single_doc.py b/benchmark/process_single_doc.py index 746ec24..bc2eac3 100644 --- a/benchmark/process_single_doc.py +++ b/benchmark/process_single_doc.py @@ -24,8 +24,8 @@ async def process_file(megaparse: MegaParse, file_path: str | Path): async def test_process_file(file: str | Path): - parser = UnstructuredParser(strategy=StrategyEnum.HI_RES) - megaparse = MegaParse(parser=parser) + # parser = UnstructuredParser(strategy=StrategyEnum.HI_RES) + megaparse = MegaParse() task = [] for _ in range(N_TRY): task.append(process_file(megaparse, file)) diff --git a/evaluations/script.py b/evaluations/script.py index 811f640..203ab25 100644 --- a/evaluations/script.py +++ b/evaluations/script.py @@ -6,6 +6,7 @@ from megaparse.parser.llama import LlamaParser from megaparse.parser.megaparse_vision import MegaParseVision from megaparse.parser.unstructured_parser import UnstructuredParser +from megaparse_sdk.schema.parser_config import StrategyEnum if __name__ == "__main__": print("---Launching evaluations script---") @@ -29,7 +30,7 @@ for method, parser in parser_dict.items(): print(f"Method: {method}") - megaparse = MegaParse(parser=parser) + megaparse = MegaParse() result = megaparse.load(file_path=base_pdf_path) score_dict[method] = difflib.SequenceMatcher(None, base_md, result).ratio() print(f"Score for method {method}: {score_dict[method]}") diff --git a/libs/megaparse/src/megaparse/api/app.py b/libs/megaparse/src/megaparse/api/app.py index d6f0ae8..b95a1a7 100644 --- a/libs/megaparse/src/megaparse/api/app.py +++ b/libs/megaparse/src/megaparse/api/app.py @@ -89,16 +89,16 @@ async def parse_file( else: raise HTTPModelNotSupported() - parser_config = ParseFileConfig( - method=method, - strategy=strategy, - model=model if model and check_table else None, - language=language, - parsing_instruction=parsing_instruction, - ) + # parser_config = ParseFileConfig( #FIXME + # method=method, + # strategy=strategy, + # llm_model_name=SupportedModel(model_name) if model_name and check_table else None, + # language=language, + # parsing_instruction=parsing_instruction, + # ) try: - parser = parser_builder.build(parser_config) - megaparse = MegaParse(parser=parser) + # parser = parser_builder.build(parser_config) + megaparse = MegaParse() if not file.filename: raise HTTPFileNotFound("No filename provided") _, extension = os.path.splitext(file.filename) @@ -136,9 +136,7 @@ async def upload_url( with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file: temp_file.write(response.content) try: - megaparse = MegaParse( - parser=UnstructuredParser(strategy=StrategyEnum.AUTO) - ) + megaparse = MegaParse() result = await megaparse.aload(temp_file.name) return {"message": "File parsed successfully", "result": result} except ParsingException: diff --git a/libs/megaparse/src/megaparse/configs/auto.py b/libs/megaparse/src/megaparse/configs/auto.py index c0034c1..688ed92 100644 --- a/libs/megaparse/src/megaparse/configs/auto.py +++ b/libs/megaparse/src/megaparse/configs/auto.py @@ -14,8 +14,8 @@ class TextDetConfig(BaseModel): class AutoStrategyConfig(BaseModel): - auto_page_threshold: float = 0.6 - auto_document_threshold: float = 0.2 + page_threshold: float = 0.6 + document_threshold: float = 0.2 class TextRecoConfig(BaseModel): @@ -29,6 +29,14 @@ class DeviceEnum(str, Enum): COREML = "coreml" +class DoctrConfig(BaseModel): + straighten_pages: bool = False + detect_orientation: bool = False + detect_language: bool = False + text_det_config: TextDetConfig = TextDetConfig() + text_reco_config: TextRecoConfig = TextRecoConfig() + + class MegaParseConfig(BaseSettings): """ Configuration for Megaparse. @@ -41,7 +49,6 @@ class MegaParseConfig(BaseSettings): extra="ignore", use_enum_values=True, ) - text_det_config: TextDetConfig = TextDetConfig() - text_reco_config: TextRecoConfig = TextRecoConfig() - auto_parse_config: AutoStrategyConfig = AutoStrategyConfig() + doctr_config: DoctrConfig = DoctrConfig() + auto_config: AutoStrategyConfig = AutoStrategyConfig() device: DeviceEnum = DeviceEnum.CPU diff --git a/libs/megaparse/src/megaparse/examples/parse_file.py b/libs/megaparse/src/megaparse/examples/parse_file.py index 248ad7c..4d95799 100644 --- a/libs/megaparse/src/megaparse/examples/parse_file.py +++ b/libs/megaparse/src/megaparse/examples/parse_file.py @@ -13,6 +13,7 @@ from megaparse.parser.doctr_parser import DoctrParser from megaparse.parser.unstructured_parser import UnstructuredParser from megaparse_sdk.schema.extensions import FileExtension +from megaparse_sdk.schema.parser_config import StrategyEnum from pydantic import BaseModel, Field @@ -22,18 +23,16 @@ class MyCustomFormat(BaseModel): solution: str = Field(description="The solution statement.") -async def main(): - # Parse a file - parser = DoctrParser() - model = ChatOpenAI(name="gpt-4o") +def main(): + # model = ChatOpenAI(name="gpt-4o") # formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) - megaparse = MegaParse(ocr_parser=parser) + megaparse = MegaParse() - file_path = Path("./tests/pdf/sample_pdf.pdf") - result = await megaparse.aload(file_path=file_path) + file_path = Path("./tests/pdf/native/0168011.pdf") + result = megaparse.load(file_path=file_path) print(result) if __name__ == "__main__": - asyncio.run(main()) + main() diff --git a/libs/megaparse/src/megaparse/examples/parsing_process.py b/libs/megaparse/src/megaparse/examples/parsing_process.py new file mode 100644 index 0000000..b855ace --- /dev/null +++ b/libs/megaparse/src/megaparse/examples/parsing_process.py @@ -0,0 +1,391 @@ +import warnings +from pathlib import Path +from typing import IO, Any, List, Tuple + +import numpy as np +import onnxruntime as rt +import pypdfium2 as pdfium +from megaparse.configs.auto import ( + AutoStrategyConfig, + DeviceEnum, + TextDetConfig, + TextRecoConfig, +) +from megaparse.models.page import Page, PageDimension +from megaparse.parser.doctr_parser import DoctrParser +from megaparse.parser.unstructured_parser import UnstructuredParser +from megaparse.predictor.models.base import BBOX, BlockLayout, BlockType, PageLayout +from megaparse.utils.strategy_utils import need_hi_res +from megaparse_sdk.schema.extensions import FileExtension +from megaparse_sdk.schema.parser_config import StrategyEnum +from numpy.typing import NDArray +from onnxtr.io import DocumentFile +from onnxtr.models import detection_predictor, recognition_predictor +from onnxtr.models.detection.predictor import DetectionPredictor +from onnxtr.models.engine import EngineConfig +from onnxtr.utils.geometry import ( + detach_scores, + extract_crops, + extract_rcrops, +) +from pypdfium2._helpers.page import PdfPage +from onnxtr.models.builder import DocumentBuilder + + +def get_strategy_page( + pdfium_page: PdfPage, onnxtr_page: PageLayout, page_threshold: float = 0.6 +) -> StrategyEnum: + # assert ( + # p_width == onnxtr_page.dimensions[1] + # and p_height == onnxtr_page.dimensions[0] + # ), "Page dimensions do not match" + text_coords = [] + # Get all the images in the page + for obj in pdfium_page.get_objects(): + if obj.type == 1: + text_coords.append(obj.get_pos()) + + p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height()) + + pdfium_canva = np.zeros((int(p_height), int(p_width))) + + for coords in text_coords: + # (left,bottom,right, top) + # 0---l--------------R-> y + # | + # B (x0,y0) + # | + # T (x1,y1) + # ^ + # x + x0, y0, x1, y1 = ( + p_height - coords[3], + coords[0], + p_height - coords[1], + coords[2], + ) + x0 = max(0, min(p_height, int(x0))) + y0 = max(0, min(p_width, int(y0))) + x1 = max(0, min(p_height, int(x1))) + y1 = max(0, min(p_width, int(y1))) + pdfium_canva[x0:x1, y0:y1] = 1 + + onnxtr_canva = np.zeros((int(p_height), int(p_width))) + for block in onnxtr_page.bboxes: + x0, y0 = block.bbox[0] + x1, y1 = block.bbox[1] + x0 = max(0, min(int(x0 * p_width), int(p_width))) + y0 = max(0, min(int(y0 * p_height), int(p_height))) + x1 = max(0, min(int(x1 * p_width), int(p_width))) + y1 = max(0, min(int(y1 * p_height), int(p_height))) + onnxtr_canva[y0:y1, x0:x1] = 1 + + intersection = np.logical_and(pdfium_canva, onnxtr_canva) + union = np.logical_or(pdfium_canva, onnxtr_canva) + iou = np.sum(intersection) / np.sum(union) + if iou < page_threshold: + return StrategyEnum.HI_RES + return StrategyEnum.FAST + + +def _get_providers(device=DeviceEnum.CPU) -> List[str]: + prov = rt.get_available_providers() + print("Available providers:", prov) + if device == DeviceEnum.CUDA: + # TODO: support openvino, directml etc + if "CUDAExecutionProvider" not in prov: + raise ValueError( + "onnxruntime can't find CUDAExecutionProvider in list of available providers" + ) + return ["TensorrtExecutionProvider", "CUDAExecutionProvider"] + elif device == DeviceEnum.COREML: + if "CoreMLExecutionProvider" not in prov: + raise ValueError( + "onnxruntime can't find CoreMLExecutionProvider in list of available providers" + ) + return ["CoreMLExecutionProvider"] + elif device == DeviceEnum.CPU: + return ["CPUExecutionProvider"] + else: + warnings.warn( + "Device not supported, using CPU", + UserWarning, + stacklevel=2, + ) + return ["CPUExecutionProvider"] + + +def validate_input( + file_path: Path | str | None = None, + file: IO[bytes] | None = None, + file_extension: str | FileExtension | None = None, +) -> FileExtension: + if not (file_path or file): + raise ValueError("Either file_path or file should be provided") + + if file_path and file: + raise ValueError("Only one of file_path or file should be provided") + + if file_path and file is None: + if isinstance(file_path, str): + file_path = Path(file_path) + file_extension = file_path.suffix + elif file and file_path is None: + if not file_extension: + raise ValueError( + "file_extension should be provided when given file argument" + ) + file.seek(0) + else: + raise ValueError("Either provider a file_path or file") + + if isinstance(file_extension, str): + try: + file_extension = FileExtension(file_extension) + except ValueError: + raise ValueError(f"Unsupported file extension: {file_extension}") + return file_extension + + +def _generate_crops( + pages: list[np.ndarray], + loc_preds: list[np.ndarray], + channels_last: bool, + assume_straight_pages: bool = False, + assume_horizontal: bool = False, +) -> list[list[np.ndarray]]: + if assume_straight_pages: + crops = [ + extract_crops(page, _boxes[:, :4], channels_last=channels_last) + for page, _boxes in zip(pages, loc_preds) + ] + else: + crops = [ + extract_rcrops( + page, + _boxes[:, :4], + channels_last=channels_last, + assume_horizontal=assume_horizontal, + ) + for page, _boxes in zip(pages, loc_preds) + ] + return crops + + +def _prepare_crops( + pages: list[np.ndarray], + loc_preds: list[np.ndarray], + channels_last: bool, + assume_straight_pages: bool = False, + assume_horizontal: bool = False, +) -> tuple[list[list[np.ndarray]], list[np.ndarray]]: + crops = _generate_crops( + pages, loc_preds, channels_last, assume_straight_pages, assume_horizontal + ) + + # Avoid sending zero-sized crops + is_kept = [ + [all(s > 0 for s in crop.shape) for crop in page_crops] for page_crops in crops + ] + crops = [ + [crop for crop, _kept in zip(page_crops, page_kept) if _kept] + for page_crops, page_kept in zip(crops, is_kept) + ] + loc_preds = [_boxes[_kept] for _boxes, _kept in zip(loc_preds, is_kept)] + + return crops, loc_preds + + +def _process_predictions( + loc_preds: list[np.ndarray], + word_preds: list[tuple[str, float]], + crop_orientations: list[dict[str, Any]], +) -> tuple[list[np.ndarray], list[list[tuple[str, float]]], list[list[dict[str, Any]]]]: + text_preds = [] + crop_orientation_preds = [] + if len(loc_preds) > 0: + # Text & crop orientation predictions at page level + _idx = 0 + for page_boxes in loc_preds: + text_preds.append(word_preds[_idx : _idx + page_boxes.shape[0]]) + crop_orientation_preds.append( + crop_orientations[_idx : _idx + page_boxes.shape[0]] + ) + _idx += page_boxes.shape[0] + + return loc_preds, text_preds, crop_orientation_preds + + +def main(): + file_path = Path("./tests/pdf/sample_pdf.pdf") + strategy = StrategyEnum.AUTO + device = DeviceEnum.COREML + ocr_parser = DoctrParser() + default_parser = UnstructuredParser(strategy=StrategyEnum.FAST) + file_extension = validate_input(file_path=file_path) + with open(file_path, "rb") as file: + pdfium_document = pdfium.PdfDocument(file) + rasterized_pages: list[np.ndarray] = [ + np.array(page.render().to_pil(scale=2)) for page in pdfium_document + ] + ##----------------------------------- + ## GET PAGES + ##----------------------------------- + mp_pages = [] + if strategy == StrategyEnum.FAST: + parsed_document = default_parser.convert( + file=file, + file_extension=file_extension, + ) + else: + text_det_config = TextDetConfig() + general_options = rt.SessionOptions() + providers = _get_providers(device=device) + engine_config = EngineConfig( + session_options=general_options, + providers=providers, + ) + det_predictor = detection_predictor( + arch=text_det_config.det_arch, + assume_straight_pages=text_det_config.assume_straight_pages, + preserve_aspect_ratio=text_det_config.preserve_aspect_ratio, + symmetric_pad=text_det_config.symmetric_pad, + batch_size=text_det_config.batch_size, + load_in_8_bit=text_det_config.load_in_8_bit, + engine_cfg=engine_config, + ) + if any(page.ndim != 3 for page in rasterized_pages): + raise ValueError( + "incorrect input shape: all pages are expected to be multi-channel 2D images." + ) + + orientations = None + general_pages_orientations = None + # Localize text elements + loc_preds, out_maps = det_predictor(rasterized_pages, return_maps=True) + # FIXME: For simplicity we do not care about page orientation rn + # FIXME: similaly we don't care about straighten page + + # Detach objectness scores from loc_preds + loc_preds, objectness_scores = detach_scores(loc_preds) # type: ignore[arg-type] + + # FIXME: Do not care about hooks here + # # Apply hooks to loc_preds if any + # for hook in hooks: + # loc_preds = hook(loc_preds) + all_pages_layouts = [] + for page_index, (page, loc_pred, objectness_score) in enumerate( + zip(rasterized_pages, loc_preds, objectness_scores, strict=True) + ): + block_layouts = [] + for bbox, score in zip(loc_pred, objectness_score, strict=True): + block_layouts.append( + BlockLayout( + bbox=BBOX(bbox[:2].tolist(), bbox[2:].tolist()), + objectness_score=score, + block_type=BlockType.TEXT, + ) + ) + all_pages_layouts.append( + PageLayout( + bboxes=block_layouts, + page_index=page_index, + dimensions=page.shape[:2], + orientation=general_pages_orientations[page_index] + if general_pages_orientations is not None + else 0, + ) + ) + for pdfium_page, onnxtr_page, rasterized_page in zip( + pdfium_document, all_pages_layouts, rasterized_pages, strict=True + ): + strategy = get_strategy_page(pdfium_page, onnxtr_page) + mp_pages.append( + Page( + strategy=strategy, + text_detections=onnxtr_page, + rasterized=rasterized_page, + page_size=PageDimension( + width=pdfium_page.get_width(), + height=pdfium_page.get_height(), + ), + page_index=onnxtr_page.page_index, + pdfium_elements=pdfium_page, + ) + ) + + ##----------------------------------- + ## GET PARSER BASE ON CHOSE STRATEGY + ##----------------------------------- + if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST: + parser = default_parser + elif strategy == StrategyEnum.HI_RES: + parser = ocr_parser + else: + if need_hi_res(mp_pages, AutoStrategyConfig()): + parser = ocr_parser + else: + parser = default_parser + + ##----------------------------------- + ## PARSE FILE + ##----------------------------------- + if isinstance(parser, UnstructuredParser): + parsed_document = parser.convert( + file=file, + pages=mp_pages, + file_extension=file_extension, + ) + else: + origin_page_shapes: List[Tuple[int, int]] = [ + (page.shape[0], page.shape[1]) for page in rasterized_pages + ] + + reco_config = TextRecoConfig() + reco_predictor = recognition_predictor( + arch=reco_config.reco_arch, + batch_size=reco_config.batch_size, + load_in_8_bit=text_det_config.load_in_8_bit, + engine_cfg=engine_config, + ) + + # Crop images + crops, loc_preds = _prepare_crops( + rasterized_pages, + loc_preds, # type: ignore[arg-type] + channels_last=True, + assume_straight_pages=True, # FIXME: To change + assume_horizontal=True, # FIXME: To change + ) + # Rectify crop orientation and get crop orientation predictions + crop_orientations: Any = [] + + # Identify character sequences + word_preds = reco_predictor( + [crop for page_crops in crops for crop in page_crops] + ) + if not crop_orientations: + crop_orientations = [ + {"value": 0, "confidence": None} for _ in word_preds + ] + + boxes, text_preds, crop_orientations = _process_predictions( + loc_preds, word_preds, crop_orientations + ) + doc_builder = DocumentBuilder() + parsed_document = doc_builder( + rasterized_pages, + boxes, + objectness_scores, + text_preds, + origin_page_shapes, + crop_orientations, + orientations, + None, + ) + + print(parsed_document) + + +if __name__ == "__main__": + main() diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py index 03f1b06..c6fc0db 100644 --- a/libs/megaparse/src/megaparse/megaparse.py +++ b/libs/megaparse/src/megaparse/megaparse.py @@ -3,18 +3,20 @@ from pathlib import Path from typing import IO, BinaryIO, List +import pypdfium2 as pdfium from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum from megaparse.configs.auto import DeviceEnum, MegaParseConfig from megaparse.exceptions.base import ParsingException from megaparse.formatter.base import BaseFormatter -from megaparse.models.page import Page -from megaparse.parser.base import BaseParser +from megaparse.models.page import GatewayDocument, Page, PageDimension from megaparse.parser.doctr_parser import DoctrParser -from megaparse.parser.strategy import StrategyHandler from megaparse.parser.unstructured_parser import UnstructuredParser -from megaparse.utils.strategy_utils import need_hi_res +from megaparse.utils.strategy import ( + determine_global_strategy, + get_page_strategy, +) logger = logging.getLogger("megaparse") @@ -24,30 +26,18 @@ class MegaParse: def __init__( self, - parser: BaseParser | None = None, - ocr_parser: BaseParser | None = None, formatters: List[BaseFormatter] | None = None, - strategy: StrategyEnum = StrategyEnum.AUTO, ) -> None: - if not parser: - parser = UnstructuredParser(strategy=StrategyEnum.FAST) - if not ocr_parser: - ocr_parser = DoctrParser( - text_det_config=self.config.text_det_config, - text_reco_config=self.config.text_reco_config, - device=self.config.device, - ) - - self.strategy = strategy - self.parser = parser self.formatters = formatters - self.ocr_parser = ocr_parser - - self.strategy_handler = StrategyHandler( - text_det_config=self.config.text_det_config, - auto_config=self.config.auto_parse_config, + self.doctr_parser = DoctrParser( + text_det_config=self.config.doctr_config.text_det_config, + text_reco_config=self.config.doctr_config.text_reco_config, device=self.config.device, + straighten_pages=self.config.doctr_config.straighten_pages, + detect_orientation=self.config.doctr_config.detect_orientation, + detect_language=self.config.doctr_config.detect_language, ) + self.unstructured_parser = UnstructuredParser() def validate_input( self, @@ -81,112 +71,173 @@ def validate_input( raise ValueError(f"Unsupported file extension: {file_extension}") return file_extension - async def aload( + def extract_page_strategies( + self, file: BinaryIO, rast_scale: int = 2 + ) -> List[Page]: + pdfium_document = pdfium.PdfDocument(file) + + pages = [] + for i, pdfium_page in enumerate(pdfium_document): + rasterized_page = pdfium_page.render(scale=rast_scale) + assert ( + abs(pdfium_page.get_width() * rast_scale - rasterized_page.width) <= 1 + ), ( + f"Widths do not match within a margin of 1: " + f"{pdfium_page.get_width() * rast_scale} != {rasterized_page.width}" + ) + pages.append( + Page( + strategy=StrategyEnum.AUTO, + text_detections=None, + rasterized=rasterized_page.to_pil(), + page_size=PageDimension( + width=pdfium_page.get_width() * rast_scale, + height=pdfium_page.get_height() * rast_scale, + ), + page_index=i, + pdfium_elements=pdfium_page, + ) + ) + + # ---- + # Get text detection for each page -> PAGE + + pages = self.doctr_parser.get_text_detections(pages) + + # --- + + # Get strategy per page -> PAGE + for page in pages: + page.strategy = get_page_strategy( + page.pdfium_elements, + page.text_detections, + threshold=self.config.auto_config.page_threshold, + ) + return pages + + def load( self, file_path: Path | str | None = None, file: BinaryIO | None = None, file_extension: str | FileExtension = "", + strategy: StrategyEnum = StrategyEnum.AUTO, ) -> str: file_extension = self.validate_input( file=file, file_path=file_path, file_extension=file_extension ) - opened_file = None # FIXM: Not sure of this method - try: - if file_path: - opended_file = open(file_path, "rb") - file = opended_file - - assert file is not None, "No File provided" - # First parse the file in with fast and get text detections - pages = self.strategy_handler.determine_strategy( - file=file, strategy=self.strategy + if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST: + self.unstructured_parser.strategy = strategy + return str( + self.unstructured_parser.convert( + file_path=file_path, file=file, file_extension=file_extension + ) ) - parser = self._select_parser(pages=pages, file_extension=file_extension) + else: + opened_file = None + try: + if file_path: + opened_file = open(file_path, "rb") + file = opened_file + + assert file is not None, "No File provided" + pages = self.extract_page_strategies(file) + strategy = determine_global_strategy( + pages, self.config.auto_config.document_threshold + ) - logger.info(f"Parsing using {parser.__class__.__name__} parser.") - parsed_document = await parser.aconvert( - file=file, file_extension=file_extension - ) - parsed_document.file_name = str(file_path) if file_path else None - - if self.formatters: - for formatter in self.formatters: - if isinstance(parsed_document, str): - warnings.warn( - f"The last step returned a string, the {formatter.__class__} and following will not be applied", - stacklevel=2, - ) - break - parsed_document = await formatter.aformat(parsed_document) - - if not isinstance(parsed_document, str): - return str(parsed_document) - return parsed_document - except Exception as e: - raise ParsingException( - f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}" - ) - finally: - if opened_file: - opened_file.close() + if strategy == StrategyEnum.HI_RES: + print("Using Doctr for text recognition") + parsed_document = self.doctr_parser.get_text_recognition(pages) + + else: + print("Switching to Unstructured Parser") + self.unstructured_parser.strategy = StrategyEnum.FAST + parsed_document = self.unstructured_parser.convert( + file=file, file_extension=file_extension + ) + + parsed_document.file_name = str(file_path) if file_path else None + + if self.formatters: + for formatter in self.formatters: + if isinstance(parsed_document, str): + warnings.warn( + f"The last step returned a string, the {formatter.__class__} and following will not be applied", + stacklevel=2, + ) + break + parsed_document = formatter.format(parsed_document) + + if not isinstance(parsed_document, str): + return str(parsed_document) + return parsed_document + except Exception as e: + raise ParsingException( + f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}" + ) + finally: + if opened_file: + opened_file.close() - def load( + async def aload( self, file_path: Path | str | None = None, file: BinaryIO | None = None, file_extension: str | FileExtension = "", + strategy: StrategyEnum = StrategyEnum.AUTO, ) -> str: file_extension = self.validate_input( file=file, file_path=file_path, file_extension=file_extension ) - opened_file = None # FIXM: Not sure of this method - try: - if file_path: - opended_file = open(file_path, "rb") - file = opended_file - - assert file is not None, "No File provided" - # First parse the file in with fast and get text detections - pages = self.strategy_handler.determine_strategy( - file=file, - ) - parser = self._select_parser(pages=pages, file_extension=file_extension) - - logger.info(f"Parsing using {parser.__class__.__name__} parser.") - parsed_document = parser.convert(file=file, file_extension=file_extension) - parsed_document.file_name = str(file_path) if file_path else None - - if self.formatters: - for formatter in self.formatters: - if isinstance(parsed_document, str): - warnings.warn( - f"The last step returned a string, the {formatter.__class__} and following will not be applied", - stacklevel=2, - ) - break - parsed_document = formatter.format(parsed_document) - - if not isinstance(parsed_document, str): - return str(parsed_document) - return parsed_document - except Exception as e: - raise ParsingException( - f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}" + if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST: + self.unstructured_parser.strategy = strategy + parsed_document = await self.unstructured_parser.aconvert( + file_path=file_path, file=file, file_extension=file_extension ) - finally: - if opened_file: - opened_file.close() + return str(parsed_document) + else: + opened_file = None + try: + if file_path: + opened_file = open(file_path, "rb") + file = opened_file + + assert file is not None, "No File provided" + pages = self.extract_page_strategies(file) + strategy = determine_global_strategy( + pages, self.config.auto_config.document_threshold + ) - def _select_parser( - self, - pages: List[Page], - file_extension: str | FileExtension = "", - ) -> BaseParser: - if file_extension != FileExtension.PDF or self.strategy == StrategyEnum.FAST: - return self.parser - if self.strategy == StrategyEnum.HI_RES: - return self.ocr_parser - - if need_hi_res(pages, self.config.auto_parse_config): - return self.ocr_parser - return self.parser + if strategy == StrategyEnum.HI_RES: + print("Using Doctr for text recognition") + parsed_document = self.doctr_parser.get_text_recognition(pages) + + else: + print("Switching to Unstructured Parser") + self.unstructured_parser.strategy = StrategyEnum.FAST + parsed_document = await self.unstructured_parser.aconvert( + file=file, file_extension=file_extension + ) + + parsed_document.file_name = str(file_path) if file_path else None + + if self.formatters: + for formatter in self.formatters: + if isinstance(parsed_document, str): + warnings.warn( + f"The last step returned a string, the {formatter.__class__} and following will not be applied", + stacklevel=2, + ) + break + parsed_document = await formatter.aformat(parsed_document) + + if not isinstance(parsed_document, str): + return str(parsed_document) + return parsed_document + except Exception as e: + raise ParsingException( + f"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}" + ) + finally: + if opened_file: + opened_file.close() diff --git a/libs/megaparse/src/megaparse/models/page.py b/libs/megaparse/src/megaparse/models/page.py index 970c226..0a1011b 100644 --- a/libs/megaparse/src/megaparse/models/page.py +++ b/libs/megaparse/src/megaparse/models/page.py @@ -1,11 +1,9 @@ from typing import List - from megaparse.predictor.models.base import PageLayout from megaparse_sdk.schema.parser_config import StrategyEnum +from PIL.Image import Image as PILImage from pydantic import BaseModel, ConfigDict from pypdfium2._helpers.page import PdfPage -from PIL.Image import Image as PILImage -import numpy as np class PageDimension(BaseModel): @@ -30,3 +28,12 @@ class Page(BaseModel): pdfium_elements: PdfPage model_config = ConfigDict(arbitrary_types_allowed=True) + + +class GatewayDocument(BaseModel): + """ + A class to represent a Gateway MegaParse Document, which is a container of pages. + """ + + file_name: str + pages: List[Page] diff --git a/libs/megaparse/src/megaparse/parser/base.py b/libs/megaparse/src/megaparse/parser/base.py index e079fbb..0937f84 100644 --- a/libs/megaparse/src/megaparse/parser/base.py +++ b/libs/megaparse/src/megaparse/parser/base.py @@ -33,7 +33,6 @@ async def aconvert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, - pages: List[Page] | None = None, file_extension: FileExtension | None = None, **kwargs, ) -> Document: @@ -57,7 +56,6 @@ def convert( self, file_path: str | Path | None = None, file: IO[bytes] | None = None, - pages: List[Page] | None = None, file_extension: FileExtension | None = None, **kwargs, ) -> Document: diff --git a/libs/megaparse/src/megaparse/parser/doctr_parser.py b/libs/megaparse/src/megaparse/parser/doctr_parser.py index b8ac07e..04e649d 100644 --- a/libs/megaparse/src/megaparse/parser/doctr_parser.py +++ b/libs/megaparse/src/megaparse/parser/doctr_parser.py @@ -1,14 +1,13 @@ import logging import warnings -from pathlib import Path -from typing import IO, Any, BinaryIO, List +from typing import Any, List -import PIL +import numpy as np import onnxruntime as rt from megaparse_sdk.schema.extensions import FileExtension -from onnxtr.io import Document, DocumentFile -from onnxtr.models import detection_predictor, ocr_predictor, recognition_predictor -from onnxtr.models.detection.predictor import DetectionPredictor +from onnxtr.io import Document +from onnxtr.models import detection_predictor, recognition_predictor +from onnxtr.models._utils import get_language from onnxtr.models.engine import EngineConfig from onnxtr.models.predictor.base import _OCRPredictor from onnxtr.utils.geometry import detach_scores @@ -18,7 +17,6 @@ from megaparse.models.document import Document as MPDocument from megaparse.models.document import ImageBlock, TextBlock from megaparse.models.page import Page -from megaparse.parser.base import BaseParser from megaparse.predictor.models.base import ( BBOX, BlockLayout, @@ -26,13 +24,6 @@ PageLayout, Point2D, ) -from onnxtr.models._utils import get_language - - -import numpy as np -from typing import List - -from PIL import Image as PILImage logger = logging.getLogger("megaparse") @@ -133,7 +124,7 @@ def get_text_detections(self, pages: list[Page], **kwargs) -> List[Page]: # Detect document rotation and rotate pages seg_maps = [ np.where( - out_map > self.det_predictor.model.postprocessor, + out_map > self.det_predictor.model.postprocessor.bin_thresh, 255, 0, ).astype(np.uint8) @@ -204,8 +195,11 @@ def get_text_recognition(self, pages: List[Page], **kwargs) -> MPDocument: orientations = [] origin_page_shapes = [] for page in pages: + page_loc_pred = page.text_detections.get_loc_preds() # type: ignore + if page_loc_pred.shape[0] == 0: + page_loc_pred = np.zeros((0, 4)) rasterized_pages.append(np.array(page.rasterized)) - loc_preds.append(page.text_detections.get_loc_preds()) # type: ignore + loc_preds.append(page_loc_pred) # type: ignore objectness_scores.append(page.text_detections.get_objectness_scores()) # type: ignore orientations.append(page.text_detections.get_orientations()) # type: ignore origin_page_shapes.append(page.text_detections.get_origin_page_shapes()) # type: ignore diff --git a/libs/megaparse/src/megaparse/parser/strategy.py b/libs/megaparse/src/megaparse/parser/strategy.py deleted file mode 100644 index 1f9f20e..0000000 --- a/libs/megaparse/src/megaparse/parser/strategy.py +++ /dev/null @@ -1,192 +0,0 @@ -import logging -import random -import warnings -from pathlib import Path -from typing import BinaryIO, List, Tuple - -import numpy as np -import onnxruntime as rt -import pypdfium2 as pdfium -from megaparse_sdk.schema.parser_config import StrategyEnum -from onnxtr.io import DocumentFile -from onnxtr.models import detection_predictor -from onnxtr.models.engine import EngineConfig -from pypdfium2._helpers.page import PdfPage - -from megaparse.configs.auto import AutoStrategyConfig, DeviceEnum, TextDetConfig -from megaparse.models.page import Page, PageDimension -from megaparse.predictor.doctr_layout_detector import LayoutPredictor -from megaparse.predictor.models.base import PageLayout - -logger = logging.getLogger("megaparse") - - -class StrategyHandler: - def __init__( - self, - auto_config: AutoStrategyConfig = AutoStrategyConfig(), - text_det_config: TextDetConfig = TextDetConfig(), - device: DeviceEnum = DeviceEnum.CPU, - ) -> None: - self.config = auto_config - self.device = device - general_options = rt.SessionOptions() - providers = self._get_providers() - engine_config = EngineConfig( - session_options=general_options, - providers=providers, - ) - - self.det_predictor = detection_predictor( - arch=text_det_config.det_arch, - assume_straight_pages=text_det_config.assume_straight_pages, - preserve_aspect_ratio=text_det_config.preserve_aspect_ratio, - symmetric_pad=text_det_config.symmetric_pad, - batch_size=text_det_config.batch_size, - load_in_8_bit=text_det_config.load_in_8_bit, - engine_cfg=engine_config, - ) - - def _get_providers(self) -> List[str]: - prov = rt.get_available_providers() - logger.info("Available providers:", prov) - if self.device == DeviceEnum.CUDA: - # TODO: support openvino, directml etc - if "CUDAExecutionProvider" not in prov: - raise ValueError( - "onnxruntime can't find CUDAExecutionProvider in list of available providers" - ) - return ["TensorrtExecutionProvider", "CUDAExecutionProvider"] - elif self.device == DeviceEnum.COREML: - if "CoreMLExecutionProvider" not in prov: - raise ValueError( - "onnxruntime can't find CoreMLExecutionProvider in list of available providers" - ) - return ["CoreMLExecutionProvider"] - elif self.device == DeviceEnum.CPU: - return ["CPUExecutionProvider"] - else: - warnings.warn( - "Device not supported, using CPU", - UserWarning, - stacklevel=2, - ) - return ["CPUExecutionProvider"] - - def get_strategy_page( - self, pdfium_page: PdfPage, onnxtr_page: PageLayout - ) -> StrategyEnum: - # assert ( - # p_width == onnxtr_page.dimensions[1] - # and p_height == onnxtr_page.dimensions[0] - # ), "Page dimensions do not match" - text_coords = [] - # Get all the images in the page - for obj in pdfium_page.get_objects(): - if obj.type == 1: - text_coords.append(obj.get_pos()) - - p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height()) - - pdfium_canva = np.zeros((int(p_height), int(p_width))) - - for coords in text_coords: - # (left,bottom,right, top) - # 0---l--------------R-> y - # | - # B (x0,y0) - # | - # T (x1,y1) - # ^ - # x - x0, y0, x1, y1 = ( - p_height - coords[3], - coords[0], - p_height - coords[1], - coords[2], - ) - x0 = max(0, min(p_height, int(x0))) - y0 = max(0, min(p_width, int(y0))) - x1 = max(0, min(p_height, int(x1))) - y1 = max(0, min(p_width, int(y1))) - pdfium_canva[x0:x1, y0:y1] = 1 - - onnxtr_canva = np.zeros((int(p_height), int(p_width))) - for block in onnxtr_page.bboxes: - x0, y0 = block.bbox[0] - x1, y1 = block.bbox[1] - x0 = max(0, min(int(x0 * p_width), int(p_width))) - y0 = max(0, min(int(y0 * p_height), int(p_height))) - x1 = max(0, min(int(x1 * p_width), int(p_width))) - y1 = max(0, min(int(y1 * p_height), int(p_height))) - onnxtr_canva[y0:y1, x0:x1] = 1 - - intersection = np.logical_and(pdfium_canva, onnxtr_canva) - union = np.logical_or(pdfium_canva, onnxtr_canva) - iou = np.sum(intersection) / np.sum(union) - if iou < self.config.auto_page_threshold: - return StrategyEnum.HI_RES - return StrategyEnum.FAST - - def determine_strategy( - self, - file: BinaryIO | Path | bytes, - max_samples: int = 5, - strategy: StrategyEnum = StrategyEnum.AUTO, - ) -> List[Page]: - if isinstance(file, BinaryIO): - file = file.read() # onnxtr expects a file as AbstractPath or bytes - logger.info("Determining strategy...") - pdfium_document = pdfium.PdfDocument(file) - - if strategy == StrategyEnum.FAST: - mp_pages = [] - for i, pdfium_page in enumerate(pdfium_document): - mp_pages.append( - Page( - strategy=strategy, - text_detections=None, - rasterized=pdfium_page.render().to_pil(), - page_size=PageDimension( - width=pdfium_page.get_width(), - height=pdfium_page.get_height(), - ), - page_index=i, - pdfium_elements=pdfium_page, - ) - ) - return mp_pages - - onnxtr_document = DocumentFile.from_pdf(file) - layout_predictor = LayoutPredictor(self.det_predictor) - - # if len(pdfium_document) > max_samples: - # sample_pages_index = random.sample(range(len(onnxtr_document)), max_samples) - # onnxtr_document = [onnxtr_document[i] for i in sample_pages_index] - # pdfium_document = [pdfium_document[i] for i in sample_pages_index] - - onnxtr_document_layout = layout_predictor(onnxtr_document) - - mp_pages: List[Page] = [] - - for pdfium_page, onnxtr_page in zip( - pdfium_document, onnxtr_document_layout, strict=True - ): - strategy = self.get_strategy_page(pdfium_page, onnxtr_page) - mp_pages.append( - Page( - strategy=strategy, - text_detections=onnxtr_page, - rasterized=pdfium_page.render().to_pil(), # FIXME check - page_size=PageDimension( - width=pdfium_page.get_width(), height=pdfium_page.get_height() - ), - page_index=onnxtr_page.page_index, - pdfium_elements=pdfium_page, - ) - ) - - if isinstance(pdfium_document, pdfium.PdfDocument): - pdfium_document.close() - - return mp_pages diff --git a/libs/megaparse/src/megaparse/utils/strategy.py b/libs/megaparse/src/megaparse/utils/strategy.py new file mode 100644 index 0000000..774281d --- /dev/null +++ b/libs/megaparse/src/megaparse/utils/strategy.py @@ -0,0 +1,71 @@ +from typing import List + +import numpy as np +from megaparse_sdk.schema.parser_config import StrategyEnum +from pypdfium2._helpers.page import PdfPage + +from megaparse.models.page import Page +from megaparse.predictor.models.base import PageLayout + + +def get_page_strategy( + pdfium_page: PdfPage, onnxtr_page: PageLayout | None, threshold: float +) -> StrategyEnum: + if onnxtr_page is None: + return StrategyEnum.FAST + text_coords = [] + # Get all the images in the page + for obj in pdfium_page.get_objects(): + if obj.type == 1: # type: ignore + text_coords.append(obj.get_pos()) + + p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height()) + + pdfium_canva = np.zeros((int(p_height), int(p_width))) + + for coords in text_coords: + # (left,bottom,right, top) + # 0---l--------------R-> y + # | + # B (x0,y0) + # | + # T (x1,y1) + # ^ + # x + x0, y0, x1, y1 = ( + p_height - coords[3], + coords[0], + p_height - coords[1], + coords[2], + ) + x0 = max(0, min(p_height, int(x0))) + y0 = max(0, min(p_width, int(y0))) + x1 = max(0, min(p_height, int(x1))) + y1 = max(0, min(p_width, int(y1))) + pdfium_canva[x0:x1, y0:y1] = 1 + + onnxtr_canva = np.zeros((int(p_height), int(p_width))) + for block in onnxtr_page.bboxes: + x0, y0 = block.bbox[0] + x1, y1 = block.bbox[1] + x0 = max(0, min(int(x0 * p_width), int(p_width))) + y0 = max(0, min(int(y0 * p_height), int(p_height))) + x1 = max(0, min(int(x1 * p_width), int(p_width))) + y1 = max(0, min(int(y1 * p_height), int(p_height))) + onnxtr_canva[y0:y1, x0:x1] = 1 + + intersection = np.logical_and(pdfium_canva, onnxtr_canva) + union = np.logical_or(pdfium_canva, onnxtr_canva) + sum_intersection = np.sum(intersection) + sum_union = np.sum(union) + iou = sum_intersection / sum_union if sum_union != 0 else 0 + if iou < threshold: + return StrategyEnum.HI_RES + return StrategyEnum.FAST + + +def determine_global_strategy(pages: List[Page], threshold: float) -> StrategyEnum: + count = sum(1 for page in pages if page.strategy == StrategyEnum.HI_RES) + if count / len(pages) > threshold: + return StrategyEnum.HI_RES + return StrategyEnum.FAST diff --git a/libs/megaparse/src/megaparse/utils/strategy_utils.py b/libs/megaparse/src/megaparse/utils/strategy_utils.py deleted file mode 100644 index 0204562..0000000 --- a/libs/megaparse/src/megaparse/utils/strategy_utils.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import List - -from megaparse.configs.auto import AutoStrategyConfig -from megaparse.models.page import Page -from megaparse_sdk.schema.parser_config import StrategyEnum - - -def need_hi_res( - pages: List[Page], auto_config: AutoStrategyConfig = AutoStrategyConfig() -) -> bool: - need_ocr = 0 - for page in pages: - if page.strategy == StrategyEnum.HI_RES: - need_ocr += 1 - - return (need_ocr / len(pages)) > auto_config.auto_document_threshold diff --git a/libs/megaparse/tests/pdf/test_all_parsers.py b/libs/megaparse/tests/pdf/test_all_parsers.py deleted file mode 100644 index 9ac8204..0000000 --- a/libs/megaparse/tests/pdf/test_all_parsers.py +++ /dev/null @@ -1,31 +0,0 @@ -import pytest -from megaparse import MegaParse -from megaparse.parser.doctr_parser import DoctrParser -from megaparse.parser.llama import LlamaParser -from megaparse.parser.megaparse_vision import MegaParseVision -from megaparse.parser.unstructured_parser import UnstructuredParser - -PARSER_LIST = [ - UnstructuredParser, - DoctrParser, -] # LlamaParser, MegaParseVision are long and costly to test - - -@pytest.mark.parametrize("parser", PARSER_LIST) -def test_sync_parsers(parser): - parser = parser() - megaparse = MegaParse(parser) - response = megaparse.load("./tests/data/dummy.pdf") - print(response) - assert response - assert len(response) > 0 - - -@pytest.mark.asyncio -@pytest.mark.parametrize("parser", PARSER_LIST) -async def test_async_parsers(parser): - parser = parser() - megaparse = MegaParse(parser) - response = await megaparse.aload("./tests/data/dummy.pdf") - print(response) - assert len(response) > 0 diff --git a/libs/megaparse/tests/pdf/test_detect_ocr.py b/libs/megaparse/tests/pdf/test_detect_ocr.py index e0d1f77..5923f1a 100644 --- a/libs/megaparse/tests/pdf/test_detect_ocr.py +++ b/libs/megaparse/tests/pdf/test_detect_ocr.py @@ -1,14 +1,14 @@ import os -from megaparse.utils.strategy_utils import need_hi_res import pytest -from megaparse.parser.strategy import StrategyHandler +from megaparse.megaparse import MegaParse +from megaparse.utils.strategy import determine_global_strategy from megaparse_sdk.schema.parser_config import StrategyEnum ocr_pdfs = os.listdir("./tests/pdf/ocr") native_pdfs = os.listdir("./tests/pdf/native") -strategy_handler = StrategyHandler() +megaparse = MegaParse() @pytest.mark.parametrize("hi_res_pdf", ocr_pdfs) @@ -17,16 +17,27 @@ def test_hi_res_strategy(hi_res_pdf): pytest.skip("Skip 0168004.pdf as it is flaky currently") with open(f"./tests/pdf/ocr/{hi_res_pdf}", "rb") as f: - pages = strategy_handler.determine_strategy( - f, + pages = megaparse.extract_page_strategies(f) + + assert ( + determine_global_strategy( + pages, megaparse.config.auto_config.document_threshold ) - assert need_hi_res(pages) + == StrategyEnum.HI_RES + ) @pytest.mark.parametrize("native_pdf", native_pdfs) def test_fast_strategy(native_pdf): + if native_pdf == "0168029.pdf": + pytest.skip("Skip 0168029.pdf as it is too long to process") + with open(f"./tests/pdf/native/{native_pdf}", "rb") as f: - pages = strategy_handler.determine_strategy( - f, + pages = megaparse.extract_page_strategies(f) + + assert ( + determine_global_strategy( + pages, megaparse.config.auto_config.document_threshold ) - assert not need_hi_res(pages) + == StrategyEnum.FAST + ) diff --git a/libs/megaparse/tests/pdf/test_pdf_processing.py b/libs/megaparse/tests/pdf/test_pdf_processing.py index 3fd5e96..fe84f0b 100644 --- a/libs/megaparse/tests/pdf/test_pdf_processing.py +++ b/libs/megaparse/tests/pdf/test_pdf_processing.py @@ -2,14 +2,10 @@ import pytest from megaparse.megaparse import MegaParse -from megaparse.parser.strategy import StrategyHandler -from megaparse.parser.unstructured_parser import UnstructuredParser -from megaparse.utils.strategy_utils import need_hi_res +from megaparse.utils.strategy import determine_global_strategy from megaparse_sdk.schema.extensions import FileExtension from megaparse_sdk.schema.parser_config import StrategyEnum -strategy_handler = StrategyHandler() - @pytest.fixture def native_pdf() -> Path: @@ -23,9 +19,9 @@ def scanned_pdf() -> Path: return p -def test_get_default_processors_megaparse(): - megaparse = MegaParse() - assert type(megaparse.parser) is UnstructuredParser +# def test_get_default_processors_megaparse(): +# megaparse = MegaParse() +# assert type(megaparse.parser) is UnstructuredParser @pytest.mark.asyncio @@ -56,16 +52,23 @@ async def test_megaparse_pdf_processor_file(pdf_name, request): def test_strategy(scanned_pdf, native_pdf): + processor = MegaParse() with open(native_pdf, "rb") as f: - native_pages = strategy_handler.determine_strategy( - f, + pages = processor.extract_page_strategies(f) + + assert ( + determine_global_strategy( + pages, processor.config.auto_config.document_threshold ) - result = need_hi_res(native_pages) - assert not result + == StrategyEnum.FAST + ) with open(scanned_pdf, "rb") as f: - scanned_pages = strategy_handler.determine_strategy( - f, + pages = processor.extract_page_strategies(f) + + assert ( + determine_global_strategy( + pages, processor.config.auto_config.document_threshold ) - result = need_hi_res(scanned_pages) - assert result + == StrategyEnum.HI_RES + ) diff --git a/libs/megaparse/tests/test_import.py b/libs/megaparse/tests/test_import.py index 430778c..acbac71 100644 --- a/libs/megaparse/tests/test_import.py +++ b/libs/megaparse/tests/test_import.py @@ -1,12 +1,10 @@ import pytest from megaparse import MegaParse -from megaparse.parser.unstructured_parser import UnstructuredParser @pytest.mark.skip("slow test") def test_load(): - parser = UnstructuredParser(model=None) - megaparse = MegaParse(parser) + megaparse = MegaParse() response = megaparse.load("./tests/data/dummy.pdf") print(response) assert response.strip("\n") == "Dummy PDF download" diff --git a/libs/megaparse/tests/test_parsers.py b/libs/megaparse/tests/test_parsers.py index 40e772a..9970d36 100644 --- a/libs/megaparse/tests/test_parsers.py +++ b/libs/megaparse/tests/test_parsers.py @@ -9,7 +9,7 @@ PARSER_LIST = [ UnstructuredParser, - DoctrParser, + # DoctrParser, ] From 1960167e6a41cfa6cde74a7df8f9061e724dcad4 Mon Sep 17 00:00:00 2001 From: chloedia Date: Mon, 13 Jan 2025 15:57:08 +0100 Subject: [PATCH 15/17] fix : Update ReadMe --- README.md | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index afab219..159fa21 100644 --- a/README.md +++ b/README.md @@ -41,34 +41,25 @@ pip install megaparse 4. If you have a mac, you also need to install libmagic ```brew install libmagic``` - +Use MegaParse as it is : ```python from megaparse import MegaParse from langchain_openai import ChatOpenAI -from megaparse.parser.unstructured_parser import UnstructuredParser -parser = UnstructuredParser() -megaparse = MegaParse(parser) +megaparse = MegaParse() response = megaparse.load("./test.pdf") print(response) -megaparse.save("./test.md") ``` ### Use MegaParse Vision -* Change the parser to MegaParseVision - ```python -from megaparse import MegaParse -from langchain_openai import ChatOpenAI from megaparse.parser.megaparse_vision import MegaParseVision model = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")) # type: ignore parser = MegaParseVision(model=model) -megaparse = MegaParse(parser) -response = megaparse.load("./test.pdf") +response = parser.convert("./test.pdf") print(response) -megaparse.save("./test.md") ``` **Note**: The model supported by MegaParse Vision are the multimodal ones such as claude 3.5, claude 4, gpt-4o and gpt-4. From be62a689314dfda81a2fa3ccd2170492a0052d8f Mon Sep 17 00:00:00 2001 From: chloedia Date: Mon, 13 Jan 2025 17:39:57 +0100 Subject: [PATCH 16/17] fix: add config as constructor parameters --- libs/megaparse/src/megaparse/megaparse.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py index c6fc0db..e899e85 100644 --- a/libs/megaparse/src/megaparse/megaparse.py +++ b/libs/megaparse/src/megaparse/megaparse.py @@ -22,12 +22,10 @@ class MegaParse: - config = MegaParseConfig() - def __init__( - self, - formatters: List[BaseFormatter] | None = None, + self, formatters: List[BaseFormatter] | None = None, config=MegaParseConfig() ) -> None: + self.config = config self.formatters = formatters self.doctr_parser = DoctrParser( text_det_config=self.config.doctr_config.text_det_config, From 1e11031256d45cc83bbb9675d3654e961b8b3cd3 Mon Sep 17 00:00:00 2001 From: chloedia Date: Tue, 14 Jan 2025 11:04:28 +0100 Subject: [PATCH 17/17] add: to_numpy to bbox --- .../src/megaparse/predictor/models/base.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/libs/megaparse/src/megaparse/predictor/models/base.py b/libs/megaparse/src/megaparse/predictor/models/base.py index 27b63b3..c79a859 100644 --- a/libs/megaparse/src/megaparse/predictor/models/base.py +++ b/libs/megaparse/src/megaparse/predictor/models/base.py @@ -19,6 +19,11 @@ class BBOX(NamedTuple): top_left: Point2D bottom_right: Point2D + def to_numpy(self): + return np.array( + [self.top_left.x, self.top_left.y, self.bottom_right.x, self.bottom_right.y] + ) + class BlockLayout(BaseModel): bbox: BBOX @@ -85,17 +90,7 @@ def get_loc_preds(self) -> np.ndarray: Returns: np.ndarray: The location predictions as a NumPy array. """ - loc_preds = np.array( - [ - [ - block.bbox.top_left.x, - block.bbox.top_left.y, - block.bbox.bottom_right.x, - block.bbox.bottom_right.y, - ] - for block in self.bboxes - ] - ) + loc_preds = np.array([block.bbox.to_numpy() for block in self.bboxes]) return loc_preds def get_objectness_scores(self) -> np.ndarray: