Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: type strategy output #216

Merged
merged 24 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
0730ff9
feat: create first format modules
chloedia Dec 1, 2024
a052e15
Merge remote-tracking branch 'origin' into feat/make_modular
chloedia Dec 9, 2024
5b63dc6
add: example file
chloedia Dec 9, 2024
eea6cfd
add: structured output formatter
chloedia Dec 9, 2024
099780d
Merge remote-tracking branch 'origin' into feat/make_modular
chloedia Dec 26, 2024
8a98694
Merge remote-tracking branch 'origin' into feat/make_modular
chloedia Jan 5, 2025
7917ae9
fix: all parsers outputs list of elements & compatibility formatters
chloedia Jan 6, 2025
351b63a
feat: new basemodel for document
chloedia Jan 6, 2025
52e2c02
add: structured output
chloedia Jan 7, 2025
50f4bb6
fix: test
chloedia Jan 7, 2025
01cab33
fix: add uncategorized text handling
chloedia Jan 8, 2025
05dc7b0
Merge remote-tracking branch 'origin/main' into feat/make_modular
chloedia Jan 8, 2025
04a858f
add: skip on flaky pdf
chloedia Jan 8, 2025
4b5b0d4
Merge branch 'feat/make_modular' into feat/type_strategy_output
chloedia Jan 8, 2025
2dcd952
add: section block
chloedia Jan 8, 2025
ab5bae4
Merge branch 'feat/make_modular' into feat/type_strategy_output
chloedia Jan 8, 2025
790bba3
fix: change load logic & reate page element
chloedia Jan 8, 2025
79354e4
fix: add pages
chloedia Jan 9, 2025
adc69b1
add: split onnxtr det and reco
chloedia Jan 10, 2025
e0c0db0
feat: Doctr in MegaParse
chloedia Jan 13, 2025
eb65e2b
merge main
chloedia Jan 13, 2025
1960167
fix : Update ReadMe
chloedia Jan 13, 2025
be62a68
fix: add config as constructor parameters
chloedia Jan 13, 2025
1e11031
add: to_numpy to bbox
chloedia Jan 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 3 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,34 +41,25 @@ pip install megaparse

4. If you have a mac, you also need to install libmagic ```brew install libmagic```


Use MegaParse as it is :
```python
from megaparse import MegaParse
from langchain_openai import ChatOpenAI
from megaparse.parser.unstructured_parser import UnstructuredParser

parser = UnstructuredParser()
megaparse = MegaParse(parser)
megaparse = MegaParse()
response = megaparse.load("./test.pdf")
print(response)
megaparse.save("./test.md")
```

### Use MegaParse Vision

* Change the parser to MegaParseVision

```python
from megaparse import MegaParse
from langchain_openai import ChatOpenAI
from megaparse.parser.megaparse_vision import MegaParseVision

model = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")) # type: ignore
parser = MegaParseVision(model=model)
megaparse = MegaParse(parser)
response = megaparse.load("./test.pdf")
response = parser.convert("./test.pdf")
print(response)
megaparse.save("./test.md")

```
**Note**: The model supported by MegaParse Vision are the multimodal ones such as claude 3.5, claude 4, gpt-4o and gpt-4.
Expand Down
4 changes: 2 additions & 2 deletions benchmark/process_single_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ async def process_file(megaparse: MegaParse, file_path: str | Path):


async def test_process_file(file: str | Path):
parser = UnstructuredParser(strategy=StrategyEnum.HI_RES)
megaparse = MegaParse(parser=parser)
# parser = UnstructuredParser(strategy=StrategyEnum.HI_RES)
megaparse = MegaParse()
task = []
for _ in range(N_TRY):
task.append(process_file(megaparse, file))
Expand Down
3 changes: 2 additions & 1 deletion evaluations/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from megaparse.parser.llama import LlamaParser
from megaparse.parser.megaparse_vision import MegaParseVision
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse_sdk.schema.parser_config import StrategyEnum

if __name__ == "__main__":
print("---Launching evaluations script---")
Expand All @@ -29,7 +30,7 @@

for method, parser in parser_dict.items():
print(f"Method: {method}")
megaparse = MegaParse(parser=parser)
megaparse = MegaParse()
result = megaparse.load(file_path=base_pdf_path)
score_dict[method] = difflib.SequenceMatcher(None, base_md, result).ratio()
print(f"Score for method {method}: {score_dict[method]}")
Expand Down
22 changes: 10 additions & 12 deletions libs/megaparse/src/megaparse/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,16 @@ async def parse_file(
else:
raise HTTPModelNotSupported()

parser_config = ParseFileConfig(
method=method,
strategy=strategy,
model=model if model and check_table else None,
language=language,
parsing_instruction=parsing_instruction,
)
# parser_config = ParseFileConfig( #FIXME
# method=method,
# strategy=strategy,
# llm_model_name=SupportedModel(model_name) if model_name and check_table else None,
# language=language,
# parsing_instruction=parsing_instruction,
# )
try:
parser = parser_builder.build(parser_config)
megaparse = MegaParse(parser=parser)
# parser = parser_builder.build(parser_config)
megaparse = MegaParse()
if not file.filename:
raise HTTPFileNotFound("No filename provided")
_, extension = os.path.splitext(file.filename)
Expand Down Expand Up @@ -136,9 +136,7 @@ async def upload_url(
with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file:
temp_file.write(response.content)
try:
megaparse = MegaParse(
parser=UnstructuredParser(strategy=StrategyEnum.AUTO)
)
megaparse = MegaParse()
result = await megaparse.aload(temp_file.name)
return {"message": "File parsed successfully", "result": result}
except ParsingException:
Expand Down
17 changes: 12 additions & 5 deletions libs/megaparse/src/megaparse/configs/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ class TextDetConfig(BaseModel):


class AutoStrategyConfig(BaseModel):
auto_page_threshold: float = 0.6
auto_document_threshold: float = 0.2
page_threshold: float = 0.6
document_threshold: float = 0.2


class TextRecoConfig(BaseModel):
Expand All @@ -29,6 +29,14 @@ class DeviceEnum(str, Enum):
COREML = "coreml"


class DoctrConfig(BaseModel):
straighten_pages: bool = False
detect_orientation: bool = False
detect_language: bool = False
text_det_config: TextDetConfig = TextDetConfig()
text_reco_config: TextRecoConfig = TextRecoConfig()


class MegaParseConfig(BaseSettings):
"""
Configuration for Megaparse.
Expand All @@ -41,7 +49,6 @@ class MegaParseConfig(BaseSettings):
extra="ignore",
use_enum_values=True,
)
text_det_config: TextDetConfig = TextDetConfig()
text_reco_config: TextRecoConfig = TextRecoConfig()
auto_parse_config: AutoStrategyConfig = AutoStrategyConfig()
doctr_config: DoctrConfig = DoctrConfig()
auto_config: AutoStrategyConfig = AutoStrategyConfig()
device: DeviceEnum = DeviceEnum.CPU
17 changes: 8 additions & 9 deletions libs/megaparse/src/megaparse/examples/parse_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from megaparse.parser.doctr_parser import DoctrParser
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse_sdk.schema.extensions import FileExtension
from megaparse_sdk.schema.parser_config import StrategyEnum
from pydantic import BaseModel, Field


Expand All @@ -22,18 +23,16 @@ class MyCustomFormat(BaseModel):
solution: str = Field(description="The solution statement.")


async def main():
# Parse a file
parser = DoctrParser()
model = ChatOpenAI(name="gpt-4o")
formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)
def main():
# model = ChatOpenAI(name="gpt-4o")
# formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)

megaparse = MegaParse(ocr_parser=parser, formatters=[formatter_1])
megaparse = MegaParse()

file_path = Path("./tests/pdf/sample_pdf.pdf")
result = await megaparse.aload(file_path=file_path)
file_path = Path("./tests/pdf/native/0168011.pdf")
result = megaparse.load(file_path=file_path)
print(result)


if __name__ == "__main__":
asyncio.run(main())
main()
Loading
Loading