From 680c3272a338b79b5d63600dcca861b4a77db67e Mon Sep 17 00:00:00 2001 From: emrgnt-cmplxty <68796651+emrgnt-cmplxty@users.noreply.github.com> Date: Tue, 29 Oct 2024 17:34:08 -0700 Subject: [PATCH] Patch/fix import bleed (#1527) * Feature/tweak actions (#1507) * up * tweak actions * Sync JS SDK, Harmonize Python SDK KG Methods (#1511) * Feature/move logging (#1492) * move logging provider out * move logging provider to own directory, remove singleton * cleanup * fix refactoring tweak (#1496) * Fix JSON serialization and Prompt ID Bugs for Prompts (#1491) * Bug in get prompts * Add tests * Prevent verbose logging on standup * Remove kg as required key in config, await get_all_prompts * Remove reference to fragment id * comment out ingestion * complete logging port (#1499) * Feature/dev rebased (#1500) * Feature/move logging (#1493) * move logging provider out * move logging provider to own directory, remove singleton * cleanup * Update js package (#1498) * fix refactoring tweak (#1496) * Fix JSON serialization and Prompt ID Bugs for Prompts (#1491) * Bug in get prompts * Add tests * Prevent verbose logging on standup * Remove kg as required key in config, await get_all_prompts * Remove reference to fragment id * comment out ingestion * complete logging port (#1499) --------- Co-authored-by: Nolan Tremelling <34580718+NolanTrem@users.noreply.github.com> * Fix handling for R2R exceptions (#1501) * fix doc test (#1502) * Harmonize python SDK KG methods for optional params, add missing JS methods --------- Co-authored-by: emrgnt-cmplxty <68796651+emrgnt-cmplxty@users.noreply.github.com> Co-authored-by: emrgnt-cmplxty * Clean up pagination and offset around KG (#1519) * Move to R2R light for integration testing (#1521) * fix ollama pdf parser --------- Co-authored-by: Nolan Tremelling <34580718+NolanTrem@users.noreply.github.com> --- py/core/configs/local_llm.toml | 7 +++++++ py/core/providers/ingestion/r2r/base.py | 24 +++++++++++------------- py/r2r.toml | 1 - 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/py/core/configs/local_llm.toml b/py/core/configs/local_llm.toml index d16bbe951..3c12fbd8d 100644 --- a/py/core/configs/local_llm.toml +++ b/py/core/configs/local_llm.toml @@ -27,3 +27,10 @@ concurrent_request_limit = 2 [orchestration] provider = "simple" + + +[ingestion] +vision_img_model = "ollama/llama3.2-vision" +vision_pdf_model = "ollama/llama3.2-vision" +[ingestion.extra_parsers] + pdf = "basic" diff --git a/py/core/providers/ingestion/r2r/base.py b/py/core/providers/ingestion/r2r/base.py index 5334ad3b0..98c064301 100644 --- a/py/core/providers/ingestion/r2r/base.py +++ b/py/core/providers/ingestion/r2r/base.py @@ -202,23 +202,21 @@ async def parse( # type: ignore else: t0 = time.time() contents = "" - parser_overrides = ingestion_config_override.get( - "parser_overrides", {} + + def check_vlm(model_name: str) -> bool: + return "gpt-4o" in model_name + + is_not_vlm = not check_vlm( + ingestion_config_override.get("vision_pdf_model") + or self.config.vision_pdf_model ) - if document.document_type.value in parser_overrides: + + if document.document_type == DocumentType.PDF and is_not_vlm: logger.info( - f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}" + f"Reverting to basic PDF parser as the provided is not a proper VLM model." ) - # TODO - Cleanup this approach to be less hardcoded - if ( - document.document_type != DocumentType.PDF - or parser_overrides[DocumentType.PDF.value] != "zerox" - ): - raise ValueError( - "Only Zerox PDF parser override is available." - ) async for text in self.parsers[ - f"zerox_{DocumentType.PDF.value}" + f"basic_{DocumentType.PDF.value}" ].ingest(file_content, **ingestion_config_override): contents += text + "\n" else: diff --git a/py/r2r.toml b/py/r2r.toml index c0d1ffad6..62a6dc355 100644 --- a/py/r2r.toml +++ b/py/r2r.toml @@ -15,7 +15,6 @@ require_email_verification = true default_admin_email = "admin@example.com" default_admin_password = "change_me_immediately" - [completion] provider = "litellm" concurrent_request_limit = 256