From 680c3272a338b79b5d63600dcca861b4a77db67e Mon Sep 17 00:00:00 2001
From: emrgnt-cmplxty <68796651+emrgnt-cmplxty@users.noreply.github.com>
Date: Tue, 29 Oct 2024 17:34:08 -0700
Subject: [PATCH] Patch/fix import bleed (#1527)

* Feature/tweak actions (#1507)

* up

* tweak actions

* Sync JS SDK, Harmonize Python SDK KG Methods (#1511)

* Feature/move logging (#1492)

* move logging provider out

* move logging provider to own directory, remove singleton

* cleanup

* fix refactoring tweak (#1496)

* Fix JSON serialization and Prompt ID Bugs for Prompts (#1491)

* Bug in get prompts

* Add tests

* Prevent verbose logging on standup

* Remove kg as required key in config, await get_all_prompts

* Remove reference to fragment id

* comment out ingestion

* complete logging port (#1499)

* Feature/dev rebased (#1500)

* Feature/move logging (#1493)

* move logging provider out

* move logging provider to own directory, remove singleton

* cleanup

* Update js package (#1498)

* fix refactoring tweak (#1496)

* Fix JSON serialization and Prompt ID Bugs for Prompts (#1491)

* Bug in get prompts

* Add tests

* Prevent verbose logging on standup

* Remove kg as required key in config, await get_all_prompts

* Remove reference to fragment id

* comment out ingestion

* complete logging port (#1499)

---------

Co-authored-by: Nolan Tremelling <34580718+NolanTrem@users.noreply.github.com>

* Fix handling for R2R exceptions (#1501)

* fix doc test (#1502)

* Harmonize python SDK KG methods for optional params, add missing JS methods

---------

Co-authored-by: emrgnt-cmplxty <68796651+emrgnt-cmplxty@users.noreply.github.com>
Co-authored-by: emrgnt-cmplxty <owen@algofi.org>

* Clean up pagination and offset around KG (#1519)

* Move to R2R light for integration testing (#1521)

* fix ollama pdf parser

---------

Co-authored-by: Nolan Tremelling <34580718+NolanTrem@users.noreply.github.com>
---
 py/core/configs/local_llm.toml          |  7 +++++++
 py/core/providers/ingestion/r2r/base.py | 24 +++++++++++-------------
 py/r2r.toml                             |  1 -
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/py/core/configs/local_llm.toml b/py/core/configs/local_llm.toml
index d16bbe951..3c12fbd8d 100644
--- a/py/core/configs/local_llm.toml
+++ b/py/core/configs/local_llm.toml
@@ -27,3 +27,10 @@ concurrent_request_limit = 2
 
 [orchestration]
 provider = "simple"
+
+
+[ingestion]
+vision_img_model = "ollama/llama3.2-vision"
+vision_pdf_model = "ollama/llama3.2-vision"
+[ingestion.extra_parsers]
+    pdf = "basic"
diff --git a/py/core/providers/ingestion/r2r/base.py b/py/core/providers/ingestion/r2r/base.py
index 5334ad3b0..98c064301 100644
--- a/py/core/providers/ingestion/r2r/base.py
+++ b/py/core/providers/ingestion/r2r/base.py
@@ -202,23 +202,21 @@ async def parse(  # type: ignore
         else:
             t0 = time.time()
             contents = ""
-            parser_overrides = ingestion_config_override.get(
-                "parser_overrides", {}
+
+            def check_vlm(model_name: str) -> bool:
+                return "gpt-4o" in model_name
+
+            is_not_vlm = not check_vlm(
+                ingestion_config_override.get("vision_pdf_model")
+                or self.config.vision_pdf_model
             )
-            if document.document_type.value in parser_overrides:
+
+            if document.document_type == DocumentType.PDF and is_not_vlm:
                 logger.info(
-                    f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}"
+                    f"Reverting to basic PDF parser as the provided is not a proper VLM model."
                 )
-                # TODO - Cleanup this approach to be less hardcoded
-                if (
-                    document.document_type != DocumentType.PDF
-                    or parser_overrides[DocumentType.PDF.value] != "zerox"
-                ):
-                    raise ValueError(
-                        "Only Zerox PDF parser override is available."
-                    )
                 async for text in self.parsers[
-                    f"zerox_{DocumentType.PDF.value}"
+                    f"basic_{DocumentType.PDF.value}"
                 ].ingest(file_content, **ingestion_config_override):
                     contents += text + "\n"
             else:
diff --git a/py/r2r.toml b/py/r2r.toml
index c0d1ffad6..62a6dc355 100644
--- a/py/r2r.toml
+++ b/py/r2r.toml
@@ -15,7 +15,6 @@ require_email_verification = true
 default_admin_email = "admin@example.com"
 default_admin_password = "change_me_immediately"
 
-
 [completion]
 provider = "litellm"
 concurrent_request_limit = 256