Skip to content

Commit

Permalink
cleanup (#1512)
Browse files Browse the repository at this point in the history
* cleanup

* cleanup prompt mgmt

* up

* cleanup printout

* cleanup new parser logic, set vlm as default for all providers

* allow user to re-override
  • Loading branch information
emrgnt-cmplxty authored Oct 28, 2024
1 parent 2f674dd commit 080d8cb
Show file tree
Hide file tree
Showing 55 changed files with 737 additions and 1,255 deletions.
14 changes: 0 additions & 14 deletions .github/actions/run-script-zerox-tests/action.yml

This file was deleted.

3 changes: 0 additions & 3 deletions .github/workflows/r2r-full-integration-deep-dive-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,3 @@ jobs:

- name: Start R2R Full server
uses: ./.github/actions/start-r2r-full

- name: Run Test Zerox
uses: ./.github/actions/run-script-zerox-tests
6 changes: 0 additions & 6 deletions docs/cookbooks/graphrag.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,6 @@ excluded_parsers = ["mp4"]
semantic_similarity_threshold = 0.7
generation_config = { model = "openai/gpt-4o-mini" }

[ingestion.extra_parsers]
pdf = "zerox"

[database]
provider = "postgres"
batch_size = 256
Expand Down Expand Up @@ -204,9 +201,6 @@ max_characters = 1_024
combine_under_n_chars = 128
overlap = 256

[ingestion.extra_parsers]
pdf = "zerox"

[orchestration]
provider = "hatchet"
kg_creation_concurrency_lipmit = 32
Expand Down
4 changes: 2 additions & 2 deletions py/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,9 @@
"AudioParser",
"DOCXParser",
"ImageParser",
"PDFParser",
"VLMPDFParser",
"BasicPDFParser",
"PDFParserUnstructured",
"PDFParserMarker",
"PPTParser",
# Structured parsers
"CSVParser",
Expand Down
7 changes: 2 additions & 5 deletions py/core/base/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,11 @@
from abc import ABC, abstractmethod
from typing import AsyncGenerator, Generic, TypeVar

from ..abstractions import DataType

T = TypeVar("T")


class AsyncParser(ABC, Generic[T]):

@abstractmethod
async def ingest(
self, data: T, **kwargs
) -> AsyncGenerator[DataType, None]:
async def ingest(self, data: T, **kwargs) -> AsyncGenerator[str, None]:
pass
28 changes: 26 additions & 2 deletions py/core/base/providers/ingestion.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import logging
from abc import ABC
from enum import Enum
from typing import Optional

from core.base.abstractions import ChunkEnrichmentSettings

from .base import Provider, ProviderConfig
from .database import DatabaseProvider
from .llm import CompletionProvider

logger = logging.getLogger()

Expand All @@ -15,7 +18,14 @@ class IngestionConfig(ProviderConfig):
chunk_enrichment_settings: ChunkEnrichmentSettings = (
ChunkEnrichmentSettings()
)
extra_parsers: dict[str, str] = {}

audio_transcription_model: str

vision_img_prompt_name: Optional[str] = None
vision_img_model: str

vision_pdf_prompt_name: Optional[str] = None
vision_pdf_model: str

@property
def supported_providers(self) -> list[str]:
Expand All @@ -27,7 +37,21 @@ def validate_config(self) -> None:


class IngestionProvider(Provider, ABC):
pass

config: IngestionConfig
database_provider: DatabaseProvider
llm_provider: CompletionProvider

def __init__(
self,
config: IngestionConfig,
database_provider: DatabaseProvider,
llm_provider: CompletionProvider,
):
super().__init__(config)
self.config: IngestionConfig = config
self.llm_provider = llm_provider
self.database_provider = database_provider


class ChunkingStrategy(str, Enum):
Expand Down
2 changes: 2 additions & 0 deletions py/core/base/providers/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ async def aget_completion(
"generation_config": generation_config,
"kwargs": kwargs,
}
if modalities := kwargs.get("modalities"):
task["modalities"] = modalities
response = await self._execute_with_backoff_async(task)
return LLMChatCompletion(**response.dict())

Expand Down
3 changes: 0 additions & 3 deletions py/core/configs/full.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@ max_characters = 1_024
combine_under_n_chars = 128
overlap = 256

[ingestion.extra_parsers]
pdf = "zerox"

[orchestration]
provider = "hatchet"
kg_creation_concurrency_lipmit = 32
Expand Down
15 changes: 0 additions & 15 deletions py/core/examples/scripts/run_ingest_with_zerox.py

This file was deleted.

40 changes: 27 additions & 13 deletions py/core/main/assembly/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,13 @@ def create_crypto_provider(

@staticmethod
def create_ingestion_provider(
ingestion_config: IngestionConfig, *args, **kwargs
ingestion_config: IngestionConfig,
database_provider: PostgresDBProvider,
llm_provider: Union[
LiteLLMCompletionProvider, OpenAICompletionProvider
],
*args,
**kwargs,
) -> Union[R2RIngestionProvider, UnstructuredIngestionProvider]:

config_dict = (
Expand All @@ -104,7 +110,9 @@ def create_ingestion_provider(
r2r_ingestion_config = R2RIngestionConfig(
**config_dict, **extra_fields
)
return R2RIngestionProvider(r2r_ingestion_config)
return R2RIngestionProvider(
r2r_ingestion_config, database_provider, llm_provider
)
elif config_dict["provider"] in [
"unstructured_local",
"unstructured_api",
Expand All @@ -114,7 +122,7 @@ def create_ingestion_provider(
)

return UnstructuredIngestionProvider(
unstructured_ingestion_config,
unstructured_ingestion_config, database_provider, llm_provider
)
else:
raise ValueError(
Expand Down Expand Up @@ -217,10 +225,12 @@ def create_llm_provider(
@staticmethod
async def create_email_provider(
email_config: Optional[EmailConfig] = None, *args, **kwargs
) -> Optional[Union[AsyncSMTPEmailProvider, ConsoleMockEmailProvider]]:
) -> Union[AsyncSMTPEmailProvider, ConsoleMockEmailProvider]:
"""Creates an email provider based on configuration."""
if not email_config:
return None
raise ValueError(
f"No email configuration provided for email provider, please add `[email]` to your `r2r.toml`."
)

if email_config.provider == "smtp":
return AsyncSMTPEmailProvider(email_config)
Expand Down Expand Up @@ -263,28 +273,32 @@ async def create_providers(
self.config.embedding, *args, **kwargs
)
)
ingestion_provider = (
ingestion_provider_override
or self.create_ingestion_provider(
self.config.ingestion, *args, **kwargs
)
)

llm_provider = llm_provider_override or self.create_llm_provider(
self.config.completion, *args, **kwargs
)

crypto_provider = (
crypto_provider_override
or self.create_crypto_provider(self.config.crypto, *args, **kwargs)
)

database_provider = (
database_provider_override
or await self.create_database_provider(
self.config.database, crypto_provider, *args, **kwargs
)
)

ingestion_provider = (
ingestion_provider_override
or self.create_ingestion_provider(
self.config.ingestion,
database_provider,
llm_provider,
*args,
**kwargs,
)
)

email_provider = (
email_provider_override
or await self.create_email_provider(
Expand Down
5 changes: 3 additions & 2 deletions py/core/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
"AudioParser",
"DOCXParser",
"ImageParser",
"PDFParser",
"VLMPDFParser",
"BasicPDFParser",
"PDFParserUnstructured",
"PDFParserMarker",
"VLMPDFParser",
"PPTParser",
# Structured parsers
"CSVParser",
Expand Down
10 changes: 4 additions & 6 deletions py/core/parsers/media/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,18 @@
from .docx_parser import DOCXParser
from .img_parser import ImageParser
from .pdf_parser import ( # type: ignore
PDFParser,
PDFParserMarker,
BasicPDFParser,
PDFParserUnstructured,
ZeroxPDFParser,
VLMPDFParser,
)
from .ppt_parser import PPTParser

__all__ = [
"AudioParser",
"DOCXParser",
"ImageParser",
"PDFParser",
"VLMPDFParser",
"BasicPDFParser",
"PDFParserUnstructured",
"ZeroxPDFParser",
"PDFParserMarker",
"PPTParser",
]
82 changes: 64 additions & 18 deletions py/core/parsers/media/audio_parser.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,81 @@
import base64
import logging
import os
import tempfile
from typing import AsyncGenerator

from core.base.parsers.base_parser import AsyncParser
from core.parsers.media.openai_helpers import process_audio_with_openai
from core.base.providers import (
CompletionProvider,
DatabaseProvider,
IngestionConfig,
)

logger = logging.getLogger()


class AudioParser(AsyncParser[bytes]):
"""A parser for audio data."""
"""A parser for audio data using Whisper transcription."""

def __init__(
self, api_base: str = "https://api.openai.com/v1/audio/transcriptions"
self,
config: IngestionConfig,
database_provider: DatabaseProvider,
llm_provider: CompletionProvider,
):
self.api_base = api_base
self.openai_api_key = os.environ.get("OPENAI_API_KEY")
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config
try:
from litellm import atranscription

self.atranscription = atranscription
except ImportError:
logger.error("Failed to import LiteLLM transcription")
raise ImportError(
"Please install the `litellm` package to use the AudioParser."
)

async def ingest( # type: ignore
self, data: bytes, chunk_size: int = 1024, *args, **kwargs
self, data: bytes, **kwargs
) -> AsyncGenerator[str, None]:
"""Ingest audio data and yield a transcription."""
temp_audio_path = "temp_audio.wav"
with open(temp_audio_path, "wb") as f:
f.write(data)
"""
Ingest audio data and yield a transcription using Whisper via LiteLLM.
Args:
data: Raw audio bytes
chunk_size: Size of text chunks to yield
model: The model to use for transcription (default is whisper-1)
*args, **kwargs: Additional arguments passed to the transcription call
Yields:
Chunks of transcribed text
"""
try:
transcription_text = process_audio_with_openai(
open(temp_audio_path, "rb"), self.openai_api_key # type: ignore
# Create a temporary file to store the audio data
with tempfile.NamedTemporaryFile(
suffix=".wav", delete=False
) as temp_file:
temp_file.write(data)
temp_file_path = temp_file.name

# Call Whisper transcription
response = await self.atranscription(
model=self.config.audio_transcription_model,
file=open(temp_file_path, "rb"),
**kwargs,
)

# split text into small chunks and yield them
for i in range(0, len(transcription_text), chunk_size):
text = transcription_text[i : i + chunk_size]
if text and text != "":
yield text
# The response should contain the transcribed text directly
yield response.text

except Exception as e:
logger.error(f"Error processing audio with Whisper: {str(e)}")
raise

finally:
os.remove(temp_audio_path)
# Clean up the temporary file
try:
os.unlink(temp_file_path)
except Exception as e:
logger.warning(f"Failed to delete temporary file: {str(e)}")
Loading

0 comments on commit 080d8cb

Please sign in to comment.