Skip to content

Commit

Permalink
Merge pull request #150 from pipecat-ai/khk-gemini
Browse files Browse the repository at this point in the history
Initial commit of Google Gemini LLM service.
  • Loading branch information
aconchillo authored May 20, 2024
2 parents 6366ee0 + 7ffb10d commit bf036be
Show file tree
Hide file tree
Showing 9 changed files with 499 additions and 35 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Added `google.generativeai` model support, including vision. This new `google` service defaults to using
`gemini-1.5-flash-latest`. Example in `examples/foundational/12a-describe-video-gemini-flash.py`.

- Added vision support to `openai` service. Example in
`examples/foundational/12a-describe-video-gemini-flash.py`.

## [Unreleased]

### Added

- Added initial interruptions support. The assistant contexts (or aggregators)
should now be placed after the output transport. This way, only the completed
spoken context is added to the assistant context.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ pip install "pipecat-ai[option,...]"

Your project may or may not need these, so they're made available as optional requirements. Here is a list:

- **AI services**: `anthropic`, `azure`, `fal`, `moondream`, `openai`, `playht`, `silero`, `whisper`
- **AI services**: `anthropic`, `azure`, `deepgram`, `google`, `fal`, `moondream`, `openai`, `playht`, `silero`, `whisper`
- **Transports**: `local`, `websocket`, `daily`

## Code examples
Expand Down
110 changes: 110 additions & 0 deletions examples/foundational/12a-describe-video-gemini-flash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

import asyncio
import aiohttp
import os
import sys

from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.user_response import UserResponseAggregator
from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.google import GoogleLLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer

from runner import configure

from loguru import logger

from dotenv import load_dotenv
load_dotenv(override=True)

logger.remove(0)
logger.add(sys.stderr, level="DEBUG")


class UserImageRequester(FrameProcessor):

def __init__(self, participant_id: str | None = None):
super().__init__()
self._participant_id = participant_id

def set_participant_id(self, participant_id: str):
self._participant_id = participant_id

async def process_frame(self, frame: Frame, direction: FrameDirection):
if self._participant_id and isinstance(frame, TextFrame):
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
await self.push_frame(frame, direction)


async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Describe participant video",
DailyParams(
audio_in_enabled=True, # This is so Silero VAD can get audio data
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
)

tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)

user_response = UserResponseAggregator()

image_requester = UserImageRequester()

vision_aggregator = VisionImageFrameAggregator()

google = GoogleLLMService(model="gemini-1.5-flash-latest")

tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)

@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await tts.say("Hi there! Feel free to ask me what I see.")
transport.capture_participant_video(participant["id"], framerate=0)
transport.capture_participant_transcription(participant["id"])
image_requester.set_participant_id(participant["id"])

pipeline = Pipeline([
transport.input(),
user_response,
image_requester,
vision_aggregator,
google,
tts,
transport.output()
])

task = PipelineTask(pipeline)

runner = PipelineRunner()

await runner.run(task)

if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
112 changes: 112 additions & 0 deletions examples/foundational/12b-describe-video-gpt-4o.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

import asyncio
import aiohttp
import os
import sys

from pipecat.frames.frames import Frame, TextFrame, UserImageRequestFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.user_response import UserResponseAggregator
from pipecat.processors.aggregators.vision_image_frame import VisionImageFrameAggregator
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVADAnalyzer

from runner import configure

from loguru import logger

from dotenv import load_dotenv
load_dotenv(override=True)

logger.remove(0)
logger.add(sys.stderr, level="DEBUG")


class UserImageRequester(FrameProcessor):

def __init__(self, participant_id: str | None = None):
super().__init__()
self._participant_id = participant_id

def set_participant_id(self, participant_id: str):
self._participant_id = participant_id

async def process_frame(self, frame: Frame, direction: FrameDirection):
if self._participant_id and isinstance(frame, TextFrame):
await self.push_frame(UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM)
await self.push_frame(frame, direction)


async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransport(
room_url,
token,
"Describe participant video",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
)
)

tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)

user_response = UserResponseAggregator()

image_requester = UserImageRequester()

vision_aggregator = VisionImageFrameAggregator()

openai = OpenAILLMService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o"
)

tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)

@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await tts.say("Hi there! Feel free to ask me what I see.")
transport.capture_participant_video(participant["id"], framerate=0)
transport.capture_participant_transcription(participant["id"])
image_requester.set_participant_id(participant["id"])

pipeline = Pipeline([
transport.input(),
user_response,
image_requester,
vision_aggregator,
openai,
tts,
transport.output()
])

task = PipelineTask(pipeline)

runner = PipelineRunner()

await runner.run(task)

if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))
Loading

0 comments on commit bf036be

Please sign in to comment.