Merge pull request #832 from pipecat-ai/khk/gemini-live-function-calling

Gemini Multimodal Live function calling example
pipecat-ai · Dec 11, 2024 · 6c9f5a8 · 6c9f5a8
2 parents 7b040be + 027e360
commit 6c9f5a8
Show file tree

Hide file tree

Showing 2 changed files with 142 additions and 0 deletions.
diff --git a/examples/foundational/26b-gemini-multimodal-live-function-calling.py b/examples/foundational/26b-gemini-multimodal-live-function-calling.py
@@ -0,0 +1,142 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import os
+import sys
+from datetime import datetime
+
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.audio.vad.vad_analyzer import VADParams
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
+    temperature = 75 if args["format"] == "fahrenheit" else 24
+    await result_callback(
+        {
+            "conditions": "nice",
+            "temperature": temperature,
+            "format": args["format"],
+            "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
+        }
+    )
+
+
+tools = [
+    {
+        "function_declarations": [
+            {
+                "name": "get_current_weather",
+                "description": "Get the current weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "format": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "description": "The temperature unit to use. Infer this from the users location.",
+                        },
+                    },
+                    "required": ["location", "format"],
+                },
+            },
+        ]
+    }
+]
+
+system_instruction = """
+You are a helpful assistant who can answer questions and use tools.
+
+You have a tool called "get_current_weather" that can be used to get the current weather. If the user asks
+for the weather, call this function.
+"""
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, token) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_in_sample_rate=16000,
+                audio_out_sample_rate=24000,
+                audio_out_enabled=True,
+                vad_enabled=True,
+                vad_audio_passthrough=True,
+                # set stop_secs to something roughly similar to the internal setting
+                # of the Multimodal Live api, just to align events. This doesn't really
+                # matter because we can only use the Multimodal Live API's phrase
+                # endpointing, for now.
+                vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
+            ),
+        )
+
+        llm = GeminiMultimodalLiveLLMService(
+            api_key=os.getenv("GOOGLE_API_KEY"),
+            system_instruction=system_instruction,
+            tools=tools,
+        )
+
+        llm.register_function("get_current_weather", fetch_weather_from_api)
+
+        context = OpenAILLMContext(
+            [{"role": "user", "content": "Say hello."}],
+        )
+        context_aggregator = llm.create_context_aggregator(context)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),
+                context_aggregator.user(),
+                llm,
+                context_aggregator.assistant(),
+                transport.output(),
+            ]
+        )
+
+        task = PipelineTask(
+            pipeline,
+            PipelineParams(
+                allow_interruptions=True,
+                enable_metrics=True,
+                enable_usage_metrics=True,
+            ),
+        )
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            await task.queue_frames([context_aggregator.user().get_context_frame()])
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/...ional/26b-gemini-multimodal-live-video.py → ...ional/26c-gemini-multimodal-live-video.py b/...ional/26b-gemini-multimodal-live-video.py → ...ional/26c-gemini-multimodal-live-video.py