Working vision example

pipecat-ai · Mar 19, 2024 · 0b4b63d · 0b4b63d
1 parent 6c9425d
commit 0b4b63d
Show file tree

Hide file tree

Showing 7 changed files with 34 additions and 21 deletions.
diff --git a/src/dailyai/pipeline/frames.py b/src/dailyai/pipeline/frames.py
@@ -187,14 +187,14 @@ class VideoImageFrame(Frame):
     participantId: str
     image: bytes
 
-    def __str__(self):
-        return f"{self.__class__.__name__}, participantId: {self.participantId}, image size: {len(self.image)} B"
+    # def __str__(self):
+    #     return f"{self.__class__.__name__}, participantId: {self.participantId}, image size: {len(self.image)} B"
 
 
 @dataclass()
 class VisionFrame(Frame):
     prompt: str
     image: bytes
 
-    def __str__(self):
-        return f"{self.__class__.__name__}, prompt: {self.prompt}, image size: {len(self.image)} B"
+    # def __str__(self):
+    #     return f"{self.__class__.__name__}, prompt: {self.prompt}, image size: {len(self.image)} B"
diff --git a/src/dailyai/services/ai_services.py b/src/dailyai/services/ai_services.py
@@ -148,6 +148,8 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
         if isinstance(frame, VisionFrame):
             async for frame in self.run_vision(frame.prompt, frame.image):
                 yield frame
+        else:
+            yield frame
 
 
 class FrameLogger(AIService):

diff --git a/src/dailyai/services/daily_transport_service.py b/src/dailyai/services/daily_transport_service.py
@@ -230,14 +230,12 @@ def _post_run(self):
         self.client.release()
 
     def _handle_video_frame(self, participant_id, video_frame):
-        # TODO-CB: What about multiple participants?
         if (not participant_id in self._participant_frame_times) or (time.time() > self._participant_frame_times[participant_id] + 1.0/self._receive_video_fps):
-            print(f"### sending frame now")
             self._participant_frame_times[participant_id] = time.time()
-            asyncio.run_coroutine_threadsafe(
+
+            future = asyncio.run_coroutine_threadsafe(
                 self.receive_queue.put(
-                    VideoImageFrame(participant_id, video_frame)), self._loop
-            )
+                    VideoImageFrame(participant_id, video_frame)), self._loop)
 
     def on_first_other_participant_joined(self):
         pass

diff --git a/src/dailyai/services/elevenlabs_ai_service.py b/src/dailyai/services/elevenlabs_ai_service.py
@@ -35,6 +35,7 @@ async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
             "xi-api-key": self._api_key,
             "Content-Type": "application/json",
         }
+
         async with self._aiohttp_session.post(
             url, json=payload, headers=headers, params=querystring
         ) as r:

diff --git a/src/dailyai/services/open_ai_services.py b/src/dailyai/services/open_ai_services.py
@@ -71,7 +71,24 @@ def __init__(
         self._client = AsyncOpenAI(api_key=api_key)
 
     async def run_vision(self, prompt: str, image: bytes):
-        base64_image = base64.b64encode(image).decode('utf-8')
+        IMAGE_WIDTH = image.width
+        IMAGE_HEIGHT = image.height
+        COLOR_FORMAT = image.color_format
+        a_image = Image.frombytes(
+            'RGBA', (IMAGE_WIDTH, IMAGE_HEIGHT), image.buffer)
+        new_image = a_image.convert('RGB')
+
+        # Uncomment these lines to write the frame to a jpg in the same directory.
+        # current_path = os.getcwd()
+        # image_path = os.path.join(current_path, "image.jpg")
+        # image.save(image_path, format="JPEG")
+
+        jpeg_buffer = io.BytesIO()
+
+        new_image.save(jpeg_buffer, format='JPEG')
+
+        jpeg_bytes = jpeg_buffer.getvalue()
+        base64_image = base64.b64encode(jpeg_bytes).decode('utf-8')
         messages = [
             {
                 "role": "user",
@@ -94,5 +111,7 @@ async def run_vision(self, prompt: str, image: bytes):
             )
         )
         async for chunk in chunks:
-            print(f"!!! chunk: {chunk}")
-            yield TextFrame(chunk)
+            if len(chunk.choices) == 0:
+                continue
+            if chunk.choices[0].delta.content:
+                yield TextFrame(chunk.choices[0].delta.content)
diff --git a/src/examples/foundational/12-describe-video.py b/src/examples/foundational/12-describe-video.py
@@ -10,6 +10,7 @@
 from dailyai.services.daily_transport_service import DailyTransportService
 from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
 from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
+from dailyai.services.deepgram_ai_services import DeepgramTTSService
 from dailyai.services.ai_services import FrameLogger
 from dailyai.pipeline.aggregators import (
     LLMAssistantContextAggregator,
@@ -59,10 +60,7 @@ async def main(room_url: str, token):
         llm = OpenAILLMService(
             api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
             model="gpt-4-turbo-preview")
-        fl = FrameLogger("!!! before VIFP")
-        fl2 = FrameLogger("Outer")
-        fl3 = FrameLogger("### Before VS")
-        fl4 = FrameLogger("$$$ After VS")
+
         messages = [
             {
                 "role": "system",
@@ -80,13 +78,9 @@ async def main(room_url: str, token):
         vifp = VideoImageFrameProcessor()
         pipeline = Pipeline(
             processors=[
-                fl,
                 vifp,
-                fl3,
                 vs,
-                fl4,
                 llm,
-                fl2,
                 tts,
                 tma_out,
             ],

diff --git a/src/examples/starter-apps/chatbot.py b/src/examples/starter-apps/chatbot.py
@@ -124,7 +124,6 @@ async def main(room_url: str, token):
 
         @transport.event_handler("on_first_other_participant_joined")
         async def on_first_other_participant_joined(transport):
-            print(f"!!! in here, pipeline.source is {pipeline.source}")
             await pipeline.queue_frames([LLMMessagesQueueFrame(messages)])
 
         async def run_conversation():