Merge pull request #112 from daily-co/user-image-frame

user image frames and other updates
pipecat-ai · Apr 9, 2024 · 4f04b10 · 4f04b10
2 parents 53930b4 + 57aabea
commit 4f04b10
Show file tree

Hide file tree

Showing 24 changed files with 170 additions and 49 deletions.
diff --git a/examples/foundational/02-llm-say-one-thing.py b/examples/foundational/02-llm-say-one-thing.py
@@ -48,7 +48,7 @@ async def main(room_url):
         pipeline = Pipeline([llm, tts])
 
         @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport):
+        async def on_first_other_participant_joined(transport, participant):
             await pipeline.queue_frames([LLMMessagesFrame(messages), EndFrame()])
 
         await transport.run(pipeline)

diff --git a/examples/foundational/03-still-frame.py b/examples/foundational/03-still-frame.py
@@ -40,7 +40,7 @@ async def main(room_url):
         pipeline = Pipeline([imagegen])
 
         @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport):
+        async def on_first_other_participant_joined(transport, participant):
             # Note that we do not put an EndFrame() item in the pipeline for this demo.
             # This means that the bot will stay in the channel until it times out.
             # An EndFrame() in the pipeline would cause the transport to shut

diff --git a/examples/foundational/05a-local-sync-speech-and-text.py b/examples/foundational/05a-local-sync-speech-and-text.py
@@ -5,7 +5,7 @@
 import os
 from dailyai.pipeline.aggregators import LLMFullResponseAggregator
 
-from dailyai.pipeline.frames import AudioFrame, ImageFrame, LLMMessagesFrame, TextFrame
+from dailyai.pipeline.frames import AudioFrame, URLImageFrame, LLMMessagesFrame, TextFrame
 from dailyai.services.open_ai_services import OpenAILLMService
 from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
 from dailyai.services.fal_ai_services import FalImageGenService
@@ -67,8 +67,7 @@ async def get_month_description(aggregator, frame):
                     return frame.text
 
         async def get_month_data(month):
-            messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {
-                month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
+            messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
 
             messages_frame = LLMMessagesFrame(messages)
 
@@ -95,6 +94,7 @@ async def get_month_data(month):
                 "text": image_description,
                 "image_url": image_data[0],
                 "image": image_data[1],
+                "image_size": image_data[2],
                 "audio": audio,
             }
 
@@ -118,7 +118,7 @@ async def show_images():
                 if data:
                     await transport.send_queue.put(
                         [
-                            ImageFrame(data["image_url"], data["image"]),
+                            URLImageFrame(data["image_url"], data["image"], data["image_size"]),
                             AudioFrame(data["audio"]),
                         ]
                     )

diff --git a/examples/foundational/06-listen-and-respond.py b/examples/foundational/06-listen-and-respond.py
@@ -72,7 +72,7 @@ async def main(room_url: str, token):
         )
 
         @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport):
+        async def on_first_other_participant_joined(transport, participant):
             # Kick off the conversation.
             messages.append(
                 {"role": "system", "content": "Please introduce yourself to the user."})

diff --git a/examples/foundational/06a-image-sync.py b/examples/foundational/06a-image-sync.py
@@ -35,9 +35,9 @@ def __init__(self, speaking_path: str, waiting_path: str):
         self._waiting_image_bytes = self._waiting_image.tobytes()
 
     async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
-        yield ImageFrame(None, self._speaking_image_bytes)
+        yield ImageFrame(self._speaking_image_bytes, (1024, 1024))
         yield frame
-        yield ImageFrame(None, self._waiting_image_bytes)
+        yield ImageFrame(self._waiting_image_bytes, (1024, 1024))
 
 
 async def main(room_url: str, token):
@@ -85,7 +85,7 @@ async def main(room_url: str, token):
         pipeline = Pipeline([image_sync_aggregator, tma_in, llm, tma_out, tts])
 
         @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport):
+        async def on_first_other_participant_joined(transport, participant):
             await pipeline.queue_frames([TextFrame("Hi, I'm listening!")])
 
         await transport.run(pipeline)

diff --git a/examples/foundational/07-interruptible.py b/examples/foundational/07-interruptible.py
@@ -50,7 +50,7 @@ async def main(room_url: str, token):
         pipeline = Pipeline([FrameLogger(), llm, FrameLogger(), tts])
 
         @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport):
+        async def on_first_other_participant_joined(transport, participant):
             await transport.say("Hi, I'm listening!", tts)
 
         async def run_conversation():

diff --git a/examples/foundational/08-bots-arguing.py b/examples/foundational/08-bots-arguing.py
@@ -122,7 +122,7 @@ async def argue():
                 )
                 await transport.send_queue.put(
                     [
-                        ImageFrame(None, image_data1[1]),
+                        ImageFrame(image_data1[1], image_data1[2]),
                         AudioFrame(audio1),
                     ]
                 )
@@ -134,7 +134,7 @@ async def argue():
                 )
                 await transport.send_queue.put(
                     [
-                        ImageFrame(None, image_data2[1]),
+                        ImageFrame(image_data2[1], image_data2[2]),
                         AudioFrame(audio2),
                     ]
                 )

diff --git a/examples/foundational/10-wake-word.py b/examples/foundational/10-wake-word.py
@@ -55,7 +55,7 @@
         sprites[file] = img.tobytes()
 
 # When the bot isn't talking, show a static image of the cat listening
-quiet_frame = ImageFrame("", sprites["sc-listen-1.png"])
+quiet_frame = ImageFrame(sprites["sc-listen-1.png"], (720, 1280))
 # When the bot is talking, build an animation from two sprites
 talking_list = [sprites["sc-default.png"], sprites["sc-talk.png"]]
 talking = [random.choice(talking_list) for x in range(30)]
@@ -165,7 +165,7 @@ async def main(room_url: str, token):
         pipeline = Pipeline([isa, tf, ncf, tma_in, llm, tma_out, tts])
 
         @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport):
+        async def on_first_other_participant_joined(transport, participant):
             await transport.say(
                 "Hi! If you want to talk to me, just say 'hey Santa Cat'.",
                 tts,

diff --git a/examples/foundational/11-sound-effects.py b/examples/foundational/11-sound-effects.py
@@ -114,7 +114,7 @@ async def main(room_url: str, token):
         pipeline = Pipeline([tma_in, in_sound, fl2, llm, tma_out, fl, tts, out_sound])
 
         @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport):
+        async def on_first_other_participant_joined(transport, participant):
             await transport.say("Hi, I'm listening!", tts)
             await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))
 

diff --git a/examples/foundational/14-render-remote-participant.py b/examples/foundational/14-render-remote-participant.py
@@ -0,0 +1,56 @@
+import asyncio
+import io
+import logging
+
+from typing import AsyncGenerator
+
+from PIL import Image
+
+from dailyai.pipeline.aggregators import FrameProcessor
+
+from dailyai.pipeline.frames import ImageFrame, Frame, UserImageFrame
+from dailyai.pipeline.pipeline import Pipeline
+from dailyai.transports.daily_transport import DailyTransport
+
+from runner import configure
+
+from dotenv import load_dotenv
+load_dotenv(override=True)
+
+logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+
+class UserImageProcessor(FrameProcessor):
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        print(frame)
+        if isinstance(frame, UserImageFrame):
+            yield ImageFrame(frame.image, frame.size)
+        else:
+            yield frame
+
+
+async def main(room_url):
+    transport = DailyTransport(
+        room_url,
+        token,
+        "Render participant video",
+        camera_width=1280,
+        camera_height=720,
+        camera_enabled=True,
+        video_rendering_enabled=True
+    )
+
+    @ transport.event_handler("on_first_other_participant_joined")
+    async def on_first_other_participant_joined(transport, participant):
+        transport.render_participant_video(participant["id"])
+
+    pipeline = Pipeline([UserImageProcessor()])
+
+    await asyncio.gather(transport.run(pipeline))
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url))
diff --git a/examples/internal/11a-dial-out.py b/examples/internal/11a-dial-out.py
@@ -80,7 +80,7 @@ async def main(room_url: str, token, phone):
         tts = AzureTTSService()
 
         @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport):
+        async def on_first_other_participant_joined(transport, participant):
             await tts.say("Hi, I'm listening!", transport.send_queue)
             await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))
 

diff --git a/examples/starter-apps/chatbot.py b/examples/starter-apps/chatbot.py
@@ -48,7 +48,7 @@
 flipped = sprites[::-1]
 sprites.extend(flipped)
 # When the bot isn't talking, show a static image of the cat listening
-quiet_frame = ImageFrame("", sprites[0])
+quiet_frame = ImageFrame(sprites[0], (1024, 576))
 talking_frame = SpriteFrame(images=sprites)
 
 
@@ -127,7 +127,7 @@ async def main(room_url: str, token):
         ]
 
         @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport):
+        async def on_first_other_participant_joined(transport, participant):
             print(f"!!! in here, pipeline.source is {pipeline.source}")
             await pipeline.queue_frames([LLMMessagesFrame(messages)])
 

diff --git a/examples/starter-apps/patient-intake.py b/examples/starter-apps/patient-intake.py
@@ -330,7 +330,7 @@ async def main(room_url: str, token):
         pipeline = Pipeline(processors=[fl, llm, fl2, checklist, tts])
 
         @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport):
+        async def on_first_other_participant_joined(transport, participant):
             await pipeline.queue_frames([OpenAILLMContextFrame(context)])
 
         async def handle_intake():

diff --git a/examples/starter-apps/storybot.py b/examples/starter-apps/storybot.py
@@ -99,7 +99,7 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
         1. Catch the frames that are generated by the LLM service
         """
         if isinstance(frame, UserStoppedSpeakingFrame):
-            yield ImageFrame(None, images["grandma-writing.png"])
+            yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
             yield AudioFrame(sounds["talking.wav"])
 
         elif isinstance(frame, TextFrame):
@@ -112,7 +112,7 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
 
                 self._text = self._text.replace("\n", " ")
                 if len(self._text) > 2:
-                    yield ImageFrame(None, images["grandma-writing.png"])
+                    yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
                     yield StoryStartFrame(self._text)
                     yield AudioFrame(sounds["ding3.wav"])
                 self._text = ""
@@ -146,11 +146,11 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
                 # last bit
                 pass
         elif isinstance(frame, LLMResponseEndFrame):
-            yield ImageFrame(None, images["grandma-writing.png"])
+            yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
             yield StoryPromptFrame(self._text)
             self._text = ""
             yield frame
-            yield ImageFrame(None, images["grandma-listening.png"])
+            yield ImageFrame(images["grandma-listening.png"], (1024, 1024))
             yield AudioFrame(sounds["listening.wav"])
 
         else:
@@ -232,7 +232,7 @@ async def main(room_url: str, token):
         start_story_event = asyncio.Event()
 
         @transport.event_handler("on_first_other_participant_joined")
-        async def on_first_other_participant_joined(transport):
+        async def on_first_other_participant_joined(transport, participant):
             start_story_event.set()
 
         async def storytime():
@@ -252,7 +252,7 @@ async def storytime():
                 [llm, lca, tts], sink=transport.send_queue)
             await local_pipeline.queue_frames(
                 [
-                    ImageFrame(None, images["grandma-listening.png"]),
+                    ImageFrame(images["grandma-listening.png"], (1024, 1024)),
                     LLMMessagesFrame(intro_messages),
                     AudioFrame(sounds["listening.wav"]),
                     EndPipeFrame(),

diff --git a/src/dailyai/pipeline/aggregators.py b/src/dailyai/pipeline/aggregators.py
@@ -360,7 +360,7 @@ class GatedAggregator(FrameProcessor):
     ...     start_open=False)
     >>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
     >>> asyncio.run(print_frames(aggregator, TextFrame("Hello again.")))
-    >>> asyncio.run(print_frames(aggregator, ImageFrame(url='', image=bytes([]))))
+    >>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
     ImageFrame
     Hello
     Hello again.

diff --git a/src/dailyai/pipeline/frames.py b/src/dailyai/pipeline/frames.py
@@ -70,11 +70,39 @@ def __str__(self):
 class ImageFrame(Frame):
     """An image. Will be shown by the transport if the transport's camera is
     enabled."""
-    url: str | None
     image: bytes
+    size: tuple[int, int]
+
+    def __str__(self):
+        return f"{self.__class__.__name__}, image size: {self.size[0]}x{self.size[1]} buffer size: {len(self.image)} B"
+
+
+@dataclass()
+class URLImageFrame(ImageFrame):
+    """An image. Will be shown by the transport if the transport's camera is
+    enabled."""
+    url: str | None
+
+    def __init__(self, url, image, size):
+        super().__init__(image, size)
+        self.url = url
+
+    def __str__(self):
+        return f"{self.__class__.__name__}, url: {self.url}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"
+
+
+@dataclass()
+class UserImageFrame(ImageFrame):
+    """An image associated to a user. Will be shown by the transport if the transport's camera is
+    enabled."""
+    user_id: str
+
+    def __init__(self, user_id, image, size):
+        super().__init__(image, size)
+        self.user_id = user_id
 
     def __str__(self):
-        return f"{self.__class__.__name__}, url: {self.url}, image size: {len(self.image)} B"
+        return f"{self.__class__.__name__}, user: {self.user_id}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"
 
 
 @dataclass()

diff --git a/src/dailyai/services/ai_services.py b/src/dailyai/services/ai_services.py
@@ -14,6 +14,7 @@
     TTSStartFrame,
     TextFrame,
     TranscriptionFrame,
+    URLImageFrame,
 )
 
 from abc import abstractmethod
@@ -87,16 +88,16 @@ def __init__(self, image_size, **kwargs):
 
     # Renders the image. Returns an Image object.
     @abstractmethod
-    async def run_image_gen(self, sentence: str) -> tuple[str, bytes]:
+    async def run_image_gen(self, sentence: str) -> tuple[str, bytes, tuple[int, int]]:
         pass
 
     async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
         if not isinstance(frame, TextFrame):
             yield frame
             return
 
-        (url, image_data) = await self.run_image_gen(frame.text)
-        yield ImageFrame(url, image_data)
+        (url, image_data, image_size) = await self.run_image_gen(frame.text)
+        yield URLImageFrame(url, image_data, image_size)
 
 
 class STTService(AIService):

diff --git a/src/dailyai/services/azure_ai_services.py b/src/dailyai/services/azure_ai_services.py
@@ -105,7 +105,7 @@ def __init__(
         self._model = model
         self._aiohttp_session = aiohttp_session
 
-    async def run_image_gen(self, sentence) -> tuple[str, bytes]:
+    async def run_image_gen(self, sentence) -> tuple[str, bytes, tuple[int, int]]:
         url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
         headers = {
             "api-key": self._api_key,
@@ -146,4 +146,4 @@ async def run_image_gen(self, sentence) -> tuple[str, bytes]:
             async with self._aiohttp_session.get(image_url) as response:
                 image_stream = io.BytesIO(await response.content.read())
                 image = Image.open(image_stream)
-                return (image_url, image.tobytes())
+                return (image_url, image.tobytes(), image.size)
diff --git a/src/dailyai/services/fal_ai_services.py b/src/dailyai/services/fal_ai_services.py
@@ -31,7 +31,7 @@ def __init__(
         if key_secret:
             os.environ["FAL_KEY_SECRET"] = key_secret
 
-    async def run_image_gen(self, sentence) -> tuple[str, bytes]:
+    async def run_image_gen(self, sentence) -> tuple[str, bytes, tuple[int, int]]:
         def get_image_url(sentence, size):
             handler = fal.apps.submit(
                 "110602490-fast-sdxl",
@@ -55,4 +55,4 @@ def get_image_url(sentence, size):
         async with self._aiohttp_session.get(image_url) as response:
             image_stream = io.BytesIO(await response.content.read())
             image = Image.open(image_stream)
-            return (image_url, image.tobytes())
+            return (image_url, image.tobytes(), image.size)