Skip to content

Commit

Permalink
Merge pull request #112 from daily-co/user-image-frame
Browse files Browse the repository at this point in the history
user image frames and other updates
  • Loading branch information
aconchillo authored Apr 9, 2024
2 parents 53930b4 + 57aabea commit 4f04b10
Show file tree
Hide file tree
Showing 24 changed files with 170 additions and 49 deletions.
2 changes: 1 addition & 1 deletion examples/foundational/02-llm-say-one-thing.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ async def main(room_url):
pipeline = Pipeline([llm, tts])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await pipeline.queue_frames([LLMMessagesFrame(messages), EndFrame()])

await transport.run(pipeline)
Expand Down
2 changes: 1 addition & 1 deletion examples/foundational/03-still-frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async def main(room_url):
pipeline = Pipeline([imagegen])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
# Note that we do not put an EndFrame() item in the pipeline for this demo.
# This means that the bot will stay in the channel until it times out.
# An EndFrame() in the pipeline would cause the transport to shut
Expand Down
8 changes: 4 additions & 4 deletions examples/foundational/05a-local-sync-speech-and-text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
from dailyai.pipeline.aggregators import LLMFullResponseAggregator

from dailyai.pipeline.frames import AudioFrame, ImageFrame, LLMMessagesFrame, TextFrame
from dailyai.pipeline.frames import AudioFrame, URLImageFrame, LLMMessagesFrame, TextFrame
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
Expand Down Expand Up @@ -67,8 +67,7 @@ async def get_month_description(aggregator, frame):
return frame.text

async def get_month_data(month):
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {
month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]

messages_frame = LLMMessagesFrame(messages)

Expand All @@ -95,6 +94,7 @@ async def get_month_data(month):
"text": image_description,
"image_url": image_data[0],
"image": image_data[1],
"image_size": image_data[2],
"audio": audio,
}

Expand All @@ -118,7 +118,7 @@ async def show_images():
if data:
await transport.send_queue.put(
[
ImageFrame(data["image_url"], data["image"]),
URLImageFrame(data["image_url"], data["image"], data["image_size"]),
AudioFrame(data["audio"]),
]
)
Expand Down
2 changes: 1 addition & 1 deletion examples/foundational/06-listen-and-respond.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ async def main(room_url: str, token):
)

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
Expand Down
6 changes: 3 additions & 3 deletions examples/foundational/06a-image-sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def __init__(self, speaking_path: str, waiting_path: str):
self._waiting_image_bytes = self._waiting_image.tobytes()

async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
yield ImageFrame(None, self._speaking_image_bytes)
yield ImageFrame(self._speaking_image_bytes, (1024, 1024))
yield frame
yield ImageFrame(None, self._waiting_image_bytes)
yield ImageFrame(self._waiting_image_bytes, (1024, 1024))


async def main(room_url: str, token):
Expand Down Expand Up @@ -85,7 +85,7 @@ async def main(room_url: str, token):
pipeline = Pipeline([image_sync_aggregator, tma_in, llm, tma_out, tts])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await pipeline.queue_frames([TextFrame("Hi, I'm listening!")])

await transport.run(pipeline)
Expand Down
2 changes: 1 addition & 1 deletion examples/foundational/07-interruptible.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ async def main(room_url: str, token):
pipeline = Pipeline([FrameLogger(), llm, FrameLogger(), tts])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await transport.say("Hi, I'm listening!", tts)

async def run_conversation():
Expand Down
4 changes: 2 additions & 2 deletions examples/foundational/08-bots-arguing.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ async def argue():
)
await transport.send_queue.put(
[
ImageFrame(None, image_data1[1]),
ImageFrame(image_data1[1], image_data1[2]),
AudioFrame(audio1),
]
)
Expand All @@ -134,7 +134,7 @@ async def argue():
)
await transport.send_queue.put(
[
ImageFrame(None, image_data2[1]),
ImageFrame(image_data2[1], image_data2[2]),
AudioFrame(audio2),
]
)
Expand Down
4 changes: 2 additions & 2 deletions examples/foundational/10-wake-word.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
sprites[file] = img.tobytes()

# When the bot isn't talking, show a static image of the cat listening
quiet_frame = ImageFrame("", sprites["sc-listen-1.png"])
quiet_frame = ImageFrame(sprites["sc-listen-1.png"], (720, 1280))
# When the bot is talking, build an animation from two sprites
talking_list = [sprites["sc-default.png"], sprites["sc-talk.png"]]
talking = [random.choice(talking_list) for x in range(30)]
Expand Down Expand Up @@ -165,7 +165,7 @@ async def main(room_url: str, token):
pipeline = Pipeline([isa, tf, ncf, tma_in, llm, tma_out, tts])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await transport.say(
"Hi! If you want to talk to me, just say 'hey Santa Cat'.",
tts,
Expand Down
2 changes: 1 addition & 1 deletion examples/foundational/11-sound-effects.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ async def main(room_url: str, token):
pipeline = Pipeline([tma_in, in_sound, fl2, llm, tma_out, fl, tts, out_sound])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await transport.say("Hi, I'm listening!", tts)
await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))

Expand Down
56 changes: 56 additions & 0 deletions examples/foundational/14-render-remote-participant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import asyncio
import io
import logging

from typing import AsyncGenerator

from PIL import Image

from dailyai.pipeline.aggregators import FrameProcessor

from dailyai.pipeline.frames import ImageFrame, Frame, UserImageFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport

from runner import configure

from dotenv import load_dotenv
load_dotenv(override=True)

logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)


class UserImageProcessor(FrameProcessor):

async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
print(frame)
if isinstance(frame, UserImageFrame):
yield ImageFrame(frame.image, frame.size)
else:
yield frame


async def main(room_url):
transport = DailyTransport(
room_url,
token,
"Render participant video",
camera_width=1280,
camera_height=720,
camera_enabled=True,
video_rendering_enabled=True
)

@ transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport, participant):
transport.render_participant_video(participant["id"])

pipeline = Pipeline([UserImageProcessor()])

await asyncio.gather(transport.run(pipeline))

if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))
2 changes: 1 addition & 1 deletion examples/internal/11a-dial-out.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ async def main(room_url: str, token, phone):
tts = AzureTTSService()

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await tts.say("Hi, I'm listening!", transport.send_queue)
await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))

Expand Down
4 changes: 2 additions & 2 deletions examples/starter-apps/chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
flipped = sprites[::-1]
sprites.extend(flipped)
# When the bot isn't talking, show a static image of the cat listening
quiet_frame = ImageFrame("", sprites[0])
quiet_frame = ImageFrame(sprites[0], (1024, 576))
talking_frame = SpriteFrame(images=sprites)


Expand Down Expand Up @@ -127,7 +127,7 @@ async def main(room_url: str, token):
]

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
print(f"!!! in here, pipeline.source is {pipeline.source}")
await pipeline.queue_frames([LLMMessagesFrame(messages)])

Expand Down
2 changes: 1 addition & 1 deletion examples/starter-apps/patient-intake.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ async def main(room_url: str, token):
pipeline = Pipeline(processors=[fl, llm, fl2, checklist, tts])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await pipeline.queue_frames([OpenAILLMContextFrame(context)])

async def handle_intake():
Expand Down
12 changes: 6 additions & 6 deletions examples/starter-apps/storybot.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
1. Catch the frames that are generated by the LLM service
"""
if isinstance(frame, UserStoppedSpeakingFrame):
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield AudioFrame(sounds["talking.wav"])

elif isinstance(frame, TextFrame):
Expand All @@ -112,7 +112,7 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:

self._text = self._text.replace("\n", " ")
if len(self._text) > 2:
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield StoryStartFrame(self._text)
yield AudioFrame(sounds["ding3.wav"])
self._text = ""
Expand Down Expand Up @@ -146,11 +146,11 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
# last bit
pass
elif isinstance(frame, LLMResponseEndFrame):
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield StoryPromptFrame(self._text)
self._text = ""
yield frame
yield ImageFrame(None, images["grandma-listening.png"])
yield ImageFrame(images["grandma-listening.png"], (1024, 1024))
yield AudioFrame(sounds["listening.wav"])

else:
Expand Down Expand Up @@ -232,7 +232,7 @@ async def main(room_url: str, token):
start_story_event = asyncio.Event()

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
start_story_event.set()

async def storytime():
Expand All @@ -252,7 +252,7 @@ async def storytime():
[llm, lca, tts], sink=transport.send_queue)
await local_pipeline.queue_frames(
[
ImageFrame(None, images["grandma-listening.png"]),
ImageFrame(images["grandma-listening.png"], (1024, 1024)),
LLMMessagesFrame(intro_messages),
AudioFrame(sounds["listening.wav"]),
EndPipeFrame(),
Expand Down
2 changes: 1 addition & 1 deletion src/dailyai/pipeline/aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ class GatedAggregator(FrameProcessor):
... start_open=False)
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello again.")))
>>> asyncio.run(print_frames(aggregator, ImageFrame(url='', image=bytes([]))))
>>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
ImageFrame
Hello
Hello again.
Expand Down
32 changes: 30 additions & 2 deletions src/dailyai/pipeline/frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,39 @@ def __str__(self):
class ImageFrame(Frame):
"""An image. Will be shown by the transport if the transport's camera is
enabled."""
url: str | None
image: bytes
size: tuple[int, int]

def __str__(self):
return f"{self.__class__.__name__}, image size: {self.size[0]}x{self.size[1]} buffer size: {len(self.image)} B"


@dataclass()
class URLImageFrame(ImageFrame):
"""An image. Will be shown by the transport if the transport's camera is
enabled."""
url: str | None

def __init__(self, url, image, size):
super().__init__(image, size)
self.url = url

def __str__(self):
return f"{self.__class__.__name__}, url: {self.url}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"


@dataclass()
class UserImageFrame(ImageFrame):
"""An image associated to a user. Will be shown by the transport if the transport's camera is
enabled."""
user_id: str

def __init__(self, user_id, image, size):
super().__init__(image, size)
self.user_id = user_id

def __str__(self):
return f"{self.__class__.__name__}, url: {self.url}, image size: {len(self.image)} B"
return f"{self.__class__.__name__}, user: {self.user_id}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"


@dataclass()
Expand Down
7 changes: 4 additions & 3 deletions src/dailyai/services/ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
TTSStartFrame,
TextFrame,
TranscriptionFrame,
URLImageFrame,
)

from abc import abstractmethod
Expand Down Expand Up @@ -87,16 +88,16 @@ def __init__(self, image_size, **kwargs):

# Renders the image. Returns an Image object.
@abstractmethod
async def run_image_gen(self, sentence: str) -> tuple[str, bytes]:
async def run_image_gen(self, sentence: str) -> tuple[str, bytes, tuple[int, int]]:
pass

async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if not isinstance(frame, TextFrame):
yield frame
return

(url, image_data) = await self.run_image_gen(frame.text)
yield ImageFrame(url, image_data)
(url, image_data, image_size) = await self.run_image_gen(frame.text)
yield URLImageFrame(url, image_data, image_size)


class STTService(AIService):
Expand Down
4 changes: 2 additions & 2 deletions src/dailyai/services/azure_ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def __init__(
self._model = model
self._aiohttp_session = aiohttp_session

async def run_image_gen(self, sentence) -> tuple[str, bytes]:
async def run_image_gen(self, sentence) -> tuple[str, bytes, tuple[int, int]]:
url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
headers = {
"api-key": self._api_key,
Expand Down Expand Up @@ -146,4 +146,4 @@ async def run_image_gen(self, sentence) -> tuple[str, bytes]:
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
return (image_url, image.tobytes(), image.size)
4 changes: 2 additions & 2 deletions src/dailyai/services/fal_ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def __init__(
if key_secret:
os.environ["FAL_KEY_SECRET"] = key_secret

async def run_image_gen(self, sentence) -> tuple[str, bytes]:
async def run_image_gen(self, sentence) -> tuple[str, bytes, tuple[int, int]]:
def get_image_url(sentence, size):
handler = fal.apps.submit(
"110602490-fast-sdxl",
Expand All @@ -55,4 +55,4 @@ def get_image_url(sentence, size):
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
return (image_url, image.tobytes(), image.size)
Loading

0 comments on commit 4f04b10

Please sign in to comment.