Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

user image frames and other updates #112

Merged
merged 3 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/foundational/02-llm-say-one-thing.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ async def main(room_url):
pipeline = Pipeline([llm, tts])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await pipeline.queue_frames([LLMMessagesFrame(messages), EndFrame()])

await transport.run(pipeline)
Expand Down
2 changes: 1 addition & 1 deletion examples/foundational/03-still-frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async def main(room_url):
pipeline = Pipeline([imagegen])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
# Note that we do not put an EndFrame() item in the pipeline for this demo.
# This means that the bot will stay in the channel until it times out.
# An EndFrame() in the pipeline would cause the transport to shut
Expand Down
8 changes: 4 additions & 4 deletions examples/foundational/05a-local-sync-speech-and-text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
from dailyai.pipeline.aggregators import LLMFullResponseAggregator

from dailyai.pipeline.frames import AudioFrame, ImageFrame, LLMMessagesFrame, TextFrame
from dailyai.pipeline.frames import AudioFrame, URLImageFrame, LLMMessagesFrame, TextFrame
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.fal_ai_services import FalImageGenService
Expand Down Expand Up @@ -67,8 +67,7 @@ async def get_month_description(aggregator, frame):
return frame.text

async def get_month_data(month):
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {
month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]
messages = [{"role": "system", "content": f"Describe a nature photograph suitable for use in a calendar, for the month of {month}. Include only the image description with no preamble. Limit the description to one sentence, please.", }]

messages_frame = LLMMessagesFrame(messages)

Expand All @@ -95,6 +94,7 @@ async def get_month_data(month):
"text": image_description,
"image_url": image_data[0],
"image": image_data[1],
"image_size": image_data[2],
"audio": audio,
}

Expand All @@ -118,7 +118,7 @@ async def show_images():
if data:
await transport.send_queue.put(
[
ImageFrame(data["image_url"], data["image"]),
URLImageFrame(data["image_url"], data["image"], data["image_size"]),
AudioFrame(data["audio"]),
]
)
Expand Down
2 changes: 1 addition & 1 deletion examples/foundational/06-listen-and-respond.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ async def main(room_url: str, token):
)

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
# Kick off the conversation.
messages.append(
{"role": "system", "content": "Please introduce yourself to the user."})
Expand Down
6 changes: 3 additions & 3 deletions examples/foundational/06a-image-sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def __init__(self, speaking_path: str, waiting_path: str):
self._waiting_image_bytes = self._waiting_image.tobytes()

async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
yield ImageFrame(None, self._speaking_image_bytes)
yield ImageFrame(self._speaking_image_bytes, (1024, 1024))
yield frame
yield ImageFrame(None, self._waiting_image_bytes)
yield ImageFrame(self._waiting_image_bytes, (1024, 1024))


async def main(room_url: str, token):
Expand Down Expand Up @@ -85,7 +85,7 @@ async def main(room_url: str, token):
pipeline = Pipeline([image_sync_aggregator, tma_in, llm, tma_out, tts])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await pipeline.queue_frames([TextFrame("Hi, I'm listening!")])

await transport.run(pipeline)
Expand Down
2 changes: 1 addition & 1 deletion examples/foundational/07-interruptible.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ async def main(room_url: str, token):
pipeline = Pipeline([FrameLogger(), llm, FrameLogger(), tts])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await transport.say("Hi, I'm listening!", tts)

async def run_conversation():
Expand Down
4 changes: 2 additions & 2 deletions examples/foundational/08-bots-arguing.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ async def argue():
)
await transport.send_queue.put(
[
ImageFrame(None, image_data1[1]),
ImageFrame(image_data1[1], image_data1[2]),
AudioFrame(audio1),
]
)
Expand All @@ -134,7 +134,7 @@ async def argue():
)
await transport.send_queue.put(
[
ImageFrame(None, image_data2[1]),
ImageFrame(image_data2[1], image_data2[2]),
AudioFrame(audio2),
]
)
Expand Down
4 changes: 2 additions & 2 deletions examples/foundational/10-wake-word.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
sprites[file] = img.tobytes()

# When the bot isn't talking, show a static image of the cat listening
quiet_frame = ImageFrame("", sprites["sc-listen-1.png"])
quiet_frame = ImageFrame(sprites["sc-listen-1.png"], (720, 1280))
# When the bot is talking, build an animation from two sprites
talking_list = [sprites["sc-default.png"], sprites["sc-talk.png"]]
talking = [random.choice(talking_list) for x in range(30)]
Expand Down Expand Up @@ -165,7 +165,7 @@ async def main(room_url: str, token):
pipeline = Pipeline([isa, tf, ncf, tma_in, llm, tma_out, tts])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await transport.say(
"Hi! If you want to talk to me, just say 'hey Santa Cat'.",
tts,
Expand Down
2 changes: 1 addition & 1 deletion examples/foundational/11-sound-effects.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ async def main(room_url: str, token):
pipeline = Pipeline([tma_in, in_sound, fl2, llm, tma_out, fl, tts, out_sound])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await transport.say("Hi, I'm listening!", tts)
await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))

Expand Down
56 changes: 56 additions & 0 deletions examples/foundational/14-render-remote-participant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import asyncio
import io
import logging

from typing import AsyncGenerator

from PIL import Image

from dailyai.pipeline.aggregators import FrameProcessor

from dailyai.pipeline.frames import ImageFrame, Frame, UserImageFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.transports.daily_transport import DailyTransport

from runner import configure

from dotenv import load_dotenv
load_dotenv(override=True)

logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)


class UserImageProcessor(FrameProcessor):

async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
print(frame)
if isinstance(frame, UserImageFrame):
yield ImageFrame(frame.image, frame.size)
else:
yield frame


async def main(room_url):
transport = DailyTransport(
room_url,
token,
"Render participant video",
camera_width=1280,
camera_height=720,
camera_enabled=True,
video_rendering_enabled=True
)

@ transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport, participant):
transport.render_participant_video(participant["id"])

pipeline = Pipeline([UserImageProcessor()])

await asyncio.gather(transport.run(pipeline))

if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url))
2 changes: 1 addition & 1 deletion examples/internal/11a-dial-out.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ async def main(room_url: str, token, phone):
tts = AzureTTSService()

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await tts.say("Hi, I'm listening!", transport.send_queue)
await transport.send_queue.put(AudioFrame(sounds["ding1.wav"]))

Expand Down
4 changes: 2 additions & 2 deletions examples/starter-apps/chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
flipped = sprites[::-1]
sprites.extend(flipped)
# When the bot isn't talking, show a static image of the cat listening
quiet_frame = ImageFrame("", sprites[0])
quiet_frame = ImageFrame(sprites[0], (1024, 576))
talking_frame = SpriteFrame(images=sprites)


Expand Down Expand Up @@ -127,7 +127,7 @@ async def main(room_url: str, token):
]

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
print(f"!!! in here, pipeline.source is {pipeline.source}")
await pipeline.queue_frames([LLMMessagesFrame(messages)])

Expand Down
2 changes: 1 addition & 1 deletion examples/starter-apps/patient-intake.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ async def main(room_url: str, token):
pipeline = Pipeline(processors=[fl, llm, fl2, checklist, tts])

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
await pipeline.queue_frames([OpenAILLMContextFrame(context)])

async def handle_intake():
Expand Down
12 changes: 6 additions & 6 deletions examples/starter-apps/storybot.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
1. Catch the frames that are generated by the LLM service
"""
if isinstance(frame, UserStoppedSpeakingFrame):
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield AudioFrame(sounds["talking.wav"])

elif isinstance(frame, TextFrame):
Expand All @@ -112,7 +112,7 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:

self._text = self._text.replace("\n", " ")
if len(self._text) > 2:
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield StoryStartFrame(self._text)
yield AudioFrame(sounds["ding3.wav"])
self._text = ""
Expand Down Expand Up @@ -146,11 +146,11 @@ async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
# last bit
pass
elif isinstance(frame, LLMResponseEndFrame):
yield ImageFrame(None, images["grandma-writing.png"])
yield ImageFrame(images["grandma-writing.png"], (1024, 1024))
yield StoryPromptFrame(self._text)
self._text = ""
yield frame
yield ImageFrame(None, images["grandma-listening.png"])
yield ImageFrame(images["grandma-listening.png"], (1024, 1024))
yield AudioFrame(sounds["listening.wav"])

else:
Expand Down Expand Up @@ -232,7 +232,7 @@ async def main(room_url: str, token):
start_story_event = asyncio.Event()

@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
async def on_first_other_participant_joined(transport, participant):
start_story_event.set()

async def storytime():
Expand All @@ -252,7 +252,7 @@ async def storytime():
[llm, lca, tts], sink=transport.send_queue)
await local_pipeline.queue_frames(
[
ImageFrame(None, images["grandma-listening.png"]),
ImageFrame(images["grandma-listening.png"], (1024, 1024)),
LLMMessagesFrame(intro_messages),
AudioFrame(sounds["listening.wav"]),
EndPipeFrame(),
Expand Down
2 changes: 1 addition & 1 deletion src/dailyai/pipeline/aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ class GatedAggregator(FrameProcessor):
... start_open=False)
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
>>> asyncio.run(print_frames(aggregator, TextFrame("Hello again.")))
>>> asyncio.run(print_frames(aggregator, ImageFrame(url='', image=bytes([]))))
>>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
ImageFrame
Hello
Hello again.
Expand Down
32 changes: 30 additions & 2 deletions src/dailyai/pipeline/frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,39 @@ def __str__(self):
class ImageFrame(Frame):
"""An image. Will be shown by the transport if the transport's camera is
enabled."""
url: str | None
image: bytes
size: tuple[int, int]

def __str__(self):
return f"{self.__class__.__name__}, image size: {self.size[0]}x{self.size[1]} buffer size: {len(self.image)} B"


@dataclass()
class URLImageFrame(ImageFrame):
"""An image. Will be shown by the transport if the transport's camera is
enabled."""
url: str | None

def __init__(self, url, image, size):
super().__init__(image, size)
self.url = url

def __str__(self):
return f"{self.__class__.__name__}, url: {self.url}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"


@dataclass()
class UserImageFrame(ImageFrame):
"""An image associated to a user. Will be shown by the transport if the transport's camera is
enabled."""
user_id: str

def __init__(self, user_id, image, size):
super().__init__(image, size)
self.user_id = user_id

def __str__(self):
return f"{self.__class__.__name__}, url: {self.url}, image size: {len(self.image)} B"
return f"{self.__class__.__name__}, user: {self.user_id}, image size: {self.size[0]}x{self.size[1]}, buffer size: {len(self.image)} B"


@dataclass()
Expand Down
7 changes: 4 additions & 3 deletions src/dailyai/services/ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
TTSStartFrame,
TextFrame,
TranscriptionFrame,
URLImageFrame,
)

from abc import abstractmethod
Expand Down Expand Up @@ -87,16 +88,16 @@ def __init__(self, image_size, **kwargs):

# Renders the image. Returns an Image object.
@abstractmethod
async def run_image_gen(self, sentence: str) -> tuple[str, bytes]:
async def run_image_gen(self, sentence: str) -> tuple[str, bytes, tuple[int, int]]:
pass

async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if not isinstance(frame, TextFrame):
yield frame
return

(url, image_data) = await self.run_image_gen(frame.text)
yield ImageFrame(url, image_data)
(url, image_data, image_size) = await self.run_image_gen(frame.text)
yield URLImageFrame(url, image_data, image_size)


class STTService(AIService):
Expand Down
4 changes: 2 additions & 2 deletions src/dailyai/services/azure_ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def __init__(
self._model = model
self._aiohttp_session = aiohttp_session

async def run_image_gen(self, sentence) -> tuple[str, bytes]:
async def run_image_gen(self, sentence) -> tuple[str, bytes, tuple[int, int]]:
url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
headers = {
"api-key": self._api_key,
Expand Down Expand Up @@ -146,4 +146,4 @@ async def run_image_gen(self, sentence) -> tuple[str, bytes]:
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
return (image_url, image.tobytes(), image.size)
4 changes: 2 additions & 2 deletions src/dailyai/services/fal_ai_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def __init__(
if key_secret:
os.environ["FAL_KEY_SECRET"] = key_secret

async def run_image_gen(self, sentence) -> tuple[str, bytes]:
async def run_image_gen(self, sentence) -> tuple[str, bytes, tuple[int, int]]:
def get_image_url(sentence, size):
handler = fal.apps.submit(
"110602490-fast-sdxl",
Expand All @@ -55,4 +55,4 @@ def get_image_url(sentence, size):
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
return (image_url, image.tobytes(), image.size)
Loading
Loading