Merge pull request #595 from pipecat-ai/aleix/bot-speaking-system-frames

bot speaking system frames
pipecat-ai · Oct 15, 2024 · 0c4a513 · 0c4a513
2 parents 79b52d4 + 4a71eac
commit 0c4a513
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 82 deletions.
diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
@@ -274,6 +274,17 @@ def __str__(self):
         return f"{self.name}(message: {self.message})"
 
 
+@dataclass
+class FunctionCallResultFrame(DataFrame):
+    """A frame containing the result of an LLM function (tool) call."""
+
+    function_name: str
+    tool_call_id: str
+    arguments: str
+    result: Any
+    run_llm: bool = True
+
+
 #
 # App frames. Application user-defined frames.
 #
@@ -393,6 +404,25 @@ class StopInterruptionFrame(SystemFrame):
     pass
 
 
+@dataclass
+class UserStartedSpeakingFrame(SystemFrame):
+    """Emitted by VAD to indicate that a user has started speaking. This can be
+    used for interruptions or other times when detecting that someone is
+    speaking is more important than knowing what they're saying (as you will
+    with a TranscriptionFrame)
+
+    """
+
+    pass
+
+
+@dataclass
+class UserStoppedSpeakingFrame(SystemFrame):
+    """Emitted by the VAD to indicate that a user stopped speaking."""
+
+    pass
+
+
 @dataclass
 class BotInterruptionFrame(SystemFrame):
     """Emitted by when the bot should be interrupted. This will mainly cause the
@@ -404,6 +434,52 @@ class BotInterruptionFrame(SystemFrame):
     pass
 
 
+@dataclass
+class BotStartedSpeakingFrame(SystemFrame):
+    """Emitted upstream by transport outputs to indicate the bot started speaking."""
+
+    pass
+
+
+@dataclass
+class BotStoppedSpeakingFrame(SystemFrame):
+    """Emitted upstream by transport outputs to indicate the bot stopped speaking."""
+
+    pass
+
+
+@dataclass
+class BotSpeakingFrame(SystemFrame):
+    """Emitted upstream by transport outputs while the bot is still
+    speaking. This can be used, for example, to detect when a user is idle. That
+    is, while the bot is speaking we don't want to trigger any user idle timeout
+    since the user might be listening.
+
+    """
+
+    pass
+
+
+@dataclass
+class UserImageRequestFrame(SystemFrame):
+    """A frame user to request an image from the given user."""
+
+    user_id: str
+    context: Optional[Any] = None
+
+    def __str__(self):
+        return f"{self.name}, user: {self.user_id}"
+
+
+@dataclass
+class FunctionCallInProgressFrame(SystemFrame):
+    """A frame signaling that a function call is in progress."""
+
+    function_name: str
+    tool_call_id: str
+    arguments: str
+
+
 @dataclass
 class TransportMessageUrgentFrame(SystemFrame):
     message: Any
@@ -457,51 +533,6 @@ class LLMFullResponseEndFrame(ControlFrame):
     pass
 
 
-@dataclass
-class UserStartedSpeakingFrame(ControlFrame):
-    """Emitted by VAD to indicate that a user has started speaking. This can be
-    used for interruptions or other times when detecting that someone is
-    speaking is more important than knowing what they're saying (as you will
-    with a TranscriptionFrame)
-
-    """
-
-    pass
-
-
-@dataclass
-class UserStoppedSpeakingFrame(ControlFrame):
-    """Emitted by the VAD to indicate that a user stopped speaking."""
-
-    pass
-
-
-@dataclass
-class BotStartedSpeakingFrame(ControlFrame):
-    """Emitted upstream by transport outputs to indicate the bot started speaking."""
-
-    pass
-
-
-@dataclass
-class BotStoppedSpeakingFrame(ControlFrame):
-    """Emitted upstream by transport outputs to indicate the bot stopped speaking."""
-
-    pass
-
-
-@dataclass
-class BotSpeakingFrame(ControlFrame):
-    """Emitted upstream by transport outputs while the bot is still
-    speaking. This can be used, for example, to detect when a user is idle. That
-    is, while the bot is speaking we don't want to trigger any user idle timeout
-    since the user might be listening.
-
-    """
-
-    pass
-
-
 @dataclass
 class TTSStartedFrame(ControlFrame):
     """Used to indicate the beginning of a TTS response. Following
@@ -522,17 +553,6 @@ class TTSStoppedFrame(ControlFrame):
     pass
 
 
-@dataclass
-class UserImageRequestFrame(ControlFrame):
-    """A frame user to request an image from the given user."""
-
-    user_id: str
-    context: Optional[Any] = None
-
-    def __str__(self):
-        return f"{self.name}, user: {self.user_id}"
-
-
 @dataclass
 class ServiceUpdateSettingsFrame(ControlFrame):
     """A control frame containing a request to update service settings."""
@@ -555,26 +575,6 @@ class STTUpdateSettingsFrame(ServiceUpdateSettingsFrame):
     pass
 
 
-@dataclass
-class FunctionCallInProgressFrame(SystemFrame):
-    """A frame signaling that a function call is in progress."""
-
-    function_name: str
-    tool_call_id: str
-    arguments: str
-
-
-@dataclass
-class FunctionCallResultFrame(DataFrame):
-    """A frame containing the result of an LLM function (tool) call."""
-
-    function_name: str
-    tool_call_id: str
-    arguments: str
-    result: Any
-    run_llm: bool = True
-
-
 @dataclass
 class VADParamsUpdateFrame(ControlFrame):
     """A control frame containing a request to update VAD params. Intended

diff --git a/src/pipecat/processors/frameworks/rtvi.py b/src/pipecat/processors/frameworks/rtvi.py
@@ -459,14 +459,18 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
 
         await self.push_frame(frame, direction)
 
-        if isinstance(frame, TextFrame):
+        if isinstance(frame, UserStartedSpeakingFrame):
+            await self._push_aggregation()
+        elif isinstance(frame, TextFrame):
             self._aggregation += frame.text
             if match_endofsentence(self._aggregation):
-                message = RTVIBotTranscriptionMessage(
-                    data=RTVITextMessageData(text=self._aggregation)
-                )
-                await self._push_transport_message_urgent(message)
-                self._aggregation = ""
+                await self._push_aggregation()
+
+    async def _push_aggregation(self):
+        if len(self._aggregation) > 0:
+            message = RTVIBotTranscriptionMessage(data=RTVITextMessageData(text=self._aggregation))
+            await self._push_transport_message_urgent(message)
+            self._aggregation = ""
 
 
 class RTVIBotLLMProcessor(RTVIFrameProcessor):