Merge pull request #853 from pipecat-ai/revert-849-aleix/no-need-for-…

…super-process-frame Revert "no longer necessary to call super().process_frame(frame, direction)"
pipecat-ai · Dec 13, 2024 · f8e69cf · f8e69cf
2 parents 10f854a + 6d11911
commit f8e69cf
Show file tree

Hide file tree

Showing 57 changed files with 212 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,12 +13,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   Tamil) and PlayHT (Afrikans, Albanian, Amharic, Arabic, Bengali, Croatian,
   Galician, Hebrew, Mandarin, Serbian, Tagalog, Urdu, Xhosa).
 
-### Changed
-
-- It's no longer necessary to call `super().process_frame(frame, direction)` if
-  you subclass and implement `FrameProcessor.process_frame()`. This is all now
-  done internally and will avoid possible issues if you forget to add it.
-
 ### Deprecated
 
 - `AWSTTSService` is now deprecated, use `PollyTTSService` instead.

diff --git a/examples/foundational/05-sync-speech-and-image.py b/examples/foundational/05-sync-speech-and-image.py
@@ -56,6 +56,8 @@ def __init__(self):
         self.prepend_to_next_text_frame = False
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, MonthFrame):
             self.most_recent_month = frame.month
         elif self.prepend_to_next_text_frame and isinstance(frame, TextFrame):

diff --git a/examples/foundational/05a-local-sync-speech-and-image.py b/examples/foundational/05a-local-sync-speech-and-image.py
@@ -62,6 +62,8 @@ def __init__(self):
                     self.text = ""
 
                 async def process_frame(self, frame: Frame, direction: FrameDirection):
+                    await super().process_frame(frame, direction)
+
                     if isinstance(frame, TextFrame):
                         self.text = frame.text
                     await self.push_frame(frame, direction)
@@ -73,6 +75,8 @@ def __init__(self):
                     self.frame = None
 
                 async def process_frame(self, frame: Frame, direction: FrameDirection):
+                    await super().process_frame(frame, direction)
+
                     if isinstance(frame, TTSAudioRawFrame):
                         self.audio.extend(frame.audio)
                         self.frame = OutputAudioRawFrame(
@@ -86,6 +90,8 @@ def __init__(self):
                     self.frame = None
 
                 async def process_frame(self, frame: Frame, direction: FrameDirection):
+                    await super().process_frame(frame, direction)
+
                     if isinstance(frame, URLImageRawFrame):
                         self.frame = frame
                     await self.push_frame(frame, direction)

diff --git a/examples/foundational/06a-image-sync.py b/examples/foundational/06a-image-sync.py
@@ -47,6 +47,8 @@ def __init__(self, speaking_path: str, waiting_path: str):
         self._waiting_image_bytes = self._waiting_image.tobytes()
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if not isinstance(frame, SystemFrame) and direction == FrameDirection.DOWNSTREAM:
             await self.push_frame(
                 OutputImageRawFrame(

diff --git a/examples/foundational/07s-interruptible-google-audio-in.py b/examples/foundational/07s-interruptible-google-audio-in.py
@@ -82,6 +82,8 @@ def __init__(self, context, user_context_aggregator):
         self._user_speaking = False
 
     async def process_frame(self, frame, direction):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, TranscriptionFrame):
             # We could gracefully handle both audio input and text/transcription input ...
             # but let's leave that as an exercise to the reader. :-)
@@ -124,6 +126,7 @@ def reset(self):
         self._accumulating_transcript = False
 
     async def process_frame(self, frame, direction):
+        await super().process_frame(frame, direction)
         if isinstance(frame, LLMFullResponseStartFrame):
             self._processing_llm_response = True
             self._accumulating_transcript = True
@@ -177,6 +180,8 @@ def add_transcript_back_to_inference_output(self):
             self._context.messages[-1].parts[-1].text += f"\n\n{marker}\n{self._transcript}\n"
 
     async def process_frame(self, frame, direction):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, MagicDemoTranscriptionFrame):
             self._transcript = frame.text
         elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(

diff --git a/examples/foundational/09-mirror.py b/examples/foundational/09-mirror.py
@@ -35,6 +35,8 @@
 
 class MirrorProcessor(FrameProcessor):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, InputAudioRawFrame):
             await self.push_frame(
                 OutputAudioRawFrame(

diff --git a/examples/foundational/09a-local-mirror.py b/examples/foundational/09a-local-mirror.py
@@ -39,6 +39,8 @@
 
 class MirrorProcessor(FrameProcessor):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, InputAudioRawFrame):
             await self.push_frame(
                 OutputAudioRawFrame(

diff --git a/examples/foundational/11-sound-effects.py b/examples/foundational/11-sound-effects.py
@@ -60,6 +60,8 @@
 
 class OutboundSoundEffectWrapper(FrameProcessor):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, LLMFullResponseEndFrame):
             await self.push_frame(sounds["ding1.wav"])
             # In case anything else downstream needs it
@@ -70,6 +72,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
 
 class InboundSoundEffectWrapper(FrameProcessor):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, OpenAILLMContextFrame):
             await self.push_frame(sounds["ding2.wav"])
             # In case anything else downstream needs it

diff --git a/examples/foundational/12-describe-video.py b/examples/foundational/12-describe-video.py
@@ -42,6 +42,8 @@ def set_participant_id(self, participant_id: str):
         self._participant_id = participant_id
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if self._participant_id and isinstance(frame, TextFrame):
             await self.push_frame(
                 UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM

diff --git a/examples/foundational/12a-describe-video-gemini-flash.py b/examples/foundational/12a-describe-video-gemini-flash.py
@@ -42,6 +42,8 @@ def set_participant_id(self, participant_id: str):
         self._participant_id = participant_id
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if self._participant_id and isinstance(frame, TextFrame):
             await self.push_frame(
                 UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM

diff --git a/examples/foundational/12b-describe-video-gpt-4o.py b/examples/foundational/12b-describe-video-gpt-4o.py
@@ -42,6 +42,8 @@ def set_participant_id(self, participant_id: str):
         self._participant_id = participant_id
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if self._participant_id and isinstance(frame, TextFrame):
             await self.push_frame(
                 UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM

diff --git a/examples/foundational/12c-describe-video-anthropic.py b/examples/foundational/12c-describe-video-anthropic.py
@@ -42,6 +42,8 @@ def set_participant_id(self, participant_id: str):
         self._participant_id = participant_id
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if self._participant_id and isinstance(frame, TextFrame):
             await self.push_frame(
                 UserImageRequestFrame(self._participant_id), FrameDirection.UPSTREAM

diff --git a/examples/foundational/13-whisper-transcription.py b/examples/foundational/13-whisper-transcription.py
@@ -30,6 +30,8 @@
 
 class TranscriptionLogger(FrameProcessor):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, TranscriptionFrame):
             print(f"Transcription: {frame.text}")
 

diff --git a/examples/foundational/13a-whisper-local.py b/examples/foundational/13a-whisper-local.py
@@ -28,6 +28,8 @@
 
 class TranscriptionLogger(FrameProcessor):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, TranscriptionFrame):
             print(f"Transcription: {frame.text}")
 

diff --git a/examples/foundational/13b-deepgram-transcription.py b/examples/foundational/13b-deepgram-transcription.py
@@ -31,6 +31,8 @@
 
 class TranscriptionLogger(FrameProcessor):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, TranscriptionFrame):
             print(f"Transcription: {frame.text}")
 

diff --git a/examples/foundational/13c-gladia-transcription.py b/examples/foundational/13c-gladia-transcription.py
@@ -29,6 +29,8 @@
 
 class TranscriptionLogger(FrameProcessor):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, TranscriptionFrame):
             print(f"Transcription: {frame.text}")
 

diff --git a/examples/foundational/13d-assemblyai-transcription.py b/examples/foundational/13d-assemblyai-transcription.py
@@ -29,6 +29,8 @@
 
 class TranscriptionLogger(FrameProcessor):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, TranscriptionFrame):
             print(f"Transcription: {frame.text}")
 

diff --git a/examples/foundational/22b-natural-conversation-proposal.py b/examples/foundational/22b-natural-conversation-proposal.py
@@ -64,6 +64,7 @@ def __init__(self, notifier: BaseNotifier, **kwargs):
         self._notifier = notifier
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
         # We must not block system frames.
         if isinstance(frame, SystemFrame):
             await self.push_frame(frame, direction)
@@ -117,6 +118,7 @@ def __init__(self, notifier: BaseNotifier):
         self._notifier = notifier
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
         if isinstance(frame, TextFrame) and frame.text == "YES":
             logger.debug("Completeness check YES")
             await self.push_frame(UserStoppedSpeakingFrame())
@@ -139,6 +141,8 @@ def open_gate(self):
         self._gate_open = True
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         # We must not block system frames.
         if isinstance(frame, SystemFrame):
             if isinstance(frame, StartFrame):

diff --git a/examples/foundational/22c-natural-conversation-mixed-llms.py b/examples/foundational/22c-natural-conversation-mixed-llms.py
@@ -101,12 +101,12 @@
 
 Examples:
 # Complete Wh-question
-[{"role": "assistant", "content": "I can help you learn."},
+[{"role": "assistant", "content": "I can help you learn."}, 
  {"role": "user", "content": "What's the fastest way to learn Spanish"}]
 Output: YES
 
 # Complete Yes/No question despite STT error
-[{"role": "assistant", "content": "I know about planets."},
+[{"role": "assistant", "content": "I know about planets."}, 
  {"role": "user", "content": "Is is Jupiter the biggest planet"}]
 Output: YES
 
@@ -118,12 +118,12 @@
 
 Examples:
 # Direct instruction
-[{"role": "assistant", "content": "I can explain many topics."},
+[{"role": "assistant", "content": "I can explain many topics."}, 
  {"role": "user", "content": "Tell me about black holes"}]
 Output: YES
 
 # Action demand
-[{"role": "assistant", "content": "I can help with math."},
+[{"role": "assistant", "content": "I can help with math."}, 
  {"role": "user", "content": "Solve this equation x plus 5 equals 12"}]
 Output: YES
 
@@ -134,12 +134,12 @@
 
 Examples:
 # Specific answer
-[{"role": "assistant", "content": "What's your favorite color?"},
+[{"role": "assistant", "content": "What's your favorite color?"}, 
  {"role": "user", "content": "I really like blue"}]
 Output: YES
 
 # Option selection
-[{"role": "assistant", "content": "Would you prefer morning or evening?"},
+[{"role": "assistant", "content": "Would you prefer morning or evening?"}, 
  {"role": "user", "content": "Morning"}]
 Output: YES
 
@@ -153,17 +153,17 @@
 
 Examples:
 # Self-correction reaching completion
-[{"role": "assistant", "content": "What would you like to know?"},
+[{"role": "assistant", "content": "What would you like to know?"}, 
  {"role": "user", "content": "Tell me about... no wait, explain how rainbows form"}]
 Output: YES
 
 # Topic change with complete thought
-[{"role": "assistant", "content": "The weather is nice today."},
+[{"role": "assistant", "content": "The weather is nice today."}, 
  {"role": "user", "content": "Actually can you tell me who invented the telephone"}]
 Output: YES
 
 # Mid-sentence completion
-[{"role": "assistant", "content": "Hello I'm ready."},
+[{"role": "assistant", "content": "Hello I'm ready."}, 
  {"role": "user", "content": "What's the capital of? France"}]
 Output: YES
 
@@ -175,12 +175,12 @@
 
 Examples:
 # Acknowledgment
-[{"role": "assistant", "content": "Should we talk about history?"},
+[{"role": "assistant", "content": "Should we talk about history?"}, 
  {"role": "user", "content": "Sure"}]
 Output: YES
 
 # Disagreement with completion
-[{"role": "assistant", "content": "Is that what you meant?"},
+[{"role": "assistant", "content": "Is that what you meant?"}, 
  {"role": "user", "content": "No not really"}]
 Output: YES
 
@@ -194,12 +194,12 @@
 
 Examples:
 # Word repetition but complete
-[{"role": "assistant", "content": "I can help with that."},
+[{"role": "assistant", "content": "I can help with that."}, 
  {"role": "user", "content": "What what is the time right now"}]
 Output: YES
 
 # Missing punctuation but complete
-[{"role": "assistant", "content": "I can explain that."},
+[{"role": "assistant", "content": "I can explain that."}, 
  {"role": "user", "content": "Please tell me how computers work"}]
 Output: YES
 
@@ -211,12 +211,12 @@
 
 Examples:
 # Filler words but complete
-[{"role": "assistant", "content": "What would you like to know?"},
+[{"role": "assistant", "content": "What would you like to know?"}, 
  {"role": "user", "content": "Um uh how do airplanes fly"}]
 Output: YES
 
 # Thinking pause but incomplete
-[{"role": "assistant", "content": "I can explain anything."},
+[{"role": "assistant", "content": "I can explain anything."}, 
  {"role": "user", "content": "Well um I want to know about the"}]
 Output: NO
 
@@ -241,17 +241,17 @@
 
 Examples:
 # Incomplete despite corrections
-[{"role": "assistant", "content": "What would you like to know about?"},
+[{"role": "assistant", "content": "What would you like to know about?"}, 
  {"role": "user", "content": "Can you tell me about"}]
 Output: NO
 
 # Complete despite multiple artifacts
-[{"role": "assistant", "content": "I can help you learn."},
+[{"role": "assistant", "content": "I can help you learn."}, 
  {"role": "user", "content": "How do you I mean what's the best way to learn programming"}]
 Output: YES
 
 # Trailing off incomplete
-[{"role": "assistant", "content": "I can explain anything."},
+[{"role": "assistant", "content": "I can explain anything."}, 
  {"role": "user", "content": "I was wondering if you could tell me why"}]
 Output: NO
 """
@@ -268,6 +268,7 @@ def __init__(self, notifier: BaseNotifier, **kwargs):
         self._notifier = notifier
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
         # We must not block system frames.
         if isinstance(frame, SystemFrame):
             await self.push_frame(frame, direction)
@@ -319,6 +320,8 @@ def __init__(self, notifier: BaseNotifier):
         self._notifier = notifier
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         if isinstance(frame, TextFrame) and frame.text == "YES":
             logger.debug("!!! Completeness check YES")
             await self.push_frame(UserStoppedSpeakingFrame())
@@ -341,6 +344,8 @@ def open_gate(self):
         self._gate_open = True
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
         # We must not block system frames.
         if isinstance(frame, SystemFrame):
             if isinstance(frame, StartFrame):