diff --git a/README.md b/README.md index 3c4ba64..c1990e5 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ You can play a WAV file when the wake word is detected (locally or remotely), an * `--awake-wav ` - played when the wake word is detected * `--done-wav ` - played when the voice command is finished -If you want to play audio files other than WAV, use [event commands](#event-commands). Specifically, the `--detection-command` to replace `--awake-wav` and `--transcript-command` to replace `--done-wav`. +If you want to play audio files other than WAV, use [event commands](#event-commands). Specifically, the `--transcribe-command` to replace `--awake-wav` and `--transcript-command` to replace `--done-wav`. ## Audio Enhancements @@ -169,6 +169,7 @@ Satellites can respond to events from the server by running commands: * `--streaming-start-command` - audio has started streaming to server (no stdin) * `--streaming-stop-command` - audio has stopped streaming to server (no stdin) * `--detection-command` - wake word is detected (wake word name on stdin) +* `--transcribe-command` - speech-to-text transcribe is started * `--transcript-command` - speech-to-text transcript is returned (text on stdin) * `--stt-start-command` - user started speaking (no stdin) * `--stt-stop-command` - user stopped speaking (no stdin) diff --git a/wyoming_satellite/__main__.py b/wyoming_satellite/__main__.py index ed29a6f..6ef43d6 100644 --- a/wyoming_satellite/__main__.py +++ b/wyoming_satellite/__main__.py @@ -177,6 +177,10 @@ async def main() -> None: parser.add_argument( "--detection-command", help="Command to run when wake word is detected" ) + parser.add_argument( + "--transcribe-command", + help="Command to run when speech to text transcribe is started", + ) parser.add_argument( "--transcript-command", help="Command to run when speech to text transcript is returned", @@ -369,6 +373,7 @@ async def main() -> None: detect=split_command(args.detect_command), detection=split_command(args.detection_command), played=split_command(args.tts_played_command), + transcribe=split_command(args.transcribe_command), transcript=split_command(args.transcript_command), stt_start=split_command(args.stt_start_command), stt_stop=split_command(args.stt_stop_command), diff --git a/wyoming_satellite/satellite.py b/wyoming_satellite/satellite.py index 6102254..3bf6256 100644 --- a/wyoming_satellite/satellite.py +++ b/wyoming_satellite/satellite.py @@ -10,7 +10,7 @@ from typing import Callable, Dict, Final, List, Optional, Set, Union from pyring_buffer import RingBuffer -from wyoming.asr import Transcript +from wyoming.asr import Transcribe, Transcript from wyoming.audio import AudioChunk, AudioFormat, AudioStart, AudioStop from wyoming.client import AsyncClient from wyoming.error import Error @@ -240,6 +240,27 @@ async def _stop(self) -> None: async def stopped(self) -> None: """Called when satellite has stopped.""" + def _debug_recording_start(self, writer: Optional[DebugAudioWriter]): + if writer is not None: + writer.start() + + def _debug_recording_stop(self, writer: Optional[DebugAudioWriter]): + if writer is not None: + writer.stop() + + def _debug_recording_write( + self, + writer: Optional[DebugAudioWriter], + event: Event, + audio_bytes: Optional[bytes], + ): + if writer is not None: + if audio_bytes is None: + chunk = AudioChunk.from_event(event) + audio_bytes = chunk.audio + + writer.write(audio_bytes) + async def event_from_server(self, event: Event) -> None: """Called when an event is received from the server.""" if Ping.is_type(event.type): @@ -278,16 +299,24 @@ async def event_from_server(self, event: Event) -> None: elif VoiceStopped.is_type(event.type): # STT stop await self.trigger_stt_stop() + elif Transcribe.is_type(event.type): + # STT start + self._debug_recording_start(self.stt_audio_writer) + await self.trigger_transcribe() elif Transcript.is_type(event.type): # STT text + self._debug_recording_stop(self.stt_audio_writer) _LOGGER.debug(event) await self.trigger_transcript(Transcript.from_event(event)) elif Synthesize.is_type(event.type): # TTS request _LOGGER.debug(event) await self.trigger_synthesize(Synthesize.from_event(event)) + elif PauseSatellite.is_type(event.type): + self._debug_recording_stop(self.stt_audio_writer) elif Error.is_type(event.type): _LOGGER.warning(event) + self._debug_recording_stop(self.stt_audio_writer) await self.trigger_error(Error.from_event(event)) # Forward everything except audio to event service @@ -813,16 +842,20 @@ async def trigger_detect(self) -> None: async def trigger_detection(self, detection: Detection) -> None: """Called when wake word is detected.""" await run_event_command(self.settings.event.detection, detection.name) - await self._play_wav( - self.settings.snd.awake_wav, - mute_microphone=self.settings.mic.mute_during_awake_wav, - ) async def trigger_played(self) -> None: """Called when audio stopped playing""" await run_event_command(self.settings.event.played) await self.forward_event(Played().event()) + async def trigger_transcribe(self) -> None: + """Called when speech-to-text is started.""" + await run_event_command(self.settings.event.transcribe) + await self._play_wav( + self.settings.snd.awake_wav, + mute_microphone=self.settings.mic.mute_during_awake_wav, + ) + async def trigger_transcript(self, transcript: Transcript) -> None: """Called when speech-to-text text is received.""" await run_event_command(self.settings.event.transcript, transcript.text) @@ -935,22 +968,13 @@ async def event_from_server(self, event: Event) -> None: elif PauseSatellite.is_type(event.type): self.is_streaming = False _LOGGER.info("Satellite paused") - elif Detection.is_type(event.type): - # Start debug recording - if self.stt_audio_writer is not None: - self.stt_audio_writer.start() - elif Transcript.is_type(event.type) or Error.is_type(event.type): - # Stop debug recording - if self.stt_audio_writer is not None: - self.stt_audio_writer.stop() - - if Transcript.is_type(event.type): - # We're always streaming - _LOGGER.info("Streaming audio") - - # Re-trigger streaming start even though we technically don't stop - # so the event service can reset LEDs, etc. - await self.trigger_streaming_start() + elif Transcript.is_type(event.type): + # We're always streaming + _LOGGER.info("Streaming audio") + + # Re-trigger streaming start even though we technically don't stop + # so the event service can reset LEDs, etc. + await self.trigger_streaming_start() async def event_from_mic( self, event: Event, audio_bytes: Optional[bytes] = None @@ -963,12 +987,7 @@ async def event_from_mic( await self.event_to_server(event) # Debug audio recording - if self.stt_audio_writer is not None: - if audio_bytes is None: - chunk = AudioChunk.from_event(event) - audio_bytes = chunk.audio - - self.stt_audio_writer.write(audio_bytes) + self._debug_recording_write(self.stt_audio_writer, event, audio_bytes) # ----------------------------------------------------------------------------- @@ -1010,10 +1029,6 @@ async def event_from_server(self, event: Event) -> None: if RunSatellite.is_type(event.type): self._is_paused = False _LOGGER.info("Waiting for speech") - elif Detection.is_type(event.type): - # Start debug recording - if self.stt_audio_writer is not None: - self.stt_audio_writer.start() elif ( Transcript.is_type(event.type) or Error.is_type(event.type) @@ -1025,10 +1040,6 @@ async def event_from_server(self, event: Event) -> None: self.is_streaming = False - # Stop debug recording - if self.stt_audio_writer is not None: - self.stt_audio_writer.stop() - async def event_from_mic( self, event: Event, audio_bytes: Optional[bytes] = None ) -> None: @@ -1043,13 +1054,7 @@ async def event_from_mic( chunk: Optional[AudioChunk] = None # Debug audio recording - if self.stt_audio_writer is not None: - if audio_bytes is None: - # Need to unpack - chunk = AudioChunk.from_event(event) - audio_bytes = chunk.audio - - self.stt_audio_writer.write(audio_bytes) + self._debug_recording_write(self.stt_audio_writer, event, audio_bytes) if ( self.is_streaming @@ -1060,9 +1065,7 @@ async def event_from_mic( self.is_streaming = False self.timeout_seconds = None - # Stop debug recording - if self.stt_audio_writer is not None: - self.stt_audio_writer.stop() + self._debug_recording_stop(self.stt_audio_writer) # Stop pipeline await self.event_to_server(AudioStop().event()) @@ -1162,6 +1165,11 @@ def __init__(self, settings: SatelliteSettings) -> None: self._wake_info: Optional[Info] = None self._wake_info_ready = asyncio.Event() + def _debug_recording_start(self, writer: Optional[DebugAudioWriter]): + # Override the base method to set a timestamp + if writer is not None: + writer.start(timestamp=self._debug_recording_timestamp) + async def event_from_server(self, event: Event) -> None: # Only check event types once is_run_satellite = False @@ -1185,10 +1193,6 @@ async def event_from_server(self, event: Event) -> None: # play the "done" WAV. self.is_streaming = False - # Stop debug recording (stt) - if self.stt_audio_writer is not None: - self.stt_audio_writer.stop() - await super().event_from_server(event) if is_run_satellite or is_transcript or is_error or is_pause_satellite: @@ -1209,19 +1213,14 @@ async def event_from_server(self, event: Event) -> None: # Start debug recording (wake) self._debug_recording_timestamp = time.monotonic_ns() - if self.wake_audio_writer is not None: - self.wake_audio_writer.start( - timestamp=self._debug_recording_timestamp - ) + self._debug_recording_start(self.wake_audio_writer) async def trigger_server_disonnected(self) -> None: await super().trigger_server_disonnected() self.is_streaming = False - # Stop debug recording (stt) - if self.stt_audio_writer is not None: - self.stt_audio_writer.stop() + self._debug_recording_stop(self.stt_audio_writer) await self.trigger_streaming_stop() @@ -1236,16 +1235,8 @@ async def event_from_mic( return # Debug audio recording - if (self.wake_audio_writer is not None) or (self.stt_audio_writer is not None): - if audio_bytes is None: - chunk = AudioChunk.from_event(event) - audio_bytes = chunk.audio - - if self.wake_audio_writer is not None: - self.wake_audio_writer.write(audio_bytes) - - if self.stt_audio_writer is not None: - self.stt_audio_writer.write(audio_bytes) + self._debug_recording_write(self.wake_audio_writer, event, audio_bytes) + self._debug_recording_write(self.stt_audio_writer, event, audio_bytes) if self.is_streaming: # Forward to server @@ -1276,12 +1267,7 @@ async def event_from_wake(self, event: Event) -> None: return # Stop debug recording (wake) - if self.wake_audio_writer is not None: - self.wake_audio_writer.stop() - - # Start debug recording (stt) - if self.stt_audio_writer is not None: - self.stt_audio_writer.start(timestamp=self._debug_recording_timestamp) + self._debug_recording_stop(self.wake_audio_writer) _LOGGER.debug(detection) diff --git a/wyoming_satellite/settings.py b/wyoming_satellite/settings.py index 903ff27..9d7fabc 100644 --- a/wyoming_satellite/settings.py +++ b/wyoming_satellite/settings.py @@ -161,6 +161,7 @@ class EventSettings(ServiceSettings): detect: Optional[List[str]] = None detection: Optional[List[str]] = None played: Optional[List[str]] = None + transcribe: Optional[List[str]] = None transcript: Optional[List[str]] = None stt_start: Optional[List[str]] = None stt_stop: Optional[List[str]] = None