From c82f6c0df9e2af19003b16c049a8d549c6d199b2 Mon Sep 17 00:00:00 2001 From: Mike Gray Date: Mon, 6 Nov 2023 00:08:13 -0600 Subject: [PATCH] add stt/tts, rearrange interface --- neon_iris/client.py | 3 +- neon_iris/web_client.py | 108 +++++++++++++++++++++------------- requirements/requirements.txt | 3 +- 3 files changed, 72 insertions(+), 42 deletions(-) diff --git a/neon_iris/client.py b/neon_iris/client.py index 1dd598b..4d08583 100644 --- a/neon_iris/client.py +++ b/neon_iris/client.py @@ -283,7 +283,8 @@ def _send_audio(self, audio_file: str, lang: str, audio_data = encode_file_to_base64_string(audio_file) message = self._build_message("neon.audio_input", {"lang": lang, - "audio_data": audio_data}, + "audio_data": audio_data, + "utterances": []}, username, user_profiles) serialized = {"msg_type": message.msg_type, "data": message.data, diff --git a/neon_iris/web_client.py b/neon_iris/web_client.py index 1cf930d..f7127f8 100644 --- a/neon_iris/web_client.py +++ b/neon_iris/web_client.py @@ -23,8 +23,9 @@ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from os.path import isfile -from typing import List +from os.path import isfile, join +from time import time +from typing import List, Optional import gradio @@ -32,11 +33,12 @@ from ovos_bus_client import Message from ovos_config import Configuration from ovos_utils import LOG - from neon_utils.file_utils import decode_base64_string_to_file +from ovos_utils.xdg_utils import xdg_data_home from neon_iris.client import NeonAIClient - +import librosa +import soundfile as sf class GradIOClient(NeonAIClient): def __init__(self, lang: str = None): @@ -45,6 +47,7 @@ def __init__(self, lang: str = None): NeonAIClient.__init__(self, config.get("MQ")) self._await_response = Event() self._response = None + self._current_tts = None self.lang = lang or self.config.get('default_lang') or \ self.config.get('languages', ['en-us'])[0] self.chat_ui = gradio.Blocks() @@ -72,39 +75,66 @@ def update_profile(self, stt_lang: str, tts_lang: str, tts_lang_2: str): from neon_utils.user_utils import apply_local_user_profile_updates apply_local_user_profile_updates(profile_update, self._user_config) + def send_audio(self, audio_file: str, lang: str = "en-us", + username: Optional[str] = None, + user_profiles: Optional[list] = None): + """ + @param audio_file: path to audio file to send to speech module + @param lang: language code associated with request + @param username: username associated with request + @param user_profiles: user profiles expecting a response + """ + audio_file = self.convert_audio(audio_file) + self._send_audio(audio_file, lang, username, user_profiles) + + def convert_audio(self, audio_file: str, target_sr=16000, target_channels=1, dtype='int16') -> str: + """ + @param audio_file: path to audio file to convert for speech model + @returns: path to converted audio file + """ + # Load the audio file + y, sr = librosa.load(audio_file, sr=None, mono=False) # Load without changing sample rate or channels + + # If the file has more than one channel, mix it down to one channel + if y.ndim > 1 and target_channels == 1: + y = librosa.to_mono(y) + + # Resample the audio to the target sample rate + y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr) + + # Ensure the audio array is in the correct format (int16 for 2-byte samples) + y_resampled = (y_resampled * (2**(8*2 - 1))).astype(dtype) + + output_path = join(join(xdg_data_home(), "iris", "stt"), f"{time()}.wav") + # Save the audio file with the new sample rate and sample width + sf.write(output_path, y_resampled, target_sr, format='WAV', subtype='PCM_16') + LOG.info(f"Converted audio file to {output_path}") + return output_path + def on_user_input(self, utterance: str, *args, **kwargs) -> str: """ Callback to handle textual user input @param utterance: String utterance submitted by the user @returns: String response from Neon (or "ERROR") """ - LOG.info(utterance) LOG.info(args) LOG.info(kwargs) self._await_response.clear() self._response = None - self.send_utterance(utterance, self.lang) + if utterance: + LOG.info(f"Sending utterance: {utterance}") + self.send_utterance(utterance, self.lang) + else: + LOG.info(f"Sending audio: {args[1]} with lang: {self.lang}") + self.send_audio(args[1], self.lang) self._await_response.wait(30) self._response = self._response or "ERROR" - LOG.info(f"Response={self._response}") + LOG.info(f"Got response={self._response}") return self._response - def on_user_speech(self, audio_path: str, *args, **kwargs) -> str: - """ - Callback to handle audio user input - @param audio_path: String path of recording created by the user - @returns: String response from Neon (or "ERROR") - """ - LOG.info(f"Received audio from user speech: {audio_path}") - LOG.info(args) - LOG.info(kwargs) - self._await_response.clear() - self._response = None - # self.send_utterance(utterance, self.lang) - # self._await_response.wait(30) - self._response = self._response or "NOT YET IMPLEMENTED" - LOG.info(f"Response={self._response}") - return self._response + def play_tts(self): + LOG.info(f"Playing most recent TTS file {self._current_tts}") + return self._current_tts def run(self): """ @@ -117,24 +147,28 @@ def run(self): address = self.config.get("server_address") or "0.0.0.0" port = self.config.get("server_port") or 7860 - audio_input = gradio.Audio(source="microphone", type="filepath", label="Talk to NEON") - audio_output = gradio.Audio(source="upload", type="filepath", label="NEON's response", interactive=False, autoplay=True) chatbot = gradio.Chatbot(label=description) textbox = gradio.Textbox(placeholder=placeholder) with self.chat_ui as blocks: # Define primary UI + audio_input = gradio.Audio(source="microphone", + type="filepath", + label="Talk to NEON", + auto_submit=True) gradio.ChatInterface(self.on_user_input, chatbot=chatbot, textbox=textbox, - additional_inputs=[audio_input, audio_output], - additional_inputs_accordion_name="Talk to NEON", + additional_inputs=[audio_input], title=title, retry_btn=None, - undo_btn=None) + undo_btn=None,) + tts_audio = gradio.Audio(autoplay=True, visible=True, + label="Neon's Response") + tts_button = gradio.Button("Play TTS") + tts_button.click(self.play_tts, + outputs=[tts_audio]) # Define settings UI - with gradio.Row(): - submit = gradio.Button("Update User Settings") with gradio.Row(): with gradio.Column(): stt_lang = gradio.Radio(label="Input Language", @@ -147,6 +181,7 @@ def run(self): choices=[None] + self.supported_languages, value=None) + submit = gradio.Button("Update User Settings") with gradio.Column(): # TODO: Unit settings pass @@ -158,7 +193,6 @@ def run(self): pass submit.click(self.update_profile, inputs=[stt_lang, tts_lang, tts_lang_2]) - audio_input.stop_recording(self.on_user_speech, inputs=[audio_input]) blocks.launch(server_name=address, server_port=port) def handle_klat_response(self, message: Message): @@ -177,10 +211,12 @@ def handle_klat_response(self, message: Message): for gender, data in response["audio"].items(): filepath = "/".join([self.audio_cache_dir] + response[gender].split('/')[-4:]) + # TODO: This only plays the most recent, so it doesn't support + # multiple languages + self._current_tts = filepath files.append(filepath) if not isfile(filepath): decode_base64_string_to_file(data, filepath) - self._play_tts_responses(filepath) self._response = "\n".join(sentences) self._await_response.set() @@ -224,11 +260,3 @@ def clear_media(self, message: Message): @param message: Message requesting media deletion """ pass - - def _play_tts_response(self, filepath: str): - """ - Play a single TTS response - @param filepath: Path to TTS audio file - """ - LOG.debug(f"Playing TTS file {filepath}") - # TODO: Figure out how to play audio in the web UI \ No newline at end of file diff --git a/requirements/requirements.txt b/requirements/requirements.txt index f8a89d2..832fb13 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -3,4 +3,5 @@ click-default-group~=1.2 neon-utils~=1.0 pyyaml>=5.4,<7.0.0 neon-mq-connector~=0.7,>=0.7.1a4 -ovos-bus-client~=0.0.3 \ No newline at end of file +ovos-bus-client~=0.0.3 +librosa~=0.10.1 \ No newline at end of file