From c53a7c0a11e783c8746c03d5d873ec884fd3e99d Mon Sep 17 00:00:00 2001 From: hello-amal Date: Mon, 15 Jul 2024 14:05:28 -0700 Subject: [PATCH] Operator-to-robot Text-to-Speech (#64) * [WIP] write a ROS node for TTS * [WIP] pyttsx3 mostly works but sounds awful * created abstract class to allow easy switching of engines * gTTS works and overrides work * [WIP] basic UI arrangement works * UI layout done * Play and Stop function providers work * Finished implementing function providers * Add scroll to too-big dropdown popups, check whether text exists as you type. * Update comments * Update requirements * Fixes from testing * Remove unnecessary logs * Auto-select all text on click * Trim whitespace before storing * Removed unnecessary logs * updated style of dropdown input to be consistent with the regular dropdown * Add launch arg for tts engine * Update color scheme --------- Co-authored-by: Vinitha Ranganeni --- CMakeLists.txt | 2 + launch/web_interface.launch.py | 16 + launch_interface.sh | 9 +- msg/TextToSpeech.msg | 16 + nodes/text_to_speech.py | 181 ++++++++++ requirements.txt | 7 + src/pages/operator/css/MovementRecorder.css | 5 + src/pages/operator/css/Operator.css | 4 +- src/pages/operator/css/TextToSpeech.css | 60 ++++ src/pages/operator/css/basic_components.css | 78 ++++- src/pages/operator/tsx/Operator.tsx | 22 ++ .../tsx/basic_components/Dropdown.tsx | 18 + .../tsx/basic_components/DropdownInput.tsx | 128 +++++++ .../tsx/default_layouts/SIMPLE_LAYOUT.tsx | 1 + .../TextToSpeechFunctionProvider.tsx | 42 +++ src/pages/operator/tsx/index.tsx | 5 + .../tsx/layout_components/CameraView.tsx | 1 - .../tsx/layout_components/TextToSpeech.tsx | 131 ++++++++ .../tsx/static_components/Sidebar.tsx | 11 + .../FirebaseStorageHandler.tsx | 33 ++ .../storage_handler/LocalStorageHandler.tsx | 30 ++ .../tsx/storage_handler/StorageHandler.tsx | 18 + .../tsx/utils/component_definitions.tsx | 1 + src/pages/robot/tsx/index.tsx | 11 + src/pages/robot/tsx/robot.tsx | 37 ++- src/shared/commands.tsx | 15 +- src/shared/remoterobot.tsx | 33 ++ .../text_to_speech_helpers.py | 311 ++++++++++++++++++ 28 files changed, 1209 insertions(+), 17 deletions(-) create mode 100644 msg/TextToSpeech.msg create mode 100755 nodes/text_to_speech.py create mode 100644 src/pages/operator/css/TextToSpeech.css create mode 100644 src/pages/operator/tsx/basic_components/DropdownInput.tsx create mode 100644 src/pages/operator/tsx/function_providers/TextToSpeechFunctionProvider.tsx create mode 100644 src/pages/operator/tsx/layout_components/TextToSpeech.tsx create mode 100644 stretch_web_teleop_helpers/text_to_speech_helpers.py diff --git a/CMakeLists.txt b/CMakeLists.txt index fd8beda7..15e92a44 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ find_package(rosidl_default_generators REQUIRED) ############################## rosidl_generate_interfaces(${PROJECT_NAME} "action/MoveToPregrasp.action" + "msg/TextToSpeech.msg" DEPENDENCIES geometry_msgs ) @@ -40,6 +41,7 @@ install(PROGRAMS nodes/configure_video_streams.py nodes/move_to_pregrasp.py nodes/navigation_camera.py + nodes/text_to_speech.py DESTINATION lib/${PROJECT_NAME} ) diff --git a/launch/web_interface.launch.py b/launch/web_interface.launch.py index 8f07da90..3b21051c 100644 --- a/launch/web_interface.launch.py +++ b/launch/web_interface.launch.py @@ -212,6 +212,11 @@ def generate_launch_description(): map_yaml = DeclareLaunchArgument( "map_yaml", description="filepath to previously captured map", default_value="" ) + tts_engine = DeclareLaunchArgument( + "tts_engine", + description="name of the TTS engine. Either pyttsx3 or gtts.", + default_value="gtts", + ) certfile_arg = DeclareLaunchArgument( "certfile", default_value=stretch_serial_no + "+6.pem" ) @@ -230,6 +235,7 @@ def generate_launch_description(): ld = LaunchDescription( [ map_yaml, + tts_engine, nav2_params_file_param, params_file, certfile_arg, @@ -531,4 +537,14 @@ def generate_launch_description(): ) ld.add_action(move_to_pregrasp_node) + # Text to speech + text_to_speech_node = Node( + package="stretch_web_teleop", + executable="text_to_speech.py", + output="screen", + arguments=[LaunchConfiguration("tts_engine")], + parameters=[], + ) + ld.add_action(text_to_speech_node) + return ld diff --git a/launch_interface.sh b/launch_interface.sh index 25c35524..b95b8580 100755 --- a/launch_interface.sh +++ b/launch_interface.sh @@ -7,6 +7,13 @@ if getopts ":m:" opt && [[ $opt == "m" && -f $OPTARG ]]; then MAP_ARG="map_yaml:=$OPTARG" fi +# Usage: ./launch_interface.sh -t pyttsx3 +TTS_ARG="" +if getopts ":t:" opt && [[ $opt == "t" ]]; then + echo "Setting tts engine..." + TTS_ARG="tts_engine:=$OPTARG" +fi + stretch_free_robot_process.py; ./stop_interface.sh sudo udevadm control --reload-rules && sudo udevadm trigger @@ -14,6 +21,6 @@ source /opt/ros/humble/setup.bash source ~/ament_ws/install/setup.bash source /usr/share/colcon_cd/function/colcon_cd.sh sleep 2; -screen -dm -S "web_teleop_ros" ros2 launch stretch_web_teleop web_interface.launch.py $MAP_ARG +screen -dm -S "web_teleop_ros" ros2 launch stretch_web_teleop web_interface.launch.py $MAP_ARG $TTS_ARG sleep 3; ~/ament_ws/src/stretch_web_teleop/start_web_server_and_robot_browser.sh diff --git a/msg/TextToSpeech.msg b/msg/TextToSpeech.msg new file mode 100644 index 00000000..e6f2205a --- /dev/null +++ b/msg/TextToSpeech.msg @@ -0,0 +1,16 @@ +# The text to say +string text + +# The voice to use. Valid options for this depend on the engine. +string voice + +# Whether to speak slow or not. +bool is_slow + +# If a message is already being spoken, this flag controls what to do with this message: +# add it to a queue to be executed sequentially (Default), or interrupt the +# current message and queue to speak this message (in this case, the old queue gets +# discarded). +uint8 OVERRIDE_BEHAVIOR_QUEUE = 0 +uint8 OVERRIDE_BEHAVIOR_INTERRUPT = 1 +uint8 override_behavior diff --git a/nodes/text_to_speech.py b/nodes/text_to_speech.py new file mode 100755 index 00000000..7ee2b90e --- /dev/null +++ b/nodes/text_to_speech.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 + +# Standard imports +import sys +import threading +from typing import List, Optional + +# Third-party imports +import rclpy +import sounddevice # suppress ALSA warnings # noqa: F401 +from rclpy.node import Node +from rclpy.qos import QoSProfile, ReliabilityPolicy + +# Local Imports +from stretch_web_teleop.msg import TextToSpeech +from stretch_web_teleop_helpers.text_to_speech_helpers import ( + GTTS, + PyTTSx3, + TextToSpeechEngine, + TextToSpeechEngineType, +) + + +class TextToSpeechNode(Node): + """ + The TextToSpeech node subscribes to a stream of text-to-speech commands + from a topic and executes them. + """ + + def __init__( + self, + engine_type: TextToSpeechEngineType = TextToSpeechEngineType.PYTTSX3, + rate_hz: float = 5.0, + ): + """ + Initialize the TextToSpeechNode. + + Parameters + ---------- + engine_type : TextToSpeechEngineType + The text-to-speech engine to use. + rate_hz : float + The rate at which to run the text-to-speech engine. + """ + # Initialize the node + super().__init__("text_to_speech") + + # Declare the attributes for the text-to-speech engine + self.engine_type = engine_type + self.engine: Optional[TextToSpeechEngine] = None + self.initialized = False + + # Declare the attributes for the run thread + self.rate_hz = rate_hz + self.queue: List[TextToSpeech] = [] + self.queue_lock = threading.Lock() + + # Create the subscription + self.create_subscription( + TextToSpeech, + "text_to_speech", + self.text_to_speech_callback, + QoSProfile(depth=1, reliability=ReliabilityPolicy.RELIABLE), + callback_group=rclpy.callback_groups.MutuallyExclusiveCallbackGroup(), + ) + + def initialize(self): + """ + Initialize the text-to-speech engine. + """ + if self.engine_type == TextToSpeechEngineType.PYTTSX3: + self.engine = PyTTSx3(self.get_logger()) + self.initialized = True + elif self.engine_type == TextToSpeechEngineType.GTTS: + self.engine = GTTS(self.get_logger()) + self.initialized = True + else: + self.get_logger().error(f"Unsupported text-to-speech {self.engine_type}") + + def text_to_speech_callback(self, msg: TextToSpeech): + """ + Callback for the text-to-speech topic. + + Parameters + ---------- + msg : TextToSpeech + The message containing the text to speak. + """ + self.get_logger().info(f"Received: {msg}") + # Interrupt if requested + if msg.override_behavior == TextToSpeech.OVERRIDE_BEHAVIOR_INTERRUPT: + if self.engine._can_say_async: + self.engine.stop() + with self.queue_lock: + self.queue.clear() + else: + self.get_logger().warn("Engine does not support interrupting speech") + + # Queue the text + if len(msg.text) > 0: + with self.queue_lock: + self.queue.append(msg) + + def run(self): + """ + Run the text-to-speech engine. + """ + rate = self.create_rate(self.rate_hz) + while rclpy.ok(): + # Sleep + rate.sleep() + + # Send a single queued utterance to the text-to-speech engine + if not self.engine.is_speaking(): + msg = None + with self.queue_lock: + if len(self.queue) > 0: + msg = self.queue.pop(0) + if msg is not None: + # Process the voice + if len(msg.voice) > 0: + if msg.voice != self.engine.voice_id: + self.engine.voice_id = msg.voice + + # Process the speed + if msg.is_slow != self.engine.is_slow: + self.engine.is_slow = msg.is_slow + + # Speak the text + if self.engine._can_say_async: + self.engine.say_async(msg.text) + else: + self.engine.say(msg.text) + self.get_logger().info(f"Saying: {msg.text}") + + +def main(): + # Check the arguments + tts_engine = sys.argv[1] + tts_engine = tts_engine.lower() + tts_engine_map = {e.name.lower(): e for e in TextToSpeechEngineType} + if tts_engine not in tts_engine_map: + print(f"Invalid text-to-speech engine: {tts_engine}") + print(f"Options: {list(tts_engine_map.keys())}") + print("Defaulting to gtts") + tts_engine = "gtts" + + rclpy.init() + + node = TextToSpeechNode( + engine_type=tts_engine_map[tts_engine], + ) + node.get_logger().info("Created!") + + # Spin in the background, as the node initializes + executor = rclpy.executors.MultiThreadedExecutor(num_threads=4) + spin_thread = threading.Thread( + target=rclpy.spin, + args=(node,), + kwargs={"executor": executor}, + daemon=True, + ) + spin_thread.start() + + # Run text-to-speech + try: + node.initialize() + node.get_logger().info("Running!") + node.run() + except KeyboardInterrupt: + pass + + # Spin in the foreground + spin_thread.join() + + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index ef061733..a065d358 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,14 @@ +gTTS loguru # numpy 1.23.2 is not required for the web teleop interface, but is required # for stretch_body. If we diden't include it here, pin would update # to the latest version of numpy, breaking stretch_body. numpy==1.23.2 pin +PyAudio==0.2.14 +pydub +# TODO: is pyquaternion still needed/used? pyquaternion +pyttsx3 +simpleaudio +sounddevice diff --git a/src/pages/operator/css/MovementRecorder.css b/src/pages/operator/css/MovementRecorder.css index fa813a00..4073246a 100644 --- a/src/pages/operator/css/MovementRecorder.css +++ b/src/pages/operator/css/MovementRecorder.css @@ -5,6 +5,11 @@ justify-content: center; } +/* The below buttons' CSS likely gets overridden by the TextToSpeech CSS + * for the same class names. It is not an issue now because they use the + * same styles, but may be an issue in the future if we change styles for + * one of the components. + */ .play-btn { background-color: var(--selected-color); display: flex; diff --git a/src/pages/operator/css/Operator.css b/src/pages/operator/css/Operator.css index b0cdb988..3222a7e0 100644 --- a/src/pages/operator/css/Operator.css +++ b/src/pages/operator/css/Operator.css @@ -79,11 +79,12 @@ .operator-voice, .operator-pose-library, .operator-pose-recorder, +.operator-text-to-speech, .operator-aruco-markers { background-color: whitesmoke; box-shadow: var(--shadow); height: 6rem; - width: 40rem; + width: 50rem; display: inline-grid; align-items: center; justify-content: center; @@ -113,6 +114,7 @@ .operator-voice[hidden], .operator-pose-library[hidden], .operator-pose-recorder[hidden], +.operator-text-to-speech[hidden], .operator-aruco-markers[hidden] { display: none; } diff --git a/src/pages/operator/css/TextToSpeech.css b/src/pages/operator/css/TextToSpeech.css new file mode 100644 index 00000000..f7bbbcae --- /dev/null +++ b/src/pages/operator/css/TextToSpeech.css @@ -0,0 +1,60 @@ +#text-to-speech-container { + display: flex; + gap: 15px; + align-items: center; + justify-content: center; +} + +.play-btn { + background-color: var(--selected-color); + display: flex; +} + +.save-btn { + background-color: var(--btn-turquoise); + display: flex; +} + +.stop-btn { + background-color: #cd0b0b; + color: white; + display: flex; +} + +.delete-btn { + background-color: var(--btn-red); + display: flex; +} + +@media (max-width: 1300px) { + #text-to-speech-container { + font-size: smaller; + } +} + +.mobile-text-save-btn { + border-radius: 13px; + border: 5px solid whitesmoke; + font-size: 25px; + padding: 10px; + margin: 1rem; + text-align: center; + background: #06c7e1; + vertical-align: middle; + display: flex; + justify-content: center; +} + +.mobile-text-play-btn { + /* width: 97%; */ + border-radius: 13px; + border: 5px solid whitesmoke; + font-size: 20px; + padding: 10px; + margin: 0.5rem; + text-align: center; + background: #06c7e1; + vertical-align: middle; + display: flex; + justify-content: center; +} diff --git a/src/pages/operator/css/basic_components.css b/src/pages/operator/css/basic_components.css index b78219dc..df24a91d 100644 --- a/src/pages/operator/css/basic_components.css +++ b/src/pages/operator/css/basic_components.css @@ -107,7 +107,8 @@ position: relative; } -.dropdown-button { +.dropdown-button, +.dropdown-input-button { display: flex; align-items: center; justify-content: space-between; @@ -118,13 +119,15 @@ color: black; } -.dropdown-button.expanded.bottom { +.dropdown-button.expanded.bottom, +.dropdown-input-button.expanded.bottom { border-bottom-left-radius: 0; border-bottom-right-radius: 0; color: black; } -.dropdown-button.expanded.top { +.dropdown-button.expanded.top, +.dropdown-input-button.expanded.top { border-top-left-radius: 0; border-top-right-radius: 0; margin: 0 !important; @@ -132,29 +135,37 @@ } /* Flip the dropdown arrow when active */ -.dropdown-button span { +.dropdown-button span, +.dropdown-input-button span { transition: transform 0.2s linear; } -.dropdown-button.expanded span { +.dropdown-button.expanded span, +.dropdown-input-button.expanded span { transform: scaleY(-1); } -.dropdown-popup { +.dropdown-popup, +.dropdown-input-popup { position: absolute; min-width: 100%; z-index: 3; box-shadow: var(--shadow); border-radius: 0 0 var(--btn-brdr-radius) var(--btn-brdr-radius); + top: 100%; + bottom: auto; + overflow-y: auto; /* Make it scrollable */ } -.dropdown-popup.top { +.dropdown-popup.top, +.dropdown-input-popup.top { top: auto; bottom: 100%; box-shadow: var(--shadow-bottom); } -.dropdown-option { +.dropdown-option, +.dropdown-input-option { padding-top: 1rem; padding-bottom: 1rem; cursor: pointer; @@ -166,22 +177,65 @@ color: black; } -.dropdown-option.active { +.dropdown-option.active, +.dropdown-input-option.active { filter: brightness(80%); } -.dropdown-popup.top .dropdown-option:first-of-type { +.dropdown-popup.top .dropdown-option:first-of-type, +.dropdown-input-popup.top .dropdown-option:first-of-type { border-radius: var(--btn-brdr-radius) var(--btn-brdr-radius) 0 0; } -.dropdown-popup.top .dropdown-option:last-of-type { +.dropdown-popup.top .dropdown-option:last-of-type, +.dropdown-input-popup.top .dropdown-option:last-of-type { box-shadow: none; } -.dropdown-popup.bottom .dropdown-option:last-of-type { +.dropdown-popup.bottom .dropdown-option:last-of-type, +.dropdown-input-popup.bottom .dropdown-option:last-of-type { border-radius: 0 0 var(--btn-brdr-radius) var(--btn-brdr-radius); } +/* Dropdown Input **************************************************************/ + +.dropdown-input { + position: relative; + display: flex; + align-items: stretch; + justify-content: center; +} + +.dropdown-input:focus-within { + border: 1px solid black; + border-radius: 0 var(--btn-brdr-radius) var(--btn-brdr-radius) 0; +} + +.dropdown-input-textarea { + border-right-width: 0px !important; + resize: none; + border-radius: var(--btn-brdr-radius) 0 0 var(--btn-brdr-radius); + box-shadow: var(--shadow); + border: none; + padding: 0.25rem 0 0.25rem 0.5rem; +} + +.dropdown-input-textarea:focus { + border-radius: var(--btn-brdr-radius) 0 0 var(--btn-brdr-radius); + outline: none; +} + +.dropdown-input-button { + width: auto !important; + padding-top: 0rem; + padding-bottom: 0rem; + box-shadow: none; + border-radius: 0 var(--btn-brdr-radius) var(--btn-brdr-radius) 0; + box-shadow: var(--shadow); + /* border: 1px solid light-dark(rgb(118, 118, 118), rgb(133, 133, 133)); */ + border-left-width: 0px !important; +} + /* CheckToggleButton **********************************************************/ .check-toggle-button { diff --git a/src/pages/operator/tsx/Operator.tsx b/src/pages/operator/tsx/Operator.tsx index cb3a4c5d..309427d5 100644 --- a/src/pages/operator/tsx/Operator.tsx +++ b/src/pages/operator/tsx/Operator.tsx @@ -38,6 +38,7 @@ import { import { MovementRecorder } from "./layout_components/MovementRecorder"; import { Alert } from "./basic_components/Alert"; import "operator/css/Operator.css"; +import { TextToSpeech } from "./layout_components/TextToSpeech"; /** Operator interface webpage */ export const Operator = (props: { @@ -153,6 +154,17 @@ export const Operator = (props: { updateLayout(); } + /** + * Sets the text-to-speech component to display or hidden. + * + * @param displayTextToSpeech whether the text-to-speech component should + * be displayed. + */ + function setDisplayTextToSpeech(displayTextToSpeech: boolean) { + layout.current.displayTextToSpeech = displayTextToSpeech; + updateLayout(); + } + /** * Sets the display labels property to display or hidden. * @@ -259,8 +271,10 @@ export const Operator = (props: { /** Properties for the global options area of the sidebar */ const globalOptionsProps: GlobalOptionsProps = { displayMovementRecorder: layout.current.displayMovementRecorder, + displayTextToSpeech: layout.current.displayTextToSpeech, displayLabels: layout.current.displayLabels, setDisplayMovementRecorder: setDisplayMovementRecorder, + setDisplayTextToSpeech: setDisplayTextToSpeech, setDisplayLabels: setDisplayLabels, defaultLayouts: Object.keys(DEFAULT_LAYOUTS), customLayouts: props.storageHandler.getCustomLayoutNames(), @@ -360,6 +374,14 @@ export const Operator = (props: { > +
diff --git a/src/pages/operator/tsx/basic_components/Dropdown.tsx b/src/pages/operator/tsx/basic_components/Dropdown.tsx index e674d71f..6178c212 100644 --- a/src/pages/operator/tsx/basic_components/Dropdown.tsx +++ b/src/pages/operator/tsx/basic_components/Dropdown.tsx @@ -13,6 +13,7 @@ export const Dropdown = (props: { const [showDropdown, setShowDropdown] = React.useState(false); const [placement, setPlacement] = React.useState(props.placement); const inputRef = React.useRef(null); + const dropdownPopupRef = React.useRef(null); if (props.selectedIndex === undefined && !props.placeholderText) throw Error("both selectedOption and placeholderText undefined"); @@ -31,6 +32,7 @@ export const Dropdown = (props: { } }); + // Function to convert each possible option into a button function mapFunc(option: T, idx: number) { const active = idx === props.selectedIndex; if (active && !props.showActive) return null; @@ -48,6 +50,21 @@ export const Dropdown = (props: { ); } + // Set the max-height of the popup to the screen height minus the top of the popup + function resizeDropdownPopup() { + if (dropdownPopupRef.current) { + const top = dropdownPopupRef.current.getBoundingClientRect().top; + dropdownPopupRef.current.style.maxHeight = `calc(100vh - ${top}px)`; + } + } + React.useEffect(resizeDropdownPopup, [showDropdown]); + React.useEffect(() => { + window.addEventListener("resize", resizeDropdownPopup); + return () => { + window.removeEventListener("resize", resizeDropdownPopup); + }; + }); + return (
diff --git a/src/pages/operator/tsx/basic_components/DropdownInput.tsx b/src/pages/operator/tsx/basic_components/DropdownInput.tsx new file mode 100644 index 00000000..f2a84316 --- /dev/null +++ b/src/pages/operator/tsx/basic_components/DropdownInput.tsx @@ -0,0 +1,128 @@ +import React from "react"; +import { className } from "shared/util"; +import "operator/css/basic_components.css"; +import e from "express"; +import { text } from "stream/consumers"; + +export const DropdownInput = (props: { + text: string; + setText: (text: string) => void; + selectedIndex?: number; + setSelectedIndex: (index?: number) => void; + possibleOptions: T[]; + placeholderText: string; + placement: string; + rows: number; +}) => { + const [showDropdown, setShowDropdown] = React.useState(false); + const componentRef = React.useRef(null); + const dropdownPopupRef = React.useRef(null); + + // Handler to close dropdown when click outside + React.useEffect(() => { + const handler = (e: any) => { + if (componentRef.current && !componentRef.current.contains(e.target)) { + setShowDropdown(false); + } + }; + if (showDropdown) { + window.addEventListener("click", handler); + return () => { + window.removeEventListener("click", handler); + }; + } + }); + + // Handler to update the selected index if the possible options or text changes + React.useEffect(() => { + let text = props.text.trim(); + if (props.possibleOptions.includes(text as T)) { + props.setSelectedIndex(props.possibleOptions.indexOf(text as T)); + } else { + props.setSelectedIndex(undefined); + } + }, [props.possibleOptions, props.text]); + + // Function to convert each possible option into a button + function mapFunc(option: T, idx: number) { + const active = idx === props.selectedIndex; + return ( + + ); + } + + // Set the max-height of the popup to the screen height minus the top of the popup + function resizeDropdownPopup() { + if (dropdownPopupRef.current) { + const top = dropdownPopupRef.current.getBoundingClientRect().top; + dropdownPopupRef.current.style.maxHeight = `calc(100vh - ${top}px)`; + } + } + React.useEffect(resizeDropdownPopup, [showDropdown]); + React.useEffect(() => { + window.addEventListener("resize", resizeDropdownPopup); + return () => { + window.removeEventListener("resize", resizeDropdownPopup); + }; + }); + + return ( +
+