Operator-to-robot Text-to-Speech (#64)

* [WIP] write a ROS node for TTS * [WIP] pyttsx3 mostly works but sounds awful * created abstract class to allow easy switching of engines * gTTS works and overrides work * [WIP] basic UI arrangement works * UI layout done * Play and Stop function providers work * Finished implementing function providers * Add scroll to too-big dropdown popups, check whether text exists as you type. * Update comments * Update requirements * Fixes from testing * Remove unnecessary logs * Auto-select all text on click * Trim whitespace before storing * Removed unnecessary logs * updated style of dropdown input to be consistent with the regular dropdown * Add launch arg for tts engine * Update color scheme --------- Co-authored-by: Vinitha Ranganeni <[email protected]>
hello-robot · Jul 15, 2024 · c53a7c0 · c53a7c0
1 parent f0ce7cb
commit c53a7c0
Show file tree

Hide file tree

Showing 28 changed files with 1,209 additions and 17 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -20,6 +20,7 @@ find_package(rosidl_default_generators REQUIRED)
 ##############################
 rosidl_generate_interfaces(${PROJECT_NAME}
   "action/MoveToPregrasp.action"
+  "msg/TextToSpeech.msg"
   DEPENDENCIES geometry_msgs
 )
 
@@ -40,6 +41,7 @@ install(PROGRAMS
   nodes/configure_video_streams.py
   nodes/move_to_pregrasp.py
   nodes/navigation_camera.py
+  nodes/text_to_speech.py
   DESTINATION lib/${PROJECT_NAME}
 )
 

diff --git a/launch/web_interface.launch.py b/launch/web_interface.launch.py
@@ -212,6 +212,11 @@ def generate_launch_description():
     map_yaml = DeclareLaunchArgument(
         "map_yaml", description="filepath to previously captured map", default_value=""
     )
+    tts_engine = DeclareLaunchArgument(
+        "tts_engine",
+        description="name of the TTS engine. Either pyttsx3 or gtts.",
+        default_value="gtts",
+    )
     certfile_arg = DeclareLaunchArgument(
         "certfile", default_value=stretch_serial_no + "+6.pem"
     )
@@ -230,6 +235,7 @@ def generate_launch_description():
     ld = LaunchDescription(
         [
             map_yaml,
+            tts_engine,
             nav2_params_file_param,
             params_file,
             certfile_arg,
@@ -531,4 +537,14 @@ def generate_launch_description():
         )
         ld.add_action(move_to_pregrasp_node)
 
+    # Text to speech
+    text_to_speech_node = Node(
+        package="stretch_web_teleop",
+        executable="text_to_speech.py",
+        output="screen",
+        arguments=[LaunchConfiguration("tts_engine")],
+        parameters=[],
+    )
+    ld.add_action(text_to_speech_node)
+
     return ld
diff --git a/launch_interface.sh b/launch_interface.sh
@@ -7,13 +7,20 @@ if getopts ":m:" opt && [[ $opt == "m" && -f $OPTARG ]]; then
     MAP_ARG="map_yaml:=$OPTARG"
 fi
 
+# Usage: ./launch_interface.sh -t pyttsx3
+TTS_ARG=""
+if getopts ":t:" opt && [[ $opt == "t" ]]; then
+    echo "Setting tts engine..."
+    TTS_ARG="tts_engine:=$OPTARG"
+fi
+
 stretch_free_robot_process.py;
 ./stop_interface.sh
 sudo udevadm control --reload-rules && sudo udevadm trigger
 source /opt/ros/humble/setup.bash
 source ~/ament_ws/install/setup.bash
 source /usr/share/colcon_cd/function/colcon_cd.sh
 sleep 2;
-screen -dm -S "web_teleop_ros" ros2 launch stretch_web_teleop web_interface.launch.py $MAP_ARG
+screen -dm -S "web_teleop_ros" ros2 launch stretch_web_teleop web_interface.launch.py $MAP_ARG $TTS_ARG
 sleep 3;
 ~/ament_ws/src/stretch_web_teleop/start_web_server_and_robot_browser.sh
diff --git a/msg/TextToSpeech.msg b/msg/TextToSpeech.msg
@@ -0,0 +1,16 @@
+# The text to say
+string text
+
+# The voice to use. Valid options for this depend on the engine.
+string voice
+
+# Whether to speak slow or not.
+bool is_slow
+
+# If a message is already being spoken, this flag controls what to do with this message:
+# add it to a queue to be executed sequentially (Default), or interrupt the
+# current message and queue to speak this message (in this case, the old queue gets
+# discarded).
+uint8 OVERRIDE_BEHAVIOR_QUEUE = 0
+uint8 OVERRIDE_BEHAVIOR_INTERRUPT = 1
+uint8 override_behavior
diff --git a/nodes/text_to_speech.py b/nodes/text_to_speech.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+
+# Standard imports
+import sys
+import threading
+from typing import List, Optional
+
+# Third-party imports
+import rclpy
+import sounddevice  # suppress ALSA warnings # noqa: F401
+from rclpy.node import Node
+from rclpy.qos import QoSProfile, ReliabilityPolicy
+
+# Local Imports
+from stretch_web_teleop.msg import TextToSpeech
+from stretch_web_teleop_helpers.text_to_speech_helpers import (
+    GTTS,
+    PyTTSx3,
+    TextToSpeechEngine,
+    TextToSpeechEngineType,
+)
+
+
+class TextToSpeechNode(Node):
+    """
+    The TextToSpeech node subscribes to a stream of text-to-speech commands
+    from a topic and executes them.
+    """
+
+    def __init__(
+        self,
+        engine_type: TextToSpeechEngineType = TextToSpeechEngineType.PYTTSX3,
+        rate_hz: float = 5.0,
+    ):
+        """
+        Initialize the TextToSpeechNode.
+
+        Parameters
+        ----------
+        engine_type : TextToSpeechEngineType
+            The text-to-speech engine to use.
+        rate_hz : float
+            The rate at which to run the text-to-speech engine.
+        """
+        # Initialize the node
+        super().__init__("text_to_speech")
+
+        # Declare the attributes for the text-to-speech engine
+        self.engine_type = engine_type
+        self.engine: Optional[TextToSpeechEngine] = None
+        self.initialized = False
+
+        # Declare the attributes for the run thread
+        self.rate_hz = rate_hz
+        self.queue: List[TextToSpeech] = []
+        self.queue_lock = threading.Lock()
+
+        # Create the subscription
+        self.create_subscription(
+            TextToSpeech,
+            "text_to_speech",
+            self.text_to_speech_callback,
+            QoSProfile(depth=1, reliability=ReliabilityPolicy.RELIABLE),
+            callback_group=rclpy.callback_groups.MutuallyExclusiveCallbackGroup(),
+        )
+
+    def initialize(self):
+        """
+        Initialize the text-to-speech engine.
+        """
+        if self.engine_type == TextToSpeechEngineType.PYTTSX3:
+            self.engine = PyTTSx3(self.get_logger())
+            self.initialized = True
+        elif self.engine_type == TextToSpeechEngineType.GTTS:
+            self.engine = GTTS(self.get_logger())
+            self.initialized = True
+        else:
+            self.get_logger().error(f"Unsupported text-to-speech {self.engine_type}")
+
+    def text_to_speech_callback(self, msg: TextToSpeech):
+        """
+        Callback for the text-to-speech topic.
+
+        Parameters
+        ----------
+        msg : TextToSpeech
+            The message containing the text to speak.
+        """
+        self.get_logger().info(f"Received: {msg}")
+        # Interrupt if requested
+        if msg.override_behavior == TextToSpeech.OVERRIDE_BEHAVIOR_INTERRUPT:
+            if self.engine._can_say_async:
+                self.engine.stop()
+                with self.queue_lock:
+                    self.queue.clear()
+            else:
+                self.get_logger().warn("Engine does not support interrupting speech")
+
+        # Queue the text
+        if len(msg.text) > 0:
+            with self.queue_lock:
+                self.queue.append(msg)
+
+    def run(self):
+        """
+        Run the text-to-speech engine.
+        """
+        rate = self.create_rate(self.rate_hz)
+        while rclpy.ok():
+            # Sleep
+            rate.sleep()
+
+            # Send a single queued utterance to the text-to-speech engine
+            if not self.engine.is_speaking():
+                msg = None
+                with self.queue_lock:
+                    if len(self.queue) > 0:
+                        msg = self.queue.pop(0)
+                if msg is not None:
+                    # Process the voice
+                    if len(msg.voice) > 0:
+                        if msg.voice != self.engine.voice_id:
+                            self.engine.voice_id = msg.voice
+
+                    # Process the speed
+                    if msg.is_slow != self.engine.is_slow:
+                        self.engine.is_slow = msg.is_slow
+
+                    # Speak the text
+                    if self.engine._can_say_async:
+                        self.engine.say_async(msg.text)
+                    else:
+                        self.engine.say(msg.text)
+                    self.get_logger().info(f"Saying: {msg.text}")
+
+
+def main():
+    # Check the arguments
+    tts_engine = sys.argv[1]
+    tts_engine = tts_engine.lower()
+    tts_engine_map = {e.name.lower(): e for e in TextToSpeechEngineType}
+    if tts_engine not in tts_engine_map:
+        print(f"Invalid text-to-speech engine: {tts_engine}")
+        print(f"Options: {list(tts_engine_map.keys())}")
+        print("Defaulting to gtts")
+        tts_engine = "gtts"
+
+    rclpy.init()
+
+    node = TextToSpeechNode(
+        engine_type=tts_engine_map[tts_engine],
+    )
+    node.get_logger().info("Created!")
+
+    # Spin in the background, as the node initializes
+    executor = rclpy.executors.MultiThreadedExecutor(num_threads=4)
+    spin_thread = threading.Thread(
+        target=rclpy.spin,
+        args=(node,),
+        kwargs={"executor": executor},
+        daemon=True,
+    )
+    spin_thread.start()
+
+    # Run text-to-speech
+    try:
+        node.initialize()
+        node.get_logger().info("Running!")
+        node.run()
+    except KeyboardInterrupt:
+        pass
+
+    # Spin in the foreground
+    spin_thread.join()
+
+    node.destroy_node()
+    rclpy.shutdown()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,14 @@
+gTTS
 loguru
 # numpy 1.23.2 is not required for the web teleop interface, but is required
 # for stretch_body. If we diden't include it here, pin would update
 # to the latest version of numpy, breaking stretch_body.
 numpy==1.23.2
 pin
+PyAudio==0.2.14
+pydub
+# TODO: is pyquaternion still needed/used?
 pyquaternion
+pyttsx3
+simpleaudio
+sounddevice
diff --git a/src/pages/operator/css/MovementRecorder.css b/src/pages/operator/css/MovementRecorder.css
@@ -5,6 +5,11 @@
   justify-content: center;
 }
 
+/* The below buttons' CSS likely gets overridden by the TextToSpeech CSS
+ * for the same class names. It is not an issue now because they use the
+ * same styles, but may be an issue in the future if we change styles for
+ * one of the components.
+ */
 .play-btn {
   background-color: var(--selected-color);
   display: flex;

diff --git a/src/pages/operator/css/Operator.css b/src/pages/operator/css/Operator.css
@@ -79,11 +79,12 @@
 .operator-voice,
 .operator-pose-library,
 .operator-pose-recorder,
+.operator-text-to-speech,
 .operator-aruco-markers {
   background-color: whitesmoke;
   box-shadow: var(--shadow);
   height: 6rem;
-  width: 40rem;
+  width: 50rem;
   display: inline-grid;
   align-items: center;
   justify-content: center;
@@ -113,6 +114,7 @@
 .operator-voice[hidden],
 .operator-pose-library[hidden],
 .operator-pose-recorder[hidden],
+.operator-text-to-speech[hidden],
 .operator-aruco-markers[hidden] {
   display: none;
 }

diff --git a/src/pages/operator/css/TextToSpeech.css b/src/pages/operator/css/TextToSpeech.css
@@ -0,0 +1,60 @@
+#text-to-speech-container {
+  display: flex;
+  gap: 15px;
+  align-items: center;
+  justify-content: center;
+}
+
+.play-btn {
+  background-color: var(--selected-color);
+  display: flex;
+}
+
+.save-btn {
+  background-color: var(--btn-turquoise);
+  display: flex;
+}
+
+.stop-btn {
+  background-color: #cd0b0b;
+  color: white;
+  display: flex;
+}
+
+.delete-btn {
+  background-color: var(--btn-red);
+  display: flex;
+}
+
+@media (max-width: 1300px) {
+  #text-to-speech-container {
+    font-size: smaller;
+  }
+}
+
+.mobile-text-save-btn {
+  border-radius: 13px;
+  border: 5px solid whitesmoke;
+  font-size: 25px;
+  padding: 10px;
+  margin: 1rem;
+  text-align: center;
+  background: #06c7e1;
+  vertical-align: middle;
+  display: flex;
+  justify-content: center;
+}
+
+.mobile-text-play-btn {
+  /* width: 97%; */
+  border-radius: 13px;
+  border: 5px solid whitesmoke;
+  font-size: 20px;
+  padding: 10px;
+  margin: 0.5rem;
+  text-align: center;
+  background: #06c7e1;
+  vertical-align: middle;
+  display: flex;
+  justify-content: center;
+}