Skip to content

Commit

Permalink
Operator-to-robot Text-to-Speech (#64)
Browse files Browse the repository at this point in the history
* [WIP] write a ROS node for TTS

* [WIP] pyttsx3 mostly works but sounds awful

* created abstract class to allow easy switching of engines

* gTTS works and overrides work

* [WIP] basic UI arrangement works

* UI layout done

* Play and Stop function providers work

* Finished implementing function providers

* Add scroll to too-big dropdown popups, check whether text exists as you type.

* Update comments

* Update requirements

* Fixes from testing

* Remove unnecessary logs

* Auto-select all text on click

* Trim whitespace before storing

* Removed unnecessary logs

* updated style of dropdown input to be consistent with the regular dropdown

* Add launch arg for tts engine

* Update color scheme

---------

Co-authored-by: Vinitha Ranganeni <[email protected]>
  • Loading branch information
hello-amal and hello-vinitha authored Jul 15, 2024
1 parent f0ce7cb commit c53a7c0
Show file tree
Hide file tree
Showing 28 changed files with 1,209 additions and 17 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ find_package(rosidl_default_generators REQUIRED)
##############################
rosidl_generate_interfaces(${PROJECT_NAME}
"action/MoveToPregrasp.action"
"msg/TextToSpeech.msg"
DEPENDENCIES geometry_msgs
)

Expand All @@ -40,6 +41,7 @@ install(PROGRAMS
nodes/configure_video_streams.py
nodes/move_to_pregrasp.py
nodes/navigation_camera.py
nodes/text_to_speech.py
DESTINATION lib/${PROJECT_NAME}
)

Expand Down
16 changes: 16 additions & 0 deletions launch/web_interface.launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,11 @@ def generate_launch_description():
map_yaml = DeclareLaunchArgument(
"map_yaml", description="filepath to previously captured map", default_value=""
)
tts_engine = DeclareLaunchArgument(
"tts_engine",
description="name of the TTS engine. Either pyttsx3 or gtts.",
default_value="gtts",
)
certfile_arg = DeclareLaunchArgument(
"certfile", default_value=stretch_serial_no + "+6.pem"
)
Expand All @@ -230,6 +235,7 @@ def generate_launch_description():
ld = LaunchDescription(
[
map_yaml,
tts_engine,
nav2_params_file_param,
params_file,
certfile_arg,
Expand Down Expand Up @@ -531,4 +537,14 @@ def generate_launch_description():
)
ld.add_action(move_to_pregrasp_node)

# Text to speech
text_to_speech_node = Node(
package="stretch_web_teleop",
executable="text_to_speech.py",
output="screen",
arguments=[LaunchConfiguration("tts_engine")],
parameters=[],
)
ld.add_action(text_to_speech_node)

return ld
9 changes: 8 additions & 1 deletion launch_interface.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,20 @@ if getopts ":m:" opt && [[ $opt == "m" && -f $OPTARG ]]; then
MAP_ARG="map_yaml:=$OPTARG"
fi

# Usage: ./launch_interface.sh -t pyttsx3
TTS_ARG=""
if getopts ":t:" opt && [[ $opt == "t" ]]; then
echo "Setting tts engine..."
TTS_ARG="tts_engine:=$OPTARG"
fi

stretch_free_robot_process.py;
./stop_interface.sh
sudo udevadm control --reload-rules && sudo udevadm trigger
source /opt/ros/humble/setup.bash
source ~/ament_ws/install/setup.bash
source /usr/share/colcon_cd/function/colcon_cd.sh
sleep 2;
screen -dm -S "web_teleop_ros" ros2 launch stretch_web_teleop web_interface.launch.py $MAP_ARG
screen -dm -S "web_teleop_ros" ros2 launch stretch_web_teleop web_interface.launch.py $MAP_ARG $TTS_ARG
sleep 3;
~/ament_ws/src/stretch_web_teleop/start_web_server_and_robot_browser.sh
16 changes: 16 additions & 0 deletions msg/TextToSpeech.msg
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# The text to say
string text

# The voice to use. Valid options for this depend on the engine.
string voice

# Whether to speak slow or not.
bool is_slow

# If a message is already being spoken, this flag controls what to do with this message:
# add it to a queue to be executed sequentially (Default), or interrupt the
# current message and queue to speak this message (in this case, the old queue gets
# discarded).
uint8 OVERRIDE_BEHAVIOR_QUEUE = 0
uint8 OVERRIDE_BEHAVIOR_INTERRUPT = 1
uint8 override_behavior
181 changes: 181 additions & 0 deletions nodes/text_to_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#!/usr/bin/env python3

# Standard imports
import sys
import threading
from typing import List, Optional

# Third-party imports
import rclpy
import sounddevice # suppress ALSA warnings # noqa: F401
from rclpy.node import Node
from rclpy.qos import QoSProfile, ReliabilityPolicy

# Local Imports
from stretch_web_teleop.msg import TextToSpeech
from stretch_web_teleop_helpers.text_to_speech_helpers import (
GTTS,
PyTTSx3,
TextToSpeechEngine,
TextToSpeechEngineType,
)


class TextToSpeechNode(Node):
"""
The TextToSpeech node subscribes to a stream of text-to-speech commands
from a topic and executes them.
"""

def __init__(
self,
engine_type: TextToSpeechEngineType = TextToSpeechEngineType.PYTTSX3,
rate_hz: float = 5.0,
):
"""
Initialize the TextToSpeechNode.
Parameters
----------
engine_type : TextToSpeechEngineType
The text-to-speech engine to use.
rate_hz : float
The rate at which to run the text-to-speech engine.
"""
# Initialize the node
super().__init__("text_to_speech")

# Declare the attributes for the text-to-speech engine
self.engine_type = engine_type
self.engine: Optional[TextToSpeechEngine] = None
self.initialized = False

# Declare the attributes for the run thread
self.rate_hz = rate_hz
self.queue: List[TextToSpeech] = []
self.queue_lock = threading.Lock()

# Create the subscription
self.create_subscription(
TextToSpeech,
"text_to_speech",
self.text_to_speech_callback,
QoSProfile(depth=1, reliability=ReliabilityPolicy.RELIABLE),
callback_group=rclpy.callback_groups.MutuallyExclusiveCallbackGroup(),
)

def initialize(self):
"""
Initialize the text-to-speech engine.
"""
if self.engine_type == TextToSpeechEngineType.PYTTSX3:
self.engine = PyTTSx3(self.get_logger())
self.initialized = True
elif self.engine_type == TextToSpeechEngineType.GTTS:
self.engine = GTTS(self.get_logger())
self.initialized = True
else:
self.get_logger().error(f"Unsupported text-to-speech {self.engine_type}")

def text_to_speech_callback(self, msg: TextToSpeech):
"""
Callback for the text-to-speech topic.
Parameters
----------
msg : TextToSpeech
The message containing the text to speak.
"""
self.get_logger().info(f"Received: {msg}")
# Interrupt if requested
if msg.override_behavior == TextToSpeech.OVERRIDE_BEHAVIOR_INTERRUPT:
if self.engine._can_say_async:
self.engine.stop()
with self.queue_lock:
self.queue.clear()
else:
self.get_logger().warn("Engine does not support interrupting speech")

# Queue the text
if len(msg.text) > 0:
with self.queue_lock:
self.queue.append(msg)

def run(self):
"""
Run the text-to-speech engine.
"""
rate = self.create_rate(self.rate_hz)
while rclpy.ok():
# Sleep
rate.sleep()

# Send a single queued utterance to the text-to-speech engine
if not self.engine.is_speaking():
msg = None
with self.queue_lock:
if len(self.queue) > 0:
msg = self.queue.pop(0)
if msg is not None:
# Process the voice
if len(msg.voice) > 0:
if msg.voice != self.engine.voice_id:
self.engine.voice_id = msg.voice

# Process the speed
if msg.is_slow != self.engine.is_slow:
self.engine.is_slow = msg.is_slow

# Speak the text
if self.engine._can_say_async:
self.engine.say_async(msg.text)
else:
self.engine.say(msg.text)
self.get_logger().info(f"Saying: {msg.text}")


def main():
# Check the arguments
tts_engine = sys.argv[1]
tts_engine = tts_engine.lower()
tts_engine_map = {e.name.lower(): e for e in TextToSpeechEngineType}
if tts_engine not in tts_engine_map:
print(f"Invalid text-to-speech engine: {tts_engine}")
print(f"Options: {list(tts_engine_map.keys())}")
print("Defaulting to gtts")
tts_engine = "gtts"

rclpy.init()

node = TextToSpeechNode(
engine_type=tts_engine_map[tts_engine],
)
node.get_logger().info("Created!")

# Spin in the background, as the node initializes
executor = rclpy.executors.MultiThreadedExecutor(num_threads=4)
spin_thread = threading.Thread(
target=rclpy.spin,
args=(node,),
kwargs={"executor": executor},
daemon=True,
)
spin_thread.start()

# Run text-to-speech
try:
node.initialize()
node.get_logger().info("Running!")
node.run()
except KeyboardInterrupt:
pass

# Spin in the foreground
spin_thread.join()

node.destroy_node()
rclpy.shutdown()


if __name__ == "__main__":
main()
7 changes: 7 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
gTTS
loguru
# numpy 1.23.2 is not required for the web teleop interface, but is required
# for stretch_body. If we diden't include it here, pin would update
# to the latest version of numpy, breaking stretch_body.
numpy==1.23.2
pin
PyAudio==0.2.14
pydub
# TODO: is pyquaternion still needed/used?
pyquaternion
pyttsx3
simpleaudio
sounddevice
5 changes: 5 additions & 0 deletions src/pages/operator/css/MovementRecorder.css
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
justify-content: center;
}

/* The below buttons' CSS likely gets overridden by the TextToSpeech CSS
* for the same class names. It is not an issue now because they use the
* same styles, but may be an issue in the future if we change styles for
* one of the components.
*/
.play-btn {
background-color: var(--selected-color);
display: flex;
Expand Down
4 changes: 3 additions & 1 deletion src/pages/operator/css/Operator.css
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,12 @@
.operator-voice,
.operator-pose-library,
.operator-pose-recorder,
.operator-text-to-speech,
.operator-aruco-markers {
background-color: whitesmoke;
box-shadow: var(--shadow);
height: 6rem;
width: 40rem;
width: 50rem;
display: inline-grid;
align-items: center;
justify-content: center;
Expand Down Expand Up @@ -113,6 +114,7 @@
.operator-voice[hidden],
.operator-pose-library[hidden],
.operator-pose-recorder[hidden],
.operator-text-to-speech[hidden],
.operator-aruco-markers[hidden] {
display: none;
}
Expand Down
60 changes: 60 additions & 0 deletions src/pages/operator/css/TextToSpeech.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#text-to-speech-container {
display: flex;
gap: 15px;
align-items: center;
justify-content: center;
}

.play-btn {
background-color: var(--selected-color);
display: flex;
}

.save-btn {
background-color: var(--btn-turquoise);
display: flex;
}

.stop-btn {
background-color: #cd0b0b;
color: white;
display: flex;
}

.delete-btn {
background-color: var(--btn-red);
display: flex;
}

@media (max-width: 1300px) {
#text-to-speech-container {
font-size: smaller;
}
}

.mobile-text-save-btn {
border-radius: 13px;
border: 5px solid whitesmoke;
font-size: 25px;
padding: 10px;
margin: 1rem;
text-align: center;
background: #06c7e1;
vertical-align: middle;
display: flex;
justify-content: center;
}

.mobile-text-play-btn {
/* width: 97%; */
border-radius: 13px;
border: 5px solid whitesmoke;
font-size: 20px;
padding: 10px;
margin: 0.5rem;
text-align: center;
background: #06c7e1;
vertical-align: middle;
display: flex;
justify-content: center;
}
Loading

0 comments on commit c53a7c0

Please sign in to comment.