Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Operator-to-robot Text-to-Speech #64

Merged
merged 20 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ find_package(rosidl_default_generators REQUIRED)
##############################
rosidl_generate_interfaces(${PROJECT_NAME}
"action/MoveToPregrasp.action"
"msg/TextToSpeech.msg"
DEPENDENCIES geometry_msgs
)

Expand All @@ -40,6 +41,7 @@ install(PROGRAMS
nodes/configure_video_streams.py
nodes/move_to_pregrasp.py
nodes/navigation_camera.py
nodes/text_to_speech.py
DESTINATION lib/${PROJECT_NAME}
)

Expand Down
16 changes: 16 additions & 0 deletions launch/web_interface.launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,11 @@ def generate_launch_description():
map_yaml = DeclareLaunchArgument(
"map_yaml", description="filepath to previously captured map", default_value=""
)
tts_engine = DeclareLaunchArgument(
"tts_engine",
description="name of the TTS engine. Either pyttsx3 or gtts.",
default_value="gtts",
)
certfile_arg = DeclareLaunchArgument(
"certfile", default_value=stretch_serial_no + "+6.pem"
)
Expand All @@ -230,6 +235,7 @@ def generate_launch_description():
ld = LaunchDescription(
[
map_yaml,
tts_engine,
nav2_params_file_param,
params_file,
certfile_arg,
Expand Down Expand Up @@ -531,4 +537,14 @@ def generate_launch_description():
)
ld.add_action(move_to_pregrasp_node)

# Text to speech
text_to_speech_node = Node(
package="stretch_web_teleop",
executable="text_to_speech.py",
output="screen",
arguments=[LaunchConfiguration("tts_engine")],
parameters=[],
)
ld.add_action(text_to_speech_node)

return ld
9 changes: 8 additions & 1 deletion launch_interface.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,20 @@ if getopts ":m:" opt && [[ $opt == "m" && -f $OPTARG ]]; then
MAP_ARG="map_yaml:=$OPTARG"
fi

# Usage: ./launch_interface.sh -t pyttsx3
TTS_ARG=""
if getopts ":t:" opt && [[ $opt == "t" ]]; then
echo "Setting tts engine..."
TTS_ARG="tts_engine:=$OPTARG"
fi

stretch_free_robot_process.py;
./stop_interface.sh
sudo udevadm control --reload-rules && sudo udevadm trigger
source /opt/ros/humble/setup.bash
source ~/ament_ws/install/setup.bash
source /usr/share/colcon_cd/function/colcon_cd.sh
sleep 2;
screen -dm -S "web_teleop_ros" ros2 launch stretch_web_teleop web_interface.launch.py $MAP_ARG
screen -dm -S "web_teleop_ros" ros2 launch stretch_web_teleop web_interface.launch.py $MAP_ARG $TTS_ARG
sleep 3;
~/ament_ws/src/stretch_web_teleop/start_web_server_and_robot_browser.sh
16 changes: 16 additions & 0 deletions msg/TextToSpeech.msg
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# The text to say
string text

# The voice to use. Valid options for this depend on the engine.
string voice

# Whether to speak slow or not.
bool is_slow

# If a message is already being spoken, this flag controls what to do with this message:
# add it to a queue to be executed sequentially (Default), or interrupt the
# current message and queue to speak this message (in this case, the old queue gets
# discarded).
uint8 OVERRIDE_BEHAVIOR_QUEUE = 0
uint8 OVERRIDE_BEHAVIOR_INTERRUPT = 1
uint8 override_behavior
181 changes: 181 additions & 0 deletions nodes/text_to_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#!/usr/bin/env python3

# Standard imports
import sys
import threading
from typing import List, Optional

# Third-party imports
import rclpy
import sounddevice # suppress ALSA warnings # noqa: F401
from rclpy.node import Node
from rclpy.qos import QoSProfile, ReliabilityPolicy

# Local Imports
from stretch_web_teleop.msg import TextToSpeech
from stretch_web_teleop_helpers.text_to_speech_helpers import (
GTTS,
PyTTSx3,
TextToSpeechEngine,
TextToSpeechEngineType,
)


class TextToSpeechNode(Node):
"""
The TextToSpeech node subscribes to a stream of text-to-speech commands
from a topic and executes them.
"""

def __init__(
self,
engine_type: TextToSpeechEngineType = TextToSpeechEngineType.PYTTSX3,
rate_hz: float = 5.0,
):
"""
Initialize the TextToSpeechNode.

Parameters
----------
engine_type : TextToSpeechEngineType
The text-to-speech engine to use.
rate_hz : float
The rate at which to run the text-to-speech engine.
"""
# Initialize the node
super().__init__("text_to_speech")

# Declare the attributes for the text-to-speech engine
self.engine_type = engine_type
self.engine: Optional[TextToSpeechEngine] = None
self.initialized = False

# Declare the attributes for the run thread
self.rate_hz = rate_hz
self.queue: List[TextToSpeech] = []
self.queue_lock = threading.Lock()

# Create the subscription
self.create_subscription(
TextToSpeech,
"text_to_speech",
self.text_to_speech_callback,
QoSProfile(depth=1, reliability=ReliabilityPolicy.RELIABLE),
callback_group=rclpy.callback_groups.MutuallyExclusiveCallbackGroup(),
)

def initialize(self):
"""
Initialize the text-to-speech engine.
"""
if self.engine_type == TextToSpeechEngineType.PYTTSX3:
self.engine = PyTTSx3(self.get_logger())
self.initialized = True
elif self.engine_type == TextToSpeechEngineType.GTTS:
self.engine = GTTS(self.get_logger())
self.initialized = True
else:
self.get_logger().error(f"Unsupported text-to-speech {self.engine_type}")

def text_to_speech_callback(self, msg: TextToSpeech):
"""
Callback for the text-to-speech topic.

Parameters
----------
msg : TextToSpeech
The message containing the text to speak.
"""
self.get_logger().info(f"Received: {msg}")
# Interrupt if requested
if msg.override_behavior == TextToSpeech.OVERRIDE_BEHAVIOR_INTERRUPT:
if self.engine._can_say_async:
self.engine.stop()
with self.queue_lock:
self.queue.clear()
else:
self.get_logger().warn("Engine does not support interrupting speech")

# Queue the text
if len(msg.text) > 0:
with self.queue_lock:
self.queue.append(msg)

def run(self):
"""
Run the text-to-speech engine.
"""
rate = self.create_rate(self.rate_hz)
while rclpy.ok():
# Sleep
rate.sleep()

# Send a single queued utterance to the text-to-speech engine
if not self.engine.is_speaking():
msg = None
with self.queue_lock:
if len(self.queue) > 0:
msg = self.queue.pop(0)
if msg is not None:
# Process the voice
if len(msg.voice) > 0:
if msg.voice != self.engine.voice_id:
self.engine.voice_id = msg.voice

# Process the speed
if msg.is_slow != self.engine.is_slow:
self.engine.is_slow = msg.is_slow

# Speak the text
if self.engine._can_say_async:
self.engine.say_async(msg.text)
else:
self.engine.say(msg.text)
self.get_logger().info(f"Saying: {msg.text}")


def main():
# Check the arguments
tts_engine = sys.argv[1]
tts_engine = tts_engine.lower()
tts_engine_map = {e.name.lower(): e for e in TextToSpeechEngineType}
if tts_engine not in tts_engine_map:
print(f"Invalid text-to-speech engine: {tts_engine}")
print(f"Options: {list(tts_engine_map.keys())}")
print("Defaulting to gtts")
tts_engine = "gtts"

rclpy.init()

node = TextToSpeechNode(
engine_type=tts_engine_map[tts_engine],
)
node.get_logger().info("Created!")

# Spin in the background, as the node initializes
executor = rclpy.executors.MultiThreadedExecutor(num_threads=4)
spin_thread = threading.Thread(
target=rclpy.spin,
args=(node,),
kwargs={"executor": executor},
daemon=True,
)
spin_thread.start()

# Run text-to-speech
try:
node.initialize()
node.get_logger().info("Running!")
node.run()
except KeyboardInterrupt:
pass

# Spin in the foreground
spin_thread.join()

node.destroy_node()
rclpy.shutdown()


if __name__ == "__main__":
main()
7 changes: 7 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
gTTS
loguru
# numpy 1.23.2 is not required for the web teleop interface, but is required
# for stretch_body. If we diden't include it here, pin would update
# to the latest version of numpy, breaking stretch_body.
numpy==1.23.2
pin
PyAudio==0.2.14
pydub
# TODO: is pyquaternion still needed/used?
pyquaternion
pyttsx3
simpleaudio
sounddevice
5 changes: 5 additions & 0 deletions src/pages/operator/css/MovementRecorder.css
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
justify-content: center;
}

/* The below buttons' CSS likely gets overridden by the TextToSpeech CSS
* for the same class names. It is not an issue now because they use the
* same styles, but may be an issue in the future if we change styles for
* one of the components.
*/
.play-btn {
background-color: var(--selected-color);
display: flex;
Expand Down
4 changes: 3 additions & 1 deletion src/pages/operator/css/Operator.css
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,12 @@
.operator-voice,
.operator-pose-library,
.operator-pose-recorder,
.operator-text-to-speech,
.operator-aruco-markers {
background-color: whitesmoke;
box-shadow: var(--shadow);
height: 6rem;
width: 40rem;
width: 50rem;
display: inline-grid;
align-items: center;
justify-content: center;
Expand Down Expand Up @@ -113,6 +114,7 @@
.operator-voice[hidden],
.operator-pose-library[hidden],
.operator-pose-recorder[hidden],
.operator-text-to-speech[hidden],
.operator-aruco-markers[hidden] {
display: none;
}
Expand Down
60 changes: 60 additions & 0 deletions src/pages/operator/css/TextToSpeech.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#text-to-speech-container {
display: flex;
gap: 15px;
align-items: center;
justify-content: center;
}

.play-btn {
background-color: var(--selected-color);
display: flex;
}

.save-btn {
background-color: var(--btn-turquoise);
display: flex;
}

.stop-btn {
background-color: #cd0b0b;
color: white;
display: flex;
}

.delete-btn {
background-color: var(--btn-red);
display: flex;
}

@media (max-width: 1300px) {
#text-to-speech-container {
font-size: smaller;
}
}

.mobile-text-save-btn {
border-radius: 13px;
border: 5px solid whitesmoke;
font-size: 25px;
padding: 10px;
margin: 1rem;
text-align: center;
background: #06c7e1;
vertical-align: middle;
display: flex;
justify-content: center;
}

.mobile-text-play-btn {
/* width: 97%; */
border-radius: 13px;
border: 5px solid whitesmoke;
font-size: 20px;
padding: 10px;
margin: 0.5rem;
text-align: center;
background: #06c7e1;
vertical-align: middle;
display: flex;
justify-content: center;
}
Loading