-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Operator-to-robot Text-to-Speech (#64)
* [WIP] write a ROS node for TTS * [WIP] pyttsx3 mostly works but sounds awful * created abstract class to allow easy switching of engines * gTTS works and overrides work * [WIP] basic UI arrangement works * UI layout done * Play and Stop function providers work * Finished implementing function providers * Add scroll to too-big dropdown popups, check whether text exists as you type. * Update comments * Update requirements * Fixes from testing * Remove unnecessary logs * Auto-select all text on click * Trim whitespace before storing * Removed unnecessary logs * updated style of dropdown input to be consistent with the regular dropdown * Add launch arg for tts engine * Update color scheme --------- Co-authored-by: Vinitha Ranganeni <[email protected]>
- Loading branch information
1 parent
f0ce7cb
commit c53a7c0
Showing
28 changed files
with
1,209 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# The text to say | ||
string text | ||
|
||
# The voice to use. Valid options for this depend on the engine. | ||
string voice | ||
|
||
# Whether to speak slow or not. | ||
bool is_slow | ||
|
||
# If a message is already being spoken, this flag controls what to do with this message: | ||
# add it to a queue to be executed sequentially (Default), or interrupt the | ||
# current message and queue to speak this message (in this case, the old queue gets | ||
# discarded). | ||
uint8 OVERRIDE_BEHAVIOR_QUEUE = 0 | ||
uint8 OVERRIDE_BEHAVIOR_INTERRUPT = 1 | ||
uint8 override_behavior |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Standard imports | ||
import sys | ||
import threading | ||
from typing import List, Optional | ||
|
||
# Third-party imports | ||
import rclpy | ||
import sounddevice # suppress ALSA warnings # noqa: F401 | ||
from rclpy.node import Node | ||
from rclpy.qos import QoSProfile, ReliabilityPolicy | ||
|
||
# Local Imports | ||
from stretch_web_teleop.msg import TextToSpeech | ||
from stretch_web_teleop_helpers.text_to_speech_helpers import ( | ||
GTTS, | ||
PyTTSx3, | ||
TextToSpeechEngine, | ||
TextToSpeechEngineType, | ||
) | ||
|
||
|
||
class TextToSpeechNode(Node): | ||
""" | ||
The TextToSpeech node subscribes to a stream of text-to-speech commands | ||
from a topic and executes them. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
engine_type: TextToSpeechEngineType = TextToSpeechEngineType.PYTTSX3, | ||
rate_hz: float = 5.0, | ||
): | ||
""" | ||
Initialize the TextToSpeechNode. | ||
Parameters | ||
---------- | ||
engine_type : TextToSpeechEngineType | ||
The text-to-speech engine to use. | ||
rate_hz : float | ||
The rate at which to run the text-to-speech engine. | ||
""" | ||
# Initialize the node | ||
super().__init__("text_to_speech") | ||
|
||
# Declare the attributes for the text-to-speech engine | ||
self.engine_type = engine_type | ||
self.engine: Optional[TextToSpeechEngine] = None | ||
self.initialized = False | ||
|
||
# Declare the attributes for the run thread | ||
self.rate_hz = rate_hz | ||
self.queue: List[TextToSpeech] = [] | ||
self.queue_lock = threading.Lock() | ||
|
||
# Create the subscription | ||
self.create_subscription( | ||
TextToSpeech, | ||
"text_to_speech", | ||
self.text_to_speech_callback, | ||
QoSProfile(depth=1, reliability=ReliabilityPolicy.RELIABLE), | ||
callback_group=rclpy.callback_groups.MutuallyExclusiveCallbackGroup(), | ||
) | ||
|
||
def initialize(self): | ||
""" | ||
Initialize the text-to-speech engine. | ||
""" | ||
if self.engine_type == TextToSpeechEngineType.PYTTSX3: | ||
self.engine = PyTTSx3(self.get_logger()) | ||
self.initialized = True | ||
elif self.engine_type == TextToSpeechEngineType.GTTS: | ||
self.engine = GTTS(self.get_logger()) | ||
self.initialized = True | ||
else: | ||
self.get_logger().error(f"Unsupported text-to-speech {self.engine_type}") | ||
|
||
def text_to_speech_callback(self, msg: TextToSpeech): | ||
""" | ||
Callback for the text-to-speech topic. | ||
Parameters | ||
---------- | ||
msg : TextToSpeech | ||
The message containing the text to speak. | ||
""" | ||
self.get_logger().info(f"Received: {msg}") | ||
# Interrupt if requested | ||
if msg.override_behavior == TextToSpeech.OVERRIDE_BEHAVIOR_INTERRUPT: | ||
if self.engine._can_say_async: | ||
self.engine.stop() | ||
with self.queue_lock: | ||
self.queue.clear() | ||
else: | ||
self.get_logger().warn("Engine does not support interrupting speech") | ||
|
||
# Queue the text | ||
if len(msg.text) > 0: | ||
with self.queue_lock: | ||
self.queue.append(msg) | ||
|
||
def run(self): | ||
""" | ||
Run the text-to-speech engine. | ||
""" | ||
rate = self.create_rate(self.rate_hz) | ||
while rclpy.ok(): | ||
# Sleep | ||
rate.sleep() | ||
|
||
# Send a single queued utterance to the text-to-speech engine | ||
if not self.engine.is_speaking(): | ||
msg = None | ||
with self.queue_lock: | ||
if len(self.queue) > 0: | ||
msg = self.queue.pop(0) | ||
if msg is not None: | ||
# Process the voice | ||
if len(msg.voice) > 0: | ||
if msg.voice != self.engine.voice_id: | ||
self.engine.voice_id = msg.voice | ||
|
||
# Process the speed | ||
if msg.is_slow != self.engine.is_slow: | ||
self.engine.is_slow = msg.is_slow | ||
|
||
# Speak the text | ||
if self.engine._can_say_async: | ||
self.engine.say_async(msg.text) | ||
else: | ||
self.engine.say(msg.text) | ||
self.get_logger().info(f"Saying: {msg.text}") | ||
|
||
|
||
def main(): | ||
# Check the arguments | ||
tts_engine = sys.argv[1] | ||
tts_engine = tts_engine.lower() | ||
tts_engine_map = {e.name.lower(): e for e in TextToSpeechEngineType} | ||
if tts_engine not in tts_engine_map: | ||
print(f"Invalid text-to-speech engine: {tts_engine}") | ||
print(f"Options: {list(tts_engine_map.keys())}") | ||
print("Defaulting to gtts") | ||
tts_engine = "gtts" | ||
|
||
rclpy.init() | ||
|
||
node = TextToSpeechNode( | ||
engine_type=tts_engine_map[tts_engine], | ||
) | ||
node.get_logger().info("Created!") | ||
|
||
# Spin in the background, as the node initializes | ||
executor = rclpy.executors.MultiThreadedExecutor(num_threads=4) | ||
spin_thread = threading.Thread( | ||
target=rclpy.spin, | ||
args=(node,), | ||
kwargs={"executor": executor}, | ||
daemon=True, | ||
) | ||
spin_thread.start() | ||
|
||
# Run text-to-speech | ||
try: | ||
node.initialize() | ||
node.get_logger().info("Running!") | ||
node.run() | ||
except KeyboardInterrupt: | ||
pass | ||
|
||
# Spin in the foreground | ||
spin_thread.join() | ||
|
||
node.destroy_node() | ||
rclpy.shutdown() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,14 @@ | ||
gTTS | ||
loguru | ||
# numpy 1.23.2 is not required for the web teleop interface, but is required | ||
# for stretch_body. If we diden't include it here, pin would update | ||
# to the latest version of numpy, breaking stretch_body. | ||
numpy==1.23.2 | ||
pin | ||
PyAudio==0.2.14 | ||
pydub | ||
# TODO: is pyquaternion still needed/used? | ||
pyquaternion | ||
pyttsx3 | ||
simpleaudio | ||
sounddevice |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#text-to-speech-container { | ||
display: flex; | ||
gap: 15px; | ||
align-items: center; | ||
justify-content: center; | ||
} | ||
|
||
.play-btn { | ||
background-color: var(--selected-color); | ||
display: flex; | ||
} | ||
|
||
.save-btn { | ||
background-color: var(--btn-turquoise); | ||
display: flex; | ||
} | ||
|
||
.stop-btn { | ||
background-color: #cd0b0b; | ||
color: white; | ||
display: flex; | ||
} | ||
|
||
.delete-btn { | ||
background-color: var(--btn-red); | ||
display: flex; | ||
} | ||
|
||
@media (max-width: 1300px) { | ||
#text-to-speech-container { | ||
font-size: smaller; | ||
} | ||
} | ||
|
||
.mobile-text-save-btn { | ||
border-radius: 13px; | ||
border: 5px solid whitesmoke; | ||
font-size: 25px; | ||
padding: 10px; | ||
margin: 1rem; | ||
text-align: center; | ||
background: #06c7e1; | ||
vertical-align: middle; | ||
display: flex; | ||
justify-content: center; | ||
} | ||
|
||
.mobile-text-play-btn { | ||
/* width: 97%; */ | ||
border-radius: 13px; | ||
border: 5px solid whitesmoke; | ||
font-size: 20px; | ||
padding: 10px; | ||
margin: 0.5rem; | ||
text-align: center; | ||
background: #06c7e1; | ||
vertical-align: middle; | ||
display: flex; | ||
justify-content: center; | ||
} |
Oops, something went wrong.