+
+ {/* Play the text */}
+
+
+
+ {/* Stop the playing text */}
+
+
+
+ {/* If we are on saved text, then we show a delete button, else a save button. */}
+ {selectedIdx != undefined ? (
+
+
+
+ ) : (
+
+
+
+ )}
+
+
+ ) : (
+
+
+ Text-to-speech not yet implemented for mobile
+
+
+ );
+};
diff --git a/src/pages/operator/tsx/static_components/Sidebar.tsx b/src/pages/operator/tsx/static_components/Sidebar.tsx
index 02b6a035..b53a834d 100644
--- a/src/pages/operator/tsx/static_components/Sidebar.tsx
+++ b/src/pages/operator/tsx/static_components/Sidebar.tsx
@@ -108,6 +108,10 @@ export type GlobalOptionsProps = {
displayMovementRecorder: boolean;
setDisplayMovementRecorder: (displayMovementRecorder: boolean) => void;
+ /** If the text-to-speech component should be displayed */
+ displayTextToSpeech: boolean;
+ setDisplayTextToSpeech: (displayTextToSpeech: boolean) => void;
+
/** If the button text labels should be displayed */
displayLabels: boolean;
setDisplayLabels: (displayLabels: boolean) => void;
@@ -152,6 +156,13 @@ const SidebarGlobalOptions = (props: GlobalOptionsProps) => {
}
label="Display movement recorder"
/>
+
+ props.setDisplayTextToSpeech(!props.displayTextToSpeech)
+ }
+ label="Display text-to-speech"
+ />
diff --git a/src/pages/operator/tsx/storage_handler/FirebaseStorageHandler.tsx b/src/pages/operator/tsx/storage_handler/FirebaseStorageHandler.tsx
index a61e154b..882e8b94 100644
--- a/src/pages/operator/tsx/storage_handler/FirebaseStorageHandler.tsx
+++ b/src/pages/operator/tsx/storage_handler/FirebaseStorageHandler.tsx
@@ -43,6 +43,7 @@ export class FirebaseStorageHandler extends StorageHandler {
private mapPoses: { [name: string]: ROSLIB.Transform };
private mapPoseTypes: { [name: string]: string };
private recordings: { [name: string]: RobotPose[] };
+ private textToSpeech: string[];
private markerNames: string[];
private markerIDs: string[];
private markerInfo: ArucoMarkersInfo;
@@ -66,6 +67,7 @@ export class FirebaseStorageHandler extends StorageHandler {
this.mapPoses = {};
this.mapPoseTypes = {};
this.recordings = {};
+ this.textToSpeech = [];
this.markerNames = [];
this.markerIDs = [];
this.markerInfo = {} as ArucoMarkersInfo;
@@ -86,6 +88,7 @@ export class FirebaseStorageHandler extends StorageHandler {
this.mapPoses = userData.map_poses;
this.mapPoseTypes = userData.map_pose_types;
this.recordings = userData.recordings;
+ this.textToSpeech = userData.text_to_speech;
this.onReadyCallback();
})
@@ -254,4 +257,34 @@ export class FirebaseStorageHandler extends StorageHandler {
delete this.recordings[recordingName];
this.writeRecordings(this.recordings);
}
+
+ /**
+ * NOTE: The below four text-to-speech functions have NOT been tested.
+ */
+
+ public getSavedTexts(): string[] {
+ if (!this.textToSpeech) return [];
+ return this.textToSpeech;
+ }
+
+ public saveText(text: string): void {
+ if (this.textToSpeech.includes(text)) return;
+ this.textToSpeech.push(text);
+ this.writeTextToSpeech(this.textToSpeech);
+ }
+
+ private async writeTextToSpeech(textToSpeech: string[]) {
+ this.textToSpeech = textToSpeech;
+
+ let updates: any = {};
+ updates["/users/" + this.uid + "/text_to_speech"] = textToSpeech;
+ return update(ref(this.database), updates);
+ }
+
+ public deleteText(text: string): void {
+ if (!this.textToSpeech.includes(text)) return;
+ const index = this.textToSpeech.indexOf(text);
+ this.textToSpeech.splice(index, 1);
+ this.writeTextToSpeech(this.textToSpeech);
+ }
}
diff --git a/src/pages/operator/tsx/storage_handler/LocalStorageHandler.tsx b/src/pages/operator/tsx/storage_handler/LocalStorageHandler.tsx
index ad4768f2..4ce660f8 100644
--- a/src/pages/operator/tsx/storage_handler/LocalStorageHandler.tsx
+++ b/src/pages/operator/tsx/storage_handler/LocalStorageHandler.tsx
@@ -11,6 +11,7 @@ export class LocalStorageHandler extends StorageHandler {
public static MAP_POSE_NAMES_KEY = "user_map_pose_names";
public static MAP_POSE_TYPES_KEY = "user_map_pose_types";
public static POSE_RECORDING_NAMES_KEY = "user_pose_recording_names";
+ public static TEXT_TO_SPEECH_KEY = "text_to_speech";
constructor(onStorageHandlerReadyCallback: () => void) {
super(onStorageHandlerReadyCallback);
@@ -173,4 +174,33 @@ export class LocalStorageHandler extends StorageHandler {
JSON.stringify(recordingNames),
);
}
+
+ public getSavedTexts(): string[] {
+ const storedJson = localStorage.getItem(
+ LocalStorageHandler.TEXT_TO_SPEECH_KEY,
+ );
+ if (!storedJson) return [];
+ return JSON.parse(storedJson);
+ }
+
+ public saveText(text: string): void {
+ const texts = this.getSavedTexts();
+ if (texts.includes(text)) return;
+ texts.push(text);
+ localStorage.setItem(
+ LocalStorageHandler.TEXT_TO_SPEECH_KEY,
+ JSON.stringify(texts),
+ );
+ }
+
+ public deleteText(text: string): void {
+ const texts = this.getSavedTexts();
+ if (!texts.includes(text)) return;
+ const index = texts.indexOf(text);
+ texts.splice(index, 1);
+ localStorage.setItem(
+ LocalStorageHandler.TEXT_TO_SPEECH_KEY,
+ JSON.stringify(texts),
+ );
+ }
}
diff --git a/src/pages/operator/tsx/storage_handler/StorageHandler.tsx b/src/pages/operator/tsx/storage_handler/StorageHandler.tsx
index b4a85153..79aa7b16 100644
--- a/src/pages/operator/tsx/storage_handler/StorageHandler.tsx
+++ b/src/pages/operator/tsx/storage_handler/StorageHandler.tsx
@@ -133,6 +133,24 @@ export abstract class StorageHandler {
*/
public abstract deleteRecording(recordingName: string): void;
+ /**
+ * Gets all the text to speech messages saved by the user.
+ * @returns list of all saved text to speech messages
+ */
+ public abstract getSavedTexts(): string[];
+
+ /**
+ * Saves a text to speech message to the storage device.
+ * @param text the text to save
+ */
+ public abstract saveText(text: string): void;
+
+ /**
+ * Deletes a text to speech message from the storage device.
+ * @param text the text to delete
+ */
+ public abstract deleteText(text: string): void;
+
/**
* Gets the last saved state from the user's layout, or gets the default
* layout if the user has no saved state.
diff --git a/src/pages/operator/tsx/utils/component_definitions.tsx b/src/pages/operator/tsx/utils/component_definitions.tsx
index b3834dea..06eb9cde 100644
--- a/src/pages/operator/tsx/utils/component_definitions.tsx
+++ b/src/pages/operator/tsx/utils/component_definitions.tsx
@@ -88,6 +88,7 @@ export type ParentComponentDefinition = ComponentDefinition & {
export type LayoutDefinition = ComponentDefinition & {
displayMovementRecorder: boolean;
+ displayTextToSpeech: boolean;
displayLabels: boolean;
actionMode: ActionMode;
children: LayoutGridDefinition[];
diff --git a/src/pages/robot/tsx/index.tsx b/src/pages/robot/tsx/index.tsx
index 33ab8846..3d90be88 100644
--- a/src/pages/robot/tsx/index.tsx
+++ b/src/pages/robot/tsx/index.tsx
@@ -297,6 +297,17 @@ function handleMessage(message: WebRTCMessage) {
break;
case "getStretchTool":
robot.getStretchTool();
+ break;
+ case "playTextToSpeech":
+ robot.playTextToSpeech(
+ message.text,
+ message.override_behavior,
+ message.is_slow,
+ );
+ break;
+ case "stopTextToSpeech":
+ robot.stopTextToSpeech();
+ break;
}
}
diff --git a/src/pages/robot/tsx/robot.tsx b/src/pages/robot/tsx/robot.tsx
index 4ae54c60..a4ed6775 100644
--- a/src/pages/robot/tsx/robot.tsx
+++ b/src/pages/robot/tsx/robot.tsx
@@ -69,6 +69,7 @@ export class Robot extends React.Component {
private subscriptions: ROSLIB.Topic[] = [];
private hasBetaTeleopKitParam: ROSLIB.Param;
private stretchToolParam: ROSLIB.Param;
+ private textToSpeechTopic?: ROSLIB.Topic;
constructor(props: {
jointStateCallback: (
@@ -154,6 +155,7 @@ export class Robot extends React.Component {
this.createMapFrameTFClient();
this.subscribeToHeadTiltTF();
this.subscribeToMapTF();
+ this.createTextToSpeechTopic();
return Promise.resolve();
}
@@ -397,6 +399,14 @@ export class Robot extends React.Component {
});
}
+ createTextToSpeechTopic() {
+ this.textToSpeechTopic = new ROSLIB.Topic({
+ ros: this.ros,
+ name: "/text_to_speech",
+ messageType: "stretch_web_teleop/msg/TextToSpeech",
+ });
+ }
+
createSwitchToNavigationService() {
this.switchToNavigationService = new ROSLIB.Service({
ros: this.ros,
@@ -574,7 +584,7 @@ export class Robot extends React.Component {
z: props.angVel,
},
});
- if (!this.cmdVelTopic) throw "trajectoryClient is undefined";
+ if (!this.cmdVelTopic) throw "cmdVelTopic is undefined";
console.log("Publishing base velocity twist message");
this.cmdVelTopic.publish(twist);
};
@@ -946,4 +956,29 @@ export class Robot extends React.Component {
return inCollision;
}
+
+ playTextToSpeech(
+ text: string,
+ override_behavior: number = 0,
+ is_slow: boolean = false,
+ ) {
+ if (!this.textToSpeechTopic) throw "textToSpeechTopic is undefined";
+ if (override_behavior != 0 && override_behavior != 1) {
+ console.log(
+ "override behavior must be 0 (queue) or 1 (interrupt). Setting to 0.",
+ );
+ override_behavior = 0;
+ }
+ let message = new ROSLIB.Message({
+ text: text,
+ is_slow: is_slow,
+ override_behavior: override_behavior,
+ });
+ this.textToSpeechTopic.publish(message);
+ }
+
+ stopTextToSpeech() {
+ // Send an empty string and override behavior 1 to interrupt the current speech
+ this.playTextToSpeech("", 1);
+ }
}
diff --git a/src/shared/commands.tsx b/src/shared/commands.tsx
index 90f3f2f8..585a97a3 100644
--- a/src/shared/commands.tsx
+++ b/src/shared/commands.tsx
@@ -19,7 +19,9 @@ export type cmd =
| PlaybackPosesCommand
| GetBatteryVoltageCommand
| GetHasBetaTeleopKit
- | GetStretchTool;
+ | GetStretchTool
+ | PlayTextToSpeech
+ | StopTextToSpeech;
export interface VelocityCommand {
stop: () => void;
@@ -112,3 +114,14 @@ export interface StopMoveToPregraspCommand {
export interface GetBatteryVoltageCommand {
type: "getBatteryVoltage";
}
+
+export interface PlayTextToSpeech {
+ type: "playTextToSpeech";
+ text: string;
+ override_behavior: number;
+ is_slow: boolean;
+}
+
+export interface StopTextToSpeech {
+ type: "stopTextToSpeech";
+}
diff --git a/src/shared/remoterobot.tsx b/src/shared/remoterobot.tsx
index 1f0b9908..b68bcd96 100644
--- a/src/shared/remoterobot.tsx
+++ b/src/shared/remoterobot.tsx
@@ -13,6 +13,8 @@ import {
GetOccupancyGrid,
MoveBaseCommand,
PlaybackPosesCommand,
+ PlayTextToSpeech,
+ StopTextToSpeech,
} from "shared/commands";
import {
ValidJointStateDict,
@@ -219,6 +221,37 @@ export class RemoteRobot extends React.Component<{}, any> {
stopMoveToPregrasp() {
this.robotChannel({ type: "stopMoveToPregrasp" });
}
+
+ /**
+ * Speak the specified text.
+ *
+ * @param text text to speak
+ * @param override_behavior 0 to queue, 1 to interrupt
+ * @param is_slow False for normal speed, True for slow speed
+ */
+ playTextToSpeech(
+ text: string,
+ override_behavior: number = 0,
+ is_slow: boolean = false,
+ ) {
+ let cmd: PlayTextToSpeech = {
+ type: "playTextToSpeech",
+ text: text,
+ override_behavior: override_behavior,
+ is_slow: is_slow,
+ };
+ this.robotChannel(cmd);
+ }
+
+ /**
+ * Stop the text that is currently being spoken.
+ */
+ stopTextToSpeech() {
+ let cmd: StopTextToSpeech = {
+ type: "stopTextToSpeech",
+ };
+ this.robotChannel(cmd);
+ }
}
class RobotSensors extends React.Component {
diff --git a/stretch_web_teleop_helpers/text_to_speech_helpers.py b/stretch_web_teleop_helpers/text_to_speech_helpers.py
new file mode 100644
index 00000000..fc177df0
--- /dev/null
+++ b/stretch_web_teleop_helpers/text_to_speech_helpers.py
@@ -0,0 +1,311 @@
+# Standard imports
+from abc import ABC, abstractmethod
+from enum import Enum
+from io import BytesIO
+from typing import List, Optional
+
+# Third-party imports
+import pyttsx3
+import simpleaudio
+import sounddevice # suppress ALSA warnings # noqa: F401
+from gtts import gTTS
+from pydub import AudioSegment
+from rclpy.impl.rcutils_logger import RcutilsLogger
+
+
+class TextToSpeechEngineType(Enum):
+ """
+ The TextToSpeechEngineType class enumerates the possible text-to-speech
+ engines.
+ """
+
+ PYTTSX3 = 1
+ GTTS = 2
+
+
+class TextToSpeechEngine(ABC):
+ """
+ Abstract base class for a text-to-speech engine that supports:
+ - Setting the voice ID.
+ - Setting the speed to default or slow.
+ - Asynchronously speaking text.
+ - Interrupting speech.
+ """
+
+ def __init__(self, logger: RcutilsLogger):
+ """
+ Initialize the text-to-speech engine.
+
+ Parameters
+ ----------
+ logger : Logger
+ The logger to use for logging messages.
+ """
+ self._logger = logger
+ self._voice_ids: List[str] = []
+ self._voice_id = ""
+ self._is_slow = False
+
+ # Whether or not this engine can speak asynchronously or not.
+ self._can_say_async = False
+
+ @property
+ def voice_ids(self) -> List[str]:
+ """
+ Get the list of voice IDs available for the text-to-speech engine.
+ """
+ return self._voice_ids
+
+ @property
+ def voice_id(self) -> str:
+ """
+ Get the current voice ID for the text-to-speech engine.
+ """
+ return self._voice_id
+
+ @voice_id.setter
+ def voice_id(self, voice_id: str) -> None:
+ """
+ Set the current voice ID for the text-to-speech engine.
+ """
+ if voice_id in self._voice_ids:
+ self._voice_id = voice_id
+ else:
+ self._logger.error(f"Invalid voice ID: {voice_id}")
+
+ @property
+ def is_slow(self) -> bool:
+ """
+ Get whether the text-to-speech engine is set to speak slowly.
+ """
+ return self._is_slow
+
+ @is_slow.setter
+ def is_slow(self, is_slow: bool):
+ """
+ Set whether the text-to-speech engine is set to speak slowly.
+ """
+ self._is_slow = is_slow
+
+ @abstractmethod
+ def say_async(self, text: str):
+ """
+ Speak the given text asynchronously.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def is_speaking(self) -> bool:
+ """
+ Return whether the text-to-speech engine is currently speaking.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def say(self, text: str):
+ """
+ Speak the given text synchronously.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def stop(self):
+ """
+ Stop speaking the current text.
+ """
+ raise NotImplementedError
+
+
+class PyTTSx3(TextToSpeechEngine):
+ """
+ Text-to-speech engine using pyttsx3. A big benefit of pyttsx3 compared
+ to other enginers is that it runs offline. However, its Linux voices tend
+ to be less natural than other engines.
+ """
+
+ def __init__(self, logger: RcutilsLogger):
+ """
+ Initialize the text-to-speech engine.
+
+ Parameters
+ ----------
+ logger : Logger
+ The logger to use for logging messages.
+ """
+ super().__init__(logger)
+ self._engine = pyttsx3.init()
+
+ # Initialize the voices
+ voices = self._engine.getProperty("voices")
+ # Variants documentation: https://espeak.sourceforge.net/languages.html
+ variants = [
+ "m1",
+ "m2",
+ "m3",
+ "m4",
+ "m5",
+ "m6",
+ "m7",
+ "f1",
+ "f2",
+ "f3",
+ "f4",
+ "croak",
+ "whisper",
+ ]
+ for voice in voices:
+ self._voice_ids.append(voice.id)
+ for variant in variants:
+ self._voice_ids.append(voice.id + "+" + variant)
+ self.voice_id = "default"
+
+ # Initialize the speeds
+ self.slow_speed = 100 # wpm
+ self.default_speed = 150 # wpm
+
+ @TextToSpeechEngine.voice_id.setter # type: ignore
+ def voice_id(self, voice_id: str) -> None:
+ """
+ Set the current voice ID for the text-to-speech engine.
+ """
+ self._voice_id = voice_id
+ self._engine.setProperty("voice", voice_id)
+
+ @TextToSpeechEngine.is_slow.setter # type: ignore
+ def is_slow(self, is_slow: bool):
+ """
+ Set whether the text-to-speech engine is set to speak slowly.
+ """
+ self._is_slow = is_slow
+ if is_slow:
+ self._engine.setProperty("rate", self.slow_speed)
+ else:
+ self._engine.setProperty("rate", self.default_speed)
+
+ def say_async(self, text: str):
+ """
+ Speak the given text asynchronously.
+ """
+ self._logger.warn(
+ "Asynchronous speaking is not supported for PyTTSx3 on Linux."
+ )
+
+ def is_speaking(self) -> bool:
+ """
+ Return whether the text-to-speech engine is currently speaking.
+ """
+ # Because asynchronous speaking is not supported in pyttsxy on Linux,
+ # if this function is called, it is assumed that the engine is not speaking.
+ # This works as long as `is_speaking` and `say` will be called from
+ # the same thread.
+ return False
+
+ def say(self, text: str):
+ """
+ Speak the given text synchronously.
+ """
+ self._engine.say(text)
+ self._engine.runAndWait()
+
+ def stop(self):
+ """
+ Stop speaking the current text.
+ """
+ # Although interruptions are nominally supported in pyttsx3
+ # (https://pyttsx3.readthedocs.io/en/latest/engine.html#examples),
+ # in practice, the Linux implementation spins of an ffmpeg process
+ # which can't be interrupted in its current implementation:
+ # https://github.com/nateshmbhat/pyttsx3/blob/5d3755b060a980f48fcaf81df018dd06cbd17a8f/pyttsx3/drivers/espeak.py#L175 # noqa: E501
+ self._logger.warn(
+ "Asynchronous stopping is not supported for PyTTSx3 on Linux."
+ )
+
+
+class GTTS(TextToSpeechEngine):
+ """
+ Text-to-speech engine using gTTS.
+ """
+
+ def __init__(self, logger: RcutilsLogger):
+ """
+ Initialize the text-to-speech engine.
+
+ Parameters
+ ----------
+ logger : Logger
+ The logger to use for logging messages.
+ """
+ super().__init__(logger)
+ self._can_say_async = True
+
+ # Initialize the voices.
+ # https://gtts.readthedocs.io/en/latest/module.html#gtts.lang.tts_langs
+ self._voice_ids = [
+ "com", # Default
+ "us", # United States
+ "com.au", # Australia
+ "co.uk", # United Kingdom
+ "ca", # Canada
+ "co.in", # India
+ "ie", # Ireland
+ "co.za", # South Africa
+ "com.ng", # Nigeria
+ ]
+ self.voice_id = "com"
+ self._playback: Optional[simpleaudio.PlayObject] = None
+
+ def __synthesize_and_play_text(self, text: str) -> simpleaudio.PlayObject:
+ """
+ Get the playback object for the given text.
+
+ Parameters
+ ----------
+ text : str
+ The text to speak.
+
+ Returns
+ -------
+ simpleaudio.PlayObject
+ The playback object.
+ """
+ tts = gTTS(text=text, lang="en", tld=self.voice_id, slow=self.is_slow)
+ fp = BytesIO()
+ tts.write_to_fp(fp)
+ fp.seek(0)
+ audio = AudioSegment.from_file(fp, format="mp3")
+ self._playback = simpleaudio.play_buffer(
+ audio.raw_data, audio.channels, audio.sample_width, audio.frame_rate
+ )
+
+ def say_async(self, text: str):
+ """
+ Speak the given text asynchronously.
+ """
+ self.__synthesize_and_play_text(text)
+
+ def is_speaking(self) -> bool:
+ """
+ Return whether the text-to-speech engine is currently speaking.
+ """
+ if self._playback is None:
+ return False
+ if not self._playback.is_playing():
+ self._playback = None
+ return False
+ return True
+
+ def say(self, text: str):
+ """
+ Speak the given text synchronously.
+ """
+ self.__synthesize_and_play_text(text)
+ self._playback.wait_done()
+ self._playback = None
+
+ def stop(self):
+ """
+ Stop speaking the current text.
+ """
+ if self._playback is not None:
+ self._playback.stop()
+ self._playback = None