From df2923016fdaeb3a768821993d033352afcd5786 Mon Sep 17 00:00:00 2001
From: DJDavid98 <djdavid98@protonmail.com>
Date: Mon, 4 Mar 2024 02:04:31 +0100
Subject: [PATCH] Add theoretical PlayHT TTS support

---
 src/js/chat/Chat.tsx                          |   9 +-
 src/js/chat/TtsHealth.tsx                     |  31 +--
 src/js/hooks/use-eleven-labs-tts.ts           | 151 +++++++++++
 src/js/hooks/use-play-ht-tts.ts               | 125 +++++++++
 src/js/hooks/use-tts.ts                       | 246 +++++++++---------
 src/js/model/eleven-labs.ts                   |   2 +-
 src/js/model/play-ht.ts                       |  42 +++
 src/js/model/settings.ts                      |   7 +
 src/js/model/tts.ts                           |  56 ++++
 .../pages/SettingsPageChatOverlay.tsx         | 151 +++++++++--
 src/js/utils/chat-messages.ts                 |   5 +
 src/js/utils/settings.ts                      |  19 ++
 12 files changed, 658 insertions(+), 186 deletions(-)
 create mode 100644 src/js/hooks/use-eleven-labs-tts.ts
 create mode 100644 src/js/hooks/use-play-ht-tts.ts
 create mode 100644 src/js/model/play-ht.ts
 create mode 100644 src/js/model/tts.ts
diff --git a/src/js/chat/Chat.tsx b/src/js/chat/Chat.tsx
index 6b55a6c..51e5f56 100644
--- a/src/js/chat/Chat.tsx
+++ b/src/js/chat/Chat.tsx
@@ -15,24 +15,23 @@ import { ChatMessage } from './ChatMessage';
 import DurationUnitFormat from 'intl-unofficial-duration-unit-format';
 import { useSettings } from '../contexts/settings-context';
 import { SettingName } from '../model/settings';
-import { useTts } from '../hooks/use-tts';
 import { TtsHealth } from './TtsHealth';
 import { RemovableElement } from '../RemovableElement';
 import { RemovableElementId } from '../model/removable-element-id';
+import { useTts } from '../hooks/use-tts';
 
 const MAX_MESSAGE_COUNT = 12;
 
 export const Chat: FC = () => {
     const {
         settings: {
-            [SettingName.ELEVEN_LABS_TOKEN]: elevenLabsToken,
             [SettingName.TTS_ENABLED]: ttsEnabled,
             [SettingName.CHAT_SONG_PREVIEWS]: chatSongPreviews,
         }
     } = useSettings();
     const [messages, setMessages] = useState<Array<DisplayableMessage>>(() => []);
     const socket = useSocket();
-    const tts = useTts(elevenLabsToken, ttsEnabled);
+    const tts = useTts();
     const df = useMemo(() => new DurationUnitFormat('en-US', {
         style: DurationUnitFormat.styles.LONG,
         format: '{days} {hour} {minutes} {seconds}'
@@ -202,8 +201,8 @@ export const Chat: FC = () => {
     }, [addMessage, chatSongPreviews, df, socket, tts]);
 
     return <Fragment>
-        {ttsEnabled && elevenLabsToken &&
-            <RemovableElement id={RemovableElementId.TTS_HEALTH}><TtsHealth token={elevenLabsToken} /></RemovableElement>}
+        {ttsEnabled && tts.limitProviderHook &&
+            <RemovableElement id={RemovableElementId.TTS_HEALTH}><TtsHealth useLimitProvider={tts.limitProviderHook} /></RemovableElement>}
         {messages.map(message => <ChatMessage key={message.id} message={message} />)}
     </Fragment>;
 };
diff --git a/src/js/chat/TtsHealth.tsx b/src/js/chat/TtsHealth.tsx
index 813f987..104379f 100644
--- a/src/js/chat/TtsHealth.tsx
+++ b/src/js/chat/TtsHealth.tsx
@@ -1,14 +1,14 @@
 import { FC, useMemo } from 'react';
-import useSWR from 'swr';
 import * as styles from '../../scss/modules/TtsHealth.module.scss';
+import { TtsLimitProviderHook } from '../model/tts';
 
 export const ELEVEN_LABS_SUBSCRIPTION_ENDPOINT = 'https://api.elevenlabs.io/v1/user/subscription';
 
 export interface TtsHealthProps {
-    token: string;
+    useLimitProvider: TtsLimitProviderHook;
 }
 
-export const TtsHealth: FC<TtsHealthProps> = ({ token }) => {
+export const TtsHealth: FC<TtsHealthProps> = ({ useLimitProvider }) => {
     const pf = useMemo(() => new Intl.NumberFormat('en-US', {
         style: 'percent',
         minimumFractionDigits: 0,
@@ -16,30 +16,7 @@ export const TtsHealth: FC<TtsHealthProps> = ({ token }) => {
     }), []);
     const nf = useMemo(() => new Intl.NumberFormat('en-US'), []);
 
-    const { data: subscriptionData } = useSWR(ELEVEN_LABS_SUBSCRIPTION_ENDPOINT, (key: string) => fetch(key, {
-        method: 'GET',
-        headers: {
-            'xi-api-key': token,
-        },
-    }).then(r => r.json()), {
-        refreshInterval: 60e3,
-        revalidateOnFocus: false,
-        keepPreviousData: true,
-    });
-
-    const limits = useMemo(() => {
-        let maxChars = 0;
-        let usedChars = 0;
-        if (typeof subscriptionData === 'object' && subscriptionData !== null) {
-            if ('character_limit' in subscriptionData && typeof subscriptionData.character_limit === 'number') {
-                maxChars = subscriptionData.character_limit;
-            }
-            if ('character_count' in subscriptionData && typeof subscriptionData.character_count === 'number') {
-                usedChars = subscriptionData.character_count;
-            }
-        }
-        return { maxChars, usedChars };
-    }, [subscriptionData]);
+    const limits = useLimitProvider();
 
     const ttsUsedPercent = limits.maxChars > 0 ? limits.usedChars / limits.maxChars : 1;
     const charsAvailable = limits.maxChars - limits.usedChars;
diff --git a/src/js/hooks/use-eleven-labs-tts.ts b/src/js/hooks/use-eleven-labs-tts.ts
new file mode 100644
index 0000000..e89eb5f
--- /dev/null
+++ b/src/js/hooks/use-eleven-labs-tts.ts
@@ -0,0 +1,151 @@
+import { useCallback, useMemo, useRef } from 'react';
+import { ElevenLabsVoiceData } from '../model/eleven-labs';
+import { mapPronounsToGender, ttsInputToText } from '../utils/chat-messages';
+import useSWR, { useSWRConfig } from 'swr';
+import { ELEVEN_LABS_SUBSCRIPTION_ENDPOINT } from '../chat/TtsHealth';
+import { TtsApi, TtsHookOptions, TtsInput, TtsLimits } from '../model/tts';
+import { useSettings } from '../contexts/settings-context';
+import { SettingName } from '../model/settings';
+
+const useElevenLabsLimits = (): TtsLimits => {
+    const { settings: { [SettingName.ELEVEN_LABS_TOKEN]: token } } = useSettings();
+
+    const { data: subscriptionData } = useSWR(ELEVEN_LABS_SUBSCRIPTION_ENDPOINT, async (key: string) => {
+        if (!token) {
+            return null;
+        }
+
+        const r = await fetch(key, {
+            method: 'GET',
+            headers: {
+                'xi-api-key': token,
+            },
+        });
+        return await r.json();
+    }, {
+        refreshInterval: 60e3,
+        revalidateOnFocus: false,
+        keepPreviousData: true,
+    });
+
+    return useMemo((): TtsLimits => {
+        let maxChars = 0;
+        let usedChars = 0;
+        if (typeof subscriptionData === 'object' && subscriptionData !== null) {
+            if ('character_limit' in subscriptionData && typeof subscriptionData.character_limit === 'number') {
+                maxChars = subscriptionData.character_limit;
+            }
+            if ('character_count' in subscriptionData && typeof subscriptionData.character_count === 'number') {
+                usedChars = subscriptionData.character_count;
+            }
+        }
+        return { maxChars, usedChars };
+    }, [subscriptionData]);
+};
+
+export const useElevenLabsTts = ({
+    token,
+    enabled,
+    lastReadTextRef,
+    currentlyReadingRef,
+    pickQueueItem,
+    requestPlayer,
+    readFirstInQueue,
+    setAudioSource,
+    clearPlayingAudio,
+    clearQueue,
+    clearIdsFromQueue,
+    queueText,
+}: TtsHookOptions): TtsApi => {
+    const voicesRef = useRef<ElevenLabsVoiceData['voices']>([]);
+    const getVoiceId = useCallback((ttsInput?: TtsInput): string | undefined => {
+        const targetGender = mapPronounsToGender(ttsInput?.pronouns);
+        const matchingVoice = voicesRef.current.find(voice => {
+            const { age, gender, 'use case': useCase } = voice.labels;
+            return age === 'young' && gender === targetGender && useCase === 'narration';
+        });
+        return matchingVoice ? matchingVoice.voice_id : undefined;
+    }, []);
+
+    const { mutate } = useSWRConfig();
+
+    const processQueue = useCallback(async (debugSource: string): Promise<void> => {
+        if (!enabled) return;
+
+        if (!token) {
+            console.error('Token is missing (%s)', debugSource);
+            return;
+        }
+
+        const firstQueueItem = pickQueueItem();
+        if (!firstQueueItem) {
+            return;
+        }
+
+        const voiceId = getVoiceId(firstQueueItem);
+        if (!voiceId) {
+            console.error('No voice found (%s)', debugSource);
+            return;
+        }
+
+        if (!requestPlayer()) {
+            return;
+        }
+
+        const ttsInput = readFirstInQueue();
+        const textToRead = ttsInputToText(ttsInput, lastReadTextRef.current);
+        const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream`, {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json',
+                'xi-api-key': token,
+                'accept': 'audio/mpeg'
+            },
+            body: JSON.stringify({ text: textToRead })
+        });
+        const audioBlob = await response.blob();
+        void mutate(ELEVEN_LABS_SUBSCRIPTION_ENDPOINT);
+
+        return setAudioSource(URL.createObjectURL(audioBlob)).then(() => {
+            clearPlayingAudio(ttsInput);
+            return processQueue('ended handler');
+        });
+    }, [clearPlayingAudio, enabled, getVoiceId, lastReadTextRef, mutate, pickQueueItem, readFirstInQueue, requestPlayer, setAudioSource, token]);
+
+    const readText = useCallback((text: TtsInput) => {
+        if (!enabled) return;
+
+        queueText(text);
+        void processQueue('readText');
+    }, [enabled, queueText, processQueue]);
+
+    const clearIds = useCallback((clearedIds: string[]) => {
+        clearIdsFromQueue(clearedIds);
+
+        if (!currentlyReadingRef.current) {
+            void processQueue('clearIds');
+        }
+    }, [clearIdsFromQueue, currentlyReadingRef, processQueue]);
+
+    const fetchVoices = useCallback(() => {
+        if (!enabled || !token || voicesRef.current.length) return;
+
+        fetch('https://api.elevenlabs.io/v1/voices', {
+            method: 'GET',
+            headers: { accept: 'application/json' },
+        }).then(async (r) => {
+            // TODO data validation
+            const voiceData = await r.json();
+            voicesRef.current = voiceData['voices'];
+            void processQueue('voices fetching');
+        });
+    }, [enabled, processQueue, token]);
+
+    return {
+        readText,
+        fetchVoices,
+        clearQueue,
+        clearIds,
+        limitProviderHook: useElevenLabsLimits,
+    };
+};
diff --git a/src/js/hooks/use-play-ht-tts.ts b/src/js/hooks/use-play-ht-tts.ts
new file mode 100644
index 0000000..598f205
--- /dev/null
+++ b/src/js/hooks/use-play-ht-tts.ts
@@ -0,0 +1,125 @@
+import { useCallback, useMemo, useRef } from 'react';
+import { TtsApi, TtsHookOptions, TtsInput } from '../model/tts';
+import { mapPronounsToGender, ttsInputToText } from '../utils/chat-messages';
+import { PlayHtVoiceData } from '../model/play-ht';
+
+export interface PlayHtTtsParams extends TtsHookOptions {
+    userId: string | null;
+}
+
+export const usePlayHtTts = ({
+    token,
+    enabled,
+    userId,
+    lastReadTextRef,
+    requestPlayer,
+    setAudioSource,
+    readFirstInQueue,
+    pickQueueItem,
+    clearQueue,
+    clearIdsFromQueue,
+    queueText,
+}: PlayHtTtsParams): TtsApi => {
+    const voicesRef = useRef<PlayHtVoiceData[]>([]);
+    const getVoiceId = useCallback((ttsInput?: TtsInput): string | undefined => {
+        const targetGender = mapPronounsToGender(ttsInput?.pronouns);
+        const matchingVoice = voicesRef.current.find(voice => {
+            const { age, gender, style, loudness } = voice;
+            return age === 'youth' && gender === targetGender && style === 'narrative' && loudness === 'neutral';
+        });
+        return matchingVoice ? matchingVoice.id : undefined;
+    }, []);
+    const apiAuthHeaders = useMemo(() => {
+        const authHeaders: Record<string, string> = {};
+        if (token) authHeaders['Authorization'] = token;
+        if (userId) authHeaders['X-USER-ID'] = userId;
+        return authHeaders;
+    }, [token, userId]);
+
+    const processQueue = useCallback(async (debugSource: string): Promise<void> => {
+        if (!enabled) return;
+
+        if (!token) {
+            console.error('Token is missing (%s)', debugSource);
+            return;
+        }
+
+        if (!userId) {
+            console.error('User ID is missing (%s)', debugSource);
+            return;
+        }
+
+        const firstQueueItem = pickQueueItem();
+        if (!firstQueueItem) {
+            return;
+        }
+
+        const voiceId = getVoiceId(firstQueueItem);
+        if (!voiceId) {
+            console.error('No voice found (%s)', debugSource);
+            return;
+        }
+
+        if (!requestPlayer()) {
+            return;
+        }
+
+        const ttsInput = readFirstInQueue();
+        const textToRead = ttsInputToText(ttsInput, lastReadTextRef.current);
+        try {
+            // Make API request to Play.ht (adjust URL and headers)
+            const response = await fetch('https://api.play.ht/api/v2/tts', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json',
+                    'Accept': 'audio/mpeg',
+                    ...apiAuthHeaders,
+                },
+                body: JSON.stringify({
+                    text: textToRead,
+                    voice: getVoiceId(ttsInput),
+                    output_format: 'mp3',
+                    speed: 1,
+                    sample_rate: 44100,
+                    voice_engine: 'PlayHT2.0-turbo'
+                }),
+            });
+
+            const audioUrl = response.ok ? response.headers.get('Location') : undefined;
+            if (!audioUrl) {
+                throw new Error(response.statusText);
+            }
+
+            return setAudioSource(audioUrl);
+        } catch (error) {
+            console.error('Error generating audio:', error);
+        }
+    }, [apiAuthHeaders, enabled, getVoiceId, lastReadTextRef, pickQueueItem, readFirstInQueue, requestPlayer, setAudioSource, token, userId]);
+
+    const fetchVoices = useCallback(() => {
+        if (!enabled || !token || voicesRef.current.length) return;
+
+        fetch('https://api.play.ht/api/v2/voices', {
+            method: 'GET',
+            headers: { accept: 'application/json', ...apiAuthHeaders },
+        }).then(async (r) => {
+            // TODO data validation
+            voicesRef.current = await r.json();
+            void processQueue('voices fetching');
+        });
+    }, [apiAuthHeaders, enabled, processQueue, token]);
+
+    const readText = useCallback((text: TtsInput) => {
+        if (!enabled) return;
+
+        queueText(text);
+        void processQueue('readText');
+    }, [enabled, queueText, processQueue]);
+
+    return {
+        readText,
+        fetchVoices,
+        clearQueue,
+        clearIds: clearIdsFromQueue,
+    };
+};
diff --git a/src/js/hooks/use-tts.ts b/src/js/hooks/use-tts.ts
index a6e6255..0b6c8e2 100644
--- a/src/js/hooks/use-tts.ts
+++ b/src/js/hooks/use-tts.ts
@@ -1,127 +1,93 @@
-import { useCallback, useEffect, useRef } from 'react';
-import { VoiceData } from '../model/eleven-labs';
-import {
-    mapPronounsToGender,
-    ttsMessageSubstitutions,
-    ttsNameSubstitutions,
-    VoiceGender
-} from '../utils/chat-messages';
-import { useSWRConfig } from 'swr';
-import { ELEVEN_LABS_SUBSCRIPTION_ENDPOINT } from '../chat/TtsHealth';
-
-interface TtsInput {
-    id?: string;
-    name?: string;
-    message: string;
-    pronouns?: string[];
-}
-
-export interface TtsApi {
-    readText: (input: TtsInput) => Promise<void>;
-    clearQueue: VoidFunction;
-    clearIds: (ids: string[]) => void;
-}
-
-export const useTts = (token: string | null, enabled: boolean | null): TtsApi => {
-    const voicesRef = useRef<VoiceData['voices']>([]);
-    const mountedRef = useRef(true);
-    const textQueueRef = useRef<TtsInput[]>([]);
-    const lastReadTextRef = useRef<TtsInput | null>(null);
-    const currentlyReadingRef = useRef<TtsInput | null>(null);
-    const audioPlayerRef = useRef<HTMLAudioElement | null>(null);
-    const getVoice = useCallback((targetGender: VoiceGender) => {
-        return voicesRef.current.find(voice => {
-            const { age, gender, 'use case': useCase } = voice.labels;
-            return age === 'young' && gender === targetGender && useCase === 'narration';
-        });
-    }, []);
-    const { mutate } = useSWRConfig();
+import { TtsApi, TtsInput, TtsProvider } from '../model/tts';
+import { SettingName } from '../model/settings';
+import { useSettings } from '../contexts/settings-context';
+import { useElevenLabsTts } from './use-eleven-labs-tts';
+import { usePlayHtTts } from './use-play-ht-tts';
+import { useCallback, useEffect, useMemo, useRef } from 'react';
+
+const noopTts: TtsApi = {
+    clearIds: () => undefined,
+    clearQueue: () => undefined,
+    fetchVoices: () => undefined,
+    readText: () => Promise.resolve(),
+};
 
-    const clearPlayingAudio = useCallback((lastRead: TtsInput | null = null) => {
-        if (audioPlayerRef.current) {
-            audioPlayerRef.current.pause();
-            const currentSource = audioPlayerRef.current.src;
-            if (currentSource) {
-                URL.revokeObjectURL(currentSource);
-            }
-            audioPlayerRef.current = null;
+export const useTts = (): TtsApi => {
+    const {
+        settings: {
+            [SettingName.ELEVEN_LABS_TOKEN]: elevenLabsToken,
+            [SettingName.TTS_ENABLED]: ttsEnabled,
+            [SettingName.PLAY_HT_TOKEN]: playHtToken,
+            [SettingName.PLAY_HT_USER_ID]: playHtUserId,
+            [SettingName.TTS_PROVIDER]: ttsProvider,
         }
-        if (currentlyReadingRef.current) {
-            currentlyReadingRef.current = null;
-        }
-        lastReadTextRef.current = lastRead;
-    }, []);
-
-    const processQueue = useCallback(async (debugSource: string) => {
-        if (!enabled) return;
+    } = useSettings();
+    const lastReadTextRef = useRef<TtsInput | null>(null);
 
-        if (!token) {
-            console.error('Token is missing (%s)', debugSource);
-            return;
-        }
-        if (textQueueRef.current.length === 0) {
-            console.info('TTS queue is empty (%s)', debugSource);
-            return;
-        }
+    const mountedRef = useRef(true);
+    const inputQueueRef = useRef<TtsInput[]>([]);
+    const currentlyReadingRef = useRef<TtsInput | null>(null);
+    const audioPlayerRef = useRef<HTMLAudioElement | null>(null);
 
-        const firstQueueItem = textQueueRef.current[0] as TtsInput;
-        const voice = getVoice(mapPronounsToGender(firstQueueItem?.pronouns));
-        if (!voice) {
-            console.error('No voice found (%s)', debugSource);
-            return;
+    const pickQueueItem = useCallback(() => {
+        if (inputQueueRef.current.length === 0) {
+            console.info('TTS queue is empty');
+            return null;
         }
-        const { voice_id } = voice;
 
+        return inputQueueRef.current[0] as TtsInput;
+    }, []);
 
+    const requestPlayer = (): boolean => {
         if (audioPlayerRef.current) {
-            console.info('TTS already playing (%s)', debugSource);
-            return;
+            console.info('TTS already playing');
+            return false;
         }
         audioPlayerRef.current = new Audio();
+        return true;
+    };
 
-        const ttsInput = textQueueRef.current.shift() as TtsInput;
+    const readFirstInQueue = () => {
+        const ttsInput = inputQueueRef.current.shift() as TtsInput;
         currentlyReadingRef.current = ttsInput;
-        // Do not repeat the name if it was the last one that was fully read out
-        const textToRead = (ttsInput.name && lastReadTextRef.current?.name !== ttsInput.name ? `${ttsNameSubstitutions(ttsInput.name)}. ` : '') + ttsMessageSubstitutions(ttsInput.message);
-        const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voice_id}/stream`, {
-            method: 'POST',
-            headers: {
-                'Content-Type': 'application/json',
-                'xi-api-key': token,
-                'accept': 'audio/mpeg'
-            },
-            body: JSON.stringify({ text: textToRead })
-        });
-        const audioBlob = await response.blob();
-        void mutate(ELEVEN_LABS_SUBSCRIPTION_ENDPOINT);
-
+        return ttsInput;
+    };
+    const setAudioSource = async (src: string) => {
         if (!audioPlayerRef.current) {
             audioPlayerRef.current = new Audio();
         }
 
-        audioPlayerRef.current.src = URL.createObjectURL(audioBlob);
+        audioPlayerRef.current.src = src;
         audioPlayerRef.current.play();
-        return new Promise(resolve => {
+
+        return new Promise<void>(resolve => {
             audioPlayerRef.current?.addEventListener('ended', () => {
-                clearPlayingAudio(ttsInput);
-                processQueue('ended handler').then(resolve);
+                resolve();
             });
         });
-    }, [enabled, token, getVoice, mutate, clearPlayingAudio]);
-
-    const readText = useCallback(async (text: TtsInput) => {
-        if (!enabled) return;
+    };
 
-        textQueueRef.current.push(text);
-        void processQueue('readText');
-    }, [enabled, processQueue]);
+    const clearPlayingAudio = useCallback((lastRead: TtsInput | null = null) => {
+        if (audioPlayerRef.current) {
+            audioPlayerRef.current.pause();
+            const currentSource = audioPlayerRef.current.src;
+            if (currentSource) {
+                URL.revokeObjectURL(currentSource);
+            }
+            audioPlayerRef.current = null;
+        }
+        if (currentlyReadingRef.current) {
+            currentlyReadingRef.current = null;
+        }
+        lastReadTextRef.current = lastRead;
+    }, [audioPlayerRef, currentlyReadingRef, lastReadTextRef]);
 
     const clearQueue = useCallback(() => {
         mountedRef.current = false;
         clearPlayingAudio();
-    }, [clearPlayingAudio]);
+    }, [clearPlayingAudio, mountedRef]);
 
-    const clearIds = useCallback((clearedIds: string[]) => {
+    const clearIdsFromQueue = useCallback((clearedIds: string[]) => {
         if (clearedIds.length === 0) return;
 
         const clearedIdsSet = new Set(clearedIds);
@@ -132,42 +98,68 @@ export const useTts = (token: string | null, enabled: boolean | null): TtsApi =>
             }
         }
 
-        if (textQueueRef.current.length > 0) {
-            textQueueRef.current = textQueueRef.current.filter(queueItem => {
+        if (inputQueueRef.current.length > 0) {
+            inputQueueRef.current = inputQueueRef.current.filter(queueItem => {
                 return !queueItem.id || !clearedIdsSet.has(queueItem.id);
             });
         }
+    }, [clearPlayingAudio]);
 
-        if (!currentlyReadingRef.current) {
-            void processQueue('clearIds');
-        }
-    }, [clearPlayingAudio, processQueue]);
 
-    useEffect(() => {
-        if (!enabled || !token || voicesRef.current.length) return;
-
-        fetch('https://api.elevenlabs.io/v1/voices', {
-            method: 'GET',
-            headers: { accept: 'application/json' },
-        }).then(async (r) => {
-            // TODO data validation
-            const voiceData = await r.json();
-            voicesRef.current = voiceData['voices'];
-            void processQueue('voices fetching');
-        });
-    }, [enabled, processQueue, token]);
+    const queueText = useCallback((text: TtsInput) => {
+        inputQueueRef.current.push(text);
+    }, [inputQueueRef]);
+
+    const elevenLabsTts = useElevenLabsTts({
+        token: elevenLabsToken,
+        enabled: ttsEnabled && ttsProvider === TtsProvider.ELEVEN_LABS,
+        lastReadTextRef,
+        currentlyReadingRef,
+        inputQueueRef,
+        pickQueueItem,
+        requestPlayer,
+        readFirstInQueue,
+        setAudioSource,
+        clearPlayingAudio,
+        clearQueue,
+        clearIdsFromQueue,
+        queueText,
+    });
+    const playHtTts = usePlayHtTts({
+        token: playHtToken,
+        userId: playHtUserId,
+        enabled: ttsEnabled && ttsProvider === TtsProvider.PLAY_HT,
+        lastReadTextRef,
+        currentlyReadingRef,
+        inputQueueRef,
+        pickQueueItem,
+        requestPlayer,
+        readFirstInQueue,
+        setAudioSource,
+        clearPlayingAudio,
+        clearQueue,
+        clearIdsFromQueue,
+        queueText,
+    });
+
+    const chosenApi = useMemo(() => {
+        if (ttsEnabled) {
+            switch (ttsProvider) {
+                case TtsProvider.PLAY_HT:
+                    return playHtTts;
+                case TtsProvider.ELEVEN_LABS:
+                    return elevenLabsTts;
+            }
+        }
+        return noopTts;
+    }, [elevenLabsTts, ttsEnabled, playHtTts, ttsProvider]);
 
-    // Clear the queue on unmount
     useEffect(() => {
-        if (!enabled) return;
-
         mountedRef.current = true;
-        return clearQueue;
-    }, [clearQueue, enabled]);
+        chosenApi.fetchVoices();
+        // Clear the queue on unmount
+        return chosenApi.clearQueue;
+    }, [chosenApi, mountedRef]);
 
-    return {
-        readText,
-        clearQueue,
-        clearIds,
-    };
+    return chosenApi;
 };
diff --git a/src/js/model/eleven-labs.ts b/src/js/model/eleven-labs.ts
index f727a0c..24bdb8a 100644
--- a/src/js/model/eleven-labs.ts
+++ b/src/js/model/eleven-labs.ts
@@ -1,4 +1,4 @@
-export interface VoiceData {
+export interface ElevenLabsVoiceData {
     voices: Array<{
         available_for_tiers: string[],
         category: string,
diff --git a/src/js/model/play-ht.ts b/src/js/model/play-ht.ts
new file mode 100644
index 0000000..7b82b97
--- /dev/null
+++ b/src/js/model/play-ht.ts
@@ -0,0 +1,42 @@
+export interface PlayHtVoiceData {
+    /**
+     * The unique ID for a PlayHT or Cloned Voice.
+     */
+    id: string;
+    /**
+     * The name of the voice.
+     */
+    name: string;
+    language: string;
+    language_code: string;
+
+    sample?: string | null;
+    /**
+     * @example `american` `australian` `british` `canadian`
+     */
+    accent?: string | null;
+    /**
+     * @example `adult` `old` `youth`
+     */
+    age?: string | null;
+    /**
+     * @example `female` `male`
+     */
+    gender?: string | null;
+    /**
+     * @example `low` `neutral` `whisper` `high`
+     */
+    loudness?: string | null;
+    /**
+     * @example `narrative` `videos` `training` `advertising` `meditation`
+     */
+    style?: string | null;
+    /**
+     * @example `neutral` `slow` `fast`
+     */
+    tempo?: string | null;
+    /**
+     * @example `gravelly` `smooth` `round` `thick`
+     */
+    texture?: string | null;
+}
diff --git a/src/js/model/settings.ts b/src/js/model/settings.ts
index 10d6527..ed6d9da 100644
--- a/src/js/model/settings.ts
+++ b/src/js/model/settings.ts
@@ -1,6 +1,7 @@
 import { RemovableElementId } from './removable-element-id';
 import { BeatSaberDataSource } from '../beat-saber/BeatSaber';
 import { FC } from 'react';
+import { TtsProvider } from './tts';
 
 export enum SettingName {
     PULSOID_TOKEN = 'pulsoidToken',
@@ -11,6 +12,9 @@ export enum SettingName {
     CHAT_SOCKET_ROOM = 'chatSocketRoom',
     ELEVEN_LABS_TOKEN = 'elevenLabsToken',
     TTS_ENABLED = 'ttsEnabled',
+    TTS_PROVIDER = 'ttsProvider',
+    PLAY_HT_TOKEN = 'playHtToken',
+    PLAY_HT_USER_ID = 'playHtUserId',
     BEAT_SABER_DATA_SOURCE = 'beatSaberDataSource',
     BEAT_SABER_BASE_FONT_SIZE = 'beatSaberBaseFontSize',
     BEAT_SABER_NOTES_PILE_ENABLED = 'beatSaberNotesPileEnabled',
@@ -33,11 +37,14 @@ export interface SettingTypes {
     [SettingName.BEAT_SABER_NOTES_PILE_ENABLED]: boolean;
     [SettingName.ELEVEN_LABS_TOKEN]: string;
     [SettingName.TTS_ENABLED]: boolean;
+    [SettingName.PLAY_HT_TOKEN]: string;
+    [SettingName.PLAY_HT_USER_ID]: string;
     [SettingName.OBS_PRIMARY_SCENE]: string;
     [SettingName.OBS_BRB_SCENE]: string;
     [SettingName.OBS_FAREWELL_SCENE]: string;
     [SettingName.OUTRO_SONG_BSR]: string;
     [SettingName.CHAT_SONG_PREVIEWS]: boolean;
+    [SettingName.TTS_PROVIDER]: TtsProvider;
 }
 
 export type SettingsObject = {
diff --git a/src/js/model/tts.ts b/src/js/model/tts.ts
new file mode 100644
index 0000000..e147c49
--- /dev/null
+++ b/src/js/model/tts.ts
@@ -0,0 +1,56 @@
+import { MutableRefObject } from 'react';
+
+export interface TtsHookOptions {
+    token: string | null;
+    enabled: boolean | null;
+    pickQueueItem: () => TtsInput | null;
+    readFirstInQueue: () => TtsInput;
+    requestPlayer: (logOnFail?: boolean) => boolean;
+    /**
+     * @returns promise that resolves when source audio has finished playing
+     */
+    setAudioSource: (src: string) => Promise<void>;
+    clearPlayingAudio: (lastRead?: TtsInput | null) => void;
+    clearQueue: VoidFunction;
+    clearIdsFromQueue: (clearedIds: string[]) => void;
+    queueText: (text: TtsInput) => void;
+    lastReadTextRef: MutableRefObject<TtsInput | null>;
+    inputQueueRef: MutableRefObject<TtsInput[]>;
+    currentlyReadingRef: MutableRefObject<TtsInput | null>;
+}
+
+export interface TtsInput {
+    id?: string;
+    name?: string;
+    message: string;
+    pronouns?: string[];
+}
+
+export interface TtsApi {
+    readText: (input: TtsInput) => void;
+    clearQueue: VoidFunction;
+    /**
+     * Function to initiate fetching of the voices for the provider (may use promises internally but shall not return them)
+     */
+    fetchVoices: VoidFunction;
+    clearIds: (ids: string[]) => void;
+    limitProviderHook?: TtsLimitProviderHook,
+}
+
+export type TtsLimitProviderHook = () => TtsLimits;
+
+export interface TtsLimits {
+    maxChars: number;
+    usedChars: number;
+}
+
+export const enum TtsProvider {
+    NOOP = 'NOOP',
+    ELEVEN_LABS = 'ElevenLabs',
+    PLAY_HT = 'PlayHt',
+}
+
+
+export const isValidTtsProvider = (input: string): input is TtsProvider =>
+    input === TtsProvider.ELEVEN_LABS
+    || input === TtsProvider.PLAY_HT;
diff --git a/src/js/settings/pages/SettingsPageChatOverlay.tsx b/src/js/settings/pages/SettingsPageChatOverlay.tsx
index 30ae86a..b76fbec 100644
--- a/src/js/settings/pages/SettingsPageChatOverlay.tsx
+++ b/src/js/settings/pages/SettingsPageChatOverlay.tsx
@@ -1,9 +1,11 @@
 import {
     ChangeEventHandler,
-    FC, FormEvent,
+    FC,
+    FormEvent,
     FormEventHandler,
     useCallback,
-    useEffect, useId,
+    useEffect,
+    useId,
     useRef,
     useState
 } from 'react';
@@ -11,6 +13,8 @@ import { useSettings } from '../../contexts/settings-context';
 import { SettingName } from '../../model/settings';
 import { BeatSaverMap } from '../../BeatSaverMap';
 import { ExternalLink } from '../../ExternalLink';
+import { LabelledInput } from '../LabelledInput';
+import { isValidTtsProvider, TtsProvider } from '../../model/tts';
 
 export const SettingsPageChatOverlay: FC = () => {
     const {
@@ -19,14 +23,20 @@ export const SettingsPageChatOverlay: FC = () => {
             [SettingName.CHAT_SOCKET_ROOM]: room,
             [SettingName.ELEVEN_LABS_TOKEN]: elevenLabsToken,
             [SettingName.TTS_ENABLED]: ttsEnabled,
+            [SettingName.TTS_PROVIDER]: ttsProvider,
+            [SettingName.PLAY_HT_USER_ID]: playHtUserId,
+            [SettingName.PLAY_HT_TOKEN]: playHtToken,
             [SettingName.CHAT_SONG_PREVIEWS]: songPreviews,
         },
         setSetting,
     } = useSettings();
     const [serverUrlInputValue, setServerUrlInputValue] = useState<string>('');
     const [roomInputValue, setRoomInputValue] = useState<string>('');
-    const [tokenInputValue, setTokenInputValue] = useState<string>('');
+    const [elevenLabsTokenInputValue, setElevenLabsTokenInputValue] = useState<string>('');
+    const [playHtTokenInputValue, setPlayHtTokenInputValue] = useState<string>('');
+    const [playHtUserIdInputValue, setPlayHtUserIdInputValue] = useState<string>('');
     const [ttsEnabledInputValue, setTtsEnabledInputValue] = useState(false);
+    const [ttsProviderInputValue, setTtsProviderInputValue] = useState<TtsProvider | null>(null);
     const [songPreviewsInputValue, setSongPreviewsInputValue] = useState(false);
     const firstInputRef = useRef<HTMLInputElement>(null);
 
@@ -36,10 +46,13 @@ export const SettingsPageChatOverlay: FC = () => {
     const updateInputValue = useCallback(() => {
         setServerUrlInputValue(serverUrl ?? '');
         setRoomInputValue(room ?? '');
-        setTokenInputValue(elevenLabsToken ?? '');
+        setElevenLabsTokenInputValue(elevenLabsToken ?? '');
+        setPlayHtTokenInputValue(playHtToken ?? '');
+        setPlayHtUserIdInputValue(playHtUserId ?? '');
         setTtsEnabledInputValue(ttsEnabled ?? false);
+        setTtsProviderInputValue(ttsProvider);
         setSongPreviewsInputValue(songPreviews ?? false);
-    }, [elevenLabsToken, room, serverUrl, songPreviews, ttsEnabled]);
+    }, [elevenLabsToken, playHtToken, playHtUserId, room, serverUrl, songPreviews, ttsEnabled, ttsProvider]);
     const changeHost = useCallback(() => {
         setSetting(SettingName.CHAT_SOCKET_SERVER_URL, serverUrlInputValue.trim());
     }, [serverUrlInputValue, setSetting]);
@@ -52,11 +65,23 @@ export const SettingsPageChatOverlay: FC = () => {
     const handlePathInputChange: ChangeEventHandler<HTMLInputElement> = useCallback((e) => {
         setRoomInputValue(e.target.value);
     }, []);
-    const updateToken = useCallback(() => {
-        setSetting(SettingName.ELEVEN_LABS_TOKEN, tokenInputValue.trim());
-    }, [setSetting, tokenInputValue]);
-    const handleTokenInputChange: ChangeEventHandler<HTMLInputElement> = useCallback((e) => {
-        setTokenInputValue(e.target.value);
+    const updateElevenLabsToken = useCallback(() => {
+        setSetting(SettingName.ELEVEN_LABS_TOKEN, elevenLabsTokenInputValue.trim());
+    }, [setSetting, elevenLabsTokenInputValue]);
+    const handleElevenLabsTokenInputChange: ChangeEventHandler<HTMLInputElement> = useCallback((e) => {
+        setElevenLabsTokenInputValue(e.target.value);
+    }, []);
+    const updatePlayHtToken = useCallback(() => {
+        setSetting(SettingName.PLAY_HT_TOKEN, playHtTokenInputValue.trim());
+    }, [setSetting, playHtTokenInputValue]);
+    const handlePlayHtTokenInputChange: ChangeEventHandler<HTMLInputElement> = useCallback((e) => {
+        setPlayHtTokenInputValue(e.target.value);
+    }, []);
+    const updatePlayHtUserId = useCallback(() => {
+        setSetting(SettingName.PLAY_HT_USER_ID, playHtUserIdInputValue.trim());
+    }, [setSetting, playHtUserIdInputValue]);
+    const handlePlayHtUserIdInputChange: ChangeEventHandler<HTMLInputElement> = useCallback((e) => {
+        setPlayHtUserIdInputValue(e.target.value);
     }, []);
     const changeTtsEnabled = useCallback(() => {
         setSetting(SettingName.TTS_ENABLED, ttsEnabledInputValue);
@@ -70,6 +95,13 @@ export const SettingsPageChatOverlay: FC = () => {
     const handleSongPreviewsInputChange: ChangeEventHandler<HTMLInputElement> = useCallback((e) => {
         setSongPreviewsInputValue(e.target.checked);
     }, []);
+    const updateTtsProvider = useCallback(() => {
+        setSetting(SettingName.TTS_PROVIDER, ttsProviderInputValue);
+    }, [setSetting, ttsProviderInputValue]);
+    const handleTtsProviderInputChange: ChangeEventHandler<HTMLInputElement> = useCallback((e) => {
+        const { value } = e.target;
+        setTtsProviderInputValue(isValidTtsProvider(value) ? value : null);
+    }, []);
 
     const handleSubmit: FormEventHandler<HTMLFormElement> = useCallback((e) => {
         e.preventDefault();
@@ -77,8 +109,11 @@ export const SettingsPageChatOverlay: FC = () => {
         changePath();
         changeSongPreviewsEnabled();
         changeTtsEnabled();
-        updateToken();
-    }, [changeHost, changePath, changeSongPreviewsEnabled, changeTtsEnabled, updateToken]);
+        updateTtsProvider();
+        updateElevenLabsToken();
+        updatePlayHtToken();
+        updatePlayHtUserId();
+    }, [changeHost, changePath, changeSongPreviewsEnabled, changeTtsEnabled, updateElevenLabsToken, updatePlayHtToken, updatePlayHtUserId, updateTtsProvider]);
 
     // Used to reset the state of the page on mount/reset button click
     const init = useCallback((e?: FormEvent) => {
@@ -92,6 +127,9 @@ export const SettingsPageChatOverlay: FC = () => {
         // eslint-disable-next-line react-hooks/exhaustive-deps -- This effect should only be called on mount
     }, []);
 
+    const isPlayHtTts = ttsProviderInputValue === TtsProvider.PLAY_HT;
+    const isElevenLabsTts = ttsProviderInputValue === TtsProvider.ELEVEN_LABS;
+
     return <form onSubmit={handleSubmit} onReset={init}>
         <details open>
             <summary>
@@ -156,26 +194,87 @@ export const SettingsPageChatOverlay: FC = () => {
                 id={ttsInputId}
                 checked={ttsEnabledInputValue}
                 onChange={handleTtsEnabledInputChange}
-            /> <label htmlFor={ttsInputId}>Read chat messages via ElevenLabs TTS</label></p>
+            /> <label htmlFor={ttsInputId}>Read chat messages via TTS</label></p>
             <p>Usernames and messages are processed before being passed to the API, e.g. omitting
                 large numbers from names, adding spaces before capital letters in names, expansion
                 of certain acronyms and slang words, etc.</p>
             <p>Emotes are only read out if they have a hard-coded text representation defined in the
                 overlay.</p>
 
-            <h3>ElevenLabs API Key</h3>
-            <p>Generate a token on <ExternalLink
-                href="https://elevenlabs.io/"
-            >elevenlabs.io</ExternalLink> and paste it below.</p>
-            <p>This requires a registered account with a verified e-mail address.</p>
-            <p>Leave the input empty to remove an already stored API key.</p>
-            <input
-                type="password"
-                name="elevenlabs-api-key"
-                autoComplete="off"
-                value={tokenInputValue}
-                onChange={handleTokenInputChange}
-            />
+            <details open={ttsEnabledInputValue}>
+                <summary>
+                    <h3>Provider</h3>
+                </summary>
+                <p>The overlay supports a few different services for test-to-speech synthesis.</p>
+                <LabelledInput
+                    type="radio"
+                    name="tts-provider"
+                    value={TtsProvider.PLAY_HT}
+                    displayName="PlayHT"
+                    checked={isPlayHtTts}
+                    onChange={handleTtsProviderInputChange}
+                    disabled
+                >
+                    <p>Uses the <ExternalLink href="https://play.ht/">PlayHT</ExternalLink> API
+                        (currently does not work due to API limitations)</p>
+                </LabelledInput>
+                <LabelledInput
+                    type="radio"
+                    name="tts-provider"
+                    value={TtsProvider.ELEVEN_LABS}
+                    displayName="ElevenLabs"
+                    checked={isElevenLabsTts}
+                    onChange={handleTtsProviderInputChange}
+                >
+                    <p>Uses
+                        the <ExternalLink href="https://elevenlabs.io/">ElevenLabs</ExternalLink> API
+                    </p>
+                </LabelledInput>
+
+                {isElevenLabsTts && <>
+                    <h3>ElevenLabs API Key</h3>
+                    <p>Generate a token on <ExternalLink
+                        href="https://elevenlabs.io/"
+                    >elevenlabs.io</ExternalLink> and paste it below.</p>
+                    <p>This requires a registered account with a verified e-mail address.</p>
+                    <p>Leave the input empty to remove an already stored API key.</p>
+                    <input
+                        type="password"
+                        name="elevenlabs-api-key"
+                        autoComplete="off"
+                        value={elevenLabsTokenInputValue}
+                        onChange={handleElevenLabsTokenInputChange}
+                    />
+                </>}
+
+                {isPlayHtTts && <>
+                    <h3>PlayHT User ID</h3>
+                    <p>Visit the <ExternalLink
+                        href="https://play.ht/studio/api-access"
+                    >API Access</ExternalLink> menu, copy the User ID and paste it below.</p>
+                    <p>This requires a registering an account.</p>
+                    <input
+                        type="text"
+                        name="playht-user-id"
+                        value={playHtUserIdInputValue}
+                        onChange={handlePlayHtUserIdInputChange}
+                    />
+
+                    <h3>PlayHT Secret Key</h3>
+                    <p>Visit the <ExternalLink
+                        href="https://play.ht/studio/api-access"
+                    >API Access</ExternalLink> menu, generate a secret key if you have not done so
+                        already, then paste it below.</p>
+                    <p>Leave the input empty to remove an already stored secret key.</p>
+                    <input
+                        type="password"
+                        name="playht-token"
+                        autoComplete="off"
+                        value={playHtTokenInputValue}
+                        onChange={handlePlayHtTokenInputChange}
+                    />
+                </>}
+            </details>
         </details>
         <button type="submit">Save</button>
         <button type="reset">Reset</button>
diff --git a/src/js/utils/chat-messages.ts b/src/js/utils/chat-messages.ts
index 83bdd0a..f6fac4f 100644
--- a/src/js/utils/chat-messages.ts
+++ b/src/js/utils/chat-messages.ts
@@ -2,6 +2,7 @@ import { ChatWebsocketMessage } from '../model/app-scoket';
 import { isValid, parseISO } from 'date-fns';
 import { ChatEmoteProps } from '../chat/ChatEmote';
 import { BeatSaverMapProps } from '../BeatSaverMap';
+import { TtsInput } from '../model/tts';
 
 export enum SystemMessageType {
     INFO,
@@ -313,3 +314,7 @@ export const mapPronounsToGender = (pronouns?: string[]): VoiceGender => {
             return 'male';
     }
 };
+
+export const ttsInputToText = (ttsInput: TtsInput, lastRead: TtsInput | null): string =>
+    // Do not repeat the name if it was the last one that was fully read out
+    (ttsInput.name && lastRead?.name !== ttsInput.name ? `${ttsNameSubstitutions(ttsInput.name)}. ` : '') + ttsMessageSubstitutions(ttsInput.message);
diff --git a/src/js/utils/settings.ts b/src/js/utils/settings.ts
index 5696278..b83cae8 100644
--- a/src/js/utils/settings.ts
+++ b/src/js/utils/settings.ts
@@ -1,6 +1,7 @@
 import { isRemovableElementId } from '../model/removable-element-id';
 import { SettingName, SettingsObject, SettingTypes } from '../model/settings';
 import { isValidBeatSaberDataSource } from '../beat-saber/BeatSaber';
+import { isValidTtsProvider } from '../model/tts';
 
 export const settingValidators: { [k in SettingName]: (input: unknown) => SettingTypes[k] | null } = {
     [SettingName.PULSOID_TOKEN]: (input) => {
@@ -63,6 +64,18 @@ export const settingValidators: { [k in SettingName]: (input: unknown) => Settin
         }
         return null;
     },
+    [SettingName.PLAY_HT_USER_ID]: input => {
+        if (typeof input === 'string' && /^[a-z\d]+$/i.test(input)) {
+            return input;
+        }
+        return null;
+    },
+    [SettingName.PLAY_HT_TOKEN]: input => {
+        if (typeof input === 'string' && /^[a-f\d]+$/.test(input)) {
+            return input;
+        }
+        return null;
+    },
     [SettingName.TTS_ENABLED]: input => {
         if (typeof input === 'boolean') {
             return input;
@@ -98,6 +111,12 @@ export const settingValidators: { [k in SettingName]: (input: unknown) => Settin
             return input;
         }
         return null;
+    },
+    [SettingName.TTS_PROVIDER]: input => {
+        if (typeof input === 'string' && isValidTtsProvider(input)) {
+            return input;
+        }
+        return null;
     }
 };