Add theoretical PlayHT TTS support

DJDavid98 · Mar 4, 2024 · df29230 · df29230
1 parent 8b2fb5a
commit df29230
Show file tree

Hide file tree

Showing 12 changed files with 658 additions and 186 deletions.
diff --git a/src/js/chat/Chat.tsx b/src/js/chat/Chat.tsx
@@ -15,24 +15,23 @@ import { ChatMessage } from './ChatMessage';
 import DurationUnitFormat from 'intl-unofficial-duration-unit-format';
 import { useSettings } from '../contexts/settings-context';
 import { SettingName } from '../model/settings';
-import { useTts } from '../hooks/use-tts';
 import { TtsHealth } from './TtsHealth';
 import { RemovableElement } from '../RemovableElement';
 import { RemovableElementId } from '../model/removable-element-id';
+import { useTts } from '../hooks/use-tts';
 
 const MAX_MESSAGE_COUNT = 12;
 
 export const Chat: FC = () => {
     const {
         settings: {
-            [SettingName.ELEVEN_LABS_TOKEN]: elevenLabsToken,
             [SettingName.TTS_ENABLED]: ttsEnabled,
             [SettingName.CHAT_SONG_PREVIEWS]: chatSongPreviews,
         }
     } = useSettings();
     const [messages, setMessages] = useState<Array<DisplayableMessage>>(() => []);
     const socket = useSocket();
-    const tts = useTts(elevenLabsToken, ttsEnabled);
+    const tts = useTts();
     const df = useMemo(() => new DurationUnitFormat('en-US', {
         style: DurationUnitFormat.styles.LONG,
         format: '{days} {hour} {minutes} {seconds}'
@@ -202,8 +201,8 @@ export const Chat: FC = () => {
     }, [addMessage, chatSongPreviews, df, socket, tts]);
 
     return <Fragment>
-        {ttsEnabled && elevenLabsToken &&
-            <RemovableElement id={RemovableElementId.TTS_HEALTH}><TtsHealth token={elevenLabsToken} /></RemovableElement>}
+        {ttsEnabled && tts.limitProviderHook &&
+            <RemovableElement id={RemovableElementId.TTS_HEALTH}><TtsHealth useLimitProvider={tts.limitProviderHook} /></RemovableElement>}
         {messages.map(message => <ChatMessage key={message.id} message={message} />)}
     </Fragment>;
 };
diff --git a/src/js/chat/TtsHealth.tsx b/src/js/chat/TtsHealth.tsx
@@ -1,45 +1,22 @@
 import { FC, useMemo } from 'react';
-import useSWR from 'swr';
 import * as styles from '../../scss/modules/TtsHealth.module.scss';
+import { TtsLimitProviderHook } from '../model/tts';
 
 export const ELEVEN_LABS_SUBSCRIPTION_ENDPOINT = 'https://api.elevenlabs.io/v1/user/subscription';
 
 export interface TtsHealthProps {
-    token: string;
+    useLimitProvider: TtsLimitProviderHook;
 }
 
-export const TtsHealth: FC<TtsHealthProps> = ({ token }) => {
+export const TtsHealth: FC<TtsHealthProps> = ({ useLimitProvider }) => {
     const pf = useMemo(() => new Intl.NumberFormat('en-US', {
         style: 'percent',
         minimumFractionDigits: 0,
         maximumFractionDigits: 0
     }), []);
     const nf = useMemo(() => new Intl.NumberFormat('en-US'), []);
 
-    const { data: subscriptionData } = useSWR(ELEVEN_LABS_SUBSCRIPTION_ENDPOINT, (key: string) => fetch(key, {
-        method: 'GET',
-        headers: {
-            'xi-api-key': token,
-        },
-    }).then(r => r.json()), {
-        refreshInterval: 60e3,
-        revalidateOnFocus: false,
-        keepPreviousData: true,
-    });
-
-    const limits = useMemo(() => {
-        let maxChars = 0;
-        let usedChars = 0;
-        if (typeof subscriptionData === 'object' && subscriptionData !== null) {
-            if ('character_limit' in subscriptionData && typeof subscriptionData.character_limit === 'number') {
-                maxChars = subscriptionData.character_limit;
-            }
-            if ('character_count' in subscriptionData && typeof subscriptionData.character_count === 'number') {
-                usedChars = subscriptionData.character_count;
-            }
-        }
-        return { maxChars, usedChars };
-    }, [subscriptionData]);
+    const limits = useLimitProvider();
 
     const ttsUsedPercent = limits.maxChars > 0 ? limits.usedChars / limits.maxChars : 1;
     const charsAvailable = limits.maxChars - limits.usedChars;

diff --git a/src/js/hooks/use-eleven-labs-tts.ts b/src/js/hooks/use-eleven-labs-tts.ts
@@ -0,0 +1,151 @@
+import { useCallback, useMemo, useRef } from 'react';
+import { ElevenLabsVoiceData } from '../model/eleven-labs';
+import { mapPronounsToGender, ttsInputToText } from '../utils/chat-messages';
+import useSWR, { useSWRConfig } from 'swr';
+import { ELEVEN_LABS_SUBSCRIPTION_ENDPOINT } from '../chat/TtsHealth';
+import { TtsApi, TtsHookOptions, TtsInput, TtsLimits } from '../model/tts';
+import { useSettings } from '../contexts/settings-context';
+import { SettingName } from '../model/settings';
+
+const useElevenLabsLimits = (): TtsLimits => {
+    const { settings: { [SettingName.ELEVEN_LABS_TOKEN]: token } } = useSettings();
+
+    const { data: subscriptionData } = useSWR(ELEVEN_LABS_SUBSCRIPTION_ENDPOINT, async (key: string) => {
+        if (!token) {
+            return null;
+        }
+
+        const r = await fetch(key, {
+            method: 'GET',
+            headers: {
+                'xi-api-key': token,
+            },
+        });
+        return await r.json();
+    }, {
+        refreshInterval: 60e3,
+        revalidateOnFocus: false,
+        keepPreviousData: true,
+    });
+
+    return useMemo((): TtsLimits => {
+        let maxChars = 0;
+        let usedChars = 0;
+        if (typeof subscriptionData === 'object' && subscriptionData !== null) {
+            if ('character_limit' in subscriptionData && typeof subscriptionData.character_limit === 'number') {
+                maxChars = subscriptionData.character_limit;
+            }
+            if ('character_count' in subscriptionData && typeof subscriptionData.character_count === 'number') {
+                usedChars = subscriptionData.character_count;
+            }
+        }
+        return { maxChars, usedChars };
+    }, [subscriptionData]);
+};
+
+export const useElevenLabsTts = ({
+    token,
+    enabled,
+    lastReadTextRef,
+    currentlyReadingRef,
+    pickQueueItem,
+    requestPlayer,
+    readFirstInQueue,
+    setAudioSource,
+    clearPlayingAudio,
+    clearQueue,
+    clearIdsFromQueue,
+    queueText,
+}: TtsHookOptions): TtsApi => {
+    const voicesRef = useRef<ElevenLabsVoiceData['voices']>([]);
+    const getVoiceId = useCallback((ttsInput?: TtsInput): string | undefined => {
+        const targetGender = mapPronounsToGender(ttsInput?.pronouns);
+        const matchingVoice = voicesRef.current.find(voice => {
+            const { age, gender, 'use case': useCase } = voice.labels;
+            return age === 'young' && gender === targetGender && useCase === 'narration';
+        });
+        return matchingVoice ? matchingVoice.voice_id : undefined;
+    }, []);
+
+    const { mutate } = useSWRConfig();
+
+    const processQueue = useCallback(async (debugSource: string): Promise<void> => {
+        if (!enabled) return;
+
+        if (!token) {
+            console.error('Token is missing (%s)', debugSource);
+            return;
+        }
+
+        const firstQueueItem = pickQueueItem();
+        if (!firstQueueItem) {
+            return;
+        }
+
+        const voiceId = getVoiceId(firstQueueItem);
+        if (!voiceId) {
+            console.error('No voice found (%s)', debugSource);
+            return;
+        }
+
+        if (!requestPlayer()) {
+            return;
+        }
+
+        const ttsInput = readFirstInQueue();
+        const textToRead = ttsInputToText(ttsInput, lastReadTextRef.current);
+        const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream`, {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json',
+                'xi-api-key': token,
+                'accept': 'audio/mpeg'
+            },
+            body: JSON.stringify({ text: textToRead })
+        });
+        const audioBlob = await response.blob();
+        void mutate(ELEVEN_LABS_SUBSCRIPTION_ENDPOINT);
+
+        return setAudioSource(URL.createObjectURL(audioBlob)).then(() => {
+            clearPlayingAudio(ttsInput);
+            return processQueue('ended handler');
+        });
+    }, [clearPlayingAudio, enabled, getVoiceId, lastReadTextRef, mutate, pickQueueItem, readFirstInQueue, requestPlayer, setAudioSource, token]);
+
+    const readText = useCallback((text: TtsInput) => {
+        if (!enabled) return;
+
+        queueText(text);
+        void processQueue('readText');
+    }, [enabled, queueText, processQueue]);
+
+    const clearIds = useCallback((clearedIds: string[]) => {
+        clearIdsFromQueue(clearedIds);
+
+        if (!currentlyReadingRef.current) {
+            void processQueue('clearIds');
+        }
+    }, [clearIdsFromQueue, currentlyReadingRef, processQueue]);
+
+    const fetchVoices = useCallback(() => {
+        if (!enabled || !token || voicesRef.current.length) return;
+
+        fetch('https://api.elevenlabs.io/v1/voices', {
+            method: 'GET',
+            headers: { accept: 'application/json' },
+        }).then(async (r) => {
+            // TODO data validation
+            const voiceData = await r.json();
+            voicesRef.current = voiceData['voices'];
+            void processQueue('voices fetching');
+        });
+    }, [enabled, processQueue, token]);
+
+    return {
+        readText,
+        fetchVoices,
+        clearQueue,
+        clearIds,
+        limitProviderHook: useElevenLabsLimits,
+    };
+};
diff --git a/src/js/hooks/use-play-ht-tts.ts b/src/js/hooks/use-play-ht-tts.ts
@@ -0,0 +1,125 @@
+import { useCallback, useMemo, useRef } from 'react';
+import { TtsApi, TtsHookOptions, TtsInput } from '../model/tts';
+import { mapPronounsToGender, ttsInputToText } from '../utils/chat-messages';
+import { PlayHtVoiceData } from '../model/play-ht';
+
+export interface PlayHtTtsParams extends TtsHookOptions {
+    userId: string | null;
+}
+
+export const usePlayHtTts = ({
+    token,
+    enabled,
+    userId,
+    lastReadTextRef,
+    requestPlayer,
+    setAudioSource,
+    readFirstInQueue,
+    pickQueueItem,
+    clearQueue,
+    clearIdsFromQueue,
+    queueText,
+}: PlayHtTtsParams): TtsApi => {
+    const voicesRef = useRef<PlayHtVoiceData[]>([]);
+    const getVoiceId = useCallback((ttsInput?: TtsInput): string | undefined => {
+        const targetGender = mapPronounsToGender(ttsInput?.pronouns);
+        const matchingVoice = voicesRef.current.find(voice => {
+            const { age, gender, style, loudness } = voice;
+            return age === 'youth' && gender === targetGender && style === 'narrative' && loudness === 'neutral';
+        });
+        return matchingVoice ? matchingVoice.id : undefined;
+    }, []);
+    const apiAuthHeaders = useMemo(() => {
+        const authHeaders: Record<string, string> = {};
+        if (token) authHeaders['Authorization'] = token;
+        if (userId) authHeaders['X-USER-ID'] = userId;
+        return authHeaders;
+    }, [token, userId]);
+
+    const processQueue = useCallback(async (debugSource: string): Promise<void> => {
+        if (!enabled) return;
+
+        if (!token) {
+            console.error('Token is missing (%s)', debugSource);
+            return;
+        }
+
+        if (!userId) {
+            console.error('User ID is missing (%s)', debugSource);
+            return;
+        }
+
+        const firstQueueItem = pickQueueItem();
+        if (!firstQueueItem) {
+            return;
+        }
+
+        const voiceId = getVoiceId(firstQueueItem);
+        if (!voiceId) {
+            console.error('No voice found (%s)', debugSource);
+            return;
+        }
+
+        if (!requestPlayer()) {
+            return;
+        }
+
+        const ttsInput = readFirstInQueue();
+        const textToRead = ttsInputToText(ttsInput, lastReadTextRef.current);
+        try {
+            // Make API request to Play.ht (adjust URL and headers)
+            const response = await fetch('https://api.play.ht/api/v2/tts', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json',
+                    'Accept': 'audio/mpeg',
+                    ...apiAuthHeaders,
+                },
+                body: JSON.stringify({
+                    text: textToRead,
+                    voice: getVoiceId(ttsInput),
+                    output_format: 'mp3',
+                    speed: 1,
+                    sample_rate: 44100,
+                    voice_engine: 'PlayHT2.0-turbo'
+                }),
+            });
+
+            const audioUrl = response.ok ? response.headers.get('Location') : undefined;
+            if (!audioUrl) {
+                throw new Error(response.statusText);
+            }
+
+            return setAudioSource(audioUrl);
+        } catch (error) {
+            console.error('Error generating audio:', error);
+        }
+    }, [apiAuthHeaders, enabled, getVoiceId, lastReadTextRef, pickQueueItem, readFirstInQueue, requestPlayer, setAudioSource, token, userId]);
+
+    const fetchVoices = useCallback(() => {
+        if (!enabled || !token || voicesRef.current.length) return;
+
+        fetch('https://api.play.ht/api/v2/voices', {
+            method: 'GET',
+            headers: { accept: 'application/json', ...apiAuthHeaders },
+        }).then(async (r) => {
+            // TODO data validation
+            voicesRef.current = await r.json();
+            void processQueue('voices fetching');
+        });
+    }, [apiAuthHeaders, enabled, processQueue, token]);
+
+    const readText = useCallback((text: TtsInput) => {
+        if (!enabled) return;
+
+        queueText(text);
+        void processQueue('readText');
+    }, [enabled, queueText, processQueue]);
+
+    return {
+        readText,
+        fetchVoices,
+        clearQueue,
+        clearIds: clearIdsFromQueue,
+    };
+};