From 4009bd1970f3ecc8d8516ab94dada9cdd2ee21c0 Mon Sep 17 00:00:00 2001 From: Yogish Baliga Date: Fri, 10 Jan 2025 09:23:10 -0800 Subject: [PATCH 1/5] OpenAPI spec for audio api request/response --- openapi.yaml | 179 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 152 insertions(+), 27 deletions(-) diff --git a/openapi.yaml b/openapi.yaml index 1a87057..78a6989 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -640,6 +640,48 @@ paths: schema: $ref: "#/components/schemas/ErrorData" deprecated: false + /audio/speech: + post: + tags: ["Audio"] + summary: Create audio generation request + description: Generate audio from input text + operationId: audio-speech + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/AudioSpeechRequest" + responses: + "200": + description: "OK" + content: + application/octet-stream: + schema: + type: string + format: binary + audio/wav: + schema: + type: string + format: binary + audio/mpeg: + schema: + type: string + format: binary + text/event-stream: + schema: + $ref: "#/components/schemas/AudioSpeechStreamResponse" + "400": + description: "BadRequest" + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorData" + "429": + description: "RateLimit" + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorData" components: securitySchemes: bearerAuth: @@ -682,21 +724,21 @@ components: example: Our solar system orbits the Milky Way galaxy at about 515,000 mph example: - { - "title": "Llama", - "text": "The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era.", - } + "title": "Llama", + "text": "The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era.", + } - { - "title": "Panda", - "text": "The giant panda (Ailuropoda melanoleuca), also known as the panda bear or simply panda, is a bear species endemic to China.", - } + "title": "Panda", + "text": "The giant panda (Ailuropoda melanoleuca), also known as the panda bear or simply panda, is a bear species endemic to China.", + } - { - "title": "Guanaco", - "text": "The guanaco is a camelid native to South America, closely related to the llama. Guanacos are one of two wild South American camelids; the other species is the vicuña, which lives at higher elevations.", - } + "title": "Guanaco", + "text": "The guanaco is a camelid native to South America, closely related to the llama. Guanacos are one of two wild South American camelids; the other species is the vicuña, which lives at higher elevations.", + } - { - "title": "Wild Bactrian camel", - "text": "The wild Bactrian camel (Camelus ferus) is an endangered species of camel endemic to Northwest China and southwestern Mongolia.", - } + "title": "Wild Bactrian camel", + "text": "The wild Bactrian camel (Camelus ferus) is an endangered species of camel endemic to Northwest China and southwestern Mongolia.", + } top_n: type: integer description: The number of top results to return. @@ -756,21 +798,21 @@ components: nullable: true example: - { - "index": 0, - "relevance_score": 0.29980177813003117, - "document": - { - "text": '{"title":"Llama","text":"The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."}', - }, - } + "index": 0, + "relevance_score": 0.29980177813003117, + "document": + { + "text": '{"title":"Llama","text":"The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."}', + }, + } - { - "index": 2, - "relevance_score": 0.2752447527354349, - "document": - { - "text": '{"title":"Guanaco","text":"The guanaco is a camelid native to South America, closely related to the llama. Guanacos are one of two wild South American camelids; the other species is the vicuña, which lives at higher elevations."}', - }, - } + "index": 2, + "relevance_score": 0.2752447527354349, + "document": + { + "text": '{"title":"Guanaco","text":"The guanaco is a camelid native to South America, closely related to the llama. Guanacos are one of two wild South American camelids; the other species is the vicuña, which lives at higher elevations."}', + }, + } usage: $ref: "#/components/schemas/UsageData" example: @@ -1485,6 +1527,89 @@ components: - $ref: "#/components/schemas/UsageData" - nullable: true + AudioSpeechRequest: + type: object + required: + - model + - input + properties: + model: + description: > + The name of the model to query.
+
+ [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#audio-models) + example: cartesia/audio-v1 + anyOf: + - type: string + - type: string + enum: + - Cartesia/Audio-v1 + input: + type: string + description: Input text to generate the audio for + maxLength: 4096 + voice: + type: string + description: The voice to use for generating the audi. Supported voices are listed [here](https://together.ai/docs/voices) + response_format: + type: string + description: The format of audio output + default: wav + enum: + - mp3 + - wav + - raw + language: + type: string + description: Language of input text + default: en + enum: + - en + response_encoding: + type: string + description: Audio encoding of response + default: pcm_f32le + enum: + - pcm_f32le + - pcm_s16le + - pcm_mulaw + - pcm_alaw + sample_rate: + type: number + default: 44100 + description: Sampling rate to use for the output audio + stream: + type: boolean + default: false + description: "If true, output is streamed for several characters at a time instead of waiting for the full response. The stream terminates with `data: [DONE]`. If false, return the encoded audio as octet stream" + + AudioSpeechStreamResponse: + oneOf: + - $ref: "#/components/schemas/AudioSpeechStreamEvent" + - $ref: "#/components/schemas/StreamSentinel" + + AudioSpeechStreamEvent: + type: object + required: [data] + properties: + data: + $ref: "#/components/schemas/AudioSpeechStreamChunk" + + AudioSpeechStreamChunk: + type: object + required: [object, model, b64] + properties: + object: + type: string + enum: + - audio.tts.chunk + model: + type: string + example: suno/bark + b64: + type: string + description: base64 encoded audio stream + StreamSentinel: type: object required: [data] @@ -2139,4 +2264,4 @@ components: type: number format: float default: 0.0 - description: The ratio of the final learning rate to the peak learning rate + description: The ratio of the final learning rate to the peak learning rate \ No newline at end of file From 465ca184cbc4907fe675eb508309867bb45c05ce Mon Sep 17 00:00:00 2001 From: Yogish Baliga Date: Wed, 22 Jan 2025 15:06:17 -0800 Subject: [PATCH 2/5] adding enum for languages and model name --- openapi.yaml | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/openapi.yaml b/openapi.yaml index 78a6989..32360a7 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -1538,16 +1538,14 @@ components: The name of the model to query.

[See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#audio-models) - example: cartesia/audio-v1 + example: cartesia/sonic anyOf: - - type: string - type: string enum: - - Cartesia/Audio-v1 + - cartesia/sonic input: type: string description: Input text to generate the audio for - maxLength: 4096 voice: type: string description: The voice to use for generating the audi. Supported voices are listed [here](https://together.ai/docs/voices) @@ -1565,6 +1563,20 @@ components: default: en enum: - en + - de + - fr + - es + - hi + - it + - ja + - ko + - nl + - pl + - pt + - ru + - sv + - tr + - zh response_encoding: type: string description: Audio encoding of response @@ -2264,4 +2276,4 @@ components: type: number format: float default: 0.0 - description: The ratio of the final learning rate to the peak learning rate \ No newline at end of file + description: The ratio of the final learning rate to the peak learning rate From 38a4d0e5db940934ee9e69e7612fb9eb9dc67044 Mon Sep 17 00:00:00 2001 From: Yogish Baliga Date: Wed, 22 Jan 2025 15:15:57 -0800 Subject: [PATCH 3/5] updating example --- openapi.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openapi.yaml b/openapi.yaml index 32360a7..95e16b5 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -1617,11 +1617,10 @@ components: - audio.tts.chunk model: type: string - example: suno/bark + example: cartesia/sonic b64: type: string description: base64 encoded audio stream - StreamSentinel: type: object required: [data] From bb74fc0d7cc7261e39e0fbc0422c01e5a2db0a10 Mon Sep 17 00:00:00 2001 From: Sam Selikoff Date: Thu, 23 Jan 2025 13:21:12 -0500 Subject: [PATCH 4/5] Add some enums for voice --- openapi.yaml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/openapi.yaml b/openapi.yaml index 95e16b5..a3131c3 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -1547,8 +1547,14 @@ components: type: string description: Input text to generate the audio for voice: - type: string - description: The voice to use for generating the audi. Supported voices are listed [here](https://together.ai/docs/voices) + description: The voice to use for generating the audio. [View all supported voices here](https://docs.together.ai/docs/text-to-speech#voices-available). + anyOf: + - type: string + enum: + - laidback woman + - polite man + - storyteller lady + - friendly sidekick response_format: type: string description: The format of audio output From 9ccc2a154039dd4d5a448bba7f76a4b71bf8fd13 Mon Sep 17 00:00:00 2001 From: Sam Selikoff Date: Thu, 23 Jan 2025 13:30:28 -0500 Subject: [PATCH 5/5] voice is required --- openapi.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/openapi.yaml b/openapi.yaml index a3131c3..e25c678 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -1532,6 +1532,7 @@ components: required: - model - input + - voice properties: model: description: >