From 4009bd1970f3ecc8d8516ab94dada9cdd2ee21c0 Mon Sep 17 00:00:00 2001
From: Yogish Baliga <yogish@together.ai>
Date: Fri, 10 Jan 2025 09:23:10 -0800
Subject: [PATCH 1/5] OpenAPI spec for audio api request/response

---
 openapi.yaml | 179 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 152 insertions(+), 27 deletions(-)

diff --git a/openapi.yaml b/openapi.yaml
index 1a87057..78a6989 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -640,6 +640,48 @@ paths:
               schema:
                 $ref: "#/components/schemas/ErrorData"
       deprecated: false
+  /audio/speech:
+    post:
+      tags: ["Audio"]
+      summary: Create audio generation request
+      description: Generate audio from input text
+      operationId: audio-speech
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/AudioSpeechRequest"
+      responses:
+        "200":
+          description: "OK"
+          content:
+            application/octet-stream:
+              schema:
+                type: string
+                format: binary
+            audio/wav:
+              schema:
+                type: string
+                format: binary
+            audio/mpeg:
+              schema:
+                type: string
+                format: binary
+            text/event-stream:
+              schema:
+                $ref: "#/components/schemas/AudioSpeechStreamResponse"
+        "400":
+          description: "BadRequest"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ErrorData"
+        "429":
+          description: "RateLimit"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ErrorData"
 components:
   securitySchemes:
     bearerAuth:
@@ -682,21 +724,21 @@ components:
                 example: Our solar system orbits the Milky Way galaxy at about 515,000 mph
           example:
             - {
-                "title": "Llama",
-                "text": "The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era.",
-              }
+              "title": "Llama",
+              "text": "The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era.",
+            }
             - {
-                "title": "Panda",
-                "text": "The giant panda (Ailuropoda melanoleuca), also known as the panda bear or simply panda, is a bear species endemic to China.",
-              }
+              "title": "Panda",
+              "text": "The giant panda (Ailuropoda melanoleuca), also known as the panda bear or simply panda, is a bear species endemic to China.",
+            }
             - {
-                "title": "Guanaco",
-                "text": "The guanaco is a camelid native to South America, closely related to the llama. Guanacos are one of two wild South American camelids; the other species is the vicuña, which lives at higher elevations.",
-              }
+              "title": "Guanaco",
+              "text": "The guanaco is a camelid native to South America, closely related to the llama. Guanacos are one of two wild South American camelids; the other species is the vicuña, which lives at higher elevations.",
+            }
             - {
-                "title": "Wild Bactrian camel",
-                "text": "The wild Bactrian camel (Camelus ferus) is an endangered species of camel endemic to Northwest China and southwestern Mongolia.",
-              }
+              "title": "Wild Bactrian camel",
+              "text": "The wild Bactrian camel (Camelus ferus) is an endangered species of camel endemic to Northwest China and southwestern Mongolia.",
+            }
         top_n:
           type: integer
           description: The number of top results to return.
@@ -756,21 +798,21 @@ components:
                     nullable: true
           example:
             - {
-                "index": 0,
-                "relevance_score": 0.29980177813003117,
-                "document":
-                  {
-                    "text": '{"title":"Llama","text":"The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."}',
-                  },
-              }
+              "index": 0,
+              "relevance_score": 0.29980177813003117,
+              "document":
+                {
+                  "text": '{"title":"Llama","text":"The llama is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era."}',
+                },
+            }
             - {
-                "index": 2,
-                "relevance_score": 0.2752447527354349,
-                "document":
-                  {
-                    "text": '{"title":"Guanaco","text":"The guanaco is a camelid native to South America, closely related to the llama. Guanacos are one of two wild South American camelids; the other species is the vicuña, which lives at higher elevations."}',
-                  },
-              }
+              "index": 2,
+              "relevance_score": 0.2752447527354349,
+              "document":
+                {
+                  "text": '{"title":"Guanaco","text":"The guanaco is a camelid native to South America, closely related to the llama. Guanacos are one of two wild South American camelids; the other species is the vicuña, which lives at higher elevations."}',
+                },
+            }
         usage:
           $ref: "#/components/schemas/UsageData"
           example:
@@ -1485,6 +1527,89 @@ components:
             - $ref: "#/components/schemas/UsageData"
             - nullable: true
 
+    AudioSpeechRequest:
+      type: object
+      required:
+        - model
+        - input
+      properties:
+        model:
+          description: >
+            The name of the model to query.<br>
+            <br>
+            [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#audio-models)
+          example: cartesia/audio-v1
+          anyOf:
+            - type: string
+            - type: string
+              enum:
+                - Cartesia/Audio-v1
+        input:
+          type: string
+          description: Input text to generate the audio for
+          maxLength: 4096
+        voice:
+          type: string
+          description: The voice to use for generating the audi. Supported voices are listed [here](https://together.ai/docs/voices)
+        response_format:
+          type: string
+          description: The format of audio output
+          default: wav
+          enum:
+            - mp3
+            - wav
+            - raw
+        language:
+          type: string
+          description: Language of input text
+          default: en
+          enum:
+            - en
+        response_encoding:
+          type: string
+          description: Audio encoding of response
+          default: pcm_f32le
+          enum:
+            - pcm_f32le
+            - pcm_s16le
+            - pcm_mulaw
+            - pcm_alaw
+        sample_rate:
+          type: number
+          default: 44100
+          description: Sampling rate to use for the output audio
+        stream:
+          type: boolean
+          default: false
+          description: "If true, output is streamed for several characters at a time instead of waiting for the full response. The stream terminates with `data: [DONE]`. If false, return the encoded audio as octet stream"
+
+    AudioSpeechStreamResponse:
+      oneOf:
+        - $ref: "#/components/schemas/AudioSpeechStreamEvent"
+        - $ref: "#/components/schemas/StreamSentinel"
+
+    AudioSpeechStreamEvent:
+      type: object
+      required: [data]
+      properties:
+        data:
+          $ref: "#/components/schemas/AudioSpeechStreamChunk"
+
+    AudioSpeechStreamChunk:
+      type: object
+      required: [object, model, b64]
+      properties:
+        object:
+          type: string
+          enum:
+            - audio.tts.chunk
+        model:
+          type: string
+          example: suno/bark
+        b64:
+          type: string
+          description: base64 encoded audio stream
+
     StreamSentinel:
       type: object
       required: [data]
@@ -2139,4 +2264,4 @@ components:
           type: number
           format: float
           default: 0.0
-          description: The ratio of the final learning rate to the peak learning rate
+          description: The ratio of the final learning rate to the peak learning rate
\ No newline at end of file

From 465ca184cbc4907fe675eb508309867bb45c05ce Mon Sep 17 00:00:00 2001
From: Yogish Baliga <yogish@together.ai>
Date: Wed, 22 Jan 2025 15:06:17 -0800
Subject: [PATCH 2/5] adding enum for languages and model name

---
 openapi.yaml | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/openapi.yaml b/openapi.yaml
index 78a6989..32360a7 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -1538,16 +1538,14 @@ components:
             The name of the model to query.<br>
             <br>
             [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#audio-models)
-          example: cartesia/audio-v1
+          example: cartesia/sonic
           anyOf:
-            - type: string
             - type: string
               enum:
-                - Cartesia/Audio-v1
+                - cartesia/sonic
         input:
           type: string
           description: Input text to generate the audio for
-          maxLength: 4096
         voice:
           type: string
           description: The voice to use for generating the audi. Supported voices are listed [here](https://together.ai/docs/voices)
@@ -1565,6 +1563,20 @@ components:
           default: en
           enum:
             - en
+            - de
+            - fr
+            - es
+            - hi
+            - it
+            - ja
+            - ko
+            - nl
+            - pl
+            - pt
+            - ru
+            - sv
+            - tr
+            - zh
         response_encoding:
           type: string
           description: Audio encoding of response
@@ -2264,4 +2276,4 @@ components:
           type: number
           format: float
           default: 0.0
-          description: The ratio of the final learning rate to the peak learning rate
\ No newline at end of file
+          description: The ratio of the final learning rate to the peak learning rate

From 38a4d0e5db940934ee9e69e7612fb9eb9dc67044 Mon Sep 17 00:00:00 2001
From: Yogish Baliga <yogish@together.ai>
Date: Wed, 22 Jan 2025 15:15:57 -0800
Subject: [PATCH 3/5] updating example

---
 openapi.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/openapi.yaml b/openapi.yaml
index 32360a7..95e16b5 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -1617,11 +1617,10 @@ components:
             - audio.tts.chunk
         model:
           type: string
-          example: suno/bark
+          example: cartesia/sonic
         b64:
           type: string
           description: base64 encoded audio stream
-
     StreamSentinel:
       type: object
       required: [data]

From bb74fc0d7cc7261e39e0fbc0422c01e5a2db0a10 Mon Sep 17 00:00:00 2001
From: Sam Selikoff <sam.selikoff@gmail.com>
Date: Thu, 23 Jan 2025 13:21:12 -0500
Subject: [PATCH 4/5] Add some enums for voice

---
 openapi.yaml | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/openapi.yaml b/openapi.yaml
index 95e16b5..a3131c3 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -1547,8 +1547,14 @@ components:
           type: string
           description: Input text to generate the audio for
         voice:
-          type: string
-          description: The voice to use for generating the audi. Supported voices are listed [here](https://together.ai/docs/voices)
+          description: The voice to use for generating the audio. [View all supported voices here](https://docs.together.ai/docs/text-to-speech#voices-available).
+          anyOf:
+            - type: string
+              enum:
+                - laidback woman
+                - polite man
+                - storyteller lady
+                - friendly sidekick
         response_format:
           type: string
           description: The format of audio output

From 9ccc2a154039dd4d5a448bba7f76a4b71bf8fd13 Mon Sep 17 00:00:00 2001
From: Sam Selikoff <sam.selikoff@gmail.com>
Date: Thu, 23 Jan 2025 13:30:28 -0500
Subject: [PATCH 5/5] voice is required

---
 openapi.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/openapi.yaml b/openapi.yaml
index a3131c3..e25c678 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -1532,6 +1532,7 @@ components:
       required:
         - model
         - input
+        - voice
       properties:
         model:
           description: >