Skip to content

Commit

Permalink
Include cartesia's voice controls on docs + update synthesizer (#674)
Browse files Browse the repository at this point in the history
* docs: include cartesia's voice controls

* fix lint and add padding

* fix: end of sentence issues

* lint and upgrade cartesia

* fix: pydantic casting issue

* lockfile

* fix pinecone lint
  • Loading branch information
sauhardjain authored Aug 7, 2024
1 parent 2ecc631 commit c1148e3
Show file tree
Hide file tree
Showing 7 changed files with 1,068 additions and 926 deletions.
48 changes: 40 additions & 8 deletions docs/open-source/using-synthesizers.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ Vocode currently supports the following synthesizers:

1. Azure (Microsoft)
2. Google
3. Eleven Labs
4. Rime
5. Play.ht
6. GTTS (Google Text-to-Speech)
7. Stream Elements
8. Bark
9. Amazon Polly
3. Cartesia
4. Eleven Labs
5. Rime
6. Play.ht
7. GTTS (Google Text-to-Speech)
8. Stream Elements
9. Bark
10. Amazon Polly

These synthesizers are defined using their respective configuration classes, which are subclasses of the `SynthesizerConfig` class.

Expand Down Expand Up @@ -83,7 +84,38 @@ synthesizer_config=PlayHtSynthesizerConfig.from_telephone_output_device(
...
```

### Example 2: Using Azure in StreamingConversation locally
### Example 2: Using Cartesia's streaming synthesizer

We support Cartesia's [low-latency streaming API](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-websocket) enabled by WebSockets. You can use the `CartesiaSynthesizer` with the `CartesiaSynthesizerConfig` to enable this feature.

#### Telephony

```python
synthesizer_config=CartesiaSynthesizerConfig.from_telephone_output_device(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id=os.getenv("CARTESIA_VOICE_ID"),
)
```

In this example, the `CartesiaSynthesizerConfig.from_output_device()` method is used to create a configuration object for the Cartesia synthesizer.
The method takes a `speaker_output` object as an argument, and extracts the `sampling_rate` and `audio_encoding` from the output device.

#### Controlling Speed & Emotions

You can set the `speed` and `emotion` parameters in the `CartesiaSynthesizerConfig` object to control the speed and emotions of the agent's voice! See [this page](https://docs.cartesia.ai/user-guides/voice-control) for more details.

```python
CartesiaSynthesizerConfig(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id=os.getenv("CARTESIA_VOICE_ID"),
experimental_voice_controls={
"speed": "slow",
"emotion": "positivity: high"
}
)
```

### Example 3: Using Azure in StreamingConversation locally

```python
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
Expand Down
1,879 changes: 973 additions & 906 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ groq = { version = "^0.9.0", optional = true }
# Synthesizers
google-cloud-texttospeech = { version = "^2.16.3", optional = true }
pvkoala = { version = "^2.0.1", optional = true }
cartesia = { version = "^1.0.7", optional = true }
cartesia = { version = "^1.0.10", optional = true }

# Transcribers
google-cloud-speech = { version = "^2.26.0", optional = true }
Expand Down
1 change: 1 addition & 0 deletions roadmap.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Our roadmap; if an issue has been created or the feature is currently being work
- [x] Bark
- [x] [Play.ht](http://Play.ht)
- [x] Eleven Labs
- [x] Cartesia
- [ ] Amazon Polly
- [ ] Resemble.AI

Expand Down
12 changes: 6 additions & 6 deletions vocode/streaming/models/synthesizer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, TypedDict
from typing import Any, Dict, List, Literal, Optional, Union

from pydantic.v1 import validator

Expand Down Expand Up @@ -230,18 +230,18 @@ class PollySynthesizerConfig(SynthesizerConfig, type=SynthesizerType.POLLY.value


DEFAULT_CARTESIA_MODEL_ID = "sonic-english"
DEFAULT_CARTESIA_VOICE_ID = "829ccd10-f8b3-43cd-b8a0-4aeaa81f3b30"
DEFAULT_CARTESIA_VOICE_ID = "f9836c6e-a0bd-460e-9d3c-f7299fa60f94"


class CartesiaVoiceControls(TypedDict):
class CartesiaVoiceControls(BaseModel):
"""See https://docs.cartesia.ai/user-guides/voice-control"""

speed: str
emotion: List[str]
speed: Optional[Union[float, str]] = None
emotion: Optional[List[str]] = None


class CartesiaSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.CARTESIA.value): # type: ignore
api_key: Optional[str] = None
model_id: str = DEFAULT_CARTESIA_MODEL_ID
voice_id: str = DEFAULT_CARTESIA_VOICE_ID
_experimental_voice_controls: Optional[CartesiaVoiceControls] = None
experimental_voice_controls: Optional[CartesiaVoiceControls] = None
39 changes: 38 additions & 1 deletion vocode/streaming/synthesizer/cartesia_synthesizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import hashlib

from loguru import logger
Expand Down Expand Up @@ -28,7 +29,11 @@ def __init__(

self.cartesia_tts = AsyncCartesia

self._experimental_voice_controls = synthesizer_config._experimental_voice_controls
self._experimental_voice_controls = None
if synthesizer_config.experimental_voice_controls:
self._experimental_voice_controls = (
synthesizer_config.experimental_voice_controls.dict()
)

if synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
match synthesizer_config.sampling_rate:
Expand Down Expand Up @@ -85,6 +90,8 @@ def __init__(
self.client = self.cartesia_tts(api_key=self.api_key)
self.ws = None
self.ctx = None
self.no_more_inputs_task = None
self.no_more_inputs_lock = asyncio.Lock()

async def initialize_ws(self):
if self.ws is None:
Expand All @@ -96,9 +103,25 @@ async def initialize_ctx(self, is_first_text_chunk: bool):
self.ctx = self.ws.context()
else:
if is_first_text_chunk:
if self.no_more_inputs_task:
self.no_more_inputs_task.cancel()
await self.ctx.no_more_inputs()
self.ctx = self.ws.context()

# This workaround is necessary to prevent the last chunk getting delayed.
# In the future, `create_speech_uncached` should be modified to handle this properly by adding a flag to the last chunk.
def refresh_no_more_inputs_task(self):
if self.no_more_inputs_task:
self.no_more_inputs_task.cancel()

async def delayed_no_more_inputs():
await asyncio.sleep(1)
async with self.no_more_inputs_lock:
if self.ctx:
await self.ctx.no_more_inputs()

self.no_more_inputs_task = asyncio.create_task(delayed_no_more_inputs())

async def create_speech_uncached(
self,
message: BaseMessage,
Expand All @@ -123,6 +146,11 @@ async def create_speech_uncached(
output_format=self.output_format,
_experimental_voice_controls=self._experimental_voice_controls,
)
if not is_sole_text_chunk:
try:
self.refresh_no_more_inputs_task()
except Exception as e:
logger.info(f"Caught error while sending no more inputs: {e}")

async def chunk_generator(context):
buffer = bytearray()
Expand All @@ -143,6 +171,13 @@ async def chunk_generator(context):
)
self.ctx._close()
if buffer:
# pad the leftover buffer with silence
if len(buffer) < chunk_size:
padding_size = chunk_size - len(buffer)
if self.output_format["encoding"] == "pcm_mulaw":
buffer.extend(b"\x7f" * padding_size) # 127 is silence in mu-law
elif self.output_format["encoding"] == "pcm_s16le":
buffer.extend(b"\x00\x00" * padding_size) # 0 is silence in s16le
yield SynthesisResult.ChunkResult(chunk=buffer, is_last_chunk=True)

return SynthesisResult(
Expand All @@ -167,6 +202,8 @@ def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig):

async def tear_down(self):
await super().tear_down()
if self.no_more_inputs_task:
self.no_more_inputs_task.cancel()
if self.ctx:
self.ctx._close()
await self.ws.close()
Expand Down
13 changes: 9 additions & 4 deletions vocode/streaming/vector_db/pinecone.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import uuid
from typing import Iterable, List, Optional, Tuple
from typing import Any, Iterable, List, Optional, Tuple, TypeGuard

from langchain.docstore.document import Document
from loguru import logger
Expand All @@ -9,15 +9,20 @@
from vocode.streaming.vector_db.base_vector_db import VectorDB


def is_non_empty_string(value: Any) -> TypeGuard[str]:
return isinstance(value, str) and len(value) > 0


class PineconeDB(VectorDB):
def __init__(self, config: PineconeConfig, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.config = config

self.index_name = self.config.index
self.pinecone_api_key = getenv("PINECONE_API_KEY") or self.config.api_key
if not self.pinecone_api_key:
raise ValueError("Pinecone API key not set")
pinecone_api_key = getenv("PINECONE_API_KEY") or self.config.api_key
if not is_non_empty_string(pinecone_api_key):
raise ValueError("Pinecone API key not set or invalid")
self.pinecone_api_key = pinecone_api_key
self.pinecone_environment = getenv("PINECONE_ENVIRONMENT") or self.config.api_environment
self.pinecone_url = f"https://{self.index_name}.svc.{self.pinecone_environment}.pinecone.io"
self._text_key = "text"
Expand Down

0 comments on commit c1148e3

Please sign in to comment.