Module livekit.plugins.cartesia
Classes
class ChunkedStream (tts: TTS, text: str, opts: _TTSOptions, session: aiohttp.ClientSession)
-
Synthesize chunked text using the bytes endpoint
Expand source code
class ChunkedStream(tts.ChunkedStream): """Synthesize chunked text using the bytes endpoint""" def __init__( self, tts: TTS, text: str, opts: _TTSOptions, session: aiohttp.ClientSession ) -> None: super().__init__(tts, text) self._opts, self._session = opts, session async def _main_task(self) -> None: request_id = utils.shortuuid() bstream = utils.audio.AudioByteStream( sample_rate=self._opts.sample_rate, num_channels=NUM_CHANNELS ) json = _to_cartesia_options(self._opts) json["transcript"] = self._input_text headers = { API_AUTH_HEADER: self._opts.api_key, API_VERSION_HEADER: API_VERSION, } try: async with self._session.post( "https://api.cartesia.ai/tts/bytes", headers=headers, json=json, ) as resp: resp.raise_for_status() async for data, _ in resp.content.iter_chunks(): for frame in bstream.write(data): self._event_ch.send_nowait( tts.SynthesizedAudio( request_id=request_id, frame=frame, ) ) for frame in bstream.flush(): self._event_ch.send_nowait( tts.SynthesizedAudio(request_id=request_id, frame=frame) ) except asyncio.TimeoutError as e: raise APITimeoutError() from e except aiohttp.ClientResponseError as e: raise APIStatusError( message=e.message, status_code=e.status, request_id=None, body=None, ) from e except Exception as e: raise APIConnectionError() from e
Ancestors
- ChunkedStream
- abc.ABC
Inherited members
class TTS (*, model: TTSModels | str = 'sonic-english', language: str = 'en', encoding: TTSEncoding = 'pcm_s16le', voice: str | list[float] = 'c2ac25f9-ecc4-4f56-9095-651354df60c0', speed: TTSVoiceSpeed | float | None = None, emotion: list[TTSVoiceEmotion | str] | None = None, sample_rate: int = 24000, api_key: str | None = None, http_session: aiohttp.ClientSession | None = None)
-
Helper class that provides a standard way to create an ABC using inheritance.
Create a new instance of Cartesia TTS.
See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API.
Args
model
:TTSModels
, optional- The Cartesia TTS model to use. Defaults to "sonic-english".
language
:str
, optional- The language code for synthesis. Defaults to "en".
encoding
:TTSEncoding
, optional- The audio encoding format. Defaults to "pcm_s16le".
voice
:str | list[float]
, optional- The voice ID or embedding array.
speed
:TTSVoiceSpeed | float
, optional- Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
emotion
:list[TTSVoiceEmotion]
, optional- Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
sample_rate
:int
, optional- The audio sample rate in Hz. Defaults to 24000.
api_key
:str
, optional- The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
http_session
:aiohttp.ClientSession | None
, optional- An existing aiohttp ClientSession to use. If not provided, a new session will be created.
Expand source code
class TTS(tts.TTS): def __init__( self, *, model: TTSModels | str = "sonic-english", language: str = "en", encoding: TTSEncoding = "pcm_s16le", voice: str | list[float] = TTSDefaultVoiceId, speed: TTSVoiceSpeed | float | None = None, emotion: list[TTSVoiceEmotion | str] | None = None, sample_rate: int = 24000, api_key: str | None = None, http_session: aiohttp.ClientSession | None = None, ) -> None: """ Create a new instance of Cartesia TTS. See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API. Args: model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english". language (str, optional): The language code for synthesis. Defaults to "en". encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le". voice (str | list[float], optional): The voice ID or embedding array. speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control) emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control) sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000. api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable. http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created. """ super().__init__( capabilities=tts.TTSCapabilities(streaming=True), sample_rate=sample_rate, num_channels=NUM_CHANNELS, ) api_key = api_key or os.environ.get("CARTESIA_API_KEY") if not api_key: raise ValueError("CARTESIA_API_KEY must be set") self._opts = _TTSOptions( model=model, language=language, encoding=encoding, sample_rate=sample_rate, voice=voice, speed=speed, emotion=emotion, api_key=api_key, ) self._session = http_session def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: self._session = utils.http_context.http_session() return self._session def update_options( self, *, model: TTSModels | None = None, language: str | None = None, voice: str | list[float] | None = None, speed: TTSVoiceSpeed | float | None = None, emotion: list[TTSVoiceEmotion | str] | None = None, ) -> None: """ Update the Text-to-Speech (TTS) configuration options. This method allows updating the TTS settings, including model type, language, voice, speed, and emotion. If any parameter is not provided, the existing value will be retained. Args: model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english". language (str, optional): The language code for synthesis. Defaults to "en". voice (str | list[float], optional): The voice ID or embedding array. speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control) emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control) """ self._opts.model = model or self._opts.model self._opts.language = language or self._opts.language self._opts.voice = voice or self._opts.voice self._opts.speed = speed or self._opts.speed if emotion is not None: self._opts.emotion = emotion def synthesize(self, text: str) -> "ChunkedStream": return ChunkedStream(self, text, self._opts, self._ensure_session()) def stream(self) -> "SynthesizeStream": return SynthesizeStream(self, self._opts, self._ensure_session())
Ancestors
- TTS
- abc.ABC
- EventEmitter
- typing.Generic
Methods
def stream(self) ‑> livekit.plugins.cartesia.tts.SynthesizeStream
def synthesize(self, text: str) ‑> livekit.plugins.cartesia.tts.ChunkedStream
def update_options(self, *, model: TTSModels | None = None, language: str | None = None, voice: str | list[float] | None = None, speed: TTSVoiceSpeed | float | None = None, emotion: list[TTSVoiceEmotion | str] | None = None) ‑> None
-
Update the Text-to-Speech (TTS) configuration options.
This method allows updating the TTS settings, including model type, language, voice, speed, and emotion. If any parameter is not provided, the existing value will be retained.
Args
model
:TTSModels
, optional- The Cartesia TTS model to use. Defaults to "sonic-english".
language
:str
, optional- The language code for synthesis. Defaults to "en".
voice
:str | list[float]
, optional- The voice ID or embedding array.
speed
:TTSVoiceSpeed | float
, optional- Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
emotion
:list[TTSVoiceEmotion]
, optional- Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
Inherited members