Module livekit.plugins.cartesia
Cartesia plugin for LiveKit Agents
See https://docs.livekit.io/agents/integrations/tts/cartesia/ for more information.
Classes
class ChunkedStream (*,
tts: TTS,
input_text: str,
conn_options: APIConnectOptions)-
Expand source code
class ChunkedStream(tts.ChunkedStream): """Synthesize chunked text using the bytes endpoint""" def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None: super().__init__(tts=tts, input_text=input_text, conn_options=conn_options) self._tts: TTS = tts self._opts = replace(tts._opts) async def _run(self, output_emitter: tts.AudioEmitter) -> None: json = _to_cartesia_options(self._opts, streaming=False) json["transcript"] = self._input_text try: async with self._tts._ensure_session().post( self._opts.get_http_url("/tts/bytes"), headers={ API_AUTH_HEADER: self._opts.api_key, API_VERSION_HEADER: API_VERSION, }, json=json, timeout=aiohttp.ClientTimeout(total=30, sock_connect=self._conn_options.timeout), ) as resp: resp.raise_for_status() output_emitter.initialize( request_id=utils.shortuuid(), sample_rate=self._opts.sample_rate, num_channels=1, mime_type="audio/pcm", ) async for data, _ in resp.content.iter_chunks(): output_emitter.push(data) output_emitter.flush() except asyncio.TimeoutError: raise APITimeoutError() from None except aiohttp.ClientResponseError as e: raise APIStatusError( message=e.message, status_code=e.status, request_id=None, body=None ) from None except Exception as e: raise APIConnectionError() from e
Synthesize chunked text using the bytes endpoint
Ancestors
- livekit.agents.tts.tts.ChunkedStream
- abc.ABC
class STT (*,
model: STTModels | str = 'ink-whisper',
language: STTLanguages | str = 'en',
encoding: STTEncoding = 'pcm_s16le',
sample_rate: int = 16000,
api_key: str | None = None,
http_session: aiohttp.ClientSession | None = None,
base_url: str = 'https://api.cartesia.ai')-
Expand source code
class STT(stt.STT): def __init__( self, *, model: STTModels | str = "ink-whisper", language: STTLanguages | str = "en", encoding: STTEncoding = "pcm_s16le", sample_rate: int = 16000, api_key: str | None = None, http_session: aiohttp.ClientSession | None = None, base_url: str = "https://api.cartesia.ai", ) -> None: """ Create a new instance of Cartesia STT. Args: model: The Cartesia STT model to use. Defaults to "ink-whisper". language: The language code for recognition. Defaults to "en". encoding: The audio encoding format. Defaults to "pcm_s16le". sample_rate: The sample rate of the audio in Hz. Defaults to 16000. api_key: The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable. http_session: Optional aiohttp ClientSession to use for requests. base_url: The base URL for the Cartesia API. Defaults to "https://api.cartesia.ai". Raises: ValueError: If no API key is provided or found in environment variables. """ super().__init__(capabilities=stt.STTCapabilities(streaming=True, interim_results=False)) cartesia_api_key = api_key or os.environ.get("CARTESIA_API_KEY") if not cartesia_api_key: raise ValueError("CARTESIA_API_KEY must be set") self._opts = STTOptions( model=model, language=language, encoding=encoding, sample_rate=sample_rate, api_key=cartesia_api_key, base_url=base_url, ) self._session = http_session self._streams = weakref.WeakSet[SpeechStream]() def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: self._session = utils.http_context.http_session() return self._session async def _recognize_impl( self, buffer: utils.AudioBuffer, *, language: NotGivenOr[str] = NOT_GIVEN, conn_options: APIConnectOptions, ) -> stt.SpeechEvent: raise NotImplementedError( "Cartesia STT does not support batch recognition, use stream() instead" ) def stream( self, *, language: NotGivenOr[STTLanguages | str] = NOT_GIVEN, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> SpeechStream: """Create a streaming transcription session.""" config = self._sanitize_options(language=language) stream = SpeechStream( stt=self, opts=config, conn_options=conn_options, ) self._streams.add(stream) return stream def update_options( self, *, model: NotGivenOr[STTModels | str] = NOT_GIVEN, language: NotGivenOr[STTLanguages | str] = NOT_GIVEN, ) -> None: """Update STT configuration options.""" if is_given(model): self._opts.model = model if is_given(language): self._opts.language = language # Update all active streams for stream in self._streams: stream.update_options( model=model, language=language, ) def _sanitize_options( self, *, language: NotGivenOr[STTLanguages | str] = NOT_GIVEN ) -> STTOptions: """Create a sanitized copy of options with language override if provided.""" config = STTOptions( model=self._opts.model, language=self._opts.language, encoding=self._opts.encoding, sample_rate=self._opts.sample_rate, api_key=self._opts.api_key, base_url=self._opts.base_url, ) if is_given(language): config.language = language return config
Helper class that provides a standard way to create an ABC using inheritance.
Create a new instance of Cartesia STT.
Args
model
- The Cartesia STT model to use. Defaults to "ink-whisper".
language
- The language code for recognition. Defaults to "en".
encoding
- The audio encoding format. Defaults to "pcm_s16le".
sample_rate
- The sample rate of the audio in Hz. Defaults to 16000.
api_key
- The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
http_session
- Optional aiohttp ClientSession to use for requests.
base_url
- The base URL for the Cartesia API. Defaults to "https://api.cartesia.ai".
Raises
ValueError
- If no API key is provided or found in environment variables.
Ancestors
- livekit.agents.stt.stt.STT
- abc.ABC
- EventEmitter
- typing.Generic
Methods
def stream(self,
*,
language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.cartesia.stt.SpeechStream-
Expand source code
def stream( self, *, language: NotGivenOr[STTLanguages | str] = NOT_GIVEN, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> SpeechStream: """Create a streaming transcription session.""" config = self._sanitize_options(language=language) stream = SpeechStream( stt=self, opts=config, conn_options=conn_options, ) self._streams.add(stream) return stream
Create a streaming transcription session.
def update_options(self,
*,
model: NotGivenOr[STTModels | str] = NOT_GIVEN,
language: NotGivenOr[STTLanguages | str] = NOT_GIVEN) ‑> None-
Expand source code
def update_options( self, *, model: NotGivenOr[STTModels | str] = NOT_GIVEN, language: NotGivenOr[STTLanguages | str] = NOT_GIVEN, ) -> None: """Update STT configuration options.""" if is_given(model): self._opts.model = model if is_given(language): self._opts.language = language # Update all active streams for stream in self._streams: stream.update_options( model=model, language=language, )
Update STT configuration options.
Inherited members
class TTS (*,
api_key: str | None = None,
model: TTSModels | str = 'sonic-2',
language: str = 'en',
encoding: TTSEncoding = 'pcm_s16le',
voice: str | list[float] = '794f9389-aac1-45b6-b726-9d9369183238',
speed: TTSVoiceSpeed | float | None = None,
emotion: list[TTSVoiceEmotion | str] | None = None,
sample_rate: int = 24000,
word_timestamps: bool = True,
http_session: aiohttp.ClientSession | None = None,
tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
text_pacing: tts.SentenceStreamPacer | bool = False,
base_url: str = 'https://api.cartesia.ai')-
Expand source code
class TTS(tts.TTS): def __init__( self, *, api_key: str | None = None, model: TTSModels | str = "sonic-2", language: str = "en", encoding: TTSEncoding = "pcm_s16le", voice: str | list[float] = TTSDefaultVoiceId, speed: TTSVoiceSpeed | float | None = None, emotion: list[TTSVoiceEmotion | str] | None = None, sample_rate: int = 24000, word_timestamps: bool = True, http_session: aiohttp.ClientSession | None = None, tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN, text_pacing: tts.SentenceStreamPacer | bool = False, base_url: str = "https://api.cartesia.ai", ) -> None: """ Create a new instance of Cartesia TTS. See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API. Args: model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-2". language (str, optional): The language code for synthesis. Defaults to "en". encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le". voice (str | list[float], optional): The voice ID or embedding array. speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control) emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control) sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000. word_timestamps (bool, optional): Whether to add word timestamps to the output. Defaults to True. api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable. http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created. tokenizer (tokenize.SentenceTokenizer, optional): The tokenizer to use. Defaults to tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT). text_pacing (tts.SentenceStreamPacer | bool, optional): Stream pacer for the TTS. Set to True to use the default pacer, False to disable. base_url (str, optional): The base URL for the Cartesia API. Defaults to "https://api.cartesia.ai". """ # noqa: E501 super().__init__( capabilities=tts.TTSCapabilities( streaming=True, aligned_transcript=word_timestamps, ), sample_rate=sample_rate, num_channels=1, ) cartesia_api_key = api_key or os.environ.get("CARTESIA_API_KEY") if not cartesia_api_key: raise ValueError("CARTESIA_API_KEY must be set") if (speed or emotion) and model != "sonic-2-2025-03-07": logger.warning( "speed and emotion controls are only supported for model 'sonic-2-2025-03-07', " "see https://docs.cartesia.ai/developer-tools/changelog for details", extra={"model": model, "speed": speed, "emotion": emotion}, ) self._opts = _TTSOptions( model=model, language=language, encoding=encoding, sample_rate=sample_rate, voice=voice, speed=speed, emotion=emotion, api_key=cartesia_api_key, base_url=base_url, word_timestamps=word_timestamps, ) self._session = http_session self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse]( connect_cb=self._connect_ws, close_cb=self._close_ws, max_session_duration=300, mark_refreshed_on_get=True, ) self._streams = weakref.WeakSet[SynthesizeStream]() self._sentence_tokenizer = ( tokenizer if is_given(tokenizer) else tokenize.blingfire.SentenceTokenizer() ) self._stream_pacer: tts.SentenceStreamPacer | None = None if text_pacing is True: self._stream_pacer = tts.SentenceStreamPacer() elif isinstance(text_pacing, tts.SentenceStreamPacer): self._stream_pacer = text_pacing async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse: session = self._ensure_session() url = self._opts.get_ws_url( f"/tts/websocket?api_key={self._opts.api_key}&cartesia_version={API_VERSION}" ) return await asyncio.wait_for(session.ws_connect(url), timeout) async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None: await ws.close() def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: self._session = utils.http_context.http_session() return self._session def prewarm(self) -> None: self._pool.prewarm() def update_options( self, *, model: NotGivenOr[TTSModels | str] = NOT_GIVEN, language: NotGivenOr[str] = NOT_GIVEN, voice: NotGivenOr[str | list[float]] = NOT_GIVEN, speed: NotGivenOr[TTSVoiceSpeed | float | None] = NOT_GIVEN, emotion: NotGivenOr[list[TTSVoiceEmotion | str] | None] = NOT_GIVEN, ) -> None: """ Update the Text-to-Speech (TTS) configuration options. This method allows updating the TTS settings, including model type, language, voice, speed, and emotion. If any parameter is not provided, the existing value will be retained. Args: model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-2". language (str, optional): The language code for synthesis. Defaults to "en". voice (str | list[float], optional): The voice ID or embedding array. speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control) emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control) """ if is_given(model): self._opts.model = model if is_given(language): self._opts.language = language if is_given(voice): self._opts.voice = cast(Union[str, list[float]], voice) if is_given(speed): self._opts.speed = cast(Optional[Union[TTSVoiceSpeed, float]], speed) if is_given(emotion): self._opts.emotion = emotion if (speed or emotion) and self._opts.model != "sonic-2-2025-03-07": logger.warning( "speed and emotion controls are only supported for model 'sonic-2-2025-03-07', " "see https://docs.cartesia.ai/developer-tools/changelog for details", extra={"model": self._opts.model, "speed": speed, "emotion": emotion}, ) def synthesize( self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS ) -> ChunkedStream: return ChunkedStream(tts=self, input_text=text, conn_options=conn_options) def stream( self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS ) -> SynthesizeStream: return SynthesizeStream(tts=self, conn_options=conn_options) async def aclose(self) -> None: for stream in list(self._streams): await stream.aclose() self._streams.clear() await self._pool.aclose()
Helper class that provides a standard way to create an ABC using inheritance.
Create a new instance of Cartesia TTS.
See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API.
Args
model
:TTSModels
, optional- The Cartesia TTS model to use. Defaults to "sonic-2".
language
:str
, optional- The language code for synthesis. Defaults to "en".
encoding
:TTSEncoding
, optional- The audio encoding format. Defaults to "pcm_s16le".
voice
:str | list[float]
, optional- The voice ID or embedding array.
speed
:TTSVoiceSpeed | float
, optional- Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
emotion
:list[TTSVoiceEmotion]
, optional- Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
sample_rate
:int
, optional- The audio sample rate in Hz. Defaults to 24000.
word_timestamps
:bool
, optional- Whether to add word timestamps to the output. Defaults to True.
api_key
:str
, optional- The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
http_session
:aiohttp.ClientSession | None
, optional- An existing aiohttp ClientSession to use. If not provided, a new session will be created.
tokenizer
:tokenize.SentenceTokenizer
, optional- The tokenizer to use. Defaults to tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT).
text_pacing
:tts.SentenceStreamPacer | bool
, optional- Stream pacer for the TTS. Set to True to use the default pacer, False to disable.
base_url
:str
, optional- The base URL for the Cartesia API. Defaults to "https://api.cartesia.ai".
Ancestors
- livekit.agents.tts.tts.TTS
- abc.ABC
- EventEmitter
- typing.Generic
Methods
async def aclose(self) ‑> None
-
Expand source code
async def aclose(self) -> None: for stream in list(self._streams): await stream.aclose() self._streams.clear() await self._pool.aclose()
def prewarm(self) ‑> None
-
Expand source code
def prewarm(self) -> None: self._pool.prewarm()
Pre-warm connection to the TTS service
def stream(self,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.cartesia.tts.SynthesizeStream-
Expand source code
def stream( self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS ) -> SynthesizeStream: return SynthesizeStream(tts=self, conn_options=conn_options)
def synthesize(self,
text: str,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.cartesia.tts.ChunkedStream-
Expand source code
def synthesize( self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS ) -> ChunkedStream: return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
def update_options(self,
*,
model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
language: NotGivenOr[str] = NOT_GIVEN,
voice: NotGivenOr[str | list[float]] = NOT_GIVEN,
speed: NotGivenOr[TTSVoiceSpeed | float | None] = NOT_GIVEN,
emotion: NotGivenOr[list[TTSVoiceEmotion | str] | None] = NOT_GIVEN) ‑> None-
Expand source code
def update_options( self, *, model: NotGivenOr[TTSModels | str] = NOT_GIVEN, language: NotGivenOr[str] = NOT_GIVEN, voice: NotGivenOr[str | list[float]] = NOT_GIVEN, speed: NotGivenOr[TTSVoiceSpeed | float | None] = NOT_GIVEN, emotion: NotGivenOr[list[TTSVoiceEmotion | str] | None] = NOT_GIVEN, ) -> None: """ Update the Text-to-Speech (TTS) configuration options. This method allows updating the TTS settings, including model type, language, voice, speed, and emotion. If any parameter is not provided, the existing value will be retained. Args: model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-2". language (str, optional): The language code for synthesis. Defaults to "en". voice (str | list[float], optional): The voice ID or embedding array. speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control) emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control) """ if is_given(model): self._opts.model = model if is_given(language): self._opts.language = language if is_given(voice): self._opts.voice = cast(Union[str, list[float]], voice) if is_given(speed): self._opts.speed = cast(Optional[Union[TTSVoiceSpeed, float]], speed) if is_given(emotion): self._opts.emotion = emotion if (speed or emotion) and self._opts.model != "sonic-2-2025-03-07": logger.warning( "speed and emotion controls are only supported for model 'sonic-2-2025-03-07', " "see https://docs.cartesia.ai/developer-tools/changelog for details", extra={"model": self._opts.model, "speed": speed, "emotion": emotion}, )
Update the Text-to-Speech (TTS) configuration options.
This method allows updating the TTS settings, including model type, language, voice, speed, and emotion. If any parameter is not provided, the existing value will be retained.
Args
model
:TTSModels
, optional- The Cartesia TTS model to use. Defaults to "sonic-2".
language
:str
, optional- The language code for synthesis. Defaults to "en".
voice
:str | list[float]
, optional- The voice ID or embedding array.
speed
:TTSVoiceSpeed | float
, optional- Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
emotion
:list[TTSVoiceEmotion]
, optional- Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
Inherited members