Module livekit.plugins.cartesia

Cartesia plugin for LiveKit Agents

See https://docs.livekit.io/agents/integrations/tts/cartesia/ for more information.

Classes

class ChunkedStream (*,
tts: TTS,
input_text: str,
conn_options: APIConnectOptions)
Expand source code
class ChunkedStream(tts.ChunkedStream):
    """Synthesize chunked text using the bytes endpoint"""

    def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
        super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
        self._tts: TTS = tts
        self._opts = replace(tts._opts)

    async def _run(self, output_emitter: tts.AudioEmitter) -> None:
        json = _to_cartesia_options(self._opts, streaming=False)
        json["transcript"] = self._input_text

        try:
            async with self._tts._ensure_session().post(
                self._opts.get_http_url("/tts/bytes"),
                headers={
                    API_AUTH_HEADER: self._opts.api_key,
                    API_VERSION_HEADER: API_VERSION,
                    "User-Agent": USER_AGENT,
                },
                json=json,
                timeout=aiohttp.ClientTimeout(total=30, sock_connect=self._conn_options.timeout),
            ) as resp:
                resp.raise_for_status()

                output_emitter.initialize(
                    request_id=utils.shortuuid(),
                    sample_rate=self._opts.sample_rate,
                    num_channels=1,
                    mime_type="audio/pcm",
                )

                async for data, _ in resp.content.iter_chunks():
                    output_emitter.push(data)

                output_emitter.flush()
        except asyncio.TimeoutError:
            raise APITimeoutError() from None
        except aiohttp.ClientResponseError as e:
            raise APIStatusError(
                message=e.message, status_code=e.status, request_id=None, body=None
            ) from None
        except Exception as e:
            raise APIConnectionError() from e

Synthesize chunked text using the bytes endpoint

Ancestors

  • livekit.agents.tts.tts.ChunkedStream
  • abc.ABC
class STT (*,
model: NotGivenOr[STTModels | str] = NOT_GIVEN,
sample_rate: int = 16000,
api_key: str | None = None,
audio_chunk_duration_ms: int = 160,
http_session: aiohttp.ClientSession | None = None,
base_url: str = 'https://api.cartesia.ai',
language: STTLanguages | str | None = None,
encoding: STTEncoding = 'pcm_s16le')
Expand source code
class STT(stt.STT):
    """Cartesia speech to text.

    Model ``ink-2`` supports:
        - Streaming
        - Turn detection
        - Interim results

    Model ``ink-whisper`` supports:
        - Streaming
        - Word aligned transcripts

    See also:
        https://docs.cartesia.ai/build-with-cartesia/stt-models/latest

    Examples:

        ```# Turn detecting
        from livekit.agents import AgentSession
        from livekit.plugins import cartesia

        session = AgentSession(
            stt=cartesia.STT(),
            llm=LLM(),  # choose your favorite LLM
            tts=cartesia.TTS(),
            turn_handling={
                "turn_detection": "stt",
            },
        )
        ```
    """

    def __init__(
        self,
        *,
        model: NotGivenOr[STTModels | str] = NOT_GIVEN,
        sample_rate: int = 16000,
        api_key: str | None = None,
        audio_chunk_duration_ms: int = 160,
        http_session: aiohttp.ClientSession | None = None,
        base_url: str = "https://api.cartesia.ai",
        language: STTLanguages | str | None = None,
        encoding: STTEncoding = AUDIO_ENCODING,
    ) -> None:
        """
        Create a new instance of Cartesia STT.

        Model ``ink-2`` supports:
            - Streaming
            - Turn detection
            - Interim results

        Model ``ink-whisper`` supports:
            - Streaming
            - Word aligned transcripts

        See also:
            https://docs.cartesia.ai/build-with-cartesia/stt-models/latest

        Args:
            model: The Cartesia STT model to use.
                Defaults to ``ink-2`` if language is ``en``.
                Defaults to ``ink-whisper`` for other languages.
            sample_rate: The sample rate of the audio in Hz. Defaults to 16 kHz.
            api_key: The Cartesia API key. If not provided, it will be read from
                the ``CARTESIA_API_KEY`` environment variable.
            audio_chunk_duration_ms: Duration in milliseconds of each audio chunk
                sent to the Cartesia STT websocket. Defaults to 160 ms.
            http_session: Optional aiohttp ClientSession to use for requests.
            base_url: The base URL for the Cartesia API.
                Defaults to ``https://api.cartesia.ai``.
            language: The language code for recognition.
                This plugin only supports ``en`` for ``ink-2``.
            encoding: The audio encoding format. Must be ``pcm_s16le``.

        Raises:
            ValueError: If no API key is provided or found in environment variables.

        Examples:

            ```# Turn detecting
            from livekit.agents import AgentSession
            from livekit.plugins import cartesia

            session = AgentSession(
                stt=cartesia.STT(),
                llm=LLM(),  # choose your favorite LLM
                tts=cartesia.TTS(),
                turn_handling={
                    "turn_detection": "stt",
                },
            )
            ```
        """
        resolved_api_key = api_key or os.environ.get("CARTESIA_API_KEY")
        if not resolved_api_key:
            raise ValueError(
                "Cartesia API key is required, either as argument or set"
                " CARTESIA_API_KEY environment variable"
            )

        language_code = None if language is None else LanguageCode(language)

        # TODO: default all languages to ink-2 once they are supported
        if utils.is_given(model):
            resolved_model = model
        elif language_code is None or language_code.language == "en":
            resolved_model = "ink-2"
        else:
            resolved_model = "ink-whisper"

        is_whisper = _is_whisper_model(resolved_model)

        resolved_final_transcript_mode: _ResolvedFinalTranscriptMode
        if is_whisper:
            resolved_final_transcript_mode = "legacy"
        else:
            resolved_final_transcript_mode = "auto"

        super().__init__(
            capabilities=stt.STTCapabilities(
                streaming=True,
                interim_results=resolved_final_transcript_mode != "legacy",
                aligned_transcript="word" if is_whisper else False,
                offline_recognize=False,
                diarization=False,
            )
        )

        self._api_key = resolved_api_key
        self._audio_chunk_duration_ms = audio_chunk_duration_ms
        self._final_transcript_mode: _ResolvedFinalTranscriptMode = resolved_final_transcript_mode
        self._encoding: STTEncoding = encoding
        self._language = language_code
        self._model = resolved_model
        self._sample_rate = sample_rate
        self._session = http_session
        self._ws_base_url = _base_url_to_ws_base_url(base_url=base_url)

        self._streams = weakref.WeakSet[CartesiaRecognizeStream]()

        self._warn_on_unexpected_args(language=self._language)

    @property
    def model(self) -> str:
        return self._model

    @property
    def provider(self) -> str:
        return "Cartesia"

    async def _recognize_impl(
        self,
        buffer: utils.AudioBuffer,
        *,
        language: NotGivenOr[str] = NOT_GIVEN,
        conn_options: APIConnectOptions,
    ) -> stt.SpeechEvent:
        raise NotImplementedError(
            "Cartesia STT does not support batch recognition, use stream() instead"
        )

    def stream(
        self,
        *,
        language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> CartesiaRecognizeStream:
        if utils.is_given(language):
            resolved_language = LanguageCode(language)
        elif self._language is not None:
            resolved_language = LanguageCode(self._language)
        else:
            resolved_language = None

        self._warn_on_unexpected_args(language=resolved_language)

        if self._session is None:
            session = utils.http_context.http_session()
            self._session = session
        else:
            session = self._session

        stream: CartesiaRecognizeStream
        match self._final_transcript_mode:
            case "auto":
                stream = AutoFinalizeRecognizeStream(
                    stt=self,
                    conn_options=conn_options,
                    sample_rate=self._sample_rate,
                    encoding=self._encoding,
                    audio_chunk_duration_ms=self._audio_chunk_duration_ms,
                    model=self._model,
                    api_key=self._api_key,
                    ws_base_url=self._ws_base_url,
                    session=session,
                    language=resolved_language or LanguageCode("en"),
                )
            case "legacy":
                stream = LegacyRecognizeStream(
                    stt=self,
                    conn_options=conn_options,
                    sample_rate=self._sample_rate,
                    encoding=self._encoding,
                    audio_chunk_duration_ms=self._audio_chunk_duration_ms,
                    model=self._model,
                    api_key=self._api_key,
                    ws_base_url=self._ws_base_url,
                    session=session,
                    language=resolved_language,
                )
            case _:
                _exhaustive_check: Never = self._final_transcript_mode
                raise RuntimeError(
                    f"Cartesia STT has unexpected final_transcript_mode={_exhaustive_check}"
                )

        self._streams.add(stream)
        return stream

    async def aclose(self) -> None:
        """Close every stream created by :meth:`stream`.

        The HTTP session is left open: it is either supplied by the caller or
        owned by the shared HTTP context, so it is not ours to close.
        """
        streams = list(self._streams)
        self._streams.clear()
        # return_exceptions=True so one stream failing to close doesn't abandon the rest
        await asyncio.gather(*(stream.aclose() for stream in streams), return_exceptions=True)

    def update_options(
        self,
        *,
        language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
        model: NotGivenOr[STTModels | str] = NOT_GIVEN,
    ) -> None:
        """Change Cartesia STT options.

        Also propagates changes to all :class:`SpeechStream` created by :meth:`stream`.

        Args:
            language: Used to change the language to match what the user is speaking.
                Ink 2 does not have multi-lingual support yet and only works with English.
            model: Deprecated. This is a no-op. Construct a new STT instance to change the model.
        """
        if utils.is_given(model) and model != self._model:
            logger.warning(
                "Cartesia STT update_options() ignores the model kwarg. Construct a new STT instance to change the model."
            )

        if utils.is_given(language):
            self._language = LanguageCode(language)
            self._warn_on_unexpected_args(language=self._language)

        for stream in self._streams:
            # do not update model since this is likely user error
            stream.update_options(language=language)

    def _warn_on_unexpected_args(self, language: LanguageCode | None) -> None:
        """Logs a warning when arguments are unexpected.

        This is not necessarily an error since the API may support languages in the future.
        """
        if self._final_transcript_mode == "auto" and language and language.language != "en":
            logger.warning(
                f'Cartesia STT model="{self._model}" currently only supports English. You provided {language}, which may produce unexpected results.'
            )

Cartesia speech to text.

Model ink-2 supports: - Streaming - Turn detection - Interim results

Model ink-whisper supports: - Streaming - Word aligned transcripts

See also: https://docs.cartesia.ai/build-with-cartesia/stt-models/latest

Examples

```# Turn detecting from livekit.agents import AgentSession from livekit.plugins import cartesia

session = AgentSession( stt=cartesia.STT(), llm=LLM(), # choose your favorite LLM tts=cartesia.TTS(), turn_handling={ "turn_detection": "stt", }, )


Create a new instance of Cartesia STT.

Model ``ink-2`` supports:
    - Streaming
    - Turn detection
    - Interim results

Model ``ink-whisper`` supports:
    - Streaming
    - Word aligned transcripts

See also:
    <https://docs.cartesia.ai/build-with-cartesia/stt-models/latest>


Args
-----=
**```model```**
:   The Cartesia STT model to use.
    Defaults to ``ink-2`` if language is <code>en</code>.
    Defaults to ``ink-whisper`` for other languages.


**```sample_rate```**
:   The sample rate of the audio in Hz. Defaults to 16 kHz.


**```api_key```**
:   The Cartesia API key. If not provided, it will be read from
    the <code>CARTESIA\_API\_KEY</code> environment variable.


**```audio_chunk_duration_ms```**
:   Duration in milliseconds of each audio chunk
    sent to the Cartesia STT websocket. Defaults to 160 ms.


**```http_session```**
:   Optional aiohttp ClientSession to use for requests.


**```base_url```**
:   The base URL for the Cartesia API.
    Defaults to ``https://api.cartesia.ai``.


**```language```**
:   The language code for recognition.
    This plugin only supports <code>en</code> for ``ink-2``.


**```encoding```**
:   The audio encoding format. Must be <code>pcm\_s16le</code>.



Raises
-----=
<code>ValueError</code>
:   If no API key is provided or found in environment variables.



Examples
-----=

```# Turn detecting
from livekit.agents import AgentSession
from livekit.plugins import cartesia

session = AgentSession(
    stt=cartesia.STT(),
    llm=LLM(),  # choose your favorite LLM
    tts=cartesia.TTS(),
    turn_handling={
        "turn_detection": "stt",
    },
)

Ancestors

  • livekit.agents.stt.stt.STT
  • abc.ABC
  • EventEmitter
  • typing.Generic

Instance variables

prop model : str
Expand source code
@property
def model(self) -> str:
    return self._model

Get the model name/identifier for this STT instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop provider : str
Expand source code
@property
def provider(self) -> str:
    return "Cartesia"

Get the provider name/identifier for this STT instance.

Returns

The provider name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their provider information.

Methods

async def aclose(self) ‑> None
Expand source code
async def aclose(self) -> None:
    """Close every stream created by :meth:`stream`.

    The HTTP session is left open: it is either supplied by the caller or
    owned by the shared HTTP context, so it is not ours to close.
    """
    streams = list(self._streams)
    self._streams.clear()
    # return_exceptions=True so one stream failing to close doesn't abandon the rest
    await asyncio.gather(*(stream.aclose() for stream in streams), return_exceptions=True)

Close every stream created by :meth:stream.

The HTTP session is left open: it is either supplied by the caller or owned by the shared HTTP context, so it is not ours to close.

def stream(self,
*,
language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> CartesiaRecognizeStream
Expand source code
def stream(
    self,
    *,
    language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> CartesiaRecognizeStream:
    if utils.is_given(language):
        resolved_language = LanguageCode(language)
    elif self._language is not None:
        resolved_language = LanguageCode(self._language)
    else:
        resolved_language = None

    self._warn_on_unexpected_args(language=resolved_language)

    if self._session is None:
        session = utils.http_context.http_session()
        self._session = session
    else:
        session = self._session

    stream: CartesiaRecognizeStream
    match self._final_transcript_mode:
        case "auto":
            stream = AutoFinalizeRecognizeStream(
                stt=self,
                conn_options=conn_options,
                sample_rate=self._sample_rate,
                encoding=self._encoding,
                audio_chunk_duration_ms=self._audio_chunk_duration_ms,
                model=self._model,
                api_key=self._api_key,
                ws_base_url=self._ws_base_url,
                session=session,
                language=resolved_language or LanguageCode("en"),
            )
        case "legacy":
            stream = LegacyRecognizeStream(
                stt=self,
                conn_options=conn_options,
                sample_rate=self._sample_rate,
                encoding=self._encoding,
                audio_chunk_duration_ms=self._audio_chunk_duration_ms,
                model=self._model,
                api_key=self._api_key,
                ws_base_url=self._ws_base_url,
                session=session,
                language=resolved_language,
            )
        case _:
            _exhaustive_check: Never = self._final_transcript_mode
            raise RuntimeError(
                f"Cartesia STT has unexpected final_transcript_mode={_exhaustive_check}"
            )

    self._streams.add(stream)
    return stream
def update_options(self,
*,
language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
model: NotGivenOr[STTModels | str] = NOT_GIVEN) ‑> None
Expand source code
def update_options(
    self,
    *,
    language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
    model: NotGivenOr[STTModels | str] = NOT_GIVEN,
) -> None:
    """Change Cartesia STT options.

    Also propagates changes to all :class:`SpeechStream` created by :meth:`stream`.

    Args:
        language: Used to change the language to match what the user is speaking.
            Ink 2 does not have multi-lingual support yet and only works with English.
        model: Deprecated. This is a no-op. Construct a new STT instance to change the model.
    """
    if utils.is_given(model) and model != self._model:
        logger.warning(
            "Cartesia STT update_options() ignores the model kwarg. Construct a new STT instance to change the model."
        )

    if utils.is_given(language):
        self._language = LanguageCode(language)
        self._warn_on_unexpected_args(language=self._language)

    for stream in self._streams:
        # do not update model since this is likely user error
        stream.update_options(language=language)

Change Cartesia STT options.

Also propagates changes to all :class:SpeechStream created by :meth:stream.

Args

language
Used to change the language to match what the user is speaking. Ink 2 does not have multi-lingual support yet and only works with English.
model
Deprecated. This is a no-op. Construct a new STT instance to change the model.

Inherited members

class TTS (*,
api_key: str | None = None,
model: TTSModels | str = 'sonic-3',
language: str | None = 'en',
encoding: TTSEncoding = 'pcm_s16le',
voice: str | list[float] = 'f786b574-daa5-4673-aa0c-cbe3e8534c02',
speed: TTSVoiceSpeed | float | None = None,
emotion: TTSVoiceEmotion | str | list[TTSVoiceEmotion | str] | None = None,
volume: float | None = None,
sample_rate: int = 24000,
word_timestamps: bool = True,
pronunciation_dict_id: str | None = None,
http_session: aiohttp.ClientSession | None = None,
tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
text_pacing: tts.SentenceStreamPacer | bool = False,
base_url: str = 'https://api.cartesia.ai',
api_version: str = '2025-04-16')
Expand source code
class TTS(tts.TTS):
    def __init__(
        self,
        *,
        api_key: str | None = None,
        model: TTSModels | str = "sonic-3",
        language: str | None = "en",
        encoding: TTSEncoding = "pcm_s16le",
        voice: str | list[float] = TTSDefaultVoiceId,
        speed: TTSVoiceSpeed | float | None = None,
        emotion: TTSVoiceEmotion | str | list[TTSVoiceEmotion | str] | None = None,
        volume: float | None = None,
        sample_rate: int = 24000,
        word_timestamps: bool = True,
        pronunciation_dict_id: str | None = None,
        http_session: aiohttp.ClientSession | None = None,
        tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
        text_pacing: tts.SentenceStreamPacer | bool = False,
        base_url: str = "https://api.cartesia.ai",
        api_version: str = API_VERSION,
    ) -> None:
        """
        Create a new instance of Cartesia TTS.

        See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the Cartesia API.

        Args:
            model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-3".
            language (str, optional): The language code for synthesis. Defaults to "en".
            encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le".
            voice (str | list[float], optional): The voice ID or embedding array.
            speed (TTSVoiceSpeed | float, optional): Speed of speech, with sonic-3, the value is valid between 0.6 and 2.0 (https://docs.cartesia.ai/api-reference/tts/bytes#body-generation-config-speed)
            emotion (list[TTSVoiceEmotion], optional): Emotion of the speech (https://docs.cartesia.ai/api-reference/tts/bytes#body-generation-config-emotion)
            volume (float, optional): Volume of the speech, with sonic-3, the value is valid between 0.5 and 2.0
            sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000.
            word_timestamps (bool, optional): Whether to add word timestamps to the output. Defaults to True.
            pronunciation_dict_id (str, optional): The pronunciation dictionary ID to use for custom pronunciations. Defaults to None.
            api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
            http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
            tokenizer (tokenize.SentenceTokenizer, optional): The tokenizer to use. Defaults to `livekit.agents.tokenize.blingfire.SentenceTokenizer`.
            text_pacing (tts.SentenceStreamPacer | bool, optional): Stream pacer for the TTS. Set to True to use the default pacer, False to disable.
            base_url (str, optional): The base URL for the Cartesia API. Defaults to "https://api.cartesia.ai".
        """  # noqa: E501

        super().__init__(
            capabilities=tts.TTSCapabilities(
                streaming=True,
                aligned_transcript=word_timestamps,
            ),
            sample_rate=sample_rate,
            num_channels=1,
        )
        cartesia_api_key = api_key or os.environ.get("CARTESIA_API_KEY")
        if not cartesia_api_key:
            raise ValueError(
                "Cartesia API key is required, either as argument or set"
                " CARTESIA_API_KEY environment variable"
            )

        if isinstance(emotion, str):
            emotion = [emotion]

        self._opts = _TTSOptions(
            model=model,
            language=LanguageCode(language) if language else None,
            encoding=encoding,
            sample_rate=sample_rate,
            voice=voice,
            speed=speed,
            emotion=emotion,
            volume=volume,
            api_key=cartesia_api_key,
            base_url=base_url,
            word_timestamps=word_timestamps,
            api_version=api_version,
            pronunciation_dict_id=pronunciation_dict_id,
        )

        if speed or emotion or volume or pronunciation_dict_id:
            self._check_generation_config()

        self._session = http_session
        self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
            connect_cb=self._connect_ws,
            close_cb=self._close_ws,
            max_session_duration=300,
            mark_refreshed_on_get=True,
        )
        self._streams = weakref.WeakSet[SynthesizeStream]()
        self._sentence_tokenizer = (
            tokenizer if is_given(tokenizer) else tokenize.blingfire.SentenceTokenizer()
        )
        self._stream_pacer: tts.SentenceStreamPacer | None = None
        if text_pacing is True:
            self._stream_pacer = tts.SentenceStreamPacer()
        elif isinstance(text_pacing, tts.SentenceStreamPacer):
            self._stream_pacer = text_pacing

        if word_timestamps:
            if "preview" not in self._opts.model and (
                self._opts.language is not None
                and self._opts.language.language
                not in {
                    "en",
                    "de",
                    "es",
                    "fr",
                }
            ):
                # https://docs.cartesia.ai/api-reference/tts/compare-tts-endpoints
                logger.warning(
                    "word_timestamps is only supported for languages en, de, es, and fr with `sonic` models"
                    " or all languages with `preview` models"
                )

    @property
    def model(self) -> str:
        return self._opts.model

    @property
    def provider(self) -> str:
        return "Cartesia"

    async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse:
        session = self._ensure_session()
        url = self._opts.get_ws_url(f"/tts/websocket?cartesia_version={self._opts.api_version}")
        ws = await asyncio.wait_for(
            session.ws_connect(
                url,
                headers={
                    "User-Agent": USER_AGENT,
                    API_AUTH_HEADER: self._opts.api_key,
                },
            ),
            timeout,
        )
        c_request_id = ws._response.headers.get(REQUEST_ID_HEADER)
        logger.debug(
            "Established new Cartesia TTS WebSocket connection",
            extra={"cartesia_request_id": c_request_id},
        )
        return ws

    async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
        await ws.close()

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()

        return self._session

    def prewarm(self) -> None:
        self._pool.prewarm()

    def update_options(
        self,
        *,
        model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
        language: NotGivenOr[str | None] = NOT_GIVEN,
        voice: NotGivenOr[str | list[float]] = NOT_GIVEN,
        speed: NotGivenOr[TTSVoiceSpeed | float] = NOT_GIVEN,
        emotion: NotGivenOr[TTSVoiceEmotion | str | list[TTSVoiceEmotion | str]] = NOT_GIVEN,
        volume: NotGivenOr[float] = NOT_GIVEN,
        pronunciation_dict_id: NotGivenOr[str] = NOT_GIVEN,
        api_version: NotGivenOr[str] = NOT_GIVEN,
    ) -> None:
        """
        Update the Text-to-Speech (TTS) configuration options.

        This method allows updating the TTS settings, including model type, language, voice, speed,
        and emotion. If any parameter is not provided, the existing value will be retained.

        Args:
            model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-3".
            language (str, optional): The language code for synthesis. Defaults to "en".
            voice (str | list[float], optional): The voice ID or embedding array.
            speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
            emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
            pronunciation_dict_id (str, optional): The pronunciation dictionary ID to use for custom pronunciations.
        """
        if is_given(model):
            self._opts.model = model
        if is_given(language):
            self._opts.language = LanguageCode(language) if language else None
        if is_given(voice):
            self._opts.voice = voice
        if is_given(speed):
            self._opts.speed = cast(TTSVoiceSpeed | float, speed)
        if is_given(emotion):
            emotion = [emotion] if isinstance(emotion, str) else emotion
            self._opts.emotion = emotion
        if is_given(volume):
            self._opts.volume = volume
        if is_given(pronunciation_dict_id):
            self._opts.pronunciation_dict_id = pronunciation_dict_id
        if is_given(api_version):
            self._opts.api_version = api_version

        if speed or emotion or volume or pronunciation_dict_id:
            self._check_generation_config()

    def synthesize(
        self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> ChunkedStream:
        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

    def stream(
        self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> SynthesizeStream:
        stream = SynthesizeStream(tts=self, conn_options=conn_options)
        self._streams.add(stream)
        return stream

    async def aclose(self) -> None:
        for stream in list(self._streams):
            await stream.aclose()

        self._streams.clear()
        await self._pool.aclose()

    def _check_generation_config(self) -> None:
        if _is_sonic_3(self._opts.model):
            if self._opts.speed:
                if not isinstance(self._opts.speed, float):
                    raise ValueError("speed must be a float for sonic-3")
                if not 0.6 <= self._opts.speed <= 2.0:
                    logger.warning("speed must be between 0.6 and 2.0 for sonic-3")
            if self._opts.volume is not None and not 0.5 <= self._opts.volume <= 2.0:
                logger.warning("volume must be between 0.5 and 2.0 for sonic-3")
        elif (
            self._opts.api_version != API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
            or self._opts.model != MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
        ):
            logger.warning(
                f"speed and emotion controls are only supported for model '{MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', and API version '{API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', "
                "see https://docs.cartesia.ai/developer-tools/changelog for details",
                extra={
                    "model": self._opts.model,
                    "speed": self._opts.speed,
                    "emotion": self._opts.emotion,
                },
            )

        if self._opts.pronunciation_dict_id and not _is_sonic_3(self._opts.model):
            logger.warning(
                "pronunciation_dict_id is only supported for sonic-3 models",
                extra={
                    "model": self._opts.model,
                    "pronunciation_dict_id": self._opts.pronunciation_dict_id,
                },
            )

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Cartesia TTS.

See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the Cartesia API.

Args

model : TTSModels, optional
The Cartesia TTS model to use. Defaults to "sonic-3".
language : str, optional
The language code for synthesis. Defaults to "en".
encoding : TTSEncoding, optional
The audio encoding format. Defaults to "pcm_s16le".
voice : str | list[float], optional
The voice ID or embedding array.
speed : TTSVoiceSpeed | float, optional
Speed of speech, with sonic-3, the value is valid between 0.6 and 2.0 (https://docs.cartesia.ai/api-reference/tts/bytes#body-generation-config-speed)
emotion : list[TTSVoiceEmotion], optional
Emotion of the speech (https://docs.cartesia.ai/api-reference/tts/bytes#body-generation-config-emotion)
volume : float, optional
Volume of the speech, with sonic-3, the value is valid between 0.5 and 2.0
sample_rate : int, optional
The audio sample rate in Hz. Defaults to 24000.
word_timestamps : bool, optional
Whether to add word timestamps to the output. Defaults to True.
pronunciation_dict_id : str, optional
The pronunciation dictionary ID to use for custom pronunciations. Defaults to None.
api_key : str, optional
The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
http_session : aiohttp.ClientSession | None, optional
An existing aiohttp ClientSession to use. If not provided, a new session will be created.
tokenizer : tokenize.SentenceTokenizer, optional
The tokenizer to use. Defaults to SentenceTokenizer.
text_pacing : tts.SentenceStreamPacer | bool, optional
Stream pacer for the TTS. Set to True to use the default pacer, False to disable.
base_url : str, optional
The base URL for the Cartesia API. Defaults to "https://api.cartesia.ai".

Ancestors

  • livekit.agents.tts.tts.TTS
  • abc.ABC
  • EventEmitter
  • typing.Generic

Instance variables

prop model : str
Expand source code
@property
def model(self) -> str:
    return self._opts.model

Get the model name/identifier for this TTS instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop provider : str
Expand source code
@property
def provider(self) -> str:
    return "Cartesia"

Get the provider name/identifier for this TTS instance.

Returns

The provider name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their provider information.

Methods

async def aclose(self) ‑> None
Expand source code
async def aclose(self) -> None:
    for stream in list(self._streams):
        await stream.aclose()

    self._streams.clear()
    await self._pool.aclose()
def prewarm(self) ‑> None
Expand source code
def prewarm(self) -> None:
    self._pool.prewarm()

Pre-warm connection to the TTS service

def stream(self,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.cartesia.tts.SynthesizeStream
Expand source code
def stream(
    self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> SynthesizeStream:
    stream = SynthesizeStream(tts=self, conn_options=conn_options)
    self._streams.add(stream)
    return stream
def synthesize(self,
text: str,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.cartesia.tts.ChunkedStream
Expand source code
def synthesize(
    self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> ChunkedStream:
    return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
def update_options(self,
*,
model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
language: NotGivenOr[str | None] = NOT_GIVEN,
voice: NotGivenOr[str | list[float]] = NOT_GIVEN,
speed: NotGivenOr[TTSVoiceSpeed | float] = NOT_GIVEN,
emotion: NotGivenOr[TTSVoiceEmotion | str | list[TTSVoiceEmotion | str]] = NOT_GIVEN,
volume: NotGivenOr[float] = NOT_GIVEN,
pronunciation_dict_id: NotGivenOr[str] = NOT_GIVEN,
api_version: NotGivenOr[str] = NOT_GIVEN) ‑> None
Expand source code
def update_options(
    self,
    *,
    model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
    language: NotGivenOr[str | None] = NOT_GIVEN,
    voice: NotGivenOr[str | list[float]] = NOT_GIVEN,
    speed: NotGivenOr[TTSVoiceSpeed | float] = NOT_GIVEN,
    emotion: NotGivenOr[TTSVoiceEmotion | str | list[TTSVoiceEmotion | str]] = NOT_GIVEN,
    volume: NotGivenOr[float] = NOT_GIVEN,
    pronunciation_dict_id: NotGivenOr[str] = NOT_GIVEN,
    api_version: NotGivenOr[str] = NOT_GIVEN,
) -> None:
    """
    Update the Text-to-Speech (TTS) configuration options.

    This method allows updating the TTS settings, including model type, language, voice, speed,
    and emotion. If any parameter is not provided, the existing value will be retained.

    Args:
        model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-3".
        language (str, optional): The language code for synthesis. Defaults to "en".
        voice (str | list[float], optional): The voice ID or embedding array.
        speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
        emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
        pronunciation_dict_id (str, optional): The pronunciation dictionary ID to use for custom pronunciations.
    """
    if is_given(model):
        self._opts.model = model
    if is_given(language):
        self._opts.language = LanguageCode(language) if language else None
    if is_given(voice):
        self._opts.voice = voice
    if is_given(speed):
        self._opts.speed = cast(TTSVoiceSpeed | float, speed)
    if is_given(emotion):
        emotion = [emotion] if isinstance(emotion, str) else emotion
        self._opts.emotion = emotion
    if is_given(volume):
        self._opts.volume = volume
    if is_given(pronunciation_dict_id):
        self._opts.pronunciation_dict_id = pronunciation_dict_id
    if is_given(api_version):
        self._opts.api_version = api_version

    if speed or emotion or volume or pronunciation_dict_id:
        self._check_generation_config()

Update the Text-to-Speech (TTS) configuration options.

This method allows updating the TTS settings, including model type, language, voice, speed, and emotion. If any parameter is not provided, the existing value will be retained.

Args

model : TTSModels, optional
The Cartesia TTS model to use. Defaults to "sonic-3".
language : str, optional
The language code for synthesis. Defaults to "en".
voice : str | list[float], optional
The voice ID or embedding array.
speed : TTSVoiceSpeed | float, optional
Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
emotion : list[TTSVoiceEmotion], optional
Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
pronunciation_dict_id : str, optional
The pronunciation dictionary ID to use for custom pronunciations.

Inherited members