Module livekit.plugins.cartesia

Cartesia plugin for LiveKit Agents

See https://docs.livekit.io/agents/integrations/tts/cartesia/ for more information.

Classes

class ChunkedStream (*,
tts: TTS,
input_text: str,
conn_options: APIConnectOptions)
Expand source code
class ChunkedStream(tts.ChunkedStream):
    """Synthesize chunked text using the bytes endpoint"""

    def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
        super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
        self._tts: TTS = tts
        self._opts = replace(tts._opts)

    async def _run(self, output_emitter: tts.AudioEmitter) -> None:
        json = _to_cartesia_options(self._opts, streaming=False)
        json["transcript"] = self._input_text

        try:
            async with self._tts._ensure_session().post(
                self._opts.get_http_url("/tts/bytes"),
                headers={
                    API_AUTH_HEADER: self._opts.api_key,
                    API_VERSION_HEADER: API_VERSION,
                },
                json=json,
                timeout=aiohttp.ClientTimeout(total=30, sock_connect=self._conn_options.timeout),
            ) as resp:
                resp.raise_for_status()

                output_emitter.initialize(
                    request_id=utils.shortuuid(),
                    sample_rate=self._opts.sample_rate,
                    num_channels=1,
                    mime_type="audio/pcm",
                )

                async for data, _ in resp.content.iter_chunks():
                    output_emitter.push(data)

                output_emitter.flush()
        except asyncio.TimeoutError:
            raise APITimeoutError() from None
        except aiohttp.ClientResponseError as e:
            raise APIStatusError(
                message=e.message, status_code=e.status, request_id=None, body=None
            ) from None
        except Exception as e:
            raise APIConnectionError() from e

Synthesize chunked text using the bytes endpoint

Ancestors

  • livekit.agents.tts.tts.ChunkedStream
  • abc.ABC
class STT (*,
model: STTModels | str = 'ink-whisper',
language: STTLanguages | str = 'en',
encoding: STTEncoding = 'pcm_s16le',
sample_rate: int = 16000,
api_key: str | None = None,
http_session: aiohttp.ClientSession | None = None,
base_url: str = 'https://api.cartesia.ai')
Expand source code
class STT(stt.STT):
    def __init__(
        self,
        *,
        model: STTModels | str = "ink-whisper",
        language: STTLanguages | str = "en",
        encoding: STTEncoding = "pcm_s16le",
        sample_rate: int = 16000,
        api_key: str | None = None,
        http_session: aiohttp.ClientSession | None = None,
        base_url: str = "https://api.cartesia.ai",
    ) -> None:
        """
        Create a new instance of Cartesia STT.

        Args:
            model: The Cartesia STT model to use. Defaults to "ink-whisper".
            language: The language code for recognition. Defaults to "en".
            encoding: The audio encoding format. Defaults to "pcm_s16le".
            sample_rate: The sample rate of the audio in Hz. Defaults to 16000.
            api_key: The Cartesia API key. If not provided, it will be read from
                the CARTESIA_API_KEY environment variable.
            http_session: Optional aiohttp ClientSession to use for requests.
            base_url: The base URL for the Cartesia API.
                Defaults to "https://api.cartesia.ai".

        Raises:
            ValueError: If no API key is provided or found in environment variables.
        """
        super().__init__(capabilities=stt.STTCapabilities(streaming=True, interim_results=False))

        cartesia_api_key = api_key or os.environ.get("CARTESIA_API_KEY")
        if not cartesia_api_key:
            raise ValueError("CARTESIA_API_KEY must be set")

        self._opts = STTOptions(
            model=model,
            language=language,
            encoding=encoding,
            sample_rate=sample_rate,
            api_key=cartesia_api_key,
            base_url=base_url,
        )
        self._session = http_session
        self._streams = weakref.WeakSet[SpeechStream]()

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()
        return self._session

    async def _recognize_impl(
        self,
        buffer: utils.AudioBuffer,
        *,
        language: NotGivenOr[str] = NOT_GIVEN,
        conn_options: APIConnectOptions,
    ) -> stt.SpeechEvent:
        raise NotImplementedError(
            "Cartesia STT does not support batch recognition, use stream() instead"
        )

    def stream(
        self,
        *,
        language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> SpeechStream:
        """Create a streaming transcription session."""
        config = self._sanitize_options(language=language)
        stream = SpeechStream(
            stt=self,
            opts=config,
            conn_options=conn_options,
        )
        self._streams.add(stream)
        return stream

    def update_options(
        self,
        *,
        model: NotGivenOr[STTModels | str] = NOT_GIVEN,
        language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
    ) -> None:
        """Update STT configuration options."""
        if is_given(model):
            self._opts.model = model
        if is_given(language):
            self._opts.language = language

        # Update all active streams
        for stream in self._streams:
            stream.update_options(
                model=model,
                language=language,
            )

    def _sanitize_options(
        self, *, language: NotGivenOr[STTLanguages | str] = NOT_GIVEN
    ) -> STTOptions:
        """Create a sanitized copy of options with language override if provided."""
        config = STTOptions(
            model=self._opts.model,
            language=self._opts.language,
            encoding=self._opts.encoding,
            sample_rate=self._opts.sample_rate,
            api_key=self._opts.api_key,
            base_url=self._opts.base_url,
        )

        if is_given(language):
            config.language = language

        return config

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Cartesia STT.

Args

model
The Cartesia STT model to use. Defaults to "ink-whisper".
language
The language code for recognition. Defaults to "en".
encoding
The audio encoding format. Defaults to "pcm_s16le".
sample_rate
The sample rate of the audio in Hz. Defaults to 16000.
api_key
The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
http_session
Optional aiohttp ClientSession to use for requests.
base_url
The base URL for the Cartesia API. Defaults to "https://api.cartesia.ai".

Raises

ValueError
If no API key is provided or found in environment variables.

Ancestors

  • livekit.agents.stt.stt.STT
  • abc.ABC
  • EventEmitter
  • typing.Generic

Methods

def stream(self,
*,
language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.cartesia.stt.SpeechStream
Expand source code
def stream(
    self,
    *,
    language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> SpeechStream:
    """Create a streaming transcription session."""
    config = self._sanitize_options(language=language)
    stream = SpeechStream(
        stt=self,
        opts=config,
        conn_options=conn_options,
    )
    self._streams.add(stream)
    return stream

Create a streaming transcription session.

def update_options(self,
*,
model: NotGivenOr[STTModels | str] = NOT_GIVEN,
language: NotGivenOr[STTLanguages | str] = NOT_GIVEN) ‑> None
Expand source code
def update_options(
    self,
    *,
    model: NotGivenOr[STTModels | str] = NOT_GIVEN,
    language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
) -> None:
    """Update STT configuration options."""
    if is_given(model):
        self._opts.model = model
    if is_given(language):
        self._opts.language = language

    # Update all active streams
    for stream in self._streams:
        stream.update_options(
            model=model,
            language=language,
        )

Update STT configuration options.

Inherited members

class TTS (*,
api_key: str | None = None,
model: TTSModels | str = 'sonic-2',
language: str = 'en',
encoding: TTSEncoding = 'pcm_s16le',
voice: str | list[float] = '794f9389-aac1-45b6-b726-9d9369183238',
speed: TTSVoiceSpeed | float | None = None,
emotion: list[TTSVoiceEmotion | str] | None = None,
sample_rate: int = 24000,
word_timestamps: bool = True,
http_session: aiohttp.ClientSession | None = None,
tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
text_pacing: tts.SentenceStreamPacer | bool = False,
base_url: str = 'https://api.cartesia.ai')
Expand source code
class TTS(tts.TTS):
    def __init__(
        self,
        *,
        api_key: str | None = None,
        model: TTSModels | str = "sonic-2",
        language: str = "en",
        encoding: TTSEncoding = "pcm_s16le",
        voice: str | list[float] = TTSDefaultVoiceId,
        speed: TTSVoiceSpeed | float | None = None,
        emotion: list[TTSVoiceEmotion | str] | None = None,
        sample_rate: int = 24000,
        word_timestamps: bool = True,
        http_session: aiohttp.ClientSession | None = None,
        tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
        text_pacing: tts.SentenceStreamPacer | bool = False,
        base_url: str = "https://api.cartesia.ai",
    ) -> None:
        """
        Create a new instance of Cartesia TTS.

        See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API.

        Args:
            model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-2".
            language (str, optional): The language code for synthesis. Defaults to "en".
            encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le".
            voice (str | list[float], optional): The voice ID or embedding array.
            speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
            emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
            sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000.
            word_timestamps (bool, optional): Whether to add word timestamps to the output. Defaults to True.
            api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
            http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
            tokenizer (tokenize.SentenceTokenizer, optional): The tokenizer to use. Defaults to tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT).
            text_pacing (tts.SentenceStreamPacer | bool, optional): Stream pacer for the TTS. Set to True to use the default pacer, False to disable.
            base_url (str, optional): The base URL for the Cartesia API. Defaults to "https://api.cartesia.ai".
        """  # noqa: E501

        super().__init__(
            capabilities=tts.TTSCapabilities(
                streaming=True,
                aligned_transcript=word_timestamps,
            ),
            sample_rate=sample_rate,
            num_channels=1,
        )
        cartesia_api_key = api_key or os.environ.get("CARTESIA_API_KEY")
        if not cartesia_api_key:
            raise ValueError("CARTESIA_API_KEY must be set")

        if (speed or emotion) and model != "sonic-2-2025-03-07":
            logger.warning(
                "speed and emotion controls are only supported for model 'sonic-2-2025-03-07', "
                "see https://docs.cartesia.ai/developer-tools/changelog for details",
                extra={"model": model, "speed": speed, "emotion": emotion},
            )

        self._opts = _TTSOptions(
            model=model,
            language=language,
            encoding=encoding,
            sample_rate=sample_rate,
            voice=voice,
            speed=speed,
            emotion=emotion,
            api_key=cartesia_api_key,
            base_url=base_url,
            word_timestamps=word_timestamps,
        )
        self._session = http_session
        self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
            connect_cb=self._connect_ws,
            close_cb=self._close_ws,
            max_session_duration=300,
            mark_refreshed_on_get=True,
        )
        self._streams = weakref.WeakSet[SynthesizeStream]()
        self._sentence_tokenizer = (
            tokenizer if is_given(tokenizer) else tokenize.blingfire.SentenceTokenizer()
        )
        self._stream_pacer: tts.SentenceStreamPacer | None = None
        if text_pacing is True:
            self._stream_pacer = tts.SentenceStreamPacer()
        elif isinstance(text_pacing, tts.SentenceStreamPacer):
            self._stream_pacer = text_pacing

    async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse:
        session = self._ensure_session()
        url = self._opts.get_ws_url(
            f"/tts/websocket?api_key={self._opts.api_key}&cartesia_version={API_VERSION}"
        )
        return await asyncio.wait_for(session.ws_connect(url), timeout)

    async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
        await ws.close()

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()

        return self._session

    def prewarm(self) -> None:
        self._pool.prewarm()

    def update_options(
        self,
        *,
        model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
        language: NotGivenOr[str] = NOT_GIVEN,
        voice: NotGivenOr[str | list[float]] = NOT_GIVEN,
        speed: NotGivenOr[TTSVoiceSpeed | float | None] = NOT_GIVEN,
        emotion: NotGivenOr[list[TTSVoiceEmotion | str] | None] = NOT_GIVEN,
    ) -> None:
        """
        Update the Text-to-Speech (TTS) configuration options.

        This method allows updating the TTS settings, including model type, language, voice, speed,
        and emotion. If any parameter is not provided, the existing value will be retained.

        Args:
            model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-2".
            language (str, optional): The language code for synthesis. Defaults to "en".
            voice (str | list[float], optional): The voice ID or embedding array.
            speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
            emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
        """
        if is_given(model):
            self._opts.model = model
        if is_given(language):
            self._opts.language = language
        if is_given(voice):
            self._opts.voice = cast(Union[str, list[float]], voice)
        if is_given(speed):
            self._opts.speed = cast(Optional[Union[TTSVoiceSpeed, float]], speed)
        if is_given(emotion):
            self._opts.emotion = emotion

        if (speed or emotion) and self._opts.model != "sonic-2-2025-03-07":
            logger.warning(
                "speed and emotion controls are only supported for model 'sonic-2-2025-03-07', "
                "see https://docs.cartesia.ai/developer-tools/changelog for details",
                extra={"model": self._opts.model, "speed": speed, "emotion": emotion},
            )

    def synthesize(
        self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> ChunkedStream:
        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

    def stream(
        self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> SynthesizeStream:
        return SynthesizeStream(tts=self, conn_options=conn_options)

    async def aclose(self) -> None:
        for stream in list(self._streams):
            await stream.aclose()

        self._streams.clear()
        await self._pool.aclose()

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Cartesia TTS.

See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API.

Args

model : TTSModels, optional
The Cartesia TTS model to use. Defaults to "sonic-2".
language : str, optional
The language code for synthesis. Defaults to "en".
encoding : TTSEncoding, optional
The audio encoding format. Defaults to "pcm_s16le".
voice : str | list[float], optional
The voice ID or embedding array.
speed : TTSVoiceSpeed | float, optional
Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
emotion : list[TTSVoiceEmotion], optional
Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
sample_rate : int, optional
The audio sample rate in Hz. Defaults to 24000.
word_timestamps : bool, optional
Whether to add word timestamps to the output. Defaults to True.
api_key : str, optional
The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
http_session : aiohttp.ClientSession | None, optional
An existing aiohttp ClientSession to use. If not provided, a new session will be created.
tokenizer : tokenize.SentenceTokenizer, optional
The tokenizer to use. Defaults to tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT).
text_pacing : tts.SentenceStreamPacer | bool, optional
Stream pacer for the TTS. Set to True to use the default pacer, False to disable.
base_url : str, optional
The base URL for the Cartesia API. Defaults to "https://api.cartesia.ai".

Ancestors

  • livekit.agents.tts.tts.TTS
  • abc.ABC
  • EventEmitter
  • typing.Generic

Methods

async def aclose(self) ‑> None
Expand source code
async def aclose(self) -> None:
    for stream in list(self._streams):
        await stream.aclose()

    self._streams.clear()
    await self._pool.aclose()
def prewarm(self) ‑> None
Expand source code
def prewarm(self) -> None:
    self._pool.prewarm()

Pre-warm connection to the TTS service

def stream(self,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.cartesia.tts.SynthesizeStream
Expand source code
def stream(
    self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> SynthesizeStream:
    return SynthesizeStream(tts=self, conn_options=conn_options)
def synthesize(self,
text: str,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.cartesia.tts.ChunkedStream
Expand source code
def synthesize(
    self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> ChunkedStream:
    return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
def update_options(self,
*,
model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
language: NotGivenOr[str] = NOT_GIVEN,
voice: NotGivenOr[str | list[float]] = NOT_GIVEN,
speed: NotGivenOr[TTSVoiceSpeed | float | None] = NOT_GIVEN,
emotion: NotGivenOr[list[TTSVoiceEmotion | str] | None] = NOT_GIVEN) ‑> None
Expand source code
def update_options(
    self,
    *,
    model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
    language: NotGivenOr[str] = NOT_GIVEN,
    voice: NotGivenOr[str | list[float]] = NOT_GIVEN,
    speed: NotGivenOr[TTSVoiceSpeed | float | None] = NOT_GIVEN,
    emotion: NotGivenOr[list[TTSVoiceEmotion | str] | None] = NOT_GIVEN,
) -> None:
    """
    Update the Text-to-Speech (TTS) configuration options.

    This method allows updating the TTS settings, including model type, language, voice, speed,
    and emotion. If any parameter is not provided, the existing value will be retained.

    Args:
        model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-2".
        language (str, optional): The language code for synthesis. Defaults to "en".
        voice (str | list[float], optional): The voice ID or embedding array.
        speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
        emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
    """
    if is_given(model):
        self._opts.model = model
    if is_given(language):
        self._opts.language = language
    if is_given(voice):
        self._opts.voice = cast(Union[str, list[float]], voice)
    if is_given(speed):
        self._opts.speed = cast(Optional[Union[TTSVoiceSpeed, float]], speed)
    if is_given(emotion):
        self._opts.emotion = emotion

    if (speed or emotion) and self._opts.model != "sonic-2-2025-03-07":
        logger.warning(
            "speed and emotion controls are only supported for model 'sonic-2-2025-03-07', "
            "see https://docs.cartesia.ai/developer-tools/changelog for details",
            extra={"model": self._opts.model, "speed": speed, "emotion": emotion},
        )

Update the Text-to-Speech (TTS) configuration options.

This method allows updating the TTS settings, including model type, language, voice, speed, and emotion. If any parameter is not provided, the existing value will be retained.

Args

model : TTSModels, optional
The Cartesia TTS model to use. Defaults to "sonic-2".
language : str, optional
The language code for synthesis. Defaults to "en".
voice : str | list[float], optional
The voice ID or embedding array.
speed : TTSVoiceSpeed | float, optional
Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
emotion : list[TTSVoiceEmotion], optional
Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)

Inherited members