Module livekit.plugins.cartesia

Classes

class ChunkedStream (text: str, opts: _TTSOptions, session: aiohttp.ClientSession)

Synthesize chunked text using the bytes endpoint

Expand source code
class ChunkedStream(tts.ChunkedStream):
    """Synthesize chunked text using the bytes endpoint"""

    def __init__(
        self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
    ) -> None:
        super().__init__()
        self._text, self._opts, self._session = text, opts, session

    @utils.log_exceptions(logger=logger)
    async def _main_task(self):
        bstream = utils.audio.AudioByteStream(
            sample_rate=self._opts.sample_rate, num_channels=NUM_CHANNELS
        )
        request_id, segment_id = utils.shortuuid(), utils.shortuuid()

        data = _to_cartesia_options(self._opts)
        data["transcript"] = self._text

        async with self._session.post(
            "https://api.cartesia.ai/tts/bytes",
            headers={
                API_AUTH_HEADER: self._opts.api_key,
                API_VERSION_HEADER: API_VERSION,
            },
            json=data,
        ) as resp:
            async for data, _ in resp.content.iter_chunks():
                for frame in bstream.write(data):
                    self._event_ch.send_nowait(
                        tts.SynthesizedAudio(
                            request_id=request_id, segment_id=segment_id, frame=frame
                        )
                    )

            for frame in bstream.flush():
                self._event_ch.send_nowait(
                    tts.SynthesizedAudio(
                        request_id=request_id, segment_id=segment_id, frame=frame
                    )
                )

Ancestors

Inherited members

class TTS (*, model: TTSModels = 'sonic-english', language: str = 'en', encoding: TTSEncoding = 'pcm_s16le', voice: str | list[float] = 'c2ac25f9-ecc4-4f56-9095-651354df60c0', speed: TTSVoiceSpeed | float | None = None, emotion: list[TTSVoiceEmotion | str] | None = None, sample_rate: int = 24000, api_key: str | None = None, http_session: aiohttp.ClientSession | None = None)

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Cartesia TTS.

See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API.

Args

model : TTSModels, optional
The Cartesia TTS model to use. Defaults to "sonic-english".
language : str, optional
The language code for synthesis. Defaults to "en".
encoding : TTSEncoding, optional
The audio encoding format. Defaults to "pcm_s16le".
voice : str | list[float], optional
The voice ID or embedding array.
speed : TTSVoiceSpeed | float, optional
Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
emotion : list[TTSVoiceEmotion], optional
Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
sample_rate : int, optional
The audio sample rate in Hz. Defaults to 24000.
api_key : str, optional
The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
http_session : aiohttp.ClientSession | None, optional
An existing aiohttp ClientSession to use. If not provided, a new session will be created.
Expand source code
class TTS(tts.TTS):
    def __init__(
        self,
        *,
        model: TTSModels = "sonic-english",
        language: str = "en",
        encoding: TTSEncoding = "pcm_s16le",
        voice: str | list[float] = TTSDefaultVoiceId,
        speed: TTSVoiceSpeed | float | None = None,
        emotion: list[TTSVoiceEmotion | str] | None = None,
        sample_rate: int = 24000,
        api_key: str | None = None,
        http_session: aiohttp.ClientSession | None = None,
    ) -> None:
        """
        Create a new instance of Cartesia TTS.

        See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API.

        Args:
            model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english".
            language (str, optional): The language code for synthesis. Defaults to "en".
            encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le".
            voice (str | list[float], optional): The voice ID or embedding array.
            speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
            emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
            sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000.
            api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
            http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
        """

        super().__init__(
            capabilities=tts.TTSCapabilities(streaming=True),
            sample_rate=sample_rate,
            num_channels=NUM_CHANNELS,
        )

        api_key = api_key or os.environ.get("CARTESIA_API_KEY")
        if not api_key:
            raise ValueError("CARTESIA_API_KEY must be set")

        self._opts = _TTSOptions(
            model=model,
            language=language,
            encoding=encoding,
            sample_rate=sample_rate,
            voice=voice,
            speed=speed,
            emotion=emotion,
            api_key=api_key,
        )
        self._session = http_session

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()

        return self._session

    def synthesize(self, text: str) -> "ChunkedStream":
        return ChunkedStream(text, self._opts, self._ensure_session())

    def stream(self) -> "SynthesizeStream":
        return SynthesizeStream(self._opts, self._ensure_session())

Ancestors

Methods

def stream(self) ‑> livekit.plugins.cartesia.tts.SynthesizeStream
def synthesize(self, text: str) ‑> livekit.plugins.cartesia.tts.ChunkedStream