Module livekit.plugins.cartesia
Classes
class ChunkedStream (text: str, opts: _TTSOptions, session: aiohttp.ClientSession)
-
Synthesize chunked text using the bytes endpoint
Expand source code
class ChunkedStream(tts.ChunkedStream): """Synthesize chunked text using the bytes endpoint""" def __init__( self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession ) -> None: super().__init__() self._text, self._opts, self._session = text, opts, session @utils.log_exceptions(logger=logger) async def _main_task(self): bstream = utils.audio.AudioByteStream( sample_rate=self._opts.sample_rate, num_channels=NUM_CHANNELS ) request_id, segment_id = utils.shortuuid(), utils.shortuuid() data = _to_cartesia_options(self._opts) data["transcript"] = self._text async with self._session.post( "https://api.cartesia.ai/tts/bytes", headers={ API_AUTH_HEADER: self._opts.api_key, API_VERSION_HEADER: API_VERSION, }, json=data, ) as resp: async for data, _ in resp.content.iter_chunks(): for frame in bstream.write(data): self._event_ch.send_nowait( tts.SynthesizedAudio( request_id=request_id, segment_id=segment_id, frame=frame ) ) for frame in bstream.flush(): self._event_ch.send_nowait( tts.SynthesizedAudio( request_id=request_id, segment_id=segment_id, frame=frame ) )
Ancestors
- ChunkedStream
- abc.ABC
Inherited members
class TTS (*, model: TTSModels = 'sonic-english', language: str = 'en', encoding: TTSEncoding = 'pcm_s16le', voice: str | list[float] = 'c2ac25f9-ecc4-4f56-9095-651354df60c0', speed: TTSVoiceSpeed | float | None = None, emotion: list[TTSVoiceEmotion | str] | None = None, sample_rate: int = 24000, api_key: str | None = None, http_session: aiohttp.ClientSession | None = None)
-
Helper class that provides a standard way to create an ABC using inheritance.
Create a new instance of Cartesia TTS.
See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API.
Args
model
:TTSModels
, optional- The Cartesia TTS model to use. Defaults to "sonic-english".
language
:str
, optional- The language code for synthesis. Defaults to "en".
encoding
:TTSEncoding
, optional- The audio encoding format. Defaults to "pcm_s16le".
voice
:str | list[float]
, optional- The voice ID or embedding array.
speed
:TTSVoiceSpeed | float
, optional- Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
emotion
:list[TTSVoiceEmotion]
, optional- Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
sample_rate
:int
, optional- The audio sample rate in Hz. Defaults to 24000.
api_key
:str
, optional- The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
http_session
:aiohttp.ClientSession | None
, optional- An existing aiohttp ClientSession to use. If not provided, a new session will be created.
Expand source code
class TTS(tts.TTS): def __init__( self, *, model: TTSModels = "sonic-english", language: str = "en", encoding: TTSEncoding = "pcm_s16le", voice: str | list[float] = TTSDefaultVoiceId, speed: TTSVoiceSpeed | float | None = None, emotion: list[TTSVoiceEmotion | str] | None = None, sample_rate: int = 24000, api_key: str | None = None, http_session: aiohttp.ClientSession | None = None, ) -> None: """ Create a new instance of Cartesia TTS. See https://docs.cartesia.ai/reference/web-socket/stream-speech/stream-speech for more details on the the Cartesia API. Args: model (TTSModels, optional): The Cartesia TTS model to use. Defaults to "sonic-english". language (str, optional): The language code for synthesis. Defaults to "en". encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le". voice (str | list[float], optional): The voice ID or embedding array. speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control) emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control) sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000. api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable. http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created. """ super().__init__( capabilities=tts.TTSCapabilities(streaming=True), sample_rate=sample_rate, num_channels=NUM_CHANNELS, ) api_key = api_key or os.environ.get("CARTESIA_API_KEY") if not api_key: raise ValueError("CARTESIA_API_KEY must be set") self._opts = _TTSOptions( model=model, language=language, encoding=encoding, sample_rate=sample_rate, voice=voice, speed=speed, emotion=emotion, api_key=api_key, ) self._session = http_session def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: self._session = utils.http_context.http_session() return self._session def synthesize(self, text: str) -> "ChunkedStream": return ChunkedStream(text, self._opts, self._ensure_session()) def stream(self) -> "SynthesizeStream": return SynthesizeStream(self._opts, self._ensure_session())
Ancestors
- TTS
- abc.ABC
Methods
def stream(self) ‑> livekit.plugins.cartesia.tts.SynthesizeStream
def synthesize(self, text: str) ‑> livekit.plugins.cartesia.tts.ChunkedStream