Module `livekit.plugins.aws.tts`

Classes

class ChunkedStream (*, tts: TTS, text: str, session: aioboto3.Session, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0), opts: _TTSOptions)

Expand source code

class ChunkedStream(tts.ChunkedStream):
    def __init__(
        self,
        *,
        tts: TTS,
        text: str,
        session: aioboto3.Session,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
        opts: _TTSOptions,
    ) -> None:
        super().__init__(tts=tts, input_text=text, conn_options=conn_options)
        self._opts = opts
        self._segment_id = utils.shortuuid()
        self._session = session

    async def _run(self):
        request_id = utils.shortuuid()

        try:
            async with self._session.client("polly") as client:
                params = {
                    "Text": self._input_text,
                    "OutputFormat": "mp3",
                    "Engine": self._opts.speech_engine
                    if is_given(self._opts.speech_engine)
                    else DEFAULT_SPEECH_ENGINE,
                    "VoiceId": self._opts.voice if is_given(self._opts.voice) else DEFAULT_VOICE,
                    "TextType": "text",
                    "SampleRate": str(self._opts.sample_rate),
                    "LanguageCode": self._opts.language if is_given(self._opts.language) else None,
                }
                response = await client.synthesize_speech(**_strip_nones(params))
                if "AudioStream" in response:
                    decoder = utils.codecs.AudioStreamDecoder(
                        sample_rate=self._opts.sample_rate,
                        num_channels=1,
                    )

                    # Create a task to push data to the decoder
                    async def push_data():
                        try:
                            async with response["AudioStream"] as resp:
                                async for data, _ in resp.content.iter_chunks():
                                    decoder.push(data)
                        finally:
                            decoder.end_input()

                    # Start pushing data to the decoder
                    push_task = asyncio.create_task(push_data())

                    try:
                        # Create emitter and process decoded frames
                        emitter = tts.SynthesizedAudioEmitter(
                            event_ch=self._event_ch,
                            request_id=request_id,
                            segment_id=self._segment_id,
                        )
                        async for frame in decoder:
                            emitter.push(frame)
                        emitter.flush()
                        await push_task
                    finally:
                        await utils.aio.gracefully_cancel(push_task)

        except asyncio.TimeoutError:
            raise APITimeoutError() from None
        except aiohttp.ClientResponseError as e:
            raise APIStatusError(
                message=e.message,
                status_code=e.status,
                request_id=request_id,
                body=None,
            ) from None
        except Exception as e:
            raise APIConnectionError() from e

Used by the non-streamed synthesize API, some providers support chunked http responses

Ancestors

livekit.agents.tts.tts.ChunkedStream
abc.ABC

class TTS (*, voice: NotGivenOr[str] = NOT_GIVEN, language: NotGivenOr[TTS_LANGUAGE | str] = NOT_GIVEN, speech_engine: NotGivenOr[TTS_SPEECH_ENGINE] = NOT_GIVEN, sample_rate: int = 16000, region: NotGivenOr[str] = NOT_GIVEN, api_key: NotGivenOr[str] = NOT_GIVEN, api_secret: NotGivenOr[str] = NOT_GIVEN, session: aioboto3.Session | None = None)

Expand source code

class TTS(tts.TTS):
    def __init__(
        self,
        *,
        voice: NotGivenOr[str] = NOT_GIVEN,
        language: NotGivenOr[TTS_LANGUAGE | str] = NOT_GIVEN,
        speech_engine: NotGivenOr[TTS_SPEECH_ENGINE] = NOT_GIVEN,
        sample_rate: int = DEFAULT_SAMPLE_RATE,
        region: NotGivenOr[str] = NOT_GIVEN,
        api_key: NotGivenOr[str] = NOT_GIVEN,
        api_secret: NotGivenOr[str] = NOT_GIVEN,
        session: aioboto3.Session | None = None,
    ) -> None:
        """
        Create a new instance of AWS Polly TTS.

        ``api_key``  and ``api_secret`` must be set to your AWS Access key id and secret access key, either using the argument or by setting the
        ``AWS_ACCESS_KEY_ID`` and ``AWS_SECRET_ACCESS_KEY`` environmental variables.

        See https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html for more details on the the AWS Polly TTS.

        Args:
            Voice (TTSModels, optional): Voice ID to use for the synthesis. Defaults to "Ruth".
            language (TTS_LANGUAGE, optional): language code for the Synthesize Speech request. This is only necessary if using a bilingual voice, such as Aditi, which can be used for either Indian English (en-IN) or Hindi (hi-IN).
            sample_rate(int, optional): The audio frequency specified in Hz. Defaults to 16000.
            speech_engine(TTS_SPEECH_ENGINE, optional): The engine to use for the synthesis. Defaults to "generative".
            region(str, optional): The region to use for the synthesis. Defaults to "us-east-1".
            api_key(str, optional): AWS access key id.
            api_secret(str, optional): AWS secret access key.
            session(aioboto3.Session, optional): Optional aioboto3 session to use.
        """  # noqa: E501
        super().__init__(
            capabilities=tts.TTSCapabilities(
                streaming=False,
            ),
            sample_rate=sample_rate,
            num_channels=TTS_NUM_CHANNELS,
        )
        self._session = session or get_aws_async_session(
            api_key=api_key if is_given(api_key) else None,
            api_secret=api_secret if is_given(api_secret) else None,
            region=region if is_given(region) else None,
        )
        self._opts = _TTSOptions(
            voice=voice,
            speech_engine=speech_engine,
            region=region,
            language=language,
            sample_rate=sample_rate,
        )

    def synthesize(
        self,
        text: str,
        *,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> ChunkedStream:
        return ChunkedStream(
            tts=self,
            text=text,
            conn_options=conn_options,
            session=self._session,
            opts=self._opts,
        )

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of AWS Polly TTS.

api_key and api_secret must be set to your AWS Access key id and secret access key, either using the argument or by setting the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environmental variables.

See https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html for more details on the the AWS Polly TTS.

Args

Voice : TTSModels, optional: Voice ID to use for the synthesis. Defaults to "Ruth".
language : TTS_LANGUAGE, optional: language code for the Synthesize Speech request. This is only necessary if using a bilingual voice, such as Aditi, which can be used for either Indian English (en-IN) or Hindi (hi-IN).

sample_rate(int, optional): The audio frequency specified in Hz. Defaults to 16000. speech_engine(TTS_SPEECH_ENGINE, optional): The engine to use for the synthesis. Defaults to "generative". region(str, optional): The region to use for the synthesis. Defaults to "us-east-1". api_key(str, optional): AWS access key id. api_secret(str, optional): AWS secret access key. session(aioboto3.Session, optional): Optional aioboto3 session to use.

Ancestors

livekit.agents.tts.tts.TTS
abc.ABC
EventEmitter
typing.Generic

Methods

def synthesize(self, text: str, *, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> ChunkedStream

Expand source code

def synthesize(
    self,
    text: str,
    *,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> ChunkedStream:
    return ChunkedStream(
        tts=self,
        text=text,
        conn_options=conn_options,
        session=self._session,
        opts=self._opts,
    )

Inherited members

EventEmitter:
- emit
- off
- on
- once