Module `livekit.plugins.inworld`

Inworld plugin for LiveKit Agents

See https://docs.livekit.io/agents/integrations/tts/inworld/ and https://docs.livekit.io/agents/models/stt/inworld/ for more information.

Classes

class ChunkedStream (*, tts: TTS, input_text: str, conn_options: APIConnectOptions)

Expand source code

class ChunkedStream(tts.ChunkedStream):
    def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
        super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
        self._tts: TTS = tts
        self._opts = replace(tts._opts)

    async def _run(self, output_emitter: tts.AudioEmitter) -> None:
        try:
            audio_config: dict[str, Any] = {
                "audioEncoding": self._opts.encoding,
                "bitrate": self._opts.bit_rate,
                "sampleRateHertz": self._opts.sample_rate,
                "temperature": self._opts.temperature,
                "speakingRate": self._opts.speaking_rate,
            }

            body_params: dict[str, Any] = {
                "text": self._input_text,
                "voiceId": self._opts.voice,
                "modelId": self._opts.model,
                "audioConfig": audio_config,
            }
            if utils.is_given(self._opts.language):
                body_params["language"] = self._opts.language
            if utils.is_given(self._opts.timestamp_type):
                body_params["timestampType"] = self._opts.timestamp_type
            if utils.is_given(self._opts.text_normalization):
                body_params["applyTextNormalization"] = self._opts.text_normalization
            if utils.is_given(self._opts.delivery_mode):
                body_params["deliveryMode"] = self._opts.delivery_mode
            body_params["timestampTransportStrategy"] = self._opts.timestamp_transport_strategy

            x_request_id = str(uuid.uuid4())
            async with self._tts._ensure_session().post(
                urljoin(self._tts._base_url, "/tts/v1/voice:stream"),
                headers={
                    "Authorization": self._tts._authorization,
                    "X-User-Agent": USER_AGENT,
                    "X-Request-Id": x_request_id,
                },
                json=body_params,
                timeout=aiohttp.ClientTimeout(sock_connect=self._conn_options.timeout),
                # large read_bufsize to avoid `ValueError: Chunk too big`
                read_bufsize=10 * 1024 * 1024,
            ) as resp:
                resp.raise_for_status()

                request_id = utils.shortuuid()
                output_emitter.initialize(
                    request_id=request_id,
                    sample_rate=self._opts.sample_rate,
                    num_channels=NUM_CHANNELS,
                    mime_type=self._opts.mime_type,
                )

                async for raw_line in resp.content:
                    line = raw_line.strip()
                    if not line:
                        continue

                    try:
                        data = json.loads(line)
                    except json.JSONDecodeError:
                        logger.warning("failed to parse Inworld response line: %s", line)
                        continue

                    if result := data.get("result"):
                        # Handle timestamp info if present
                        if timestamp_info := result.get("timestampInfo"):
                            timed_strings = _parse_timestamp_info(timestamp_info)
                            if timed_strings:
                                output_emitter.push_timed_transcript(timed_strings)

                        if audio_content := result.get("audioContent"):
                            output_emitter.push(base64.b64decode(audio_content))
                            if self._opts.encoding == "PCM":
                                output_emitter.flush()
                    elif error := data.get("error"):
                        raise APIStatusError(
                            message=error.get("message"),
                            status_code=error.get("code"),
                            request_id=x_request_id,
                            body=None,
                        )
        except asyncio.TimeoutError:
            raise APITimeoutError() from None
        except aiohttp.ClientResponseError as e:
            raise APIStatusError(
                message=e.message, status_code=e.status, request_id=x_request_id, body=None
            ) from None
        except Exception as e:
            raise APIConnectionError() from e

Used by the non-streamed synthesize API, some providers support chunked http responses

Ancestors

livekit.agents.tts.tts.ChunkedStream
abc.ABC

class STT (*, api_key: NotGivenOr[str] = NOT_GIVEN, model: NotGivenOr[str] = NOT_GIVEN, language: NotGivenOr[str] = NOT_GIVEN, sample_rate: NotGivenOr[int] = NOT_GIVEN, num_channels: NotGivenOr[int] = NOT_GIVEN, enable_voice_profile: bool = True, voice_profile_top_n: int = 1, vad_threshold: NotGivenOr[float] = NOT_GIVEN, min_end_of_turn_silence_when_confident: int = 200, end_of_turn_confidence_threshold: float = 0.3, base_url: str = 'https://api.inworld.ai/', http_session: aiohttp.ClientSession | None = None)

Expand source code

class STT(stt.STT):
    def __init__(
        self,
        *,
        api_key: NotGivenOr[str] = NOT_GIVEN,
        model: NotGivenOr[str] = NOT_GIVEN,
        language: NotGivenOr[str] = NOT_GIVEN,
        sample_rate: NotGivenOr[int] = NOT_GIVEN,
        num_channels: NotGivenOr[int] = NOT_GIVEN,
        enable_voice_profile: bool = True,
        voice_profile_top_n: int = 1,
        vad_threshold: NotGivenOr[float] = NOT_GIVEN,
        min_end_of_turn_silence_when_confident: int = 200,
        end_of_turn_confidence_threshold: float = 0.3,
        base_url: str = DEFAULT_API_URL,
        http_session: aiohttp.ClientSession | None = None,
    ) -> None:
        """Create a new instance of Inworld STT.

        Args:
            api_key: Inworld API key. If not provided, reads from INWORLD_API_KEY env var.
            model: STT model identifier. Any model string supported by the Inworld STT API
                is accepted (e.g. "inworld/inworld-stt-1",
                "assemblyai/universal-streaming-multilingual", "soniox/stt-rt-v4").
                Defaults to "inworld/inworld-stt-1".
            language: Language code. Defaults to "en-US".
            sample_rate: Audio sample rate in Hz. Defaults to 16000.
            num_channels: Number of audio channels. Defaults to 1.
            enable_voice_profile: Enable voice profiling (age, gender, emotion, accent).
                Defaults to True.
            voice_profile_top_n: Number of top voice profile results per category.
            vad_threshold: VAD sensitivity threshold.
            min_end_of_turn_silence_when_confident: Minimum silence (ms) before end-of-turn
                when confidence is high. Defaults to 200.
            end_of_turn_confidence_threshold: Confidence threshold for end-of-turn detection.
                Lower values trigger end-of-turn more eagerly. Defaults to 0.3.
            base_url: Base URL for the Inworld API. Defaults to "https://api.inworld.ai/".
            http_session: Optional aiohttp.ClientSession to use.
        """
        super().__init__(
            capabilities=stt.STTCapabilities(
                streaming=True, interim_results=True, offline_recognize=False
            ),
        )

        api_key = api_key if utils.is_given(api_key) else os.getenv("INWORLD_API_KEY", "")
        if not api_key:
            raise ValueError("Inworld API key required. Set INWORLD_API_KEY or provide api_key.")

        self._authorization = f"Basic {api_key}"
        self._base_url = base_url
        self._http_session = http_session
        self._streams: weakref.WeakSet[SpeechStream] = weakref.WeakSet()

        self._opts = _STTOptions(
            model=model if utils.is_given(model) else DEFAULT_MODEL,
            language=language if utils.is_given(language) else DEFAULT_LANGUAGE,
            sample_rate=sample_rate if utils.is_given(sample_rate) else DEFAULT_SAMPLE_RATE,
            num_channels=num_channels if utils.is_given(num_channels) else DEFAULT_NUM_CHANNELS,
            enable_voice_profile=enable_voice_profile,
            voice_profile_top_n=voice_profile_top_n,
            vad_threshold=vad_threshold,
            min_end_of_turn_silence_when_confident=min_end_of_turn_silence_when_confident,
            end_of_turn_confidence_threshold=end_of_turn_confidence_threshold,
        )

    @property
    def model(self) -> str:
        return self._opts.model

    @property
    def provider(self) -> str:
        return "Inworld"

    def update_options(
        self,
        *,
        model: NotGivenOr[str] = NOT_GIVEN,
        language: NotGivenOr[str] = NOT_GIVEN,
        enable_voice_profile: NotGivenOr[bool] = NOT_GIVEN,
        voice_profile_top_n: NotGivenOr[int] = NOT_GIVEN,
        vad_threshold: NotGivenOr[float] = NOT_GIVEN,
        min_end_of_turn_silence_when_confident: NotGivenOr[int] = NOT_GIVEN,
        end_of_turn_confidence_threshold: NotGivenOr[float] = NOT_GIVEN,
    ) -> None:
        """Update STT options. Changes apply to new streams only.

        Args:
            model: STT model identifier (e.g. "inworld/inworld-stt-1",
                "assemblyai/universal-streaming-multilingual"). Any model string is accepted.
            language: Language code (e.g. "en-US").
            enable_voice_profile: Enable voice profiling.
            voice_profile_top_n: Number of top voice profile results.
            vad_threshold: VAD sensitivity threshold.
            min_end_of_turn_silence_when_confident: Min silence (ms) for end-of-turn.
            end_of_turn_confidence_threshold: Confidence threshold for end-of-turn.
        """
        if utils.is_given(model):
            self._opts.model = model
        if utils.is_given(language):
            self._opts.language = language
        if utils.is_given(enable_voice_profile):
            self._opts.enable_voice_profile = enable_voice_profile
        if utils.is_given(voice_profile_top_n):
            self._opts.voice_profile_top_n = voice_profile_top_n
        if utils.is_given(vad_threshold):
            self._opts.vad_threshold = vad_threshold
        if utils.is_given(min_end_of_turn_silence_when_confident):
            self._opts.min_end_of_turn_silence_when_confident = (
                min_end_of_turn_silence_when_confident
            )
        if utils.is_given(end_of_turn_confidence_threshold):
            self._opts.end_of_turn_confidence_threshold = end_of_turn_confidence_threshold

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._http_session:
            self._http_session = utils.http_context.http_session()
        return self._http_session

    async def _recognize_impl(
        self,
        buffer: utils.AudioBuffer,
        *,
        language: NotGivenOr[str] = NOT_GIVEN,
        conn_options: APIConnectOptions,
    ) -> stt.SpeechEvent:
        raise NotImplementedError(
            "Inworld STT does not support batch recognition — use streaming via stream()"
        )

    def stream(
        self,
        *,
        language: NotGivenOr[str] = NOT_GIVEN,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> SpeechStream:
        stream = SpeechStream(
            stt=self,
            conn_options=conn_options,
            language=language if utils.is_given(language) else self._opts.language,
        )
        self._streams.add(stream)
        return stream

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Inworld STT.

Args

api_key: Inworld API key. If not provided, reads from INWORLD_API_KEY env var.
model: STT model identifier. Any model string supported by the Inworld STT API is accepted (e.g. "inworld/inworld-stt-1", "assemblyai/universal-streaming-multilingual", "soniox/stt-rt-v4"). Defaults to "inworld/inworld-stt-1".
language: Language code. Defaults to "en-US".
sample_rate: Audio sample rate in Hz. Defaults to 16000.
num_channels: Number of audio channels. Defaults to 1.
enable_voice_profile: Enable voice profiling (age, gender, emotion, accent). Defaults to True.
voice_profile_top_n: Number of top voice profile results per category.
vad_threshold: VAD sensitivity threshold.
min_end_of_turn_silence_when_confident: Minimum silence (ms) before end-of-turn when confidence is high. Defaults to 200.
end_of_turn_confidence_threshold: Confidence threshold for end-of-turn detection. Lower values trigger end-of-turn more eagerly. Defaults to 0.3.
base_url: Base URL for the Inworld API. Defaults to "https://api.inworld.ai/".
http_session: Optional aiohttp.ClientSession to use.

Ancestors

livekit.agents.stt.stt.STT
abc.ABC
EventEmitter
typing.Generic

Instance variables

prop model : str

Expand source code

@property
def model(self) -> str:
    return self._opts.model

Get the model name/identifier for this STT instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop provider : str

Expand source code

@property
def provider(self) -> str:
    return "Inworld"

Get the provider name/identifier for this STT instance.

Returns

The provider name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their provider information.

Methods

def stream(self, *, language: NotGivenOr[str] = NOT_GIVEN, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.inworld.stt.SpeechStream

Expand source code

def stream(
    self,
    *,
    language: NotGivenOr[str] = NOT_GIVEN,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> SpeechStream:
    stream = SpeechStream(
        stt=self,
        conn_options=conn_options,
        language=language if utils.is_given(language) else self._opts.language,
    )
    self._streams.add(stream)
    return stream

def update_options(self, *, model: NotGivenOr[str] = NOT_GIVEN, language: NotGivenOr[str] = NOT_GIVEN, enable_voice_profile: NotGivenOr[bool] = NOT_GIVEN, voice_profile_top_n: NotGivenOr[int] = NOT_GIVEN, vad_threshold: NotGivenOr[float] = NOT_GIVEN, min_end_of_turn_silence_when_confident: NotGivenOr[int] = NOT_GIVEN, end_of_turn_confidence_threshold: NotGivenOr[float] = NOT_GIVEN) ‑> None

Expand source code

def update_options(
    self,
    *,
    model: NotGivenOr[str] = NOT_GIVEN,
    language: NotGivenOr[str] = NOT_GIVEN,
    enable_voice_profile: NotGivenOr[bool] = NOT_GIVEN,
    voice_profile_top_n: NotGivenOr[int] = NOT_GIVEN,
    vad_threshold: NotGivenOr[float] = NOT_GIVEN,
    min_end_of_turn_silence_when_confident: NotGivenOr[int] = NOT_GIVEN,
    end_of_turn_confidence_threshold: NotGivenOr[float] = NOT_GIVEN,
) -> None:
    """Update STT options. Changes apply to new streams only.

    Args:
        model: STT model identifier (e.g. "inworld/inworld-stt-1",
            "assemblyai/universal-streaming-multilingual"). Any model string is accepted.
        language: Language code (e.g. "en-US").
        enable_voice_profile: Enable voice profiling.
        voice_profile_top_n: Number of top voice profile results.
        vad_threshold: VAD sensitivity threshold.
        min_end_of_turn_silence_when_confident: Min silence (ms) for end-of-turn.
        end_of_turn_confidence_threshold: Confidence threshold for end-of-turn.
    """
    if utils.is_given(model):
        self._opts.model = model
    if utils.is_given(language):
        self._opts.language = language
    if utils.is_given(enable_voice_profile):
        self._opts.enable_voice_profile = enable_voice_profile
    if utils.is_given(voice_profile_top_n):
        self._opts.voice_profile_top_n = voice_profile_top_n
    if utils.is_given(vad_threshold):
        self._opts.vad_threshold = vad_threshold
    if utils.is_given(min_end_of_turn_silence_when_confident):
        self._opts.min_end_of_turn_silence_when_confident = (
            min_end_of_turn_silence_when_confident
        )
    if utils.is_given(end_of_turn_confidence_threshold):
        self._opts.end_of_turn_confidence_threshold = end_of_turn_confidence_threshold

Update STT options. Changes apply to new streams only.

Args

model: STT model identifier (e.g. "inworld/inworld-stt-1", "assemblyai/universal-streaming-multilingual"). Any model string is accepted.
language: Language code (e.g. "en-US").
enable_voice_profile: Enable voice profiling.
voice_profile_top_n: Number of top voice profile results.
vad_threshold: VAD sensitivity threshold.
min_end_of_turn_silence_when_confident: Min silence (ms) for end-of-turn.
end_of_turn_confidence_threshold: Confidence threshold for end-of-turn.

Inherited members

EventEmitter:
- emit
- off
- on
- once

class SpeechStream (*, stt: STT, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0), language: str = 'en-US')

Expand source code

class SpeechStream(stt.SpeechStream):
    def __init__(
        self,
        *,
        stt: STT,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
        language: str = DEFAULT_LANGUAGE,
    ) -> None:
        super().__init__(stt=stt, conn_options=conn_options, sample_rate=stt._opts.sample_rate)
        self._stt: STT = stt
        self._language = language
        self._ws: aiohttp.ClientWebSocketResponse | None = None
        self._reconnect_event = asyncio.Event()
        self._speaking = False
        self._request_id = ""
        self._audio_duration_collector: PeriodicCollector[float] = PeriodicCollector(
            callback=self._on_audio_duration_report,
            duration=5.0,
        )

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._stt._http_session:
            self._stt._http_session = utils.http_context.http_session()
        return self._stt._http_session

    def _build_transcribe_config(self) -> dict:
        opts = self._stt._opts
        config: dict = {
            "modelId": opts.model,
            "audioEncoding": "LINEAR16",
            "sampleRateHertz": opts.sample_rate,
            "numberOfChannels": opts.num_channels,
            "language": self._language,
        }

        if opts.enable_voice_profile:
            config["voiceProfileConfig"] = {
                "enableVoiceProfile": True,
                "topN": opts.voice_profile_top_n,
            }

        config["endOfTurnConfidenceThreshold"] = opts.end_of_turn_confidence_threshold

        inworld_v1_config: dict = {
            "minEndOfTurnSilenceWhenConfident": opts.min_end_of_turn_silence_when_confident,
        }
        if utils.is_given(opts.vad_threshold):
            inworld_v1_config["vadThreshold"] = opts.vad_threshold
        config["inworldSttV1Config"] = inworld_v1_config

        return config

    async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
        ws_url = self._stt._base_url.replace("https://", "wss://").replace("http://", "ws://")
        ws_url = ws_url.rstrip("/") + "/" + WS_ENDPOINT

        ws = await asyncio.wait_for(
            self._ensure_session().ws_connect(
                ws_url,
                headers={"Authorization": self._stt._authorization},
            ),
            timeout=self._conn_options.timeout,
        )

        self._request_id = utils.shortuuid()
        await ws.send_str(json.dumps({"transcribeConfig": self._build_transcribe_config()}))
        logger.debug("Inworld STT WebSocket connection established")
        return ws

    async def _run(self) -> None:
        while True:
            try:
                ws = await self._connect_ws()
                self._ws = ws

                tasks = [
                    asyncio.create_task(self._send_audio_task()),
                    asyncio.create_task(self._recv_messages_task()),
                ]
                wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())

                tasks_group: asyncio.Future[Any] = asyncio.gather(*tasks)
                try:
                    done, _ = await asyncio.wait(
                        [tasks_group, wait_reconnect_task],
                        return_when=asyncio.FIRST_COMPLETED,
                    )

                    for task in done:
                        if task != wait_reconnect_task:
                            task.result()

                    if wait_reconnect_task not in done:
                        break

                    self._reconnect_event.clear()
                finally:
                    await utils.aio.gracefully_cancel(*tasks, wait_reconnect_task)
                    tasks_group.cancel()
                    tasks_group.exception()

            except asyncio.TimeoutError as e:
                logger.error(f"Timeout during Inworld STT connection: {e}")
                raise APITimeoutError("Timeout connecting to Inworld STT API") from e
            except aiohttp.ClientResponseError as e:
                logger.error(f"Inworld STT status error: {e.status} {e.message}")
                raise APIStatusError(
                    message=e.message, status_code=e.status, request_id=None, body=None
                ) from e
            except aiohttp.ClientError as e:
                logger.error(f"Inworld STT connection error: {e}")
                raise APIConnectionError(f"Inworld STT connection error: {e}") from e
            except Exception as e:
                logger.exception(f"Unexpected error in Inworld STT: {e}")
                raise APIConnectionError(f"Unexpected error: {e}") from e
            finally:
                if self._ws is not None:
                    await self._ws.close()
                    self._ws = None

    async def _send_audio_task(self) -> None:
        if not self._ws:
            return

        async for data in self._input_ch:
            if isinstance(data, self._FlushSentinel):
                self._audio_duration_collector.flush()
                try:
                    await self._ws.send_str(json.dumps({"endTurn": {}}))
                except Exception as e:
                    logger.error(f"Error sending endTurn: {e}")
                    break
            elif isinstance(data, rtc.AudioFrame):
                self._audio_duration_collector.push(data.duration)
                pcm_bytes = data.data.tobytes()
                audio_b64 = base64.b64encode(pcm_bytes).decode()
                try:
                    await self._ws.send_str(json.dumps({"audioChunk": {"content": audio_b64}}))
                except Exception as e:
                    logger.error(f"Error sending audio chunk: {e}")
                    break

        self._audio_duration_collector.flush()
        # Input channel closed — tell the server to close the stream
        if self._ws:
            try:
                await self._ws.send_str(json.dumps({"closeStream": {}}))
            except Exception:
                pass

    def _on_audio_duration_report(self, duration: float) -> None:
        self._event_ch.send_nowait(
            stt.SpeechEvent(
                type=SpeechEventType.RECOGNITION_USAGE,
                request_id=self._request_id,
                alternatives=[],
                recognition_usage=stt.RecognitionUsage(audio_duration=duration),
            )
        )

    async def _recv_messages_task(self) -> None:
        if not self._ws:
            return

        try:
            async for msg in self._ws:
                if msg.type == aiohttp.WSMsgType.TEXT:
                    try:
                        data = json.loads(msg.data)
                    except json.JSONDecodeError:
                        continue
                    self._process_stream_event(data)
                elif msg.type in (
                    aiohttp.WSMsgType.CLOSED,
                    aiohttp.WSMsgType.CLOSE,
                    aiohttp.WSMsgType.CLOSING,
                ):
                    return
                elif msg.type == aiohttp.WSMsgType.ERROR:
                    logger.error(f"Inworld STT WebSocket error: {self._ws.exception()}")
                    return
        except aiohttp.ClientError as e:
            logger.error(f"WebSocket error while receiving: {e}")
            raise
        except Exception as e:
            logger.error(f"Unexpected error receiving messages: {e}")
            raise

    def _process_stream_event(self, data: dict) -> None:
        result = data.get("result", {})

        if "speechStarted" in result and not self._speaking:
            self._speaking = True
            self._event_ch.send_nowait(
                stt.SpeechEvent(
                    type=SpeechEventType.START_OF_SPEECH,
                    request_id=self._request_id,
                )
            )
            return

        t = result.get("transcription", {})
        if not t:
            return

        text = t.get("transcript", "")
        is_final = t.get("isFinal", False)
        voice_profile = t.get("voiceProfile")

        # An empty-text final (VAD false positive, unrecognizable noise) must still
        # reach the END_OF_SPEECH emission below — otherwise _speaking stays True
        # and subsequent speechStarted events are ignored, wedging the stream.
        if not text and not is_final:
            return

        if text:
            event_type = (
                SpeechEventType.FINAL_TRANSCRIPT if is_final else SpeechEventType.INTERIM_TRANSCRIPT
            )

            metadata = None
            if voice_profile:
                metadata = {"voice_profile": voice_profile}
                if is_final:
                    logger.info(f"Inworld voice profile: {voice_profile}")

            self._event_ch.send_nowait(
                stt.SpeechEvent(
                    type=event_type,
                    request_id=self._request_id,
                    alternatives=[
                        stt.SpeechData(
                            text=text,
                            language=LanguageCode(self._language),
                            metadata=metadata,
                        )
                    ],
                )
            )

        if is_final and self._speaking:
            self._speaking = False
            self._event_ch.send_nowait(
                stt.SpeechEvent(
                    type=SpeechEventType.END_OF_SPEECH,
                    request_id=self._request_id,
                )
            )

Helper class that provides a standard way to create an ABC using inheritance.

Args: sample_rate : int or None, optional The desired sample rate for the audio input. If specified, the audio input will be automatically resampled to match the given sample rate before being processed for Speech-to-Text. If not provided (None), the input will retain its original sample rate.

Ancestors

livekit.agents.stt.stt.RecognizeStream
abc.ABC

class SynthesizeStream (*, tts: TTS, conn_options: APIConnectOptions)

Expand source code

class SynthesizeStream(tts.SynthesizeStream):
    def __init__(self, *, tts: TTS, conn_options: APIConnectOptions):
        super().__init__(tts=tts, conn_options=conn_options)
        self._tts: TTS = tts
        self._opts = replace(tts._opts)

    async def _run(self, output_emitter: tts.AudioEmitter) -> None:
        request_id = utils.shortuuid()
        sent_tokenizer_stream = self._tts._sentence_tokenizer.stream()

        output_emitter.initialize(
            request_id=request_id,
            sample_rate=self._opts.sample_rate,
            num_channels=NUM_CHANNELS,
            mime_type=self._opts.mime_type,
            stream=True,
        )

        pool = await self._tts._get_pool()
        context_id, waiter, connection = await pool.acquire_context(
            emitter=output_emitter,
            opts=self._opts,
            timeout=self._conn_options.timeout,
        )

        async def _input_task() -> None:
            async for data in self._input_ch:
                if isinstance(data, self._FlushSentinel):
                    sent_tokenizer_stream.flush()
                    continue
                sent_tokenizer_stream.push_text(data)
            sent_tokenizer_stream.end_input()

        async def _send_task() -> None:
            async for ev in sent_tokenizer_stream:
                text = ev.token
                # Chunk to stay within Inworld's 1000 char limit
                for i in range(0, len(text), 1000):
                    connection.send_text(context_id, text[i : i + 1000])
                    self._mark_started()
            connection.flush_context(context_id)
            connection.close_context(context_id)

        tasks = [
            asyncio.create_task(_input_task()),
            asyncio.create_task(_send_task()),
        ]

        try:
            await asyncio.wait_for(waiter, timeout=self._conn_options.timeout + 60)
        except asyncio.TimeoutError:
            connection.close_context(context_id)
            raise APITimeoutError() from None
        except asyncio.CancelledError:
            connection.close_context(context_id)
            raise
        except APIError:
            raise
        except Exception as e:
            logger.error("Inworld stream error", extra={"context_id": context_id, "error": e})
            connection.close_context(context_id)
            raise APIConnectionError() from e
        finally:
            await utils.aio.gracefully_cancel(*tasks)
            await sent_tokenizer_stream.aclose()
            output_emitter.end_input()

Helper class that provides a standard way to create an ABC using inheritance.

Ancestors

livekit.agents.tts.tts.SynthesizeStream
abc.ABC

class TTS (*, api_key: NotGivenOr[str] = NOT_GIVEN, voice: NotGivenOr[str] = NOT_GIVEN, model: NotGivenOr[TTSModels | str] = NOT_GIVEN, encoding: NotGivenOr[Encoding] = NOT_GIVEN, bit_rate: NotGivenOr[int] = NOT_GIVEN, sample_rate: NotGivenOr[int] = NOT_GIVEN, speaking_rate: NotGivenOr[float] = NOT_GIVEN, temperature: NotGivenOr[float] = NOT_GIVEN, language: NotGivenOr[str] = NOT_GIVEN, timestamp_type: NotGivenOr[TimestampType] = NOT_GIVEN, text_normalization: NotGivenOr[TextNormalization] = NOT_GIVEN, delivery_mode: NotGivenOr[DeliveryMode] = NOT_GIVEN, timestamp_transport_strategy: NotGivenOr[TimestampTransportStrategy] = NOT_GIVEN, buffer_char_threshold: NotGivenOr[int] = NOT_GIVEN, max_buffer_delay_ms: NotGivenOr[int] = NOT_GIVEN, base_url: str = 'https://api.inworld.ai/', ws_url: str = 'wss://api.inworld.ai/', http_session: aiohttp.ClientSession | None = None, tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN, retain_format: NotGivenOr[bool] = NOT_GIVEN, max_connections: int = 20, idle_connection_timeout: float = 300.0)

Expand source code

class TTS(tts.TTS):
    def __init__(
        self,
        *,
        api_key: NotGivenOr[str] = NOT_GIVEN,
        voice: NotGivenOr[str] = NOT_GIVEN,
        model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
        encoding: NotGivenOr[Encoding] = NOT_GIVEN,
        bit_rate: NotGivenOr[int] = NOT_GIVEN,
        sample_rate: NotGivenOr[int] = NOT_GIVEN,
        speaking_rate: NotGivenOr[float] = NOT_GIVEN,
        temperature: NotGivenOr[float] = NOT_GIVEN,
        language: NotGivenOr[str] = NOT_GIVEN,
        timestamp_type: NotGivenOr[TimestampType] = NOT_GIVEN,
        text_normalization: NotGivenOr[TextNormalization] = NOT_GIVEN,
        delivery_mode: NotGivenOr[DeliveryMode] = NOT_GIVEN,
        timestamp_transport_strategy: NotGivenOr[TimestampTransportStrategy] = NOT_GIVEN,
        buffer_char_threshold: NotGivenOr[int] = NOT_GIVEN,
        max_buffer_delay_ms: NotGivenOr[int] = NOT_GIVEN,
        base_url: str = DEFAULT_URL,
        ws_url: str = DEFAULT_WS_URL,
        http_session: aiohttp.ClientSession | None = None,
        tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
        retain_format: NotGivenOr[bool] = NOT_GIVEN,
        max_connections: int = DEFAULT_MAX_CONNECTIONS,
        idle_connection_timeout: float = DEFAULT_IDLE_CONNECTION_TIMEOUT,
    ) -> None:
        """
        Create a new instance of Inworld TTS.

        Args:
            api_key (str, optional): The Inworld API key.
                If not provided, it will be read from the INWORLD_API_KEY environment variable.
            voice (str, optional): The voice to use. Defaults to "Ashley".
            model (str, optional): The Inworld model to use. Defaults to "inworld-tts-1.5-max".
            encoding (str, optional): The encoding to use. Defaults to "PCM".
            bit_rate (int, optional): Bits per second of the audio. Defaults to 64000.
            sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000.
            speaking_rate (float, optional): The speed of the voice, in the range [0.5, 1.5].
                Defaults to 1.0.
            temperature (float, optional): Determines the degree of randomness when sampling audio
                tokens to generate the response. Range (0, 2]. Defaults to 1.0.
            language (str, optional): BCP-47 language tag (e.g., "en-US", "fr-FR", "ja-JP")
                specifying the language that the given voice should speak the text in.
                If not set, the model default applies.
            timestamp_type (str, optional): Controls timestamp metadata returned with the audio.
                Use "WORD" for word-level timestamps or "CHARACTER" for character-level.
                Useful for karaoke-style captions, word highlighting, and lipsync.
            text_normalization (str, optional): Controls text normalization. When "ON", numbers,
                dates, and abbreviations are expanded (e.g., "Dr." -> "Doctor"). When "OFF",
                text is read exactly as written. Defaults to automatic.
            delivery_mode (str, optional): Controls output variation on ``inworld-tts-2`` only.
                One of "DELIVERY_MODE_UNSPECIFIED", "STABLE", "BALANCED", or "CREATIVE".
                The Inworld API ignores ``temperature`` on ``inworld-tts-2`` — use
                ``delivery_mode`` to steer output variation on that model instead.
                Defaults to the server-side default ("BALANCED").
            timestamp_transport_strategy (str, optional): Controls how timestamp info is
                transported relative to audio data. "SYNC" returns timestamps in the same
                message as audio data. "ASYNC" allows timestamps to return in trailing
                messages after the audio data. Defaults to "ASYNC".
            buffer_char_threshold (int, optional): For streaming, the minimum number of characters
                in the buffer that automatically triggers audio generation. Defaults to 1000.
            max_buffer_delay_ms (int, optional): For streaming, the maximum time in ms to buffer
                before starting generation. Defaults to 3000.
            base_url (str, optional): The base URL for the Inworld TTS API.
                Defaults to "https://api.inworld.ai/".
            ws_url (str, optional): The WebSocket URL for streaming TTS.
                Defaults to "wss://api.inworld.ai/".
            http_session (aiohttp.ClientSession, optional): The HTTP session to use.
            tokenizer (tokenize.SentenceTokenizer, optional): The tokenizer to use for streaming.
                Defaults to `livekit.agents.tokenize.blingfire.SentenceTokenizer`.
            retain_format (bool, optional): Whether to retain the format of the text when tokenizing.
                Defaults to True.
            max_connections (int, optional): Maximum number of concurrent WebSocket connections.
                Each connection supports up to 5 concurrent synthesis streams. Defaults to 20.
            idle_connection_timeout (float, optional): Time in seconds after which idle connections
                are closed. Defaults to 300 (5 minutes).
        """
        if not is_given(sample_rate):
            sample_rate = DEFAULT_SAMPLE_RATE
        super().__init__(
            capabilities=tts.TTSCapabilities(
                streaming=True,
                aligned_transcript=is_given(timestamp_type)
                and timestamp_type != "TIMESTAMP_TYPE_UNSPECIFIED",
            ),
            sample_rate=sample_rate,
            num_channels=NUM_CHANNELS,
        )

        key = api_key if is_given(api_key) else os.getenv("INWORLD_API_KEY")
        if not key:
            raise ValueError(
                "Inworld API key is required, either as argument or set"
                " INWORLD_API_KEY environment variable"
            )

        self._authorization = f"Basic {key}"
        self._base_url = base_url
        self._ws_url = ws_url
        self._session = http_session

        if is_given(encoding):
            _validate_str_param(encoding, "encoding", Encoding)
        if is_given(timestamp_type):
            _validate_str_param(timestamp_type, "timestamp_type", TimestampType)
        if is_given(text_normalization):
            text_normalization = _resolve_text_normalization(text_normalization)
        if is_given(delivery_mode):
            _validate_str_param(delivery_mode, "delivery_mode", DeliveryMode)
        if is_given(timestamp_transport_strategy):
            _validate_str_param(
                timestamp_transport_strategy,
                "timestamp_transport_strategy",
                TimestampTransportStrategy,
            )

        self._opts = _TTSOptions(
            voice=voice if is_given(voice) else DEFAULT_VOICE,
            model=model if is_given(model) else DEFAULT_MODEL,
            encoding=encoding if is_given(encoding) else DEFAULT_ENCODING,
            bit_rate=bit_rate if is_given(bit_rate) else DEFAULT_BIT_RATE,
            sample_rate=sample_rate if is_given(sample_rate) else DEFAULT_SAMPLE_RATE,
            speaking_rate=speaking_rate if is_given(speaking_rate) else DEFAULT_SPEAKING_RATE,
            temperature=temperature if is_given(temperature) else DEFAULT_TEMPERATURE,
            language=language,
            timestamp_type=timestamp_type,
            text_normalization=text_normalization,
            delivery_mode=delivery_mode,
            timestamp_transport_strategy=timestamp_transport_strategy
            if is_given(timestamp_transport_strategy)
            else DEFAULT_TIMESTAMP_TRANSPORT_STRATEGY,
            buffer_char_threshold=buffer_char_threshold
            if is_given(buffer_char_threshold)
            else DEFAULT_BUFFER_CHAR_THRESHOLD,
            max_buffer_delay_ms=max_buffer_delay_ms
            if is_given(max_buffer_delay_ms)
            else DEFAULT_MAX_BUFFER_DELAY_MS,
        )

        self._max_connections = max_connections
        self._idle_connection_timeout = idle_connection_timeout
        self._pool: _ConnectionPool | None = None
        self._pool_lock = asyncio.Lock()
        self._streams = weakref.WeakSet[SynthesizeStream]()
        self._sentence_tokenizer = (
            tokenizer
            if is_given(tokenizer)
            else tokenize.blingfire.SentenceTokenizer(
                retain_format=retain_format if is_given(retain_format) else True
            )
        )

    @property
    def model(self) -> str:
        return self._opts.model

    @property
    def provider(self) -> str:
        return "Inworld"

    async def _get_pool(self) -> _ConnectionPool:
        """Get the connection pool, creating if needed."""
        async with self._pool_lock:
            if self._pool is None or self._pool._closed:
                self._pool = _ConnectionPool(
                    session=self._ensure_session(),
                    ws_url=self._ws_url,
                    authorization=self._authorization,
                    max_connections=self._max_connections,
                    idle_timeout=self._idle_connection_timeout,
                )
            return self._pool

    def update_options(
        self,
        *,
        voice: NotGivenOr[str] = NOT_GIVEN,
        model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
        encoding: NotGivenOr[Encoding] = NOT_GIVEN,
        bit_rate: NotGivenOr[int] = NOT_GIVEN,
        sample_rate: NotGivenOr[int] = NOT_GIVEN,
        speaking_rate: NotGivenOr[float] = NOT_GIVEN,
        temperature: NotGivenOr[float] = NOT_GIVEN,
        language: NotGivenOr[str] = NOT_GIVEN,
        timestamp_type: NotGivenOr[TimestampType] = NOT_GIVEN,
        text_normalization: NotGivenOr[TextNormalization] = NOT_GIVEN,
        delivery_mode: NotGivenOr[DeliveryMode] = NOT_GIVEN,
        timestamp_transport_strategy: NotGivenOr[TimestampTransportStrategy] = NOT_GIVEN,
        buffer_char_threshold: NotGivenOr[int] = NOT_GIVEN,
        max_buffer_delay_ms: NotGivenOr[int] = NOT_GIVEN,
    ) -> None:
        """
        Update the TTS configuration options.

        Args:
            voice (str, optional): The voice to use.
            model (str, optional): The Inworld model to use.
            encoding (str, optional): The encoding to use.
            bit_rate (int, optional): Bits per second of the audio.
            sample_rate (int, optional): The audio sample rate in Hz.
            speaking_rate (float, optional): The speed of the voice.
            temperature (float, optional): Determines the degree of randomness when sampling audio
                tokens to generate the response.
            language (str, optional): BCP-47 language tag (e.g., "en-US", "fr-FR").
            timestamp_type (str, optional): Controls timestamp metadata ("WORD" or "CHARACTER").
            text_normalization (str, optional): Controls text normalization ("ON" or "OFF").
            delivery_mode (str, optional): Controls output variation on ``inworld-tts-2`` only
                ("DELIVERY_MODE_UNSPECIFIED", "STABLE", "BALANCED", or "CREATIVE"). The Inworld
                API ignores ``temperature`` on TTS-2; use this instead.
            timestamp_transport_strategy (str, optional): Controls timestamp transport strategy
                ("SYNC" or "ASYNC").
            buffer_char_threshold (int, optional): For streaming, min characters before triggering.
            max_buffer_delay_ms (int, optional): For streaming, max time to buffer.
        """
        if is_given(voice):
            self._opts.voice = voice
        if is_given(model):
            self._opts.model = model
        if is_given(encoding):
            _validate_str_param(encoding, "encoding", Encoding)
            self._opts.encoding = encoding
        if is_given(bit_rate):
            self._opts.bit_rate = bit_rate
        if is_given(sample_rate):
            self._opts.sample_rate = sample_rate
        if is_given(speaking_rate):
            self._opts.speaking_rate = speaking_rate
        if is_given(temperature):
            self._opts.temperature = temperature
        if is_given(language):
            self._opts.language = language
        if is_given(timestamp_type):
            _validate_str_param(timestamp_type, "timestamp_type", TimestampType)
            self._opts.timestamp_type = timestamp_type
        if is_given(text_normalization):
            self._opts.text_normalization = _resolve_text_normalization(text_normalization)
        if is_given(delivery_mode):
            _validate_str_param(delivery_mode, "delivery_mode", DeliveryMode)
            self._opts.delivery_mode = delivery_mode
        if is_given(timestamp_transport_strategy):
            _validate_str_param(
                timestamp_transport_strategy,
                "timestamp_transport_strategy",
                TimestampTransportStrategy,
            )
            self._opts.timestamp_transport_strategy = timestamp_transport_strategy
        if is_given(buffer_char_threshold):
            self._opts.buffer_char_threshold = buffer_char_threshold
        if is_given(max_buffer_delay_ms):
            self._opts.max_buffer_delay_ms = max_buffer_delay_ms

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()

        return self._session

    def prewarm(self) -> None:
        asyncio.create_task(self._prewarm_impl())

    async def _prewarm_impl(self) -> None:
        # Just ensure the pool is created - first acquire will establish a connection
        await self._get_pool()

    def synthesize(
        self,
        text: str,
        *,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> tts.ChunkedStream:
        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

    def stream(
        self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> SynthesizeStream:
        stream = SynthesizeStream(tts=self, conn_options=conn_options)
        self._streams.add(stream)
        return stream

    async def aclose(self) -> None:
        for stream in list(self._streams):
            await stream.aclose()

        self._streams.clear()
        if self._pool:
            await self._pool.aclose()
            self._pool = None

    async def list_voices(self, language: str | None = None) -> list[dict[str, Any]]:
        """
        List all available voices in the workspace associated with the API key.

        Args:
            language (str, optional): ISO 639-1 language code to filter voices (e.g., 'en', 'es', 'fr').
        """
        url = urljoin(self._base_url, "tts/v1/voices")
        params = {}
        if language:
            params["filter"] = f"language={language}"

        async with self._ensure_session().get(
            url,
            headers={
                "Authorization": self._authorization,
                "X-User-Agent": USER_AGENT,
                "X-Request-Id": str(uuid.uuid4()),
            },
            params=params,
        ) as resp:
            if not resp.ok:
                error_body = await resp.json()
                raise APIStatusError(
                    message=error_body.get("message"),
                    status_code=resp.status,
                    request_id=None,
                    body=None,
                )

            data = await resp.json()
            return cast(list[dict[str, Any]], data.get("voices", []))

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Inworld TTS.

Args

api_key : str, optional: The Inworld API key. If not provided, it will be read from the INWORLD_API_KEY environment variable.
voice : str, optional: The voice to use. Defaults to "Ashley".
model : str, optional: The Inworld model to use. Defaults to "inworld-tts-1.5-max".
encoding : str, optional: The encoding to use. Defaults to "PCM".
bit_rate : int, optional: Bits per second of the audio. Defaults to 64000.
sample_rate : int, optional: The audio sample rate in Hz. Defaults to 24000.
speaking_rate : float, optional: The speed of the voice, in the range [0.5, 1.5]. Defaults to 1.0.
temperature : float, optional: Determines the degree of randomness when sampling audio tokens to generate the response. Range (0, 2]. Defaults to 1.0.
language : str, optional: BCP-47 language tag (e.g., "en-US", "fr-FR", "ja-JP") specifying the language that the given voice should speak the text in. If not set, the model default applies.
timestamp_type : str, optional: Controls timestamp metadata returned with the audio. Use "WORD" for word-level timestamps or "CHARACTER" for character-level. Useful for karaoke-style captions, word highlighting, and lipsync.
text_normalization : str, optional: Controls text normalization. When "ON", numbers, dates, and abbreviations are expanded (e.g., "Dr." -> "Doctor"). When "OFF", text is read exactly as written. Defaults to automatic.
delivery_mode : str, optional: Controls output variation on inworld-tts-2 only. One of "DELIVERY_MODE_UNSPECIFIED", "STABLE", "BALANCED", or "CREATIVE". The Inworld API ignores temperature on inworld-tts-2 — use delivery_mode to steer output variation on that model instead. Defaults to the server-side default ("BALANCED").
timestamp_transport_strategy : str, optional: Controls how timestamp info is transported relative to audio data. "SYNC" returns timestamps in the same message as audio data. "ASYNC" allows timestamps to return in trailing messages after the audio data. Defaults to "ASYNC".
buffer_char_threshold : int, optional: For streaming, the minimum number of characters in the buffer that automatically triggers audio generation. Defaults to 1000.
max_buffer_delay_ms : int, optional: For streaming, the maximum time in ms to buffer before starting generation. Defaults to 3000.
base_url : str, optional: The base URL for the Inworld TTS API. Defaults to "https://api.inworld.ai/".
ws_url : str, optional: The WebSocket URL for streaming TTS. Defaults to "wss://api.inworld.ai/".
http_session : aiohttp.ClientSession, optional: The HTTP session to use.
tokenizer : tokenize.SentenceTokenizer, optional: The tokenizer to use for streaming. Defaults to SentenceTokenizer.
retain_format : bool, optional: Whether to retain the format of the text when tokenizing. Defaults to True.
max_connections : int, optional: Maximum number of concurrent WebSocket connections. Each connection supports up to 5 concurrent synthesis streams. Defaults to 20.
idle_connection_timeout : float, optional: Time in seconds after which idle connections are closed. Defaults to 300 (5 minutes).

Ancestors

livekit.agents.tts.tts.TTS
abc.ABC
EventEmitter
typing.Generic

Instance variables

prop model : str

Expand source code

@property
def model(self) -> str:
    return self._opts.model

Get the model name/identifier for this TTS instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop provider : str

Expand source code

@property
def provider(self) -> str:
    return "Inworld"

Get the provider name/identifier for this TTS instance.

Returns

The provider name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their provider information.

Methods

async def aclose(self) ‑> None

Expand source code

async def aclose(self) -> None:
    for stream in list(self._streams):
        await stream.aclose()

    self._streams.clear()
    if self._pool:
        await self._pool.aclose()
        self._pool = None

async def list_voices(self, language: str | None = None) ‑> list[dict[str, typing.Any]]

Expand source code

async def list_voices(self, language: str | None = None) -> list[dict[str, Any]]:
    """
    List all available voices in the workspace associated with the API key.

    Args:
        language (str, optional): ISO 639-1 language code to filter voices (e.g., 'en', 'es', 'fr').
    """
    url = urljoin(self._base_url, "tts/v1/voices")
    params = {}
    if language:
        params["filter"] = f"language={language}"

    async with self._ensure_session().get(
        url,
        headers={
            "Authorization": self._authorization,
            "X-User-Agent": USER_AGENT,
            "X-Request-Id": str(uuid.uuid4()),
        },
        params=params,
    ) as resp:
        if not resp.ok:
            error_body = await resp.json()
            raise APIStatusError(
                message=error_body.get("message"),
                status_code=resp.status,
                request_id=None,
                body=None,
            )

        data = await resp.json()
        return cast(list[dict[str, Any]], data.get("voices", []))

List all available voices in the workspace associated with the API key.

Args

language : str, optional: ISO 639-1 language code to filter voices (e.g., 'en', 'es', 'fr').

def prewarm(self) ‑> None

Expand source code

def prewarm(self) -> None:
    asyncio.create_task(self._prewarm_impl())

Pre-warm connection to the TTS service

def stream(self, *, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.inworld.tts.SynthesizeStream

Expand source code

def stream(
    self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> SynthesizeStream:
    stream = SynthesizeStream(tts=self, conn_options=conn_options)
    self._streams.add(stream)
    return stream

def synthesize(self, text: str, *, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.agents.tts.tts.ChunkedStream

Expand source code

def synthesize(
    self,
    text: str,
    *,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> tts.ChunkedStream:
    return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

def update_options(self, *, voice: NotGivenOr[str] = NOT_GIVEN, model: NotGivenOr[TTSModels | str] = NOT_GIVEN, encoding: NotGivenOr[Encoding] = NOT_GIVEN, bit_rate: NotGivenOr[int] = NOT_GIVEN, sample_rate: NotGivenOr[int] = NOT_GIVEN, speaking_rate: NotGivenOr[float] = NOT_GIVEN, temperature: NotGivenOr[float] = NOT_GIVEN, language: NotGivenOr[str] = NOT_GIVEN, timestamp_type: NotGivenOr[TimestampType] = NOT_GIVEN, text_normalization: NotGivenOr[TextNormalization] = NOT_GIVEN, delivery_mode: NotGivenOr[DeliveryMode] = NOT_GIVEN, timestamp_transport_strategy: NotGivenOr[TimestampTransportStrategy] = NOT_GIVEN, buffer_char_threshold: NotGivenOr[int] = NOT_GIVEN, max_buffer_delay_ms: NotGivenOr[int] = NOT_GIVEN) ‑> None

Expand source code

def update_options(
    self,
    *,
    voice: NotGivenOr[str] = NOT_GIVEN,
    model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
    encoding: NotGivenOr[Encoding] = NOT_GIVEN,
    bit_rate: NotGivenOr[int] = NOT_GIVEN,
    sample_rate: NotGivenOr[int] = NOT_GIVEN,
    speaking_rate: NotGivenOr[float] = NOT_GIVEN,
    temperature: NotGivenOr[float] = NOT_GIVEN,
    language: NotGivenOr[str] = NOT_GIVEN,
    timestamp_type: NotGivenOr[TimestampType] = NOT_GIVEN,
    text_normalization: NotGivenOr[TextNormalization] = NOT_GIVEN,
    delivery_mode: NotGivenOr[DeliveryMode] = NOT_GIVEN,
    timestamp_transport_strategy: NotGivenOr[TimestampTransportStrategy] = NOT_GIVEN,
    buffer_char_threshold: NotGivenOr[int] = NOT_GIVEN,
    max_buffer_delay_ms: NotGivenOr[int] = NOT_GIVEN,
) -> None:
    """
    Update the TTS configuration options.

    Args:
        voice (str, optional): The voice to use.
        model (str, optional): The Inworld model to use.
        encoding (str, optional): The encoding to use.
        bit_rate (int, optional): Bits per second of the audio.
        sample_rate (int, optional): The audio sample rate in Hz.
        speaking_rate (float, optional): The speed of the voice.
        temperature (float, optional): Determines the degree of randomness when sampling audio
            tokens to generate the response.
        language (str, optional): BCP-47 language tag (e.g., "en-US", "fr-FR").
        timestamp_type (str, optional): Controls timestamp metadata ("WORD" or "CHARACTER").
        text_normalization (str, optional): Controls text normalization ("ON" or "OFF").
        delivery_mode (str, optional): Controls output variation on ``inworld-tts-2`` only
            ("DELIVERY_MODE_UNSPECIFIED", "STABLE", "BALANCED", or "CREATIVE"). The Inworld
            API ignores ``temperature`` on TTS-2; use this instead.
        timestamp_transport_strategy (str, optional): Controls timestamp transport strategy
            ("SYNC" or "ASYNC").
        buffer_char_threshold (int, optional): For streaming, min characters before triggering.
        max_buffer_delay_ms (int, optional): For streaming, max time to buffer.
    """
    if is_given(voice):
        self._opts.voice = voice
    if is_given(model):
        self._opts.model = model
    if is_given(encoding):
        _validate_str_param(encoding, "encoding", Encoding)
        self._opts.encoding = encoding
    if is_given(bit_rate):
        self._opts.bit_rate = bit_rate
    if is_given(sample_rate):
        self._opts.sample_rate = sample_rate
    if is_given(speaking_rate):
        self._opts.speaking_rate = speaking_rate
    if is_given(temperature):
        self._opts.temperature = temperature
    if is_given(language):
        self._opts.language = language
    if is_given(timestamp_type):
        _validate_str_param(timestamp_type, "timestamp_type", TimestampType)
        self._opts.timestamp_type = timestamp_type
    if is_given(text_normalization):
        self._opts.text_normalization = _resolve_text_normalization(text_normalization)
    if is_given(delivery_mode):
        _validate_str_param(delivery_mode, "delivery_mode", DeliveryMode)
        self._opts.delivery_mode = delivery_mode
    if is_given(timestamp_transport_strategy):
        _validate_str_param(
            timestamp_transport_strategy,
            "timestamp_transport_strategy",
            TimestampTransportStrategy,
        )
        self._opts.timestamp_transport_strategy = timestamp_transport_strategy
    if is_given(buffer_char_threshold):
        self._opts.buffer_char_threshold = buffer_char_threshold
    if is_given(max_buffer_delay_ms):
        self._opts.max_buffer_delay_ms = max_buffer_delay_ms

Update the TTS configuration options.

Args

voice : str, optional: The voice to use.
model : str, optional: The Inworld model to use.
encoding : str, optional: The encoding to use.
bit_rate : int, optional: Bits per second of the audio.
sample_rate : int, optional: The audio sample rate in Hz.
speaking_rate : float, optional: The speed of the voice.
temperature : float, optional: Determines the degree of randomness when sampling audio tokens to generate the response.
language : str, optional: BCP-47 language tag (e.g., "en-US", "fr-FR").
timestamp_type : str, optional: Controls timestamp metadata ("WORD" or "CHARACTER").
text_normalization : str, optional: Controls text normalization ("ON" or "OFF").
delivery_mode : str, optional: Controls output variation on inworld-tts-2 only ("DELIVERY_MODE_UNSPECIFIED", "STABLE", "BALANCED", or "CREATIVE"). The Inworld API ignores temperature on TTS-2; use this instead.
timestamp_transport_strategy : str, optional: Controls timestamp transport strategy ("SYNC" or "ASYNC").
buffer_char_threshold : int, optional: For streaming, min characters before triggering.
max_buffer_delay_ms : int, optional: For streaming, max time to buffer.

Inherited members

EventEmitter:
- emit
- off
- on
- once