Module livekit.plugins.lmnt

LMNT plugin for LiveKit Agents

See https://docs.livekit.io/agents/integrations/tts/lmnt/ for more information.

Classes

class ChunkedStream (*,
tts: TTS,
input_text: str,
conn_options: APIConnectOptions)
Expand source code
class ChunkedStream(tts.ChunkedStream):
    """Synthesize text to speech in chunks."""

    def __init__(
        self,
        *,
        tts: TTS,
        input_text: str,
        conn_options: APIConnectOptions,
    ) -> None:
        super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
        self._tts = tts
        self._opts = replace(tts._opts)

    async def _run(self, output_emitter: tts.AudioEmitter) -> None:
        data = {
            "text": self._input_text,
            "voice": self._opts.voice,
            "language": self._opts.language,
            "sample_rate": self._opts.sample_rate,
            "model": self._opts.model,
            "format": self._opts.format,
            "temperature": self._opts.temperature,
            "top_p": self._opts.top_p,
        }

        try:
            async with self._tts._ensure_session().post(
                LMNT_BASE_URL,
                headers={
                    "Content-Type": "application/json",
                    "X-API-Key": self._opts.api_key,
                },
                json=data,
                timeout=aiohttp.ClientTimeout(
                    total=30,
                    sock_connect=self._conn_options.timeout,
                ),
            ) as resp:
                resp.raise_for_status()
                output_emitter.initialize(
                    request_id=utils.shortuuid(),
                    sample_rate=self._opts.sample_rate,
                    num_channels=NUM_CHANNELS,
                    mime_type=MIME_TYPE[self._opts.format],
                )
                async for data, _ in resp.content.iter_chunks():
                    output_emitter.push(data)

                output_emitter.flush()
        except asyncio.TimeoutError:
            raise APITimeoutError() from None
        except aiohttp.ClientResponseError as e:
            raise APIStatusError(
                message=e.message,
                status_code=e.status,
                request_id=None,
                body=None,
            ) from None
        except Exception as e:
            raise APIConnectionError() from e

Synthesize text to speech in chunks.

Ancestors

  • livekit.agents.tts.tts.ChunkedStream
  • abc.ABC
class TTS (*,
model: LMNTModels = 'blizzard',
voice: str = 'leah',
language: LMNTLanguages | None = None,
format: LMNTAudioFormats = 'mp3',
sample_rate: LMNTSampleRate = 24000,
api_key: str | None = None,
http_session: aiohttp.ClientSession | None = None,
temperature: float = 1.0,
top_p: float = 0.8)
Expand source code
class TTS(tts.TTS):
    """
    Text-to-Speech (TTS) plugin for LMNT.
    """

    def __init__(
        self,
        *,
        model: LMNTModels = "blizzard",
        voice: str = "leah",
        language: LMNTLanguages | None = None,
        format: LMNTAudioFormats = "mp3",
        sample_rate: LMNTSampleRate = 24000,
        api_key: str | None = None,
        http_session: aiohttp.ClientSession | None = None,
        temperature: float = 1.0,
        top_p: float = 0.8,
    ) -> None:
        """
        Create a new instance of LMNT TTS.

        See: https://docs.lmnt.com/api-reference/speech/synthesize-speech-bytes

        Args:
            model: The model to use for synthesis. Default is "blizzard".
                Learn more at: https://docs.lmnt.com/guides/models
            voice: The voice ID to use. Default is "leah". Find more amazing voices at https://app.lmnt.com/
            language: Two-letter ISO 639-1 language code. Defaults to None.
                See: https://docs.lmnt.com/api-reference/speech/synthesize-speech-bytes#body-language
            format: Output file format. Options: aac, mp3, mulaw, raw, wav. Default is "mp3".
            sample_rate: Output sample rate in Hz. Default is 24000.
                See: https://docs.lmnt.com/api-reference/speech/synthesize-speech-bytes#body-sample-rate
            api_key: API key for authentication. Defaults to the LMNT_API_KEY environment variable.
            http_session: Optional aiohttp ClientSession. A new session is created if not provided.
            temperature: Influences how expressive and emotionally varied the speech becomes.
                Lower values (like 0.3) create more neutral, consistent speaking styles.
                Higher values (like 1.0) allow for more dynamic emotional range and speaking styles.
                Default is 1.0.
            top_p: Controls the stability of the generated speech.
                A lower value (like 0.3) produces more consistent, reliable speech.
                A higher value (like 0.9) gives more flexibility in how words are spoken,
                but might occasionally produce unusual intonations or speech patterns.
                Default is 0.8.
        """
        super().__init__(
            capabilities=tts.TTSCapabilities(streaming=False),
            sample_rate=sample_rate,
            num_channels=NUM_CHANNELS,
        )
        api_key = api_key or os.environ.get("LMNT_API_KEY")
        if not api_key:
            raise ValueError(
                "LMNT API key is required. "
                "Set it via environment variable or pass it as an argument."
            )

        if not language:
            language = "auto" if model == "blizzard" else "en"

        self._opts = _TTSOptions(
            model=model,
            sample_rate=sample_rate,
            num_channels=NUM_CHANNELS,
            language=language,
            voice=voice,
            format=format,
            api_key=api_key,
            temperature=temperature,
            top_p=top_p,
        )

        self._session = http_session

    def synthesize(
        self,
        text: str,
        *,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> ChunkedStream:
        return ChunkedStream(
            tts=self,
            input_text=text,
            conn_options=conn_options,
        )

    def update_options(
        self,
        *,
        model: NotGivenOr[LMNTModels] = NOT_GIVEN,
        voice: NotGivenOr[str] = NOT_GIVEN,
        language: NotGivenOr[LMNTLanguages] = NOT_GIVEN,
        format: NotGivenOr[LMNTAudioFormats] = NOT_GIVEN,
        sample_rate: NotGivenOr[LMNTSampleRate] = NOT_GIVEN,
        temperature: NotGivenOr[float] = NOT_GIVEN,
        top_p: NotGivenOr[float] = NOT_GIVEN,
    ) -> None:
        """
        Update the TTS options.

        Args:
            model: The model to use for synthesis. Learn more at: https://docs.lmnt.com/guides/models
            voice: The voice ID to update.
            language: Two-letter ISO 639-1 code.
                See: https://docs.lmnt.com/api-reference/speech/synthesize-speech-bytes#body-language
            format: Audio output format. Options: aac, mp3, mulaw, raw, wav.
            sample_rate: Output sample rate in Hz.
            temperature: Controls the expressiveness of the speech. A number between 0.0 and 1.0.
            top_p: Controls the stability of the generated speech. A number between 0.0 and 1.0.
        """
        if is_given(model):
            self._opts.model = model
        if is_given(voice):
            self._opts.voice = voice
        if is_given(language):
            self._opts.language = language
        if is_given(format):
            self._opts.format = format
        if is_given(sample_rate):
            self._opts.sample_rate = sample_rate
        if is_given(temperature):
            self._opts.temperature = temperature
        if is_given(top_p):
            self._opts.top_p = top_p

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()

        return self._session

Text-to-Speech (TTS) plugin for LMNT.

Create a new instance of LMNT TTS.

See: https://docs.lmnt.com/api-reference/speech/synthesize-speech-bytes

Args

model
The model to use for synthesis. Default is "blizzard". Learn more at: https://docs.lmnt.com/guides/models
voice
The voice ID to use. Default is "leah". Find more amazing voices at https://app.lmnt.com/
language
Two-letter ISO 639-1 language code. Defaults to None. See: https://docs.lmnt.com/api-reference/speech/synthesize-speech-bytes#body-language
format
Output file format. Options: aac, mp3, mulaw, raw, wav. Default is "mp3".
sample_rate
Output sample rate in Hz. Default is 24000. See: https://docs.lmnt.com/api-reference/speech/synthesize-speech-bytes#body-sample-rate
api_key
API key for authentication. Defaults to the LMNT_API_KEY environment variable.
http_session
Optional aiohttp ClientSession. A new session is created if not provided.
temperature
Influences how expressive and emotionally varied the speech becomes. Lower values (like 0.3) create more neutral, consistent speaking styles. Higher values (like 1.0) allow for more dynamic emotional range and speaking styles. Default is 1.0.
top_p
Controls the stability of the generated speech. A lower value (like 0.3) produces more consistent, reliable speech. A higher value (like 0.9) gives more flexibility in how words are spoken, but might occasionally produce unusual intonations or speech patterns. Default is 0.8.

Ancestors

  • livekit.agents.tts.tts.TTS
  • abc.ABC
  • EventEmitter
  • typing.Generic

Methods

def synthesize(self,
text: str,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.lmnt.tts.ChunkedStream
Expand source code
def synthesize(
    self,
    text: str,
    *,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> ChunkedStream:
    return ChunkedStream(
        tts=self,
        input_text=text,
        conn_options=conn_options,
    )
def update_options(self,
*,
model: NotGivenOr[LMNTModels] = NOT_GIVEN,
voice: NotGivenOr[str] = NOT_GIVEN,
language: NotGivenOr[LMNTLanguages] = NOT_GIVEN,
format: NotGivenOr[LMNTAudioFormats] = NOT_GIVEN,
sample_rate: NotGivenOr[LMNTSampleRate] = NOT_GIVEN,
temperature: NotGivenOr[float] = NOT_GIVEN,
top_p: NotGivenOr[float] = NOT_GIVEN) ‑> None
Expand source code
def update_options(
    self,
    *,
    model: NotGivenOr[LMNTModels] = NOT_GIVEN,
    voice: NotGivenOr[str] = NOT_GIVEN,
    language: NotGivenOr[LMNTLanguages] = NOT_GIVEN,
    format: NotGivenOr[LMNTAudioFormats] = NOT_GIVEN,
    sample_rate: NotGivenOr[LMNTSampleRate] = NOT_GIVEN,
    temperature: NotGivenOr[float] = NOT_GIVEN,
    top_p: NotGivenOr[float] = NOT_GIVEN,
) -> None:
    """
    Update the TTS options.

    Args:
        model: The model to use for synthesis. Learn more at: https://docs.lmnt.com/guides/models
        voice: The voice ID to update.
        language: Two-letter ISO 639-1 code.
            See: https://docs.lmnt.com/api-reference/speech/synthesize-speech-bytes#body-language
        format: Audio output format. Options: aac, mp3, mulaw, raw, wav.
        sample_rate: Output sample rate in Hz.
        temperature: Controls the expressiveness of the speech. A number between 0.0 and 1.0.
        top_p: Controls the stability of the generated speech. A number between 0.0 and 1.0.
    """
    if is_given(model):
        self._opts.model = model
    if is_given(voice):
        self._opts.voice = voice
    if is_given(language):
        self._opts.language = language
    if is_given(format):
        self._opts.format = format
    if is_given(sample_rate):
        self._opts.sample_rate = sample_rate
    if is_given(temperature):
        self._opts.temperature = temperature
    if is_given(top_p):
        self._opts.top_p = top_p

Update the TTS options.

Args

model
The model to use for synthesis. Learn more at: https://docs.lmnt.com/guides/models
voice
The voice ID to update.
language
Two-letter ISO 639-1 code. See: https://docs.lmnt.com/api-reference/speech/synthesize-speech-bytes#body-language
format
Audio output format. Options: aac, mp3, mulaw, raw, wav.
sample_rate
Output sample rate in Hz.
temperature
Controls the expressiveness of the speech. A number between 0.0 and 1.0.
top_p
Controls the stability of the generated speech. A number between 0.0 and 1.0.

Inherited members