Module `livekit.plugins.minimax`

Minimax plugin for LiveKit Agents

See [Plugin Docs URL - when available] for more information.

Classes

class TTS (*, model: TTSModel | str = 'speech-02-turbo', voice: TTSVoice | str = 'socialmedia_female_2_v1', emotion: TTSEmotion | None = None, speed: float = 1.0, vol: float = 1.0, pitch: int = 0, text_normalization: bool = False, audio_format: TTSAudioFormat = 'mp3', pronunciation_dict: dict[str, list[str]] | None = None, intensity: int | None = None, timbre: int | None = None, sample_rate: TTSSampleRate = 24000, bitrate: TTSBitRate = 128000, tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN, text_pacing: tts.SentenceStreamPacer | bool = False, api_key: str | None = None, base_url: NotGivenOr[str] = NOT_GIVEN, http_session: aiohttp.ClientSession | None = None)

Expand source code

class TTS(tts.TTS):
    def __init__(
        self,
        *,
        model: TTSModel | str = DEFAULT_MODEL,
        voice: TTSVoice | str = DEFAULT_VOICE_ID,
        emotion: TTSEmotion | None = None,
        speed: float = 1.0,
        vol: float = 1.0,
        pitch: int = 0,
        text_normalization: bool = False,
        audio_format: TTSAudioFormat = "mp3",
        pronunciation_dict: dict[str, list[str]] | None = None,
        intensity: int | None = None,
        timbre: int | None = None,
        sample_rate: TTSSampleRate = 24000,
        bitrate: TTSBitRate = 128000,
        tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
        text_pacing: tts.SentenceStreamPacer | bool = False,
        api_key: str | None = None,
        base_url: NotGivenOr[str] = NOT_GIVEN,
        http_session: aiohttp.ClientSession | None = None,
    ):
        """Minimax TTS plugin

        Args:
            model (TTSModel | str, optional): The Minimax TTS model to use. Defaults to DEFAULT_MODEL.
                Available models: speech-2.6-hd, speech-2.6-turbo, speech-2.5-hd-preview,
                speech-2.5-turbo-preview, speech-02-hd, speech-02-turbo, speech-01-hd, speech-01-turbo.
            voice (TTSVoice | str, optional): The voice to use. Defaults to DEFAULT_VOICE_ID.
            emotion (TTSEmotion | None, optional): Emotion control for speech synthesis.
                Options: "happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral", "fluent".
                Note: "fluent" emotion is only supported by speech-2.6-* models. Defaults to None.
            speed (float, optional): Speech speed, higher values speak faster. Range is [0.5, 2.0].
            vol (float, optional): Speech volume, range is [0, 10].
            pitch (int, optional): Speech pitch adjustment, range is [-12, 12].
            text_normalization (bool, optional): Enable text normalization (Chinese/English). Improves performance
                in digit-reading scenarios at the cost of slightly higher latency. Defaults to False.
            audio_format (TTSAudioFormat, optional): The audio format to use. Defaults to "mp3".
            pronunciation_dict (dict[str, list[str]] | None, optional): Defines pronunciation rules for specific characters or symbols.
            intensity (int | None, optional): Corresponds to the "Strong/Softer" slider on the official page. Range [-100, 100].
            timbre (int | None, optional): Corresponds to the "Nasal/Crisp" slider on the official page. Range: [-100, 100].
            sample_rate (TTSSampleRate, optional): The audio sample rate in Hz. Defaults to 24000.
            bitrate (TTSBitRate, optional): The audio bitrate in kbps. Defaults to 128000.
            tokenizer (NotGivenOr[tokenize.SentenceTokenizer], optional): The sentence tokenizer to use. Defaults to `livekit.agents.tokenize.basic.SentenceTokenizer`.
            text_pacing (tts.SentenceStreamPacer | bool, optional): Enable text pacing for sentence-level timing control. Defaults to False.
            api_key (str | None, optional): The Minimax API key. Defaults to None.
            base_url (NotGivenOr[str], optional): The base URL for the Minimax API. Defaults to NOT_GIVEN.
            http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
        """
        super().__init__(
            capabilities=tts.TTSCapabilities(streaming=True, aligned_transcript=False),
            sample_rate=sample_rate,
            num_channels=1,
        )

        base_url = (
            base_url
            if utils.is_given(base_url)
            else os.environ.get("MINIMAX_BASE_URL", DEFAULT_BASE_URL)
        )

        minimax_api_key = api_key or os.environ.get("MINIMAX_API_KEY")
        if not minimax_api_key:
            raise ValueError("MINIMAX_API_KEY must be set")

        if not (0.5 <= speed <= 2.0):
            raise ValueError(f"speed must be between 0.5 and 2.0, but got {speed}")
        if intensity is not None and not (-100 <= intensity <= 100):
            raise ValueError(f"intensity must be between -100 and 100, but got {intensity}")
        if timbre is not None and not (-100 <= timbre <= 100):
            raise ValueError(f"timbre must be between -100 and 100, but got {timbre}")

        # Validate fluent emotion is only used with speech-2.6-* models
        if emotion == "fluent" and not model.startswith("speech-2.6"):
            raise ValueError(
                f'"fluent" emotion is only supported by speech-2.6-* models, '
                f'but got model "{model}". Please use speech-2.6-hd or speech-2.6-turbo.'
            )

        self._sentence_tokenizer = (
            tokenizer if utils.is_given(tokenizer) else tokenize.basic.SentenceTokenizer()
        )

        self._stream_pacer: tts.SentenceStreamPacer | None = None
        if text_pacing is True:
            self._stream_pacer = tts.SentenceStreamPacer()
        elif isinstance(text_pacing, tts.SentenceStreamPacer):
            self._stream_pacer = text_pacing

        self._opts = _TTSOptions(
            model=model,
            voice_id=voice,
            api_key=minimax_api_key,
            base_url=base_url,
            sample_rate=sample_rate,
            emotion=emotion,
            bitrate=bitrate,
            speed=speed,
            pitch=pitch,
            vol=vol,
            text_normalization=text_normalization,
            timbre=timbre,
            pronunciation_dict=pronunciation_dict,
            intensity=intensity,
            audio_format=audio_format,
        )

        self._session = http_session
        self._streams = weakref.WeakSet[SynthesizeStream]()

    @property
    def model(self) -> str:
        return self._opts.model

    @property
    def provider(self) -> str:
        return "MiniMax"

    def update_options(
        self,
        *,
        model: NotGivenOr[TTSModel | str] = NOT_GIVEN,
        voice: NotGivenOr[TTSVoice | str] = NOT_GIVEN,
        emotion: NotGivenOr[TTSEmotion | None] = NOT_GIVEN,
        speed: NotGivenOr[float] = NOT_GIVEN,
        vol: NotGivenOr[float] = NOT_GIVEN,
        pitch: NotGivenOr[int] = NOT_GIVEN,
        text_normalization: NotGivenOr[bool] = NOT_GIVEN,
        audio_format: NotGivenOr[TTSAudioFormat] = NOT_GIVEN,
        pronunciation_dict: NotGivenOr[dict[str, list[str]]] = NOT_GIVEN,
        intensity: NotGivenOr[int] = NOT_GIVEN,
        timbre: NotGivenOr[int] = NOT_GIVEN,
    ) -> None:
        """Update the TTS configuration options."""
        if utils.is_given(model):
            self._opts.model = model

        if utils.is_given(voice):
            self._opts.voice_id = voice

        if utils.is_given(emotion):
            self._opts.emotion = cast(Optional[TTSEmotion], emotion)

        if utils.is_given(speed):
            self._opts.speed = speed

        if utils.is_given(vol):
            self._opts.vol = vol

        if utils.is_given(pitch):
            self._opts.pitch = pitch

        if utils.is_given(text_normalization):
            self._opts.text_normalization = text_normalization

        if utils.is_given(audio_format):
            self._opts.audio_format = cast(TTSAudioFormat, audio_format)

        if utils.is_given(pronunciation_dict):
            self._opts.pronunciation_dict = pronunciation_dict

        if utils.is_given(intensity):
            self._opts.intensity = intensity

        if utils.is_given(timbre):
            self._opts.timbre = timbre

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()

        return self._session

    async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse:
        url = self._opts.base_url
        if url.startswith("http"):
            url = url.replace("http", "ws", 1)
        url = f"{url}/ws/v1/t2a_v2"

        headers = {"Authorization": f"Bearer {self._opts.api_key}"}
        session = self._ensure_session()
        ws = await asyncio.wait_for(session.ws_connect(url, headers=headers), timeout)

        # Log WebSocket connection establishment
        logger.debug(f"MiniMax WebSocket connected to {url}")

        return ws

    async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
        await ws.close()

    def synthesize(
        self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> ChunkedStream:
        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

    def stream(
        self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> SynthesizeStream:
        stream = SynthesizeStream(tts=self, conn_options=conn_options)
        self._streams.add(stream)
        return stream

    async def aclose(self) -> None:
        for stream in list(self._streams):
            await stream.aclose()

        self._streams.clear()

Helper class that provides a standard way to create an ABC using inheritance.

Minimax TTS plugin

Args

model : TTSModel | str, optional: The Minimax TTS model to use. Defaults to DEFAULT_MODEL. Available models: speech-2.6-hd, speech-2.6-turbo, speech-2.5-hd-preview, speech-2.5-turbo-preview, speech-02-hd, speech-02-turbo, speech-01-hd, speech-01-turbo.
voice : TTSVoice | str, optional: The voice to use. Defaults to DEFAULT_VOICE_ID.
emotion : TTSEmotion | None, optional: Emotion control for speech synthesis. Options: "happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral", "fluent". Note: "fluent" emotion is only supported by speech-2.6-* models. Defaults to None.
speed : float, optional: Speech speed, higher values speak faster. Range is [0.5, 2.0].
vol : float, optional: Speech volume, range is [0, 10].
pitch : int, optional: Speech pitch adjustment, range is [-12, 12].
text_normalization : bool, optional: Enable text normalization (Chinese/English). Improves performance in digit-reading scenarios at the cost of slightly higher latency. Defaults to False.
audio_format : TTSAudioFormat, optional: The audio format to use. Defaults to "mp3".
pronunciation_dict : dict[str, list[str]] | None, optional: Defines pronunciation rules for specific characters or symbols.
intensity : int | None, optional: Corresponds to the "Strong/Softer" slider on the official page. Range [-100, 100].
timbre : int | None, optional: Corresponds to the "Nasal/Crisp" slider on the official page. Range: [-100, 100].
sample_rate : TTSSampleRate, optional: The audio sample rate in Hz. Defaults to 24000.
bitrate : TTSBitRate, optional: The audio bitrate in kbps. Defaults to 128000.
tokenizer : NotGivenOr[tokenize.SentenceTokenizer], optional: The sentence tokenizer to use. Defaults to SentenceTokenizer.
text_pacing : tts.SentenceStreamPacer | bool, optional: Enable text pacing for sentence-level timing control. Defaults to False.
api_key : str | None, optional: The Minimax API key. Defaults to None.
base_url : NotGivenOr[str], optional: The base URL for the Minimax API. Defaults to NOT_GIVEN.
http_session : aiohttp.ClientSession | None, optional: An existing aiohttp ClientSession to use. If not provided, a new session will be created.

Ancestors

livekit.agents.tts.tts.TTS
abc.ABC
EventEmitter
typing.Generic

Instance variables

prop model : str

Expand source code

@property
def model(self) -> str:
    return self._opts.model

Get the model name/identifier for this TTS instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop provider : str

Expand source code

@property
def provider(self) -> str:
    return "MiniMax"

Get the provider name/identifier for this TTS instance.

Returns

The provider name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their provider information.

Methods

async def aclose(self) ‑> None

Expand source code

async def aclose(self) -> None:
    for stream in list(self._streams):
        await stream.aclose()

    self._streams.clear()

def stream(self, *, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.minimax.tts.SynthesizeStream

Expand source code

def stream(
    self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> SynthesizeStream:
    stream = SynthesizeStream(tts=self, conn_options=conn_options)
    self._streams.add(stream)
    return stream

def synthesize(self, text: str, *, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.minimax.tts.ChunkedStream

Expand source code

def synthesize(
    self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> ChunkedStream:
    return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

def update_options(self, *, model: NotGivenOr[TTSModel | str] = NOT_GIVEN, voice: NotGivenOr[TTSVoice | str] = NOT_GIVEN, emotion: NotGivenOr[TTSEmotion | None] = NOT_GIVEN, speed: NotGivenOr[float] = NOT_GIVEN, vol: NotGivenOr[float] = NOT_GIVEN, pitch: NotGivenOr[int] = NOT_GIVEN, text_normalization: NotGivenOr[bool] = NOT_GIVEN, audio_format: NotGivenOr[TTSAudioFormat] = NOT_GIVEN, pronunciation_dict: NotGivenOr[dict[str, list[str]]] = NOT_GIVEN, intensity: NotGivenOr[int] = NOT_GIVEN, timbre: NotGivenOr[int] = NOT_GIVEN) ‑> None

Expand source code

def update_options(
    self,
    *,
    model: NotGivenOr[TTSModel | str] = NOT_GIVEN,
    voice: NotGivenOr[TTSVoice | str] = NOT_GIVEN,
    emotion: NotGivenOr[TTSEmotion | None] = NOT_GIVEN,
    speed: NotGivenOr[float] = NOT_GIVEN,
    vol: NotGivenOr[float] = NOT_GIVEN,
    pitch: NotGivenOr[int] = NOT_GIVEN,
    text_normalization: NotGivenOr[bool] = NOT_GIVEN,
    audio_format: NotGivenOr[TTSAudioFormat] = NOT_GIVEN,
    pronunciation_dict: NotGivenOr[dict[str, list[str]]] = NOT_GIVEN,
    intensity: NotGivenOr[int] = NOT_GIVEN,
    timbre: NotGivenOr[int] = NOT_GIVEN,
) -> None:
    """Update the TTS configuration options."""
    if utils.is_given(model):
        self._opts.model = model

    if utils.is_given(voice):
        self._opts.voice_id = voice

    if utils.is_given(emotion):
        self._opts.emotion = cast(Optional[TTSEmotion], emotion)

    if utils.is_given(speed):
        self._opts.speed = speed

    if utils.is_given(vol):
        self._opts.vol = vol

    if utils.is_given(pitch):
        self._opts.pitch = pitch

    if utils.is_given(text_normalization):
        self._opts.text_normalization = text_normalization

    if utils.is_given(audio_format):
        self._opts.audio_format = cast(TTSAudioFormat, audio_format)

    if utils.is_given(pronunciation_dict):
        self._opts.pronunciation_dict = pronunciation_dict

    if utils.is_given(intensity):
        self._opts.intensity = intensity

    if utils.is_given(timbre):
        self._opts.timbre = timbre

Update the TTS configuration options.

Inherited members

EventEmitter:
- emit
- off
- on
- once