Module livekit.plugins.minimax

Minimax plugin for LiveKit Agents

See [Plugin Docs URL - when available] for more information.

Classes

class TTS (*,
model: TTSModel | str = 'speech-02-turbo',
voice: TTSVoice | str = 'English_radiant_girl',
emotion: TTSEmotion | None = None,
speed: float = 1.0,
vol: float = 1.0,
pitch: int = 0,
english_normalization: bool = False,
audio_format: TTSAudioFormat = 'mp3',
pronunciation_dict: dict[str, list[str]] | None = None,
intensity: int | None = None,
timbre: int | None = None,
sample_rate: TTSSampleRate = 24000,
bitrate: TTSBitRate = 128000,
tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
text_pacing: tts.SentenceStreamPacer | bool = False,
api_key: str | None = None,
base_url: NotGivenOr[str] = NOT_GIVEN,
http_session: aiohttp.ClientSession | None = None)
Expand source code
class TTS(tts.TTS):
    def __init__(
        self,
        *,
        model: TTSModel | str = DEFAULT_MODEL,
        voice: TTSVoice | str = DEFAULT_VOICE_ID,
        emotion: TTSEmotion | None = None,
        speed: float = 1.0,
        vol: float = 1.0,
        pitch: int = 0,
        english_normalization: bool = False,
        audio_format: TTSAudioFormat = "mp3",
        pronunciation_dict: dict[str, list[str]] | None = None,
        intensity: int | None = None,
        timbre: int | None = None,
        sample_rate: TTSSampleRate = 24000,
        bitrate: TTSBitRate = 128000,
        tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
        text_pacing: tts.SentenceStreamPacer | bool = False,
        api_key: str | None = None,
        base_url: NotGivenOr[str] = NOT_GIVEN,
        http_session: aiohttp.ClientSession | None = None,
    ):
        """Minimax TTS plugin

        Args:
            model (TTSModel | str, optional): The Minimax TTS model to use. Defaults to DEFAULT_MODEL.
            voice (TTSVoice | str, optional): The voice to use. Defaults to DEFAULT_VOICE_ID.
            emotion (TTSEmotion | None, optional): Emotion control for speech synthesis. Defaults to None.
            speed (float, optional): Speech speed, higher values speak faster. Range is [0.5, 2.0].
            vol (float, optional): Speech volume, range is [0, 10].
            pitch (int, optional): Speech pitch adjustment, range is [-12, 12].
            english_normalization (bool, optional): Enable text normalization in English. Improves performance
                in digit-reading scenarios at the cost of slightly higher latency. Defaults to False.
            audio_format (TTSAudioFormat, optional): The audio format to use. Defaults to "mp3".
            pronunciation_dict (dict[str, list[str]] | None, optional): Defines pronunciation rules for specific characters or symbols.
            intensity (int | None, optional): Corresponds to the "Strong/Softer" slider on the official page. Range [-100, 100].
            timbre (int | None, optional): Corresponds to the "Nasal/Crisp" slider on the official page. Range: [-100, 100].
            sample_rate (TTSSampleRate, optional): The audio sample rate in Hz. Defaults to 24000.
            bitrate (TTSBitRate, optional): The audio bitrate in kbps. Defaults to 128000.
            tokenizer (NotGivenOr[tokenize.SentenceTokenizer], optional): The sentence tokenizer to use. Defaults to NOT_GIVEN.
            text_pacing (tts.SentenceStreamPacer | bool, optional): Enable text pacing for sentence-level timing control. Defaults to False.
            api_key (str | None, optional): The Minimax API key. Defaults to None.
            base_url (NotGivenOr[str], optional): The base URL for the Minimax API. Defaults to NOT_GIVEN.
            http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
        """
        super().__init__(
            capabilities=tts.TTSCapabilities(streaming=True, aligned_transcript=False),
            sample_rate=sample_rate,
            num_channels=1,
        )

        base_url = (
            base_url
            if utils.is_given(base_url)
            else os.environ.get("MINIMAX_BASE_URL", DEFAULT_BASE_URL)
        )

        minimax_api_key = api_key or os.environ.get("MINIMAX_API_KEY")
        if not minimax_api_key:
            raise ValueError("MINIMAX_API_KEY must be set")

        if not (0.5 <= speed <= 2.0):
            raise ValueError(f"speed must be between 0.5 and 2.0, but got {speed}")
        if intensity is not None and not (-100 <= intensity <= 100):
            raise ValueError(f"intensity must be between -100 and 100, but got {intensity}")
        if timbre is not None and not (-100 <= timbre <= 100):
            raise ValueError(f"timbre must be between -100 and 100, but got {timbre}")

        self._sentence_tokenizer = (
            tokenizer if utils.is_given(tokenizer) else tokenize.basic.SentenceTokenizer()
        )

        self._stream_pacer: tts.SentenceStreamPacer | None = None
        if text_pacing is True:
            self._stream_pacer = tts.SentenceStreamPacer()
        elif isinstance(text_pacing, tts.SentenceStreamPacer):
            self._stream_pacer = text_pacing

        self._opts = _TTSOptions(
            model=model,
            voice_id=voice,
            api_key=minimax_api_key,
            base_url=base_url,
            sample_rate=sample_rate,
            emotion=emotion,
            bitrate=bitrate,
            speed=speed,
            pitch=pitch,
            vol=vol,
            english_normalization=english_normalization,
            timbre=timbre,
            pronunciation_dict=pronunciation_dict,
            intensity=intensity,
            audio_format=audio_format,
        )

        self._session = http_session
        self._streams = weakref.WeakSet[SynthesizeStream]()

    def update_options(
        self,
        *,
        model: NotGivenOr[TTSModel | str] = NOT_GIVEN,
        voice: NotGivenOr[TTSVoice | str] = NOT_GIVEN,
        emotion: NotGivenOr[TTSEmotion | None] = NOT_GIVEN,
        speed: NotGivenOr[float] = NOT_GIVEN,
        vol: NotGivenOr[float] = NOT_GIVEN,
        pitch: NotGivenOr[int] = NOT_GIVEN,
        english_normalization: NotGivenOr[bool] = NOT_GIVEN,
        audio_format: NotGivenOr[TTSAudioFormat] = NOT_GIVEN,
        pronunciation_dict: NotGivenOr[dict[str, list[str]]] = NOT_GIVEN,
        intensity: NotGivenOr[int] = NOT_GIVEN,
        timbre: NotGivenOr[int] = NOT_GIVEN,
    ) -> None:
        """Update the TTS configuration options."""
        if utils.is_given(model):
            self._opts.model = model

        if utils.is_given(voice):
            self._opts.voice_id = voice

        if utils.is_given(emotion):
            self._opts.emotion = cast(Optional[TTSEmotion], emotion)

        if utils.is_given(speed):
            self._opts.speed = speed

        if utils.is_given(vol):
            self._opts.vol = vol

        if utils.is_given(pitch):
            self._opts.pitch = pitch

        if utils.is_given(english_normalization):
            self._opts.english_normalization = english_normalization

        if utils.is_given(audio_format):
            self._opts.audio_format = cast(TTSAudioFormat, audio_format)

        if utils.is_given(pronunciation_dict):
            self._opts.pronunciation_dict = pronunciation_dict

        if utils.is_given(intensity):
            self._opts.intensity = intensity

        if utils.is_given(timbre):
            self._opts.timbre = timbre

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()

        return self._session

    async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse:
        url = self._opts.base_url
        if url.startswith("http"):
            url = url.replace("http", "ws", 1)
        url = f"{url}/ws/v1/t2a_v2"

        headers = {"Authorization": f"Bearer {self._opts.api_key}"}
        session = self._ensure_session()
        ws = await asyncio.wait_for(session.ws_connect(url, headers=headers), timeout)
        return ws

    async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
        await ws.close()

    def synthesize(
        self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> ChunkedStream:
        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

    def stream(
        self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> SynthesizeStream:
        stream = SynthesizeStream(tts=self, conn_options=conn_options)
        self._streams.add(stream)
        return stream

    async def aclose(self) -> None:
        for stream in list(self._streams):
            await stream.aclose()

        self._streams.clear()

Helper class that provides a standard way to create an ABC using inheritance.

Minimax TTS plugin

Args

model : TTSModel | str, optional
The Minimax TTS model to use. Defaults to DEFAULT_MODEL.
voice : TTSVoice | str, optional
The voice to use. Defaults to DEFAULT_VOICE_ID.
emotion : TTSEmotion | None, optional
Emotion control for speech synthesis. Defaults to None.
speed : float, optional
Speech speed, higher values speak faster. Range is [0.5, 2.0].
vol : float, optional
Speech volume, range is [0, 10].
pitch : int, optional
Speech pitch adjustment, range is [-12, 12].
english_normalization : bool, optional
Enable text normalization in English. Improves performance in digit-reading scenarios at the cost of slightly higher latency. Defaults to False.
audio_format : TTSAudioFormat, optional
The audio format to use. Defaults to "mp3".
pronunciation_dict : dict[str, list[str]] | None, optional
Defines pronunciation rules for specific characters or symbols.
intensity : int | None, optional
Corresponds to the "Strong/Softer" slider on the official page. Range [-100, 100].
timbre : int | None, optional
Corresponds to the "Nasal/Crisp" slider on the official page. Range: [-100, 100].
sample_rate : TTSSampleRate, optional
The audio sample rate in Hz. Defaults to 24000.
bitrate : TTSBitRate, optional
The audio bitrate in kbps. Defaults to 128000.
tokenizer : NotGivenOr[tokenize.SentenceTokenizer], optional
The sentence tokenizer to use. Defaults to NOT_GIVEN.
text_pacing : tts.SentenceStreamPacer | bool, optional
Enable text pacing for sentence-level timing control. Defaults to False.
api_key : str | None, optional
The Minimax API key. Defaults to None.
base_url : NotGivenOr[str], optional
The base URL for the Minimax API. Defaults to NOT_GIVEN.
http_session : aiohttp.ClientSession | None, optional
An existing aiohttp ClientSession to use. If not provided, a new session will be created.

Ancestors

  • livekit.agents.tts.tts.TTS
  • abc.ABC
  • EventEmitter
  • typing.Generic

Methods

async def aclose(self) ‑> None
Expand source code
async def aclose(self) -> None:
    for stream in list(self._streams):
        await stream.aclose()

    self._streams.clear()
def stream(self,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.minimax.tts.SynthesizeStream
Expand source code
def stream(
    self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> SynthesizeStream:
    stream = SynthesizeStream(tts=self, conn_options=conn_options)
    self._streams.add(stream)
    return stream
def synthesize(self,
text: str,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.minimax.tts.ChunkedStream
Expand source code
def synthesize(
    self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> ChunkedStream:
    return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
def update_options(self,
*,
model: NotGivenOr[TTSModel | str] = NOT_GIVEN,
voice: NotGivenOr[TTSVoice | str] = NOT_GIVEN,
emotion: NotGivenOr[TTSEmotion | None] = NOT_GIVEN,
speed: NotGivenOr[float] = NOT_GIVEN,
vol: NotGivenOr[float] = NOT_GIVEN,
pitch: NotGivenOr[int] = NOT_GIVEN,
english_normalization: NotGivenOr[bool] = NOT_GIVEN,
audio_format: NotGivenOr[TTSAudioFormat] = NOT_GIVEN,
pronunciation_dict: NotGivenOr[dict[str, list[str]]] = NOT_GIVEN,
intensity: NotGivenOr[int] = NOT_GIVEN,
timbre: NotGivenOr[int] = NOT_GIVEN) ‑> None
Expand source code
def update_options(
    self,
    *,
    model: NotGivenOr[TTSModel | str] = NOT_GIVEN,
    voice: NotGivenOr[TTSVoice | str] = NOT_GIVEN,
    emotion: NotGivenOr[TTSEmotion | None] = NOT_GIVEN,
    speed: NotGivenOr[float] = NOT_GIVEN,
    vol: NotGivenOr[float] = NOT_GIVEN,
    pitch: NotGivenOr[int] = NOT_GIVEN,
    english_normalization: NotGivenOr[bool] = NOT_GIVEN,
    audio_format: NotGivenOr[TTSAudioFormat] = NOT_GIVEN,
    pronunciation_dict: NotGivenOr[dict[str, list[str]]] = NOT_GIVEN,
    intensity: NotGivenOr[int] = NOT_GIVEN,
    timbre: NotGivenOr[int] = NOT_GIVEN,
) -> None:
    """Update the TTS configuration options."""
    if utils.is_given(model):
        self._opts.model = model

    if utils.is_given(voice):
        self._opts.voice_id = voice

    if utils.is_given(emotion):
        self._opts.emotion = cast(Optional[TTSEmotion], emotion)

    if utils.is_given(speed):
        self._opts.speed = speed

    if utils.is_given(vol):
        self._opts.vol = vol

    if utils.is_given(pitch):
        self._opts.pitch = pitch

    if utils.is_given(english_normalization):
        self._opts.english_normalization = english_normalization

    if utils.is_given(audio_format):
        self._opts.audio_format = cast(TTSAudioFormat, audio_format)

    if utils.is_given(pronunciation_dict):
        self._opts.pronunciation_dict = pronunciation_dict

    if utils.is_given(intensity):
        self._opts.intensity = intensity

    if utils.is_given(timbre):
        self._opts.timbre = timbre

Update the TTS configuration options.

Inherited members