Module livekit.plugins.minimax
Minimax plugin for LiveKit Agents
See [Plugin Docs URL - when available] for more information.
Classes
class TTS (*,
model: TTSModel | str = 'speech-02-turbo',
voice: TTSVoice | str = 'English_radiant_girl',
emotion: TTSEmotion | None = None,
speed: float = 1.0,
vol: float = 1.0,
pitch: int = 0,
english_normalization: bool = False,
audio_format: TTSAudioFormat = 'mp3',
pronunciation_dict: dict[str, list[str]] | None = None,
intensity: int | None = None,
timbre: int | None = None,
sample_rate: TTSSampleRate = 24000,
bitrate: TTSBitRate = 128000,
tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
text_pacing: tts.SentenceStreamPacer | bool = False,
api_key: str | None = None,
base_url: NotGivenOr[str] = NOT_GIVEN,
http_session: aiohttp.ClientSession | None = None)-
Expand source code
class TTS(tts.TTS): def __init__( self, *, model: TTSModel | str = DEFAULT_MODEL, voice: TTSVoice | str = DEFAULT_VOICE_ID, emotion: TTSEmotion | None = None, speed: float = 1.0, vol: float = 1.0, pitch: int = 0, english_normalization: bool = False, audio_format: TTSAudioFormat = "mp3", pronunciation_dict: dict[str, list[str]] | None = None, intensity: int | None = None, timbre: int | None = None, sample_rate: TTSSampleRate = 24000, bitrate: TTSBitRate = 128000, tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN, text_pacing: tts.SentenceStreamPacer | bool = False, api_key: str | None = None, base_url: NotGivenOr[str] = NOT_GIVEN, http_session: aiohttp.ClientSession | None = None, ): """Minimax TTS plugin Args: model (TTSModel | str, optional): The Minimax TTS model to use. Defaults to DEFAULT_MODEL. voice (TTSVoice | str, optional): The voice to use. Defaults to DEFAULT_VOICE_ID. emotion (TTSEmotion | None, optional): Emotion control for speech synthesis. Defaults to None. speed (float, optional): Speech speed, higher values speak faster. Range is [0.5, 2.0]. vol (float, optional): Speech volume, range is [0, 10]. pitch (int, optional): Speech pitch adjustment, range is [-12, 12]. english_normalization (bool, optional): Enable text normalization in English. Improves performance in digit-reading scenarios at the cost of slightly higher latency. Defaults to False. audio_format (TTSAudioFormat, optional): The audio format to use. Defaults to "mp3". pronunciation_dict (dict[str, list[str]] | None, optional): Defines pronunciation rules for specific characters or symbols. intensity (int | None, optional): Corresponds to the "Strong/Softer" slider on the official page. Range [-100, 100]. timbre (int | None, optional): Corresponds to the "Nasal/Crisp" slider on the official page. Range: [-100, 100]. sample_rate (TTSSampleRate, optional): The audio sample rate in Hz. Defaults to 24000. bitrate (TTSBitRate, optional): The audio bitrate in kbps. Defaults to 128000. tokenizer (NotGivenOr[tokenize.SentenceTokenizer], optional): The sentence tokenizer to use. Defaults to NOT_GIVEN. text_pacing (tts.SentenceStreamPacer | bool, optional): Enable text pacing for sentence-level timing control. Defaults to False. api_key (str | None, optional): The Minimax API key. Defaults to None. base_url (NotGivenOr[str], optional): The base URL for the Minimax API. Defaults to NOT_GIVEN. http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created. """ super().__init__( capabilities=tts.TTSCapabilities(streaming=True, aligned_transcript=False), sample_rate=sample_rate, num_channels=1, ) base_url = ( base_url if utils.is_given(base_url) else os.environ.get("MINIMAX_BASE_URL", DEFAULT_BASE_URL) ) minimax_api_key = api_key or os.environ.get("MINIMAX_API_KEY") if not minimax_api_key: raise ValueError("MINIMAX_API_KEY must be set") if not (0.5 <= speed <= 2.0): raise ValueError(f"speed must be between 0.5 and 2.0, but got {speed}") if intensity is not None and not (-100 <= intensity <= 100): raise ValueError(f"intensity must be between -100 and 100, but got {intensity}") if timbre is not None and not (-100 <= timbre <= 100): raise ValueError(f"timbre must be between -100 and 100, but got {timbre}") self._sentence_tokenizer = ( tokenizer if utils.is_given(tokenizer) else tokenize.basic.SentenceTokenizer() ) self._stream_pacer: tts.SentenceStreamPacer | None = None if text_pacing is True: self._stream_pacer = tts.SentenceStreamPacer() elif isinstance(text_pacing, tts.SentenceStreamPacer): self._stream_pacer = text_pacing self._opts = _TTSOptions( model=model, voice_id=voice, api_key=minimax_api_key, base_url=base_url, sample_rate=sample_rate, emotion=emotion, bitrate=bitrate, speed=speed, pitch=pitch, vol=vol, english_normalization=english_normalization, timbre=timbre, pronunciation_dict=pronunciation_dict, intensity=intensity, audio_format=audio_format, ) self._session = http_session self._streams = weakref.WeakSet[SynthesizeStream]() def update_options( self, *, model: NotGivenOr[TTSModel | str] = NOT_GIVEN, voice: NotGivenOr[TTSVoice | str] = NOT_GIVEN, emotion: NotGivenOr[TTSEmotion | None] = NOT_GIVEN, speed: NotGivenOr[float] = NOT_GIVEN, vol: NotGivenOr[float] = NOT_GIVEN, pitch: NotGivenOr[int] = NOT_GIVEN, english_normalization: NotGivenOr[bool] = NOT_GIVEN, audio_format: NotGivenOr[TTSAudioFormat] = NOT_GIVEN, pronunciation_dict: NotGivenOr[dict[str, list[str]]] = NOT_GIVEN, intensity: NotGivenOr[int] = NOT_GIVEN, timbre: NotGivenOr[int] = NOT_GIVEN, ) -> None: """Update the TTS configuration options.""" if utils.is_given(model): self._opts.model = model if utils.is_given(voice): self._opts.voice_id = voice if utils.is_given(emotion): self._opts.emotion = cast(Optional[TTSEmotion], emotion) if utils.is_given(speed): self._opts.speed = speed if utils.is_given(vol): self._opts.vol = vol if utils.is_given(pitch): self._opts.pitch = pitch if utils.is_given(english_normalization): self._opts.english_normalization = english_normalization if utils.is_given(audio_format): self._opts.audio_format = cast(TTSAudioFormat, audio_format) if utils.is_given(pronunciation_dict): self._opts.pronunciation_dict = pronunciation_dict if utils.is_given(intensity): self._opts.intensity = intensity if utils.is_given(timbre): self._opts.timbre = timbre def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: self._session = utils.http_context.http_session() return self._session async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse: url = self._opts.base_url if url.startswith("http"): url = url.replace("http", "ws", 1) url = f"{url}/ws/v1/t2a_v2" headers = {"Authorization": f"Bearer {self._opts.api_key}"} session = self._ensure_session() ws = await asyncio.wait_for(session.ws_connect(url, headers=headers), timeout) return ws async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None: await ws.close() def synthesize( self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS ) -> ChunkedStream: return ChunkedStream(tts=self, input_text=text, conn_options=conn_options) def stream( self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS ) -> SynthesizeStream: stream = SynthesizeStream(tts=self, conn_options=conn_options) self._streams.add(stream) return stream async def aclose(self) -> None: for stream in list(self._streams): await stream.aclose() self._streams.clear()
Helper class that provides a standard way to create an ABC using inheritance.
Minimax TTS plugin
Args
model
:TTSModel | str
, optional- The Minimax TTS model to use. Defaults to DEFAULT_MODEL.
voice
:TTSVoice | str
, optional- The voice to use. Defaults to DEFAULT_VOICE_ID.
emotion
:TTSEmotion | None
, optional- Emotion control for speech synthesis. Defaults to None.
speed
:float
, optional- Speech speed, higher values speak faster. Range is [0.5, 2.0].
vol
:float
, optional- Speech volume, range is [0, 10].
pitch
:int
, optional- Speech pitch adjustment, range is [-12, 12].
english_normalization
:bool
, optional- Enable text normalization in English. Improves performance in digit-reading scenarios at the cost of slightly higher latency. Defaults to False.
audio_format
:TTSAudioFormat
, optional- The audio format to use. Defaults to "mp3".
pronunciation_dict
:dict[str, list[str]] | None
, optional- Defines pronunciation rules for specific characters or symbols.
intensity
:int | None
, optional- Corresponds to the "Strong/Softer" slider on the official page. Range [-100, 100].
timbre
:int | None
, optional- Corresponds to the "Nasal/Crisp" slider on the official page. Range: [-100, 100].
sample_rate
:TTSSampleRate
, optional- The audio sample rate in Hz. Defaults to 24000.
bitrate
:TTSBitRate
, optional- The audio bitrate in kbps. Defaults to 128000.
tokenizer
:NotGivenOr[tokenize.SentenceTokenizer]
, optional- The sentence tokenizer to use. Defaults to NOT_GIVEN.
text_pacing
:tts.SentenceStreamPacer | bool
, optional- Enable text pacing for sentence-level timing control. Defaults to False.
api_key
:str | None
, optional- The Minimax API key. Defaults to None.
base_url
:NotGivenOr[str]
, optional- The base URL for the Minimax API. Defaults to NOT_GIVEN.
http_session
:aiohttp.ClientSession | None
, optional- An existing aiohttp ClientSession to use. If not provided, a new session will be created.
Ancestors
- livekit.agents.tts.tts.TTS
- abc.ABC
- EventEmitter
- typing.Generic
Methods
async def aclose(self) ‑> None
-
Expand source code
async def aclose(self) -> None: for stream in list(self._streams): await stream.aclose() self._streams.clear()
def stream(self,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.minimax.tts.SynthesizeStream-
Expand source code
def stream( self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS ) -> SynthesizeStream: stream = SynthesizeStream(tts=self, conn_options=conn_options) self._streams.add(stream) return stream
def synthesize(self,
text: str,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.minimax.tts.ChunkedStream-
Expand source code
def synthesize( self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS ) -> ChunkedStream: return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
def update_options(self,
*,
model: NotGivenOr[TTSModel | str] = NOT_GIVEN,
voice: NotGivenOr[TTSVoice | str] = NOT_GIVEN,
emotion: NotGivenOr[TTSEmotion | None] = NOT_GIVEN,
speed: NotGivenOr[float] = NOT_GIVEN,
vol: NotGivenOr[float] = NOT_GIVEN,
pitch: NotGivenOr[int] = NOT_GIVEN,
english_normalization: NotGivenOr[bool] = NOT_GIVEN,
audio_format: NotGivenOr[TTSAudioFormat] = NOT_GIVEN,
pronunciation_dict: NotGivenOr[dict[str, list[str]]] = NOT_GIVEN,
intensity: NotGivenOr[int] = NOT_GIVEN,
timbre: NotGivenOr[int] = NOT_GIVEN) ‑> None-
Expand source code
def update_options( self, *, model: NotGivenOr[TTSModel | str] = NOT_GIVEN, voice: NotGivenOr[TTSVoice | str] = NOT_GIVEN, emotion: NotGivenOr[TTSEmotion | None] = NOT_GIVEN, speed: NotGivenOr[float] = NOT_GIVEN, vol: NotGivenOr[float] = NOT_GIVEN, pitch: NotGivenOr[int] = NOT_GIVEN, english_normalization: NotGivenOr[bool] = NOT_GIVEN, audio_format: NotGivenOr[TTSAudioFormat] = NOT_GIVEN, pronunciation_dict: NotGivenOr[dict[str, list[str]]] = NOT_GIVEN, intensity: NotGivenOr[int] = NOT_GIVEN, timbre: NotGivenOr[int] = NOT_GIVEN, ) -> None: """Update the TTS configuration options.""" if utils.is_given(model): self._opts.model = model if utils.is_given(voice): self._opts.voice_id = voice if utils.is_given(emotion): self._opts.emotion = cast(Optional[TTSEmotion], emotion) if utils.is_given(speed): self._opts.speed = speed if utils.is_given(vol): self._opts.vol = vol if utils.is_given(pitch): self._opts.pitch = pitch if utils.is_given(english_normalization): self._opts.english_normalization = english_normalization if utils.is_given(audio_format): self._opts.audio_format = cast(TTSAudioFormat, audio_format) if utils.is_given(pronunciation_dict): self._opts.pronunciation_dict = pronunciation_dict if utils.is_given(intensity): self._opts.intensity = intensity if utils.is_given(timbre): self._opts.timbre = timbre
Update the TTS configuration options.
Inherited members