Module `livekit.plugins.simplismart`

SimpliSmart plugin for LiveKit Agents

Support for speech-to-text and text-to-speech with SimpliSmart.

SimpliSmart hosts a range of STT and TTS models, including Whisper-based transcription and TTS models such as Orpheus and Qwen 3 TTS.

For API access, visit https://simplismart.ai/

Classes

class STT (*, base_url: str = 'https://api.simplismart.live/predict', api_key: str | None = None, streaming: bool = False, model: Literal['openai/whisper-large-v2', 'openai/whisper-large-v3', 'openai/whisper-large-v3-turbo'] | str = 'openai/whisper-large-v3-turbo', language: str = 'en', task: Literal['transcribe', 'translate'] = 'transcribe', without_timestamps: bool = True, vad_model: Literal['silero', 'frame'] = 'frame', vad_filter: bool = True, vad_onset: float | None = 0.5, vad_offset: float | None = None, min_speech_duration_ms: int = 0, max_speech_duration_s: float = 30, min_silence_duration_ms: int = 2000, speech_pad_ms: int = 400, initial_prompt: str | None = None, hotwords: str | None = None, num_speakers: int = 0, compression_ratio_threshold: float | None = 2.4, beam_size: int = 4, temperature: float = 0.0, multilingual: bool = False, max_tokens: float | None = 400, log_prob_threshold: float | None = -1.0, length_penalty: int = 1, repetition_penalty: float = 1.01, strict_hallucination_reduction: bool = False, http_session: aiohttp.client.ClientSession | None = None)

Expand source code

class STT(stt.STT):
    def __init__(
        self,
        *,
        base_url: str = SIMPLISMART_BASE_URL,
        api_key: str | None = None,
        streaming: bool = False,
        model: STTModels | str = "openai/whisper-large-v3-turbo",
        language: str = "en",
        task: Literal["transcribe", "translate"] = "transcribe",
        without_timestamps: bool = True,
        vad_model: Literal["silero", "frame"] = "frame",
        vad_filter: bool = True,
        vad_onset: float | None = 0.5,
        vad_offset: float | None = None,
        min_speech_duration_ms: int = 0,
        max_speech_duration_s: float = 30,
        min_silence_duration_ms: int = 2000,
        speech_pad_ms: int = 400,
        initial_prompt: str | None = None,
        hotwords: str | None = None,
        num_speakers: int = 0,
        compression_ratio_threshold: float | None = 2.4,
        beam_size: int = 4,
        temperature: float = 0.0,
        multilingual: bool = False,
        max_tokens: float | None = 400,
        log_prob_threshold: float | None = -1.0,
        length_penalty: int = 1,
        repetition_penalty: float = 1.01,
        strict_hallucination_reduction: bool = False,
        http_session: aiohttp.ClientSession | None = None,
    ):
        """
        Configuration options for the SimpliSmart STT (Speech-to-Text) engine.

        Note:
            Streaming transcription is not publicly available at this time.

        Args:
            language (str): Language code for transcription (default: "en").
            task (Literal["transcribe", "translate"]): Operation to perform, either "transcribe" or "translate".
            model (STTModels | str): Model identifier for the backend STT model.
            without_timestamps (bool): If True, disables timestamp generation in transcripts.
            vad_model (Literal["silero", "frame"]): Voice Activity Detection model to use ("silero" or "frame").
            vad_filter (bool): Whether to apply VAD to filter input audio.
            vad_onset (float | None): Time (in seconds) for VAD onset boundary.
            vad_offset (float | None): Time (in seconds) for VAD offset boundary.
            min_speech_duration_ms (int): Minimum duration (ms) for a valid speech segment.
            max_speech_duration_s (float): Maximum speech segment duration (seconds).
            min_silence_duration_ms (int): Minimum silence duration (ms) to split speech.
            speech_pad_ms (int): Padding (ms) added to boundaries of detected speech.
            initial_prompt (str | None): Optional initial prompt for contextual biasing.
            hotwords (str | None): Comma-separated list of hotwords to bias recognition.
            num_speakers (int): Number of speakers for diarization.
            compression_ratio_threshold (float | None): Threshold for output compression ratio.
            beam_size (int): Beam size for the decoder.
            temperature (float): Decoding temperature (affects randomness).
            multilingual (bool): Whether to permit multilingual recognition.
            max_tokens (float | None): Maximum number of output tokens for the model.
            log_prob_threshold (float | None): Log probability threshold for word filtering.
            length_penalty (int): Penalty for longer transcriptions.
            repetition_penalty (float): Penalty for repeated words during decoding.
            strict_hallucination_reduction (bool): Whether to apply hallucination reduction.
        """
        if streaming:
            base_url = f"wss://{urlparse(base_url).netloc}/ws/audio"

        super().__init__(
            capabilities=stt.STTCapabilities(
                streaming=streaming,
                interim_results=False,
                aligned_transcript="word",
            )
        )

        self._api_key = api_key or os.environ.get("SIMPLISMART_API_KEY")
        if not self._api_key:
            raise ValueError("SIMPLISMART_API_KEY is not set")

        self._model = model
        self._opts = SimplismartSTTOptions(
            language=LanguageCode(language),
            task=task,
            without_timestamps=without_timestamps,
            vad_model=vad_model,
            vad_filter=vad_filter,
            vad_onset=vad_onset,
            vad_offset=vad_offset,
            min_speech_duration_ms=min_speech_duration_ms,
            max_speech_duration_s=max_speech_duration_s,
            min_silence_duration_ms=min_silence_duration_ms,
            speech_pad_ms=speech_pad_ms,
            initial_prompt=initial_prompt,
            hotwords=hotwords,
            num_speakers=num_speakers,
            compression_ratio_threshold=compression_ratio_threshold,
            beam_size=beam_size,
            temperature=temperature,
            multilingual=multilingual,
            max_tokens=max_tokens,
            log_prob_threshold=log_prob_threshold,
            length_penalty=length_penalty,
            repetition_penalty=repetition_penalty,
            strict_hallucination_reduction=strict_hallucination_reduction,
        )
        self._base_url = base_url
        self._session = http_session
        self._streams = weakref.WeakSet[SpeechStream]()

    @property
    def provider(self) -> str:
        return "Simplismart"

    @property
    def model(self) -> str:
        return self._model

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()
        return self._session

    async def _recognize_impl(
        self,
        buffer: AudioBuffer,
        *,
        language: NotGivenOr[str] = NOT_GIVEN,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> stt.SpeechEvent:
        resolved_language: str | None = language if is_given(language) else self._opts.language
        wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes()

        audio_b64 = base64.b64encode(wav_bytes).decode("utf-8")
        payload = self._opts.model_dump()

        payload["audio_data"] = audio_b64
        payload["language"] = resolved_language
        payload["model"] = self._model

        try:
            async with self._ensure_session().post(
                self._base_url,
                json=payload,
                headers={
                    "Authorization": f"Bearer {self._api_key}",
                    "Content-Type": "application/json",
                },
                timeout=aiohttp.ClientTimeout(
                    total=conn_options.timeout,
                ),
            ) as res:
                if res.status != 200:
                    error_text = await res.text()
                    logger.error(f"Simplismart API error: {res.status} - {error_text}")
                    raise APIStatusError(
                        message=f"Simplismart API Error: {error_text}",
                        status_code=res.status,
                        request_id=None,
                        body=error_text,
                    )

                response_json = await res.json()
                timestamps = response_json.get("timestamps", [])
                transcription = response_json.get("transcription", [])

                info = response_json.get("info", {})
                detected_language = LanguageCode(info.get("language", resolved_language or "en"))

                start_time = timestamps[0][0] if timestamps else 0.0
                end_time = timestamps[-1][1] if timestamps else 0.0
                request_id = response_json.get("request_id", "")
                text = "".join(transcription)

                alternatives = [
                    stt.SpeechData(
                        language=detected_language,
                        text=text,
                        start_time=start_time,
                        end_time=end_time,
                    ),
                ]

                return stt.SpeechEvent(
                    type=stt.SpeechEventType.FINAL_TRANSCRIPT,
                    request_id=request_id,
                    alternatives=alternatives,
                )
        except asyncio.TimeoutError as e:
            logger.error(f"Simplismart API timeout: {e}")
            raise APITimeoutError("Simplismart API request timed out") from e
        except aiohttp.ClientError as e:
            logger.error(f"Simplismart API client error: {e}")
            raise APIConnectionError(f"Simplismart API connection error: {e}") from e
        except APIStatusError:
            raise
        except Exception as e:
            logger.error(f"Error during Simplismart STT processing: {traceback.format_exc()}")
            raise APIConnectionError(f"Unexpected error in Simplismart STT: {e}") from e

    def stream(
        self,
        *,
        language: NotGivenOr[str] = NOT_GIVEN,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
        **kwargs: Any,
    ) -> "SpeechStream":
        """Create a streaming transcription session."""
        opts_language = LanguageCode(language) if is_given(language) else self._opts.language

        # Create options for the stream
        stream_opts = SimplismartSTTOptions(language=opts_language)

        # Create a fresh session for this stream to avoid conflicts
        stream_session = aiohttp.ClientSession()

        if self._api_key is None:
            raise ValueError("API key cannot be None")
        stream = SpeechStream(
            stt=self,
            opts=stream_opts,
            conn_options=conn_options,
            api_key=self._api_key,
            http_session=stream_session,
        )
        self._streams.add(stream)
        return stream

Helper class that provides a standard way to create an ABC using inheritance.

Configuration options for the SimpliSmart STT (Speech-to-Text) engine.

Note

Streaming transcription is not publicly available at this time.

Args

language : str: Language code for transcription (default: "en").
task (Literal["transcribe", "translate"]): Operation to perform, either "transcribe" or "translate".
model : STTModels | str: Model identifier for the backend STT model.
without_timestamps : bool: If True, disables timestamp generation in transcripts.
vad_model (Literal["silero", "frame"]): Voice Activity Detection model to use ("silero" or "frame").
vad_filter : bool: Whether to apply VAD to filter input audio.
vad_onset : float | None: Time (in seconds) for VAD onset boundary.
vad_offset : float | None: Time (in seconds) for VAD offset boundary.
min_speech_duration_ms : int: Minimum duration (ms) for a valid speech segment.
max_speech_duration_s : float: Maximum speech segment duration (seconds).
min_silence_duration_ms : int: Minimum silence duration (ms) to split speech.
speech_pad_ms : int: Padding (ms) added to boundaries of detected speech.
initial_prompt : str | None: Optional initial prompt for contextual biasing.
hotwords : str | None: Comma-separated list of hotwords to bias recognition.
num_speakers : int: Number of speakers for diarization.
compression_ratio_threshold : float | None: Threshold for output compression ratio.
beam_size : int: Beam size for the decoder.
temperature : float: Decoding temperature (affects randomness).
multilingual : bool: Whether to permit multilingual recognition.
max_tokens : float | None: Maximum number of output tokens for the model.
log_prob_threshold : float | None: Log probability threshold for word filtering.
length_penalty : int: Penalty for longer transcriptions.
repetition_penalty : float: Penalty for repeated words during decoding.
strict_hallucination_reduction : bool: Whether to apply hallucination reduction.

Ancestors

livekit.agents.stt.stt.STT
abc.ABC
EventEmitter
typing.Generic

Instance variables

prop model : str

Expand source code

@property
def model(self) -> str:
    return self._model

Get the model name/identifier for this STT instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop provider : str

Expand source code

@property
def provider(self) -> str:
    return "Simplismart"

Get the provider name/identifier for this STT instance.

Returns

The provider name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their provider information.

Methods

def stream(self, *, language: str | livekit.agents.types.NotGiven = NOT_GIVEN, conn_options: livekit.agents.types.APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0), **kwargs: Any) ‑> livekit.plugins.simplismart.stt.SpeechStream

Expand source code

def stream(
    self,
    *,
    language: NotGivenOr[str] = NOT_GIVEN,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    **kwargs: Any,
) -> "SpeechStream":
    """Create a streaming transcription session."""
    opts_language = LanguageCode(language) if is_given(language) else self._opts.language

    # Create options for the stream
    stream_opts = SimplismartSTTOptions(language=opts_language)

    # Create a fresh session for this stream to avoid conflicts
    stream_session = aiohttp.ClientSession()

    if self._api_key is None:
        raise ValueError("API key cannot be None")
    stream = SpeechStream(
        stt=self,
        opts=stream_opts,
        conn_options=conn_options,
        api_key=self._api_key,
        http_session=stream_session,
    )
    self._streams.add(stream)
    return stream

Create a streaming transcription session.

Inherited members

EventEmitter:
- emit
- off
- on
- once

class TTS (*, base_url: str | None = None, model: TTSModels | str = 'canopylabs/orpheus-3b-0.1-ft', voice: str | None = None, api_key: str | None = None, http_session: aiohttp.ClientSession | None = None, sample_rate: int = 24000, temperature: float = 0.7, top_p: float = 0.9, repetition_penalty: float = 1.5, max_tokens: int = 1000, language: str = 'English', leading_silence: bool = True)

Expand source code

class TTS(tts.TTS):
    def __init__(
        self,
        *,
        base_url: str | None = None,
        model: TTSModels | str = DEFAULT_ORPHEUS_MODEL,
        voice: str | None = None,
        api_key: str | None = None,
        http_session: aiohttp.ClientSession | None = None,
        # sample_rate is used by the audio framework for playback; not sent to the server
        sample_rate: int = 24000,
        # Simplismart TTS options
        temperature: float = 0.7,
        top_p: float = 0.9,
        repetition_penalty: float = 1.5,
        max_tokens: int = 1000,
        # Qwen 3 TTS options
        language: str = "English",
        leading_silence: bool = True,
    ) -> None:
        """Initialize SimpliSmart TTS.

        SimpliSmart hosts multiple TTS models. The model name determines which endpoint
        and payload format to use. Defaults are set for the Orpheus model
        (``"canopylabs/orpheus-3b-0.1-ft"``).

        Args:
            base_url: Base URL for the TTS endpoint.
            model: TTS model identifier.
            voice: Voice/speaker identifier.
            api_key: API key for authentication (defaults to ``SIMPLISMART_API_KEY`` env var).
            http_session: Optional aiohttp session for reuse.
            sample_rate: Expected sample rate of the returned PCM audio (default: 24000).
                Used by the framework for playback; not sent to the server.
            temperature: Controls output randomness.
            top_p: Nucleus sampling threshold.
            repetition_penalty: Penalty for repeated tokens.
            max_tokens: Maximum number of output tokens.
            language: Qwen 3 TTS only — language for synthesis (default: ``"English"``).
            leading_silence: Qwen 3 TTS only — whether to include leading silence (default: ``True``).
        """
        super().__init__(
            capabilities=tts.TTSCapabilities(streaming=False),
            sample_rate=sample_rate,
            num_channels=1,
        )

        is_qwen = _is_qwen_model(model)

        self._base_url = (
            base_url
            if base_url is not None
            else (QWEN_BASE_URL if is_qwen else SIMPLISMART_BASE_URL)
        )
        self._opts = _TTSOptions(
            model=model,
            voice=voice
            if voice is not None
            else (DEFAULT_QWEN_VOICE if is_qwen else DEFAULT_ORPHEUS_VOICE),
        )

        if is_qwen:
            self._opts.qwen_options = _QwenTTSOptions(
                language=language,
                leading_silence=leading_silence,
            )
        else:
            self._opts.simplismart_options = _SimplismartTTSOptions(
                temperature=temperature,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                max_tokens=max_tokens,
            )

        self._api_key = api_key or os.environ.get("SIMPLISMART_API_KEY")
        if not self._api_key:
            raise ValueError("SIMPLISMART_API_KEY is not set")

        self._session = http_session

    @property
    def model(self) -> str:
        return self._opts.model

    @property
    def provider(self) -> str:
        return "SimpliSmart"

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()
        return self._session

    def synthesize(
        self,
        text: str,
        *,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> ChunkedStream:
        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

Helper class that provides a standard way to create an ABC using inheritance.

Initialize SimpliSmart TTS.

SimpliSmart hosts multiple TTS models. The model name determines which endpoint and payload format to use. Defaults are set for the Orpheus model ("canopylabs/orpheus-3b-0.1-ft").

Args

base_url: Base URL for the TTS endpoint.
model: TTS model identifier.
voice: Voice/speaker identifier.
api_key: API key for authentication (defaults to SIMPLISMART_API_KEY env var).
http_session: Optional aiohttp session for reuse.
sample_rate: Expected sample rate of the returned PCM audio (default: 24000). Used by the framework for playback; not sent to the server.
temperature: Controls output randomness.
top_p: Nucleus sampling threshold.
repetition_penalty: Penalty for repeated tokens.
max_tokens: Maximum number of output tokens.
language: Qwen 3 TTS only — language for synthesis (default: "English").
leading_silence: Qwen 3 TTS only — whether to include leading silence (default: True).

Ancestors

livekit.agents.tts.tts.TTS
abc.ABC
EventEmitter
typing.Generic

Instance variables

prop model : str

Expand source code

@property
def model(self) -> str:
    return self._opts.model

Get the model name/identifier for this TTS instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop provider : str

Expand source code

@property
def provider(self) -> str:
    return "SimpliSmart"

Get the provider name/identifier for this TTS instance.

Returns

The provider name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their provider information.

Methods

def synthesize(self, text: str, *, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.simplismart.tts.ChunkedStream

Expand source code

def synthesize(
    self,
    text: str,
    *,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> ChunkedStream:
    return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

Inherited members

EventEmitter:
- emit
- off
- on
- once