Module livekit.plugins.simplismart
SimpliSmart plugin for LiveKit Agents
Support for speech-to-text and text-to-speech with SimpliSmart.
SimpliSmart provides high-quality STT and TTS for Indian languages.
For API access, visit https://simplismart.ai/
Classes
class STT (*,
base_url: str = 'https://api.simplismart.live/predict',
api_key: str | None = None,
streaming: bool = False,
model: Literal['openai/whisper-large-v2', 'openai/whisper-large-v3', 'openai/whisper-large-v3-turbo'] | str = 'openai/whisper-large-v3-turbo',
language: str = 'en',
task: Literal['transcribe', 'translate'] = 'transcribe',
without_timestamps: bool = True,
vad_model: Literal['silero', 'frame'] = 'frame',
vad_filter: bool = True,
vad_onset: float | None = 0.5,
vad_offset: float | None = None,
min_speech_duration_ms: int = 0,
max_speech_duration_s: float = 30,
min_silence_duration_ms: int = 2000,
speech_pad_ms: int = 400,
initial_prompt: str | None = None,
hotwords: str | None = None,
num_speakers: int = 0,
compression_ratio_threshold: float | None = 2.4,
beam_size: int = 4,
temperature: float = 0.0,
multilingual: bool = False,
max_tokens: float | None = 400,
log_prob_threshold: float | None = -1.0,
length_penalty: int = 1,
repetition_penalty: float = 1.01,
strict_hallucination_reduction: bool = False,
http_session: aiohttp.client.ClientSession | None = None)-
Expand source code
class STT(stt.STT): def __init__( self, *, base_url: str = SIMPLISMART_BASE_URL, api_key: str | None = None, streaming: bool = False, model: STTModels | str = "openai/whisper-large-v3-turbo", language: str = "en", task: Literal["transcribe", "translate"] = "transcribe", without_timestamps: bool = True, vad_model: Literal["silero", "frame"] = "frame", vad_filter: bool = True, vad_onset: float | None = 0.5, vad_offset: float | None = None, min_speech_duration_ms: int = 0, max_speech_duration_s: float = 30, min_silence_duration_ms: int = 2000, speech_pad_ms: int = 400, initial_prompt: str | None = None, hotwords: str | None = None, num_speakers: int = 0, compression_ratio_threshold: float | None = 2.4, beam_size: int = 4, temperature: float = 0.0, multilingual: bool = False, max_tokens: float | None = 400, log_prob_threshold: float | None = -1.0, length_penalty: int = 1, repetition_penalty: float = 1.01, strict_hallucination_reduction: bool = False, http_session: aiohttp.ClientSession | None = None, ): """ Configuration options for the SimpliSmart STT (Speech-to-Text) engine. Note: Streaming transcription is not publicly available at this time. Args: language (str): Language code for transcription (default: "en"). task (Literal["transcribe", "translate"]): Operation to perform, either "transcribe" or "translate". model (STTModels | str): Model identifier for the backend STT model. without_timestamps (bool): If True, disables timestamp generation in transcripts. vad_model (Literal["silero", "frame"]): Voice Activity Detection model to use ("silero" or "frame"). vad_filter (bool): Whether to apply VAD to filter input audio. vad_onset (float | None): Time (in seconds) for VAD onset boundary. vad_offset (float | None): Time (in seconds) for VAD offset boundary. min_speech_duration_ms (int): Minimum duration (ms) for a valid speech segment. max_speech_duration_s (float): Maximum speech segment duration (seconds). min_silence_duration_ms (int): Minimum silence duration (ms) to split speech. speech_pad_ms (int): Padding (ms) added to boundaries of detected speech. initial_prompt (str | None): Optional initial prompt for contextual biasing. hotwords (str | None): Comma-separated list of hotwords to bias recognition. num_speakers (int): Number of speakers for diarization. compression_ratio_threshold (float | None): Threshold for output compression ratio. beam_size (int): Beam size for the decoder. temperature (float): Decoding temperature (affects randomness). multilingual (bool): Whether to permit multilingual recognition. max_tokens (float | None): Maximum number of output tokens for the model. log_prob_threshold (float | None): Log probability threshold for word filtering. length_penalty (int): Penalty for longer transcriptions. repetition_penalty (float): Penalty for repeated words during decoding. strict_hallucination_reduction (bool): Whether to apply hallucination reduction. """ if streaming: base_url = f"wss://{urlparse(base_url).netloc}/ws/audio" super().__init__( capabilities=stt.STTCapabilities( streaming=streaming, interim_results=False, aligned_transcript="word", ) ) self._api_key = api_key or os.environ.get("SIMPLISMART_API_KEY") if not self._api_key: raise ValueError("SIMPLISMART_API_KEY is not set") self._model = model self._opts = SimplismartSTTOptions( language=language, task=task, without_timestamps=without_timestamps, vad_model=vad_model, vad_filter=vad_filter, vad_onset=vad_onset, vad_offset=vad_offset, min_speech_duration_ms=min_speech_duration_ms, max_speech_duration_s=max_speech_duration_s, min_silence_duration_ms=min_silence_duration_ms, speech_pad_ms=speech_pad_ms, initial_prompt=initial_prompt, hotwords=hotwords, num_speakers=num_speakers, compression_ratio_threshold=compression_ratio_threshold, beam_size=beam_size, temperature=temperature, multilingual=multilingual, max_tokens=max_tokens, log_prob_threshold=log_prob_threshold, length_penalty=length_penalty, repetition_penalty=repetition_penalty, strict_hallucination_reduction=strict_hallucination_reduction, ) self._base_url = base_url self._session = http_session self._streams = weakref.WeakSet[SpeechStream]() @property def provider(self) -> str: return "Simplismart" @property def model(self) -> str: return self._model def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: self._session = utils.http_context.http_session() return self._session async def _recognize_impl( self, buffer: AudioBuffer, *, language: NotGivenOr[str] = NOT_GIVEN, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> stt.SpeechEvent: resolved_language: str | None = language if is_given(language) else self._opts.language wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes() audio_b64 = base64.b64encode(wav_bytes).decode("utf-8") payload = self._opts.model_dump() payload["audio_data"] = audio_b64 payload["language"] = resolved_language payload["model"] = self._model try: async with self._ensure_session().post( self._base_url, json=payload, headers={ "Authorization": f"Bearer {self._api_key}", "Content-Type": "application/json", }, timeout=aiohttp.ClientTimeout( total=conn_options.timeout, ), ) as res: if res.status != 200: error_text = await res.text() logger.error(f"Simplismart API error: {res.status} - {error_text}") raise APIStatusError( message=f"Simplismart API Error: {error_text}", status_code=res.status, request_id=None, body=error_text, ) response_json = await res.json() timestamps = response_json.get("timestamps", []) transcription = response_json.get("transcription", []) info = response_json.get("info", {}) detected_language = info.get("language", resolved_language or "en") start_time = timestamps[0][0] if timestamps else 0.0 end_time = timestamps[-1][1] if timestamps else 0.0 request_id = response_json.get("request_id", "") text = "".join(transcription) alternatives = [ stt.SpeechData( language=detected_language, text=text, start_time=start_time, end_time=end_time, ), ] return stt.SpeechEvent( type=stt.SpeechEventType.FINAL_TRANSCRIPT, request_id=request_id, alternatives=alternatives, ) except asyncio.TimeoutError as e: logger.error(f"Simplismart API timeout: {e}") raise APITimeoutError("Simplismart API request timed out") from e except aiohttp.ClientError as e: logger.error(f"Simplismart API client error: {e}") raise APIConnectionError(f"Simplismart API connection error: {e}") from e except APIStatusError: raise except Exception as e: logger.error(f"Error during Simplismart STT processing: {traceback.format_exc()}") raise APIConnectionError(f"Unexpected error in Simplismart STT: {e}") from e def stream( self, *, language: NotGivenOr[str] = NOT_GIVEN, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, **kwargs: Any, ) -> "SpeechStream": """Create a streaming transcription session.""" opts_language = language if is_given(language) else self._opts.language # Create options for the stream stream_opts = SimplismartSTTOptions(language=opts_language) # Create a fresh session for this stream to avoid conflicts stream_session = aiohttp.ClientSession() if self._api_key is None: raise ValueError("API key cannot be None") stream = SpeechStream( stt=self, opts=stream_opts, conn_options=conn_options, api_key=self._api_key, http_session=stream_session, ) self._streams.add(stream) return streamHelper class that provides a standard way to create an ABC using inheritance.
Configuration options for the SimpliSmart STT (Speech-to-Text) engine.
Note
Streaming transcription is not publicly available at this time.
Args
language:str- Language code for transcription (default: "en").
- task (Literal["transcribe", "translate"]): Operation to perform, either "transcribe" or "translate".
model:STTModels | str- Model identifier for the backend STT model.
without_timestamps:bool- If True, disables timestamp generation in transcripts.
- vad_model (Literal["silero", "frame"]): Voice Activity Detection model to use ("silero" or "frame").
vad_filter:bool- Whether to apply VAD to filter input audio.
vad_onset:float | None- Time (in seconds) for VAD onset boundary.
vad_offset:float | None- Time (in seconds) for VAD offset boundary.
min_speech_duration_ms:int- Minimum duration (ms) for a valid speech segment.
max_speech_duration_s:float- Maximum speech segment duration (seconds).
min_silence_duration_ms:int- Minimum silence duration (ms) to split speech.
speech_pad_ms:int- Padding (ms) added to boundaries of detected speech.
initial_prompt:str | None- Optional initial prompt for contextual biasing.
hotwords:str | None- Comma-separated list of hotwords to bias recognition.
num_speakers:int- Number of speakers for diarization.
compression_ratio_threshold:float | None- Threshold for output compression ratio.
beam_size:int- Beam size for the decoder.
temperature:float- Decoding temperature (affects randomness).
multilingual:bool- Whether to permit multilingual recognition.
max_tokens:float | None- Maximum number of output tokens for the model.
log_prob_threshold:float | None- Log probability threshold for word filtering.
length_penalty:int- Penalty for longer transcriptions.
repetition_penalty:float- Penalty for repeated words during decoding.
strict_hallucination_reduction:bool- Whether to apply hallucination reduction.
Ancestors
- livekit.agents.stt.stt.STT
- abc.ABC
- EventEmitter
- typing.Generic
Instance variables
prop model : str-
Expand source code
@property def model(self) -> str: return self._modelGet the model name/identifier for this STT instance.
Returns
The model name if available, "unknown" otherwise.
Note
Plugins should override this property to provide their model information.
prop provider : str-
Expand source code
@property def provider(self) -> str: return "Simplismart"Get the provider name/identifier for this STT instance.
Returns
The provider name if available, "unknown" otherwise.
Note
Plugins should override this property to provide their provider information.
Methods
def stream(self,
*,
language: str | livekit.agents.types.NotGiven = NOT_GIVEN,
conn_options: livekit.agents.types.APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0),
**kwargs: Any) ‑> livekit.plugins.simplismart.stt.SpeechStream-
Expand source code
def stream( self, *, language: NotGivenOr[str] = NOT_GIVEN, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, **kwargs: Any, ) -> "SpeechStream": """Create a streaming transcription session.""" opts_language = language if is_given(language) else self._opts.language # Create options for the stream stream_opts = SimplismartSTTOptions(language=opts_language) # Create a fresh session for this stream to avoid conflicts stream_session = aiohttp.ClientSession() if self._api_key is None: raise ValueError("API key cannot be None") stream = SpeechStream( stt=self, opts=stream_opts, conn_options=conn_options, api_key=self._api_key, http_session=stream_session, ) self._streams.add(stream) return streamCreate a streaming transcription session.
Inherited members
class TTS (*,
base_url: str = 'https://api.simplismart.live/tts',
model: Literal['canopylabs/orpheus-3b-0.1-ft', 'maya-research/Veena'] | str = 'canopylabs/orpheus-3b-0.1-ft',
voice: str = 'tara',
api_key: str | None = None,
http_session: aiohttp.client.ClientSession | None = None,
temperature: float = 0.7,
top_p: float = 0.9,
repetition_penalty: float = 1.5,
max_tokens: int = 1000)-
Expand source code
class TTS(tts.TTS): def __init__( self, *, base_url: str = SIMPLISMART_BASE_URL, model: TTSModels | str = "canopylabs/orpheus-3b-0.1-ft", voice: str = "tara", api_key: str | None = None, http_session: aiohttp.ClientSession | None = None, temperature: float = 0.7, top_p: float = 0.9, repetition_penalty: float = 1.5, max_tokens: int = 1000, ) -> None: """ Configuration options for SimpliSmart TTS (Text-to-Speech). Attributes: temperature (float): Controls the randomness in the model output. Lower values make output more deterministic. top_p (float): Nucleus sampling probability threshold. Limits the sampling pool of predicted tokens. repetition_penalty (float): Penalty applied to repeated text to reduce repetition. max_tokens (int): Maximum number of output tokens allowed in the synthesized speech. """ super().__init__( capabilities=tts.TTSCapabilities(streaming=False), sample_rate=24000, num_channels=1, ) self._base_url = base_url self._model = model self._voice = voice self._api_key = api_key or os.environ.get("SIMPLISMART_API_KEY") if not self._api_key: raise ValueError("SIMPLISMART_API_KEY is not set") self._session = http_session self._opts = SimplismartTTSOptions( temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, max_tokens=max_tokens, ) @property def model(self) -> str: return self._model @property def provider(self) -> str: return "SimpliSmart" def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: self._session = utils.http_context.http_session() return self._session def synthesize( self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> "ChunkedStream": return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)Helper class that provides a standard way to create an ABC using inheritance.
Configuration options for SimpliSmart TTS (Text-to-Speech).
Attributes
temperature:float- Controls the randomness in the model output. Lower values make output more deterministic.
top_p:float- Nucleus sampling probability threshold. Limits the sampling pool of predicted tokens.
repetition_penalty:float- Penalty applied to repeated text to reduce repetition.
max_tokens:int- Maximum number of output tokens allowed in the synthesized speech.
Ancestors
- livekit.agents.tts.tts.TTS
- abc.ABC
- EventEmitter
- typing.Generic
Instance variables
prop model : str-
Expand source code
@property def model(self) -> str: return self._modelGet the model name/identifier for this TTS instance.
Returns
The model name if available, "unknown" otherwise.
Note
Plugins should override this property to provide their model information.
prop provider : str-
Expand source code
@property def provider(self) -> str: return "SimpliSmart"Get the provider name/identifier for this TTS instance.
Returns
The provider name if available, "unknown" otherwise.
Note
Plugins should override this property to provide their provider information.
Methods
def synthesize(self,
text: str,
*,
conn_options: livekit.agents.types.APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.simplismart.tts.ChunkedStream-
Expand source code
def synthesize( self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> "ChunkedStream": return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
Inherited members