Module livekit.plugins.sarvam
Sarvam.ai plugin for LiveKit Agents
Support for speech-to-text and text-to-speech with Sarvam.ai.
Sarvam.ai provides high-quality STT and TTS for Indian languages.
For API access, visit https://sarvam.ai/
Classes
class STT (*,
language: str,
model: SarvamSTTModels | str = 'saarika:v2.5',
api_key: str | None = None,
base_url: str = 'https://api.sarvam.ai/speech-to-text',
http_session: aiohttp.ClientSession | None = None)-
Expand source code
class STT(stt.STT): """Sarvam.ai Speech-to-Text implementation. This class provides speech-to-text functionality using the Sarvam.ai API. Sarvam.ai specializes in high-quality STT for Indian languages. Args: language: BCP-47 language code, e.g., "hi-IN", "en-IN" model: The Sarvam STT model to use api_key: Sarvam.ai API key (falls back to SARVAM_API_KEY env var) base_url: API endpoint URL http_session: Optional aiohttp session to use """ def __init__( self, *, language: str, model: SarvamSTTModels | str = "saarika:v2.5", api_key: str | None = None, base_url: str = SARVAM_STT_BASE_URL, http_session: aiohttp.ClientSession | None = None, ) -> None: super().__init__(capabilities=stt.STTCapabilities(streaming=False, interim_results=False)) self._api_key = api_key or os.environ.get("SARVAM_API_KEY") if not self._api_key: raise ValueError( "Sarvam API key is required. " "Provide it directly or set SARVAM_API_KEY environment variable." ) self._opts = SarvamSTTOptions( language=language, model=model, api_key=self._api_key, base_url=base_url, ) self._session = http_session self._logger = logger.getChild(self.__class__.__name__) def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: self._session = utils.http_context.http_session() return self._session async def _recognize_impl( self, buffer: AudioBuffer, *, language: NotGivenOr[str] = NOT_GIVEN, model: NotGivenOr[SarvamSTTModels | str] = NOT_GIVEN, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> stt.SpeechEvent: """Recognize speech using Sarvam.ai API. Args: buffer: Audio buffer containing speech data language: BCP-47 language code (overrides the one set in constructor) model: Sarvam model to use (overrides the one set in constructor) conn_options: Connection options for API requests Returns: A SpeechEvent containing the transcription result Raises: APIConnectionError: On network connection errors APIStatusError: On API errors (non-200 status) APITimeoutError: On API timeout """ opts_language = self._opts.language if isinstance(language, type(NOT_GIVEN)) else language opts_model = self._opts.model if isinstance(model, type(NOT_GIVEN)) else model wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes() form_data = aiohttp.FormData() form_data.add_field("file", wav_bytes, filename="audio.wav", content_type="audio/wav") # Add model and language_code to the form data if specified # Sarvam API docs state language_code is optional for saarika:v2x but mandatory for v1 # Model is also optional, defaults to saarika:v2.5 if opts_language: form_data.add_field("language_code", opts_language) if opts_model: form_data.add_field("model", str(opts_model)) headers = {"api-subscription-key": self._opts.api_key} try: async with self._ensure_session().post( url=self._opts.base_url, data=form_data, headers=headers, timeout=aiohttp.ClientTimeout( total=conn_options.timeout, sock_connect=conn_options.timeout, ), ) as res: if res.status != 200: error_text = await res.text() self._logger.error(f"Sarvam API error: {res.status} - {error_text}") raise APIStatusError( message=f"Sarvam API Error: {error_text}", status_code=res.status, ) response_json = await res.json() self._logger.debug(f"Sarvam API response: {response_json}") transcript_text = response_json.get("transcript", "") request_id = response_json.get("request_id", "") detected_language = response_json.get("language_code") if not isinstance(detected_language, str): detected_language = opts_language or "" start_time = 0.0 end_time = 0.0 # Try to get timestamps if available timestamps_data = response_json.get("timestamps") if timestamps_data and isinstance(timestamps_data, dict): words_ts_start = timestamps_data.get("start_time_seconds") words_ts_end = timestamps_data.get("end_time_seconds") if isinstance(words_ts_start, list) and len(words_ts_start) > 0: start_time = words_ts_start[0] if isinstance(words_ts_end, list) and len(words_ts_end) > 0: end_time = words_ts_end[-1] # If start/end times are still 0, use buffer duration as an estimate for end_time if start_time == 0.0 and end_time == 0.0: # Calculate duration from buffer - AudioBuffer can be list[AudioFrame] # or AudioFrame try: if isinstance(buffer, list): # Calculate total duration from all frames total_samples = sum(frame.samples_per_channel for frame in buffer) if buffer and total_samples > 0: sample_rate = buffer[0].sample_rate end_time = total_samples / sample_rate elif hasattr(buffer, "duration"): end_time = buffer.duration / 1000.0 # buffer.duration is in ms elif hasattr(buffer, "samples_per_channel") and hasattr( buffer, "sample_rate" ): # Single AudioFrame end_time = buffer.samples_per_channel / buffer.sample_rate except Exception as duration_error: self._logger.warning( f"Could not calculate audio duration: {duration_error}" ) end_time = 0.0 alternatives = [ stt.SpeechData( language=detected_language, text=transcript_text, start_time=start_time, end_time=end_time, confidence=1.0, # Sarvam doesn't provide confidence score in this response ) ] return stt.SpeechEvent( type=stt.SpeechEventType.FINAL_TRANSCRIPT, request_id=request_id, alternatives=alternatives, ) except asyncio.TimeoutError as e: self._logger.error(f"Sarvam API timeout: {e}") raise APITimeoutError("Sarvam API request timed out") from e except aiohttp.ClientError as e: self._logger.error(f"Sarvam API client error: {e}") raise APIConnectionError(f"Sarvam API connection error: {e}") from e except Exception as e: self._logger.error(f"Error during Sarvam STT processing: {e}") raise APIConnectionError(f"Unexpected error in Sarvam STT: {e}") from e
Sarvam.ai Speech-to-Text implementation.
This class provides speech-to-text functionality using the Sarvam.ai API. Sarvam.ai specializes in high-quality STT for Indian languages.
Args
language
- BCP-47 language code, e.g., "hi-IN", "en-IN"
model
- The Sarvam STT model to use
api_key
- Sarvam.ai API key (falls back to SARVAM_API_KEY env var)
base_url
- API endpoint URL
http_session
- Optional aiohttp session to use
Ancestors
- livekit.agents.stt.stt.STT
- abc.ABC
- EventEmitter
- typing.Generic
Inherited members
class TTS (*,
target_language_code: SarvamTTSLanguages | str,
model: SarvamTTSModels | str = 'bulbul:v2',
speaker: SarvamTTSSpeakers | str = 'anushka',
speech_sample_rate: int = 22050,
num_channels: int = 1,
pitch: float = 0.0,
pace: float = 1.0,
loudness: float = 1.0,
enable_preprocessing: bool = False,
api_key: str | None = None,
base_url: str = 'https://api.sarvam.ai/text-to-speech',
http_session: aiohttp.ClientSession | None = None)-
Expand source code
class TTS(tts.TTS): """Sarvam.ai Text-to-Speech implementation. This class provides text-to-speech functionality using the Sarvam.ai API. Sarvam.ai specializes in high-quality TTS for Indian languages. Args: target_language_code: BCP-47 language code for supported Indian languages model: Sarvam TTS model to use (only bulbul:v2 supported) speaker: Voice to use for synthesis speech_sample_rate: Audio sample rate in Hz num_channels: Number of audio channels (Sarvam outputs mono) pitch: Voice pitch adjustment (-20.0 to 20.0) pace: Speech rate multiplier (0.5 to 2.0) loudness: Volume multiplier (0.5 to 2.0) enable_preprocessing: Whether to use text preprocessing api_key: Sarvam.ai API key (required) base_url: API endpoint URL http_session: Optional aiohttp session to use """ def __init__( self, *, target_language_code: SarvamTTSLanguages | str, model: SarvamTTSModels | str = "bulbul:v2", speaker: SarvamTTSSpeakers | str = "anushka", speech_sample_rate: int = 22050, num_channels: int = 1, # Sarvam output is mono WAV pitch: float = 0.0, pace: float = 1.0, loudness: float = 1.0, enable_preprocessing: bool = False, api_key: str | None = None, base_url: str = SARVAM_TTS_BASE_URL, http_session: aiohttp.ClientSession | None = None, ) -> None: super().__init__( capabilities=tts.TTSCapabilities(streaming=False), sample_rate=speech_sample_rate, num_channels=num_channels, ) self._api_key = api_key or os.environ.get("SARVAM_API_KEY") if not self._api_key: raise ValueError( "Sarvam API key is required. Provide it directly or set SARVAM_API_KEY env var." ) # Validate model-speaker compatibility if not validate_model_speaker_compatibility(model, speaker): compatible_speakers = MODEL_SPEAKER_COMPATIBILITY.get(model, {}).get("all", []) raise ValueError( f"Speaker '{speaker}' is not compatible with model '{model}'. " f"Please choose a compatible speaker from: {', '.join(compatible_speakers)}" ) self._opts = SarvamTTSOptions( target_language_code=target_language_code, model=model, speaker=speaker, speech_sample_rate=speech_sample_rate, pitch=pitch, pace=pace, loudness=loudness, enable_preprocessing=enable_preprocessing, api_key=self._api_key, base_url=base_url, ) self._session = http_session def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: self._session = utils.http_context.http_session() return self._session # Implement the abstract synthesize method def synthesize( self, text: str, *, conn_options: APIConnectOptions | None = None ) -> ChunkedStream: """Synthesize text to audio using Sarvam.ai TTS API.""" if conn_options is None: conn_options = DEFAULT_API_CONNECT_OPTIONS return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
Sarvam.ai Text-to-Speech implementation.
This class provides text-to-speech functionality using the Sarvam.ai API. Sarvam.ai specializes in high-quality TTS for Indian languages.
Args
target_language_code
- BCP-47 language code for supported Indian languages
model
- Sarvam TTS model to use (only bulbul:v2 supported)
speaker
- Voice to use for synthesis
speech_sample_rate
- Audio sample rate in Hz
num_channels
- Number of audio channels (Sarvam outputs mono)
pitch
- Voice pitch adjustment (-20.0 to 20.0)
pace
- Speech rate multiplier (0.5 to 2.0)
loudness
- Volume multiplier (0.5 to 2.0)
enable_preprocessing
- Whether to use text preprocessing
api_key
- Sarvam.ai API key (required)
base_url
- API endpoint URL
http_session
- Optional aiohttp session to use
Ancestors
- livekit.agents.tts.tts.TTS
- abc.ABC
- EventEmitter
- typing.Generic
Methods
def synthesize(self, text: str, *, conn_options: APIConnectOptions | None = None) ‑> livekit.plugins.sarvam.tts.ChunkedStream
-
Expand source code
def synthesize( self, text: str, *, conn_options: APIConnectOptions | None = None ) -> ChunkedStream: """Synthesize text to audio using Sarvam.ai TTS API.""" if conn_options is None: conn_options = DEFAULT_API_CONNECT_OPTIONS return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
Synthesize text to audio using Sarvam.ai TTS API.
Inherited members