Module livekit.plugins.azure
Classes
class STT (*, speech_key: str | None = None, speech_region: str | None = None, speech_host: str | None = None, sample_rate: int = 16000, num_channels: int = 1, segmentation_silence_timeout_ms: int | None = None, segmentation_max_time_ms: int | None = None, segmentation_strategy: str | None = None, languages: list[str] = [])
-
Helper class that provides a standard way to create an ABC using inheritance.
Create a new instance of Azure STT.
Either
speech_host
orspeech_key
andspeech_region
must be set, either using arguments or by setting theAZURE_SPEECH_HOST
,AZURE_SPEECH_KEY
andAZURE_SPEECH_REGION
environmental variables, respectively.Expand source code
class STT(stt.STT): def __init__( self, *, speech_key: str | None = None, speech_region: str | None = None, speech_host: str | None = None, sample_rate: int = 16000, num_channels: int = 1, segmentation_silence_timeout_ms: int | None = None, segmentation_max_time_ms: int | None = None, segmentation_strategy: str | None = None, languages: list[str] = [], # when empty, auto-detect the language ): """ Create a new instance of Azure STT. Either ``speech_host`` or ``speech_key`` and ``speech_region`` must be set, either using arguments or by setting the ``AZURE_SPEECH_HOST``, ``AZURE_SPEECH_KEY`` and ``AZURE_SPEECH_REGION`` environmental variables, respectively. """ super().__init__( capabilities=stt.STTCapabilities(streaming=True, interim_results=True) ) speech_host = speech_host or os.environ.get("AZURE_SPEECH_HOST") speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY") speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION") if not speech_host and (not speech_key or not speech_region): raise ValueError( "AZURE_SPEECH_HOST or AZURE_SPEECH_KEY and AZURE_SPEECH_REGION must be set" ) self._config = STTOptions( speech_key=speech_key, speech_region=speech_region, speech_host=speech_host, languages=languages, sample_rate=sample_rate, num_channels=num_channels, segmentation_silence_timeout_ms=segmentation_silence_timeout_ms, segmentation_max_time_ms=segmentation_max_time_ms, segmentation_strategy=segmentation_strategy, ) async def _recognize_impl( self, buffer: utils.AudioBuffer, *, language: str | None = None ) -> stt.SpeechEvent: raise NotImplementedError("Azure STT does not support single frame recognition") def stream(self, *, language: str | None = None) -> "SpeechStream": return SpeechStream(self, self._config)
Ancestors
- STT
- abc.ABC
- EventEmitter
- typing.Generic
Methods
def stream(self, *, language: str | None = None) ‑> livekit.plugins.azure.stt.SpeechStream
Inherited members
class SpeechStream (stt: STT, opts: STTOptions)
-
Helper class that provides a standard way to create an ABC using inheritance.
Args: sample_rate : int or None, optional The desired sample rate for the audio input. If specified, the audio input will be automatically resampled to match the given sample rate before being processed for Speech-to-Text. If not provided (None), the input will retain its original sample rate.
Expand source code
class SpeechStream(stt.SpeechStream): def __init__(self, stt: STT, opts: STTOptions) -> None: super().__init__(stt, sample_rate=opts.sample_rate) self._opts = opts self._speaking = False self._stream = speechsdk.audio.PushAudioInputStream( stream_format=speechsdk.audio.AudioStreamFormat( samples_per_second=self._opts.sample_rate, bits_per_sample=16, channels=self._opts.num_channels, ) ) self._recognizer = _create_speech_recognizer( config=self._opts, stream=self._stream ) self._recognizer.recognizing.connect(self._on_recognizing) self._recognizer.recognized.connect(self._on_recognized) self._recognizer.speech_start_detected.connect(self._on_speech_start) self._recognizer.speech_end_detected.connect(self._on_speech_end) self._recognizer.session_stopped.connect(self._on_session_stopped) self._recognizer.start_continuous_recognition() self._done_event = asyncio.Event() self._loop = asyncio.get_running_loop() @utils.log_exceptions(logger=logger) async def _main_task(self) -> None: try: async for input in self._input_ch: if isinstance(input, rtc.AudioFrame): self._stream.write(input.data.tobytes()) self._stream.close() await self._done_event.wait() finally: def _cleanup(): self._recognizer.stop_continuous_recognition() del self._recognizer await asyncio.to_thread(_cleanup) def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs): detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language text = evt.result.text.strip() if not text: return final_data = stt.SpeechData( language=detected_lg, confidence=1.0, text=evt.result.text ) self._threadsafe_send( stt.SpeechEvent( type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=[final_data] ) ) def _on_recognizing(self, evt: speechsdk.SpeechRecognitionEventArgs): detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language text = evt.result.text.strip() if not text: return interim_data = stt.SpeechData( language=detected_lg, confidence=0.0, text=evt.result.text ) self._threadsafe_send( stt.SpeechEvent( type=stt.SpeechEventType.INTERIM_TRANSCRIPT, alternatives=[interim_data] ) ) def _on_speech_start(self, evt: speechsdk.SpeechRecognitionEventArgs): if self._speaking: return self._speaking = True self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)) def _on_speech_end(self, evt: speechsdk.SpeechRecognitionEventArgs): if not self._speaking: return self._speaking = False self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)) def _on_session_stopped(self, evt: speechsdk.SpeechRecognitionEventArgs): self._loop.call_soon_threadsafe(self._done_event.set) def _threadsafe_send(self, evt: stt.SpeechEvent): self._loop.call_soon_threadsafe(self._event_ch.send_nowait, evt)
Ancestors
- SpeechStream
- abc.ABC
Inherited members
class TTS (*, voice: str | None = None, language: str | None = None, prosody: ProsodyConfig | None = None, speech_key: str | None = None, speech_region: str | None = None, speech_host: str | None = None, endpoint_id: str | None = None)
-
Helper class that provides a standard way to create an ABC using inheritance.
Create a new instance of Azure TTS.
speech_key
andspeech_region
must be set, either using arguments or by setting theAZURE_SPEECH_KEY
andAZURE_SPEECH_REGION
environmental variables, respectively.Expand source code
class TTS(tts.TTS): def __init__( self, *, voice: str | None = None, language: str | None = None, prosody: ProsodyConfig | None = None, speech_key: str | None = None, speech_region: str | None = None, speech_host: str | None = None, endpoint_id: str | None = None, ) -> None: """ Create a new instance of Azure TTS. ``speech_key`` and ``speech_region`` must be set, either using arguments or by setting the ``AZURE_SPEECH_KEY`` and ``AZURE_SPEECH_REGION`` environmental variables, respectively. """ super().__init__( capabilities=tts.TTSCapabilities( streaming=False, ), sample_rate=AZURE_SAMPLE_RATE, num_channels=AZURE_NUM_CHANNELS, ) speech_host = speech_host or os.environ.get("AZURE_SPEECH_HOST") speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY") speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION") if not speech_host and not (speech_key and speech_region): raise ValueError( "AZURE_SPEECH_HOST or AZURE_SPEECH_KEY and AZURE_SPEECH_REGION must be set" ) if prosody: prosody.validate() self._opts = _TTSOptions( speech_key=speech_key, speech_region=speech_region, voice=voice, endpoint_id=endpoint_id, language=language, prosody=prosody, ) def update_options( self, *, voice: str | None = None, language: str | None = None, prosody: ProsodyConfig | None = None, ) -> None: self._opts.voice = voice or self._opts.voice self._opts.language = language or self._opts.language self._opts.prosody = prosody or self._opts.prosody def synthesize(self, text: str) -> "ChunkedStream": return ChunkedStream(self, text, self._opts)
Ancestors
- TTS
- abc.ABC
- EventEmitter
- typing.Generic
Methods
def synthesize(self, text: str) ‑> livekit.plugins.azure.tts.ChunkedStream
def update_options(self, *, voice: str | None = None, language: str | None = None, prosody: ProsodyConfig | None = None) ‑> None
Inherited members