Module livekit.plugins.azure
Classes
class STT (*, speech_key: str | None = None, speech_region: str | None = None, sample_rate: int = 48000, num_channels: int = 1, languages: list[str] = [])
-
Helper class that provides a standard way to create an ABC using inheritance.
Create a new instance of Azure STT.
speech_key
andspeech_region
must be set, either using arguments or by setting theAZURE_SPEECH_KEY
andAZURE_SPEECH_REGION
environmental variables, respectively.Expand source code
class STT(stt.STT): def __init__( self, *, speech_key: str | None = None, speech_region: str | None = None, sample_rate: int = 48000, num_channels: int = 1, languages: list[str] = [], # when empty, auto-detect the language ): """ Create a new instance of Azure STT. ``speech_key`` and ``speech_region`` must be set, either using arguments or by setting the ``AZURE_SPEECH_KEY`` and ``AZURE_SPEECH_REGION`` environmental variables, respectively. """ super().__init__( capabilities=stt.STTCapabilities(streaming=True, interim_results=True) ) speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY") if not speech_key: raise ValueError("AZURE_SPEECH_KEY must be set") speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION") if not speech_region: raise ValueError("AZURE_SPEECH_REGION must be set") self._config = STTOptions( speech_key=speech_key, speech_region=speech_region, languages=languages, sample_rate=sample_rate, num_channels=num_channels, ) async def recognize( self, buffer: utils.AudioBuffer, *, language: str | None = None ) -> stt.SpeechEvent: raise NotImplementedError("Azure STT does not support single frame recognition") def stream(self, *, language: str | None = None) -> "SpeechStream": return SpeechStream(self._config)
Ancestors
- STT
- abc.ABC
Methods
async def recognize(self, buffer: utils.AudioBuffer, *, language: str | None = None) ‑> SpeechEvent
def stream(self, *, language: str | None = None) ‑> livekit.plugins.azure.stt.SpeechStream
Inherited members
class SpeechStream (opts: STTOptions)
-
Helper class that provides a standard way to create an ABC using inheritance.
Expand source code
class SpeechStream(stt.SpeechStream): def __init__(self, opts: STTOptions) -> None: super().__init__() self._opts = opts self._speaking = False self._stream = speechsdk.audio.PushAudioInputStream( stream_format=speechsdk.audio.AudioStreamFormat( samples_per_second=self._opts.sample_rate, bits_per_sample=16, channels=self._opts.num_channels, ) ) self._recognizer = _create_speech_recognizer( config=self._opts, stream=self._stream ) self._recognizer.recognizing.connect(self._on_recognizing) self._recognizer.recognized.connect(self._on_recognized) self._recognizer.speech_start_detected.connect(self._on_speech_start) self._recognizer.speech_end_detected.connect(self._on_speech_end) self._recognizer.session_stopped.connect(self._on_session_stopped) self._recognizer.start_continuous_recognition() self._done_event = asyncio.Event() self._loop = asyncio.get_running_loop() @utils.log_exceptions(logger=logger) async def _main_task(self) -> None: try: async for input in self._input_ch: if isinstance(input, rtc.AudioFrame): self._stream.write(input.data.tobytes()) self._stream.close() await self._done_event.wait() finally: def _cleanup(): self._recognizer.stop_continuous_recognition() del self._recognizer await asyncio.to_thread(_cleanup) def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs): detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language text = evt.result.text.strip() if not text: return final_data = stt.SpeechData( language=detected_lg, confidence=1.0, text=evt.result.text ) self._threadsafe_send( stt.SpeechEvent( type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=[final_data] ) ) def _on_recognizing(self, evt: speechsdk.SpeechRecognitionEventArgs): detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language text = evt.result.text.strip() if not text: return interim_data = stt.SpeechData( language=detected_lg, confidence=0.0, text=evt.result.text ) self._threadsafe_send( stt.SpeechEvent( type=stt.SpeechEventType.INTERIM_TRANSCRIPT, alternatives=[interim_data] ) ) def _on_speech_start(self, evt: speechsdk.SpeechRecognitionEventArgs): if self._speaking: return self._speaking = True self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)) def _on_speech_end(self, evt: speechsdk.SpeechRecognitionEventArgs): if not self._speaking: return self._speaking = False self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)) def _on_session_stopped(self, evt: speechsdk.SpeechRecognitionEventArgs): self._loop.call_soon_threadsafe(self._done_event.set) def _threadsafe_send(self, evt: stt.SpeechEvent | None): self._loop.call_soon_threadsafe(self._event_ch.send_nowait, evt)
Ancestors
- SpeechStream
- abc.ABC
Inherited members
class TTS (*, speech_key: str | None = None, speech_region: str | None = None, voice: str | None = None, endpoint_id: str | None = None)
-
Helper class that provides a standard way to create an ABC using inheritance.
Create a new instance of Azure TTS.
speech_key
andspeech_region
must be set, either using arguments or by setting theAZURE_SPEECH_KEY
andAZURE_SPEECH_REGION
environmental variables, respectively.Expand source code
class TTS(tts.TTS): def __init__( self, *, speech_key: str | None = None, speech_region: str | None = None, voice: str | None = None, endpoint_id: str | None = None, ) -> None: """ Create a new instance of Azure TTS. ``speech_key`` and ``speech_region`` must be set, either using arguments or by setting the ``AZURE_SPEECH_KEY`` and ``AZURE_SPEECH_REGION`` environmental variables, respectively. """ super().__init__( capabilities=tts.TTSCapabilities( streaming=False, ), sample_rate=AZURE_SAMPLE_RATE, num_channels=AZURE_NUM_CHANNELS, ) speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY") if not speech_key: raise ValueError("AZURE_SPEECH_KEY must be set") speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION") if not speech_region: raise ValueError("AZURE_SPEECH_REGION must be set") self._opts = _TTSOptions( speech_key=speech_key, speech_region=speech_region, voice=voice, endpoint_id=endpoint_id, ) def synthesize(self, text: str) -> "ChunkedStream": return ChunkedStream(text, self._opts)
Ancestors
- TTS
- abc.ABC
Methods
def synthesize(self, text: str) ‑> livekit.plugins.azure.tts.ChunkedStream