Module livekit.plugins.azure

Classes

class STT (*, speech_key: str | None = None, speech_region: str | None = None, sample_rate: int = 48000, num_channels: int = 1, languages: list[str] = [])

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Azure STT.

speech_key and speech_region must be set, either using arguments or by setting the AZURE_SPEECH_KEY and AZURE_SPEECH_REGION environmental variables, respectively.

Expand source code
class STT(stt.STT):
    def __init__(
        self,
        *,
        speech_key: str | None = None,
        speech_region: str | None = None,
        sample_rate: int = 48000,
        num_channels: int = 1,
        languages: list[str] = [],  # when empty, auto-detect the language
    ):
        """
        Create a new instance of Azure STT.

        ``speech_key`` and ``speech_region`` must be set, either using arguments or by setting the
        ``AZURE_SPEECH_KEY`` and ``AZURE_SPEECH_REGION`` environmental variables, respectively.
        """

        super().__init__(
            capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
        )

        speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY")
        if not speech_key:
            raise ValueError("AZURE_SPEECH_KEY must be set")

        speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION")
        if not speech_region:
            raise ValueError("AZURE_SPEECH_REGION must be set")

        self._config = STTOptions(
            speech_key=speech_key,
            speech_region=speech_region,
            languages=languages,
            sample_rate=sample_rate,
            num_channels=num_channels,
        )

    async def recognize(
        self, buffer: utils.AudioBuffer, *, language: str | None = None
    ) -> stt.SpeechEvent:
        raise NotImplementedError("Azure STT does not support single frame recognition")

    def stream(self, *, language: str | None = None) -> "SpeechStream":
        return SpeechStream(self._config)

Ancestors

Methods

async def recognize(self, buffer: utils.AudioBuffer, *, language: str | None = None) ‑> SpeechEvent
def stream(self, *, language: str | None = None) ‑> livekit.plugins.azure.stt.SpeechStream

Inherited members

class SpeechStream (opts: STTOptions)

Helper class that provides a standard way to create an ABC using inheritance.

Expand source code
class SpeechStream(stt.SpeechStream):
    def __init__(self, opts: STTOptions) -> None:
        super().__init__()
        self._opts = opts
        self._speaking = False

        self._stream = speechsdk.audio.PushAudioInputStream(
            stream_format=speechsdk.audio.AudioStreamFormat(
                samples_per_second=self._opts.sample_rate,
                bits_per_sample=16,
                channels=self._opts.num_channels,
            )
        )
        self._recognizer = _create_speech_recognizer(
            config=self._opts, stream=self._stream
        )
        self._recognizer.recognizing.connect(self._on_recognizing)
        self._recognizer.recognized.connect(self._on_recognized)
        self._recognizer.speech_start_detected.connect(self._on_speech_start)
        self._recognizer.speech_end_detected.connect(self._on_speech_end)
        self._recognizer.session_stopped.connect(self._on_session_stopped)
        self._recognizer.start_continuous_recognition()
        self._done_event = asyncio.Event()
        self._loop = asyncio.get_running_loop()

    @utils.log_exceptions(logger=logger)
    async def _main_task(self) -> None:
        try:
            async for input in self._input_ch:
                if isinstance(input, rtc.AudioFrame):
                    self._stream.write(input.data.tobytes())

            self._stream.close()
            await self._done_event.wait()
        finally:

            def _cleanup():
                self._recognizer.stop_continuous_recognition()
                del self._recognizer

            await asyncio.to_thread(_cleanup)

    def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs):
        detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language
        text = evt.result.text.strip()
        if not text:
            return

        final_data = stt.SpeechData(
            language=detected_lg, confidence=1.0, text=evt.result.text
        )

        self._threadsafe_send(
            stt.SpeechEvent(
                type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=[final_data]
            )
        )

    def _on_recognizing(self, evt: speechsdk.SpeechRecognitionEventArgs):
        detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language
        text = evt.result.text.strip()
        if not text:
            return

        interim_data = stt.SpeechData(
            language=detected_lg, confidence=0.0, text=evt.result.text
        )

        self._threadsafe_send(
            stt.SpeechEvent(
                type=stt.SpeechEventType.INTERIM_TRANSCRIPT, alternatives=[interim_data]
            )
        )

    def _on_speech_start(self, evt: speechsdk.SpeechRecognitionEventArgs):
        if self._speaking:
            return

        self._speaking = True
        self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH))

    def _on_speech_end(self, evt: speechsdk.SpeechRecognitionEventArgs):
        if not self._speaking:
            return

        self._speaking = False
        self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))

    def _on_session_stopped(self, evt: speechsdk.SpeechRecognitionEventArgs):
        self._loop.call_soon_threadsafe(self._done_event.set)

    def _threadsafe_send(self, evt: stt.SpeechEvent | None):
        self._loop.call_soon_threadsafe(self._event_ch.send_nowait, evt)

Ancestors

Inherited members

class TTS (*, speech_key: str | None = None, speech_region: str | None = None, voice: str | None = None, endpoint_id: str | None = None)

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Azure TTS.

speech_key and speech_region must be set, either using arguments or by setting the AZURE_SPEECH_KEY and AZURE_SPEECH_REGION environmental variables, respectively.

Expand source code
class TTS(tts.TTS):
    def __init__(
        self,
        *,
        speech_key: str | None = None,
        speech_region: str | None = None,
        voice: str | None = None,
        endpoint_id: str | None = None,
    ) -> None:
        """
        Create a new instance of Azure TTS.

        ``speech_key`` and ``speech_region`` must be set, either using arguments or by setting the
        ``AZURE_SPEECH_KEY`` and ``AZURE_SPEECH_REGION`` environmental variables, respectively.
        """

        super().__init__(
            capabilities=tts.TTSCapabilities(
                streaming=False,
            ),
            sample_rate=AZURE_SAMPLE_RATE,
            num_channels=AZURE_NUM_CHANNELS,
        )

        speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY")
        if not speech_key:
            raise ValueError("AZURE_SPEECH_KEY must be set")

        speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION")
        if not speech_region:
            raise ValueError("AZURE_SPEECH_REGION must be set")

        self._opts = _TTSOptions(
            speech_key=speech_key,
            speech_region=speech_region,
            voice=voice,
            endpoint_id=endpoint_id,
        )

    def synthesize(self, text: str) -> "ChunkedStream":
        return ChunkedStream(text, self._opts)

Ancestors

Methods

def synthesize(self, text: str) ‑> livekit.plugins.azure.tts.ChunkedStream