Module livekit.plugins.azure

Classes

class STT (*, speech_key: str | None = None, speech_region: str | None = None, speech_host: str | None = None, sample_rate: int = 16000, num_channels: int = 1, segmentation_silence_timeout_ms: int | None = None, segmentation_max_time_ms: int | None = None, segmentation_strategy: str | None = None, languages: list[str] = [])

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Azure STT.

Either speech_host or speech_key and speech_region must be set, either using arguments or by setting the AZURE_SPEECH_HOST, AZURE_SPEECH_KEY and AZURE_SPEECH_REGION environmental variables, respectively.

Expand source code
class STT(stt.STT):
    def __init__(
        self,
        *,
        speech_key: str | None = None,
        speech_region: str | None = None,
        speech_host: str | None = None,
        sample_rate: int = 16000,
        num_channels: int = 1,
        segmentation_silence_timeout_ms: int | None = None,
        segmentation_max_time_ms: int | None = None,
        segmentation_strategy: str | None = None,
        languages: list[str] = [],  # when empty, auto-detect the language
    ):
        """
        Create a new instance of Azure STT.

        Either ``speech_host`` or ``speech_key`` and ``speech_region`` must be set,
        either using arguments or by setting the ``AZURE_SPEECH_HOST``, ``AZURE_SPEECH_KEY``
        and ``AZURE_SPEECH_REGION`` environmental variables, respectively.
        """

        super().__init__(
            capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
        )
        speech_host = speech_host or os.environ.get("AZURE_SPEECH_HOST")
        speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY")
        speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION")

        if not speech_host and (not speech_key or not speech_region):
            raise ValueError(
                "AZURE_SPEECH_HOST or AZURE_SPEECH_KEY and AZURE_SPEECH_REGION must be set"
            )

        self._config = STTOptions(
            speech_key=speech_key,
            speech_region=speech_region,
            speech_host=speech_host,
            languages=languages,
            sample_rate=sample_rate,
            num_channels=num_channels,
            segmentation_silence_timeout_ms=segmentation_silence_timeout_ms,
            segmentation_max_time_ms=segmentation_max_time_ms,
            segmentation_strategy=segmentation_strategy,
        )

    async def _recognize_impl(
        self, buffer: utils.AudioBuffer, *, language: str | None = None
    ) -> stt.SpeechEvent:
        raise NotImplementedError("Azure STT does not support single frame recognition")

    def stream(self, *, language: str | None = None) -> "SpeechStream":
        return SpeechStream(self, self._config)

Ancestors

Methods

def stream(self, *, language: str | None = None) ‑> livekit.plugins.azure.stt.SpeechStream

Inherited members

class SpeechStream (stt: STT, opts: STTOptions)

Helper class that provides a standard way to create an ABC using inheritance.

Args: sample_rate : int or None, optional The desired sample rate for the audio input. If specified, the audio input will be automatically resampled to match the given sample rate before being processed for Speech-to-Text. If not provided (None), the input will retain its original sample rate.

Expand source code
class SpeechStream(stt.SpeechStream):
    def __init__(self, stt: STT, opts: STTOptions) -> None:
        super().__init__(stt, sample_rate=opts.sample_rate)
        self._opts = opts
        self._speaking = False

        self._stream = speechsdk.audio.PushAudioInputStream(
            stream_format=speechsdk.audio.AudioStreamFormat(
                samples_per_second=self._opts.sample_rate,
                bits_per_sample=16,
                channels=self._opts.num_channels,
            )
        )
        self._recognizer = _create_speech_recognizer(
            config=self._opts, stream=self._stream
        )
        self._recognizer.recognizing.connect(self._on_recognizing)
        self._recognizer.recognized.connect(self._on_recognized)
        self._recognizer.speech_start_detected.connect(self._on_speech_start)
        self._recognizer.speech_end_detected.connect(self._on_speech_end)
        self._recognizer.session_stopped.connect(self._on_session_stopped)
        self._recognizer.start_continuous_recognition()
        self._done_event = asyncio.Event()
        self._loop = asyncio.get_running_loop()

    @utils.log_exceptions(logger=logger)
    async def _main_task(self) -> None:
        try:
            async for input in self._input_ch:
                if isinstance(input, rtc.AudioFrame):
                    self._stream.write(input.data.tobytes())

            self._stream.close()
            await self._done_event.wait()
        finally:

            def _cleanup():
                self._recognizer.stop_continuous_recognition()
                del self._recognizer

            await asyncio.to_thread(_cleanup)

    def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs):
        detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language
        text = evt.result.text.strip()
        if not text:
            return

        final_data = stt.SpeechData(
            language=detected_lg, confidence=1.0, text=evt.result.text
        )

        self._threadsafe_send(
            stt.SpeechEvent(
                type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=[final_data]
            )
        )

    def _on_recognizing(self, evt: speechsdk.SpeechRecognitionEventArgs):
        detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language
        text = evt.result.text.strip()
        if not text:
            return

        interim_data = stt.SpeechData(
            language=detected_lg, confidence=0.0, text=evt.result.text
        )

        self._threadsafe_send(
            stt.SpeechEvent(
                type=stt.SpeechEventType.INTERIM_TRANSCRIPT, alternatives=[interim_data]
            )
        )

    def _on_speech_start(self, evt: speechsdk.SpeechRecognitionEventArgs):
        if self._speaking:
            return

        self._speaking = True
        self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH))

    def _on_speech_end(self, evt: speechsdk.SpeechRecognitionEventArgs):
        if not self._speaking:
            return

        self._speaking = False
        self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))

    def _on_session_stopped(self, evt: speechsdk.SpeechRecognitionEventArgs):
        self._loop.call_soon_threadsafe(self._done_event.set)

    def _threadsafe_send(self, evt: stt.SpeechEvent):
        self._loop.call_soon_threadsafe(self._event_ch.send_nowait, evt)

Ancestors

Inherited members

class TTS (*, voice: str | None = None, language: str | None = None, prosody: ProsodyConfig | None = None, speech_key: str | None = None, speech_region: str | None = None, speech_host: str | None = None, endpoint_id: str | None = None)

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Azure TTS.

speech_key and speech_region must be set, either using arguments or by setting the AZURE_SPEECH_KEY and AZURE_SPEECH_REGION environmental variables, respectively.

Expand source code
class TTS(tts.TTS):
    def __init__(
        self,
        *,
        voice: str | None = None,
        language: str | None = None,
        prosody: ProsodyConfig | None = None,
        speech_key: str | None = None,
        speech_region: str | None = None,
        speech_host: str | None = None,
        endpoint_id: str | None = None,
    ) -> None:
        """
        Create a new instance of Azure TTS.

        ``speech_key`` and ``speech_region`` must be set, either using arguments or by setting the
        ``AZURE_SPEECH_KEY`` and ``AZURE_SPEECH_REGION`` environmental variables, respectively.
        """

        super().__init__(
            capabilities=tts.TTSCapabilities(
                streaming=False,
            ),
            sample_rate=AZURE_SAMPLE_RATE,
            num_channels=AZURE_NUM_CHANNELS,
        )

        speech_host = speech_host or os.environ.get("AZURE_SPEECH_HOST")
        speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY")
        speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION")

        if not speech_host and not (speech_key and speech_region):
            raise ValueError(
                "AZURE_SPEECH_HOST or AZURE_SPEECH_KEY and AZURE_SPEECH_REGION must be set"
            )

        if prosody:
            prosody.validate()

        self._opts = _TTSOptions(
            speech_key=speech_key,
            speech_region=speech_region,
            voice=voice,
            endpoint_id=endpoint_id,
            language=language,
            prosody=prosody,
        )

    def update_options(
        self,
        *,
        voice: str | None = None,
        language: str | None = None,
        prosody: ProsodyConfig | None = None,
    ) -> None:
        self._opts.voice = voice or self._opts.voice
        self._opts.language = language or self._opts.language
        self._opts.prosody = prosody or self._opts.prosody

    def synthesize(self, text: str) -> "ChunkedStream":
        return ChunkedStream(self, text, self._opts)

Ancestors

Methods

def synthesize(self, text: str) ‑> livekit.plugins.azure.tts.ChunkedStream
def update_options(self, *, voice: str | None = None, language: str | None = None, prosody: ProsodyConfig | None = None) ‑> None

Inherited members