Module `livekit.agents.stt`

Sub-modules

livekit.agents.stt.stream_adapter
livekit.agents.stt.stt

Classes

class STT (*, capabilities: STTCapabilities)

Helper class that provides a standard way to create an ABC using inheritance.

Expand source code

class STT(ABC):
    def __init__(self, *, capabilities: STTCapabilities) -> None:
        self._capabilities = capabilities

    @property
    def capabilities(self) -> STTCapabilities:
        return self._capabilities

    @abstractmethod
    async def recognize(
        self, buffer: AudioBuffer, *, language: str | None = None
    ) -> SpeechEvent:
        pass

    def stream(self, *, language: str | None = None) -> "SpeechStream":
        raise NotImplementedError(
            "streaming is not supported by this STT, please use a different STT or use a StreamAdapter"
        )

    async def aclose(self) -> None:
        """
        Close the STT, and every stream/requests associated with it
        """
        pass

Ancestors

abc.ABC

Subclasses

StreamAdapter
livekit.plugins.azure.stt.STT
livekit.plugins.deepgram.stt.STT
livekit.plugins.google.stt.STT
livekit.plugins.openai.stt.STT

Instance variables

prop capabilities : STTCapabilities

Expand source code

@property
def capabilities(self) -> STTCapabilities:
    return self._capabilities

Methods

async def aclose(self) ‑> None: Close the STT, and every stream/requests associated with it
async def recognize(self, buffer: AudioBuffer, *, language: str | None = None) ‑> SpeechEvent
def stream(self, *, language: str | None = None) ‑> SpeechStream

class STTCapabilities (streaming: bool, interim_results: bool)

STTCapabilities(streaming: 'bool', interim_results: 'bool')

Expand source code

@dataclass
class STTCapabilities:
    streaming: bool
    interim_results: bool

Class variables

var interim_results : bool
var streaming : bool

class SpeechData (language: str, text: str, start_time: float = 0.0, end_time: float = 0.0, confidence: float = 0.0)

SpeechData(language: 'str', text: 'str', start_time: 'float' = 0.0, end_time: 'float' = 0.0, confidence: 'float' = 0.0)

Expand source code

@dataclass
class SpeechData:
    language: str
    text: str
    start_time: float = 0.0
    end_time: float = 0.0
    confidence: float = 0.0  # [0, 1]

Class variables

var confidence : float
var end_time : float
var language : str
var start_time : float
var text : str

class SpeechEvent (type: SpeechEventType, alternatives: List[SpeechData] = <factory>)

SpeechEvent(type: 'SpeechEventType', alternatives: 'List[SpeechData]' = )

Expand source code

@dataclass
class SpeechEvent:
    type: SpeechEventType
    alternatives: List[SpeechData] = field(default_factory=list)

Class variables

var alternatives : List[SpeechData]
var type : SpeechEventType

class SpeechEventType (*args, **kwds)

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.

Expand source code

@unique
class SpeechEventType(str, Enum):
    START_OF_SPEECH = "start_of_speech"
    """indicate the start of speech
    if the STT doesn't support this event, this will be emitted as the same time as the first INTERIM_TRANSCRIPT"""
    INTERIM_TRANSCRIPT = "interim_transcript"
    """interim transcript, useful for real-time transcription"""
    FINAL_TRANSCRIPT = "final_transcript"
    """final transcript, emitted when the STT is confident enough that a certain
    portion of speech will not change"""
    END_OF_SPEECH = "end_of_speech"
    """indicate the end of speech, emitted when the user stops speaking"""

Ancestors

builtins.str
enum.Enum

Class variables

var END_OF_SPEECH: indicate the end of speech, emitted when the user stops speaking
var FINAL_TRANSCRIPT: final transcript, emitted when the STT is confident enough that a certain portion of speech will not change
var INTERIM_TRANSCRIPT: interim transcript, useful for real-time transcription
var START_OF_SPEECH: indicate the start of speech if the STT doesn't support this event, this will be emitted as the same time as the first INTERIM_TRANSCRIPT

class SpeechStream

Helper class that provides a standard way to create an ABC using inheritance.

Expand source code

class SpeechStream(ABC):
    class _FlushSentinel:
        pass

    def __init__(self):
        self._input_ch = aio.Chan[Union[rtc.AudioFrame, SpeechStream._FlushSentinel]]()
        self._event_ch = aio.Chan[SpeechEvent]()
        self._task = asyncio.create_task(self._main_task())
        self._task.add_done_callback(lambda _: self._event_ch.close())

    @abstractmethod
    def _main_task(self) -> None: ...

    def push_frame(self, frame: rtc.AudioFrame) -> None:
        """Push audio to be recognized"""
        self._check_input_not_ended()
        self._check_not_closed()
        self._input_ch.send_nowait(frame)

    def flush(self) -> None:
        """Mark the end of the current segment"""
        self._check_input_not_ended()
        self._check_not_closed()
        self._input_ch.send_nowait(self._FlushSentinel())

    def end_input(self) -> None:
        """Mark the end of input, no more text will be pushed"""
        self.flush()
        self._input_ch.close()

    async def aclose(self) -> None:
        """Close ths stream immediately"""
        self._input_ch.close()
        await aio.gracefully_cancel(self._task)
        self._event_ch.close()

    async def __anext__(self) -> SpeechEvent:
        return await self._event_ch.__anext__()

    def __aiter__(self) -> AsyncIterator[SpeechEvent]:
        return self

    def _check_not_closed(self) -> None:
        if self._event_ch.closed:
            cls = type(self)
            raise RuntimeError(f"{cls.__module__}.{cls.__name__} is closed")

    def _check_input_not_ended(self) -> None:
        if self._input_ch.closed:
            cls = type(self)
            raise RuntimeError(f"{cls.__module__}.{cls.__name__} input ended")

Ancestors

abc.ABC

Subclasses

StreamAdapterWrapper
livekit.plugins.azure.stt.SpeechStream
livekit.plugins.deepgram.stt.SpeechStream
livekit.plugins.google.stt.SpeechStream

Methods

async def aclose(self) ‑> None: Close ths stream immediately
def end_input(self) ‑> None: Mark the end of input, no more text will be pushed
def flush(self) ‑> None: Mark the end of the current segment
def push_frame(self, frame: rtc.AudioFrame) ‑> None: Push audio to be recognized

class StreamAdapter (*, stt: STT, vad: VAD)

Helper class that provides a standard way to create an ABC using inheritance.

Expand source code

class StreamAdapter(STT):
    def __init__(self, *, stt: STT, vad: VAD) -> None:
        super().__init__(
            capabilities=STTCapabilities(streaming=True, interim_results=False)
        )
        self._vad = vad
        self._stt = stt

    @property
    def wrapped_stt(self) -> STT:
        return self._stt

    async def recognize(
        self, buffer: utils.AudioBuffer, *, language: str | None = None
    ):
        return await self._stt.recognize(buffer=buffer, language=language)

    def stream(self, *, language: str | None = None) -> SpeechStream:
        return StreamAdapterWrapper(self._vad, self._stt, language=language)

Ancestors

STT
abc.ABC

Instance variables

prop wrapped_stt : STT

Expand source code

@property
def wrapped_stt(self) -> STT:
    return self._stt

Methods

async def recognize(self, buffer: utils.AudioBuffer, *, language: str | None = None)
def stream(self, *, language: str | None = None) ‑> SpeechStream

Inherited members

STT:
- aclose

class StreamAdapterWrapper (vad: VAD, stt: STT, *args: Any, **kwargs: Any)

Helper class that provides a standard way to create an ABC using inheritance.

Expand source code

class StreamAdapterWrapper(SpeechStream):
    def __init__(self, vad: VAD, stt: STT, *args: Any, **kwargs: Any) -> None:
        super().__init__()
        self._vad = vad
        self._stt = stt
        self._vad_stream = self._vad.stream()
        self._args = args
        self._kwargs = kwargs

    @utils.log_exceptions(logger=logger)
    async def _main_task(self) -> None:
        async def _forward_input():
            """forward input to vad"""
            async for input in self._input_ch:
                if isinstance(input, self._FlushSentinel):
                    self._vad_stream.flush()
                    continue
                self._vad_stream.push_frame(input)

            self._vad_stream.end_input()

        async def _recognize():
            """recognize speech from vad"""
            async for event in self._vad_stream:
                if event.type == VADEventType.START_OF_SPEECH:
                    self._event_ch.send_nowait(
                        SpeechEvent(SpeechEventType.START_OF_SPEECH)
                    )
                elif event.type == VADEventType.END_OF_SPEECH:
                    self._event_ch.send_nowait(
                        SpeechEvent(
                            type=SpeechEventType.END_OF_SPEECH,
                        )
                    )

                    merged_frames = utils.merge_frames(event.frames)
                    t_event = await self._stt.recognize(
                        buffer=merged_frames, *self._args, **self._kwargs
                    )

                    if len(t_event.alternatives) == 0:
                        continue
                    elif not t_event.alternatives[0].text:
                        continue

                    self._event_ch.send_nowait(
                        SpeechEvent(
                            type=SpeechEventType.FINAL_TRANSCRIPT,
                            alternatives=[t_event.alternatives[0]],
                        )
                    )

        tasks = [
            asyncio.create_task(_forward_input(), name="forward_input"),
            asyncio.create_task(_recognize(), name="recognize"),
        ]
        try:
            await asyncio.gather(*tasks)
        finally:
            await utils.aio.gracefully_cancel(*tasks)

Ancestors

SpeechStream
abc.ABC

Inherited members

SpeechStream:
- aclose
- end_input
- flush
- push_frame