Module livekit.agents.inference.vad

Classes

class VAD (*,
model: VADModels = 'silero',
min_speech_duration: float = 0.05,
min_silence_duration: float = 0.25,
prefix_padding_duration: float = 0.5,
max_buffered_speech: float = 60.0,
activation_threshold: float = 0.5,
deactivation_threshold: NotGivenOr[float] = NOT_GIVEN)
Expand source code
class VAD(vad.VAD):
    """Voice Activity Detection backed by ``livekit-local-inference``.

    The native model singleton is loaded once at module import (via the
    pybind11 ``.so`` constructor); each stream allocates its own per-instance
    LSTM/context state.
    """

    def __init__(
        self,
        *,
        model: VADModels = "silero",
        min_speech_duration: float = 0.05,
        min_silence_duration: float = 0.25,
        prefix_padding_duration: float = 0.5,
        max_buffered_speech: float = 60.0,
        activation_threshold: float = 0.5,
        deactivation_threshold: NotGivenOr[float] = NOT_GIVEN,
    ) -> None:
        super().__init__(capabilities=vad.VADCapabilities(update_interval=0.032))
        if model != "silero":
            raise ValueError(f"Unknown VAD model: {model!r}. Supported: 'silero'.")
        if is_given(deactivation_threshold) and deactivation_threshold <= 0:
            raise ValueError("deactivation_threshold must be greater than 0")
        self._model = model
        self._opts = _VADOptions(
            min_speech_duration=min_speech_duration,
            min_silence_duration=min_silence_duration,
            prefix_padding_duration=prefix_padding_duration,
            max_buffered_speech=max_buffered_speech,
            activation_threshold=activation_threshold,
            deactivation_threshold=deactivation_threshold
            if is_given(deactivation_threshold)
            else max(activation_threshold - 0.15, 0.01),
        )
        self._streams: weakref.WeakSet[_VADStream] = weakref.WeakSet()

    @property
    def model(self) -> str:
        return self._model

    @property
    def provider(self) -> str:
        return "livekit-local-inference"

    def stream(self) -> vad.VADStream:
        # Each stream owns its own _VADOptions snapshot so that
        # _VADStream.update_options() can read the prior value of
        # max_buffered_speech before mutating it. Sharing the dataclass would
        # let VAD.update_options() mutate the stream's view first, and the
        # stream would never observe an increase.
        stream = _VADStream(self, replace(self._opts))
        self._streams.add(stream)
        return stream

    def update_options(
        self,
        *,
        min_speech_duration: NotGivenOr[float] = NOT_GIVEN,
        min_silence_duration: NotGivenOr[float] = NOT_GIVEN,
        prefix_padding_duration: NotGivenOr[float] = NOT_GIVEN,
        max_buffered_speech: NotGivenOr[float] = NOT_GIVEN,
        activation_threshold: NotGivenOr[float] = NOT_GIVEN,
        deactivation_threshold: NotGivenOr[float] = NOT_GIVEN,
    ) -> None:
        if is_given(min_speech_duration):
            self._opts.min_speech_duration = min_speech_duration
        if is_given(min_silence_duration):
            self._opts.min_silence_duration = min_silence_duration
        if is_given(prefix_padding_duration):
            self._opts.prefix_padding_duration = prefix_padding_duration
        if is_given(max_buffered_speech):
            self._opts.max_buffered_speech = max_buffered_speech
        if is_given(activation_threshold):
            self._opts.activation_threshold = activation_threshold
        if is_given(deactivation_threshold):
            self._opts.deactivation_threshold = deactivation_threshold

        for stream in self._streams:
            stream.update_options(
                min_speech_duration=min_speech_duration,
                min_silence_duration=min_silence_duration,
                prefix_padding_duration=prefix_padding_duration,
                max_buffered_speech=max_buffered_speech,
                activation_threshold=activation_threshold,
                deactivation_threshold=deactivation_threshold,
            )

    @property
    def min_silence_duration(self) -> float | None:
        return self._opts.min_silence_duration

Voice Activity Detection backed by livekit-local-inference.

The native model singleton is loaded once at module import (via the pybind11 .so constructor); each stream allocates its own per-instance LSTM/context state.

Ancestors

Instance variables

prop min_silence_duration : float | None
Expand source code
@property
def min_silence_duration(self) -> float | None:
    return self._opts.min_silence_duration
prop model : str
Expand source code
@property
def model(self) -> str:
    return self._model
prop provider : str
Expand source code
@property
def provider(self) -> str:
    return "livekit-local-inference"

Methods

def stream(self) ‑> VADStream
Expand source code
def stream(self) -> vad.VADStream:
    # Each stream owns its own _VADOptions snapshot so that
    # _VADStream.update_options() can read the prior value of
    # max_buffered_speech before mutating it. Sharing the dataclass would
    # let VAD.update_options() mutate the stream's view first, and the
    # stream would never observe an increase.
    stream = _VADStream(self, replace(self._opts))
    self._streams.add(stream)
    return stream
def update_options(self,
*,
min_speech_duration: NotGivenOr[float] = NOT_GIVEN,
min_silence_duration: NotGivenOr[float] = NOT_GIVEN,
prefix_padding_duration: NotGivenOr[float] = NOT_GIVEN,
max_buffered_speech: NotGivenOr[float] = NOT_GIVEN,
activation_threshold: NotGivenOr[float] = NOT_GIVEN,
deactivation_threshold: NotGivenOr[float] = NOT_GIVEN) ‑> None
Expand source code
def update_options(
    self,
    *,
    min_speech_duration: NotGivenOr[float] = NOT_GIVEN,
    min_silence_duration: NotGivenOr[float] = NOT_GIVEN,
    prefix_padding_duration: NotGivenOr[float] = NOT_GIVEN,
    max_buffered_speech: NotGivenOr[float] = NOT_GIVEN,
    activation_threshold: NotGivenOr[float] = NOT_GIVEN,
    deactivation_threshold: NotGivenOr[float] = NOT_GIVEN,
) -> None:
    if is_given(min_speech_duration):
        self._opts.min_speech_duration = min_speech_duration
    if is_given(min_silence_duration):
        self._opts.min_silence_duration = min_silence_duration
    if is_given(prefix_padding_duration):
        self._opts.prefix_padding_duration = prefix_padding_duration
    if is_given(max_buffered_speech):
        self._opts.max_buffered_speech = max_buffered_speech
    if is_given(activation_threshold):
        self._opts.activation_threshold = activation_threshold
    if is_given(deactivation_threshold):
        self._opts.deactivation_threshold = deactivation_threshold

    for stream in self._streams:
        stream.update_options(
            min_speech_duration=min_speech_duration,
            min_silence_duration=min_silence_duration,
            prefix_padding_duration=prefix_padding_duration,
            max_buffered_speech=max_buffered_speech,
            activation_threshold=activation_threshold,
            deactivation_threshold=deactivation_threshold,
        )

Inherited members