Module `livekit.agents.voice`

Sub-modules

livekit.agents.voice.avatar
livekit.agents.voice.background_audio
livekit.agents.voice.io
livekit.agents.voice.report
livekit.agents.voice.room_io
livekit.agents.voice.run_result

Classes

class Agent (*, instructions: str, id: str | None = None, chat_ctx: NotGivenOr[llm.ChatContext | None] = NOT_GIVEN, tools: list[llm.FunctionTool | llm.RawFunctionTool] | None = None, turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN, stt: NotGivenOr[stt.STT | STTModels | str | None] = NOT_GIVEN, vad: NotGivenOr[vad.VAD | None] = NOT_GIVEN, llm: NotGivenOr[llm.LLM | llm.RealtimeModel | LLMModels | str | None] = NOT_GIVEN, tts: NotGivenOr[tts.TTS | TTSModels | str | None] = NOT_GIVEN, mcp_servers: NotGivenOr[list[mcp.MCPServer] | None] = NOT_GIVEN, allow_interruptions: NotGivenOr[bool] = NOT_GIVEN, min_consecutive_speech_delay: NotGivenOr[float] = NOT_GIVEN, use_tts_aligned_transcript: NotGivenOr[bool] = NOT_GIVEN, min_endpointing_delay: NotGivenOr[float] = NOT_GIVEN, max_endpointing_delay: NotGivenOr[float] = NOT_GIVEN)

Expand source code

class Agent:
    def __init__(
        self,
        *,
        instructions: str,
        id: str | None = None,
        chat_ctx: NotGivenOr[llm.ChatContext | None] = NOT_GIVEN,
        tools: list[llm.FunctionTool | llm.RawFunctionTool] | None = None,
        turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
        stt: NotGivenOr[stt.STT | STTModels | str | None] = NOT_GIVEN,
        vad: NotGivenOr[vad.VAD | None] = NOT_GIVEN,
        llm: NotGivenOr[llm.LLM | llm.RealtimeModel | LLMModels | str | None] = NOT_GIVEN,
        tts: NotGivenOr[tts.TTS | TTSModels | str | None] = NOT_GIVEN,
        mcp_servers: NotGivenOr[list[mcp.MCPServer] | None] = NOT_GIVEN,
        allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
        min_consecutive_speech_delay: NotGivenOr[float] = NOT_GIVEN,
        use_tts_aligned_transcript: NotGivenOr[bool] = NOT_GIVEN,
        min_endpointing_delay: NotGivenOr[float] = NOT_GIVEN,
        max_endpointing_delay: NotGivenOr[float] = NOT_GIVEN,
    ) -> None:
        tools = tools or []
        if type(self) is Agent:
            self._id = "default_agent"
        else:
            self._id = id or misc.camel_to_snake_case(type(self).__name__)

        self._instructions = instructions
        self._tools = tools.copy() + find_function_tools(self)
        self._chat_ctx = chat_ctx.copy(tools=self._tools) if chat_ctx else ChatContext.empty()
        self._turn_detection = turn_detection

        if isinstance(stt, str):
            stt = inference.STT.from_model_string(stt)

        if isinstance(llm, str):
            llm = inference.LLM.from_model_string(llm)

        if isinstance(tts, str):
            tts = inference.TTS.from_model_string(tts)

        self._stt = stt
        self._llm = llm
        self._tts = tts
        self._vad = vad
        self._allow_interruptions = allow_interruptions
        self._min_consecutive_speech_delay = min_consecutive_speech_delay
        self._use_tts_aligned_transcript = use_tts_aligned_transcript
        self._min_endpointing_delay = min_endpointing_delay
        self._max_endpointing_delay = max_endpointing_delay

        if isinstance(mcp_servers, list) and len(mcp_servers) == 0:
            mcp_servers = None  # treat empty list as None (but keep NOT_GIVEN)

        self._mcp_servers = mcp_servers
        self._activity: AgentActivity | None = None

    @property
    def id(self) -> str:
        return self._id

    @property
    def label(self) -> str:
        return self.id

    @property
    def instructions(self) -> str:
        """
        Returns:
            str: The core instructions that guide the agent's behavior.
        """
        return self._instructions

    @property
    def tools(self) -> list[llm.FunctionTool | llm.RawFunctionTool]:
        """
        Returns:
            list[llm.FunctionTool | llm.RawFunctionTool]:
                A list of function tools available to the agent.
        """
        return self._tools.copy()

    @property
    def chat_ctx(self) -> llm.ChatContext:
        """
        Provides a read-only view of the agent's current chat context.

        Returns:
            llm.ChatContext: A read-only version of the agent's conversation history.

        See Also:
            update_chat_ctx: Method to update the internal chat context.
        """
        return _ReadOnlyChatContext(self._chat_ctx.items)

    async def update_instructions(self, instructions: str) -> None:
        """
        Updates the agent's instructions.

        If the agent is running in realtime mode, this method also updates
        the instructions for the ongoing realtime session.

        Args:
            instructions (str):
                The new instructions to set for the agent.

        Raises:
            llm.RealtimeError: If updating the realtime session instructions fails.
        """
        if self._activity is None:
            self._instructions = instructions
            return

        await self._activity.update_instructions(instructions)

    async def update_tools(self, tools: list[llm.FunctionTool | llm.RawFunctionTool]) -> None:
        """
        Updates the agent's available function tools.

        If the agent is running in realtime mode, this method also updates
        the tools for the ongoing realtime session.

        Args:
            tools (list[llm.FunctionTool]):
                The new list of function tools available to the agent.

        Raises:
            llm.RealtimeError: If updating the realtime session tools fails.
        """
        invalid = [t for t in tools if not (is_function_tool(t) or is_raw_function_tool(t))]
        if invalid:
            kinds = ", ".join(sorted({type(t).__name__ for t in invalid}))
            raise TypeError(
                f"Invalid tool type(s): {kinds}. Expected FunctionTool or RawFunctionTool."
            )

        if self._activity is None:
            self._tools = list(set(tools))
            self._chat_ctx = self._chat_ctx.copy(tools=self._tools)
            return

        await self._activity.update_tools(tools)

    async def update_chat_ctx(
        self, chat_ctx: llm.ChatContext, *, exclude_invalid_function_calls: bool = True
    ) -> None:
        """
        Updates the agent's chat context.

        If the agent is running in realtime mode, this method also updates
        the chat context for the ongoing realtime session.

        Args:
            chat_ctx (llm.ChatContext):
                The new or updated chat context for the agent.
            exclude_invalid_function_calls (bool): Whether to exclude function calls
                and outputs not from the agent's tools.

        Raises:
            llm.RealtimeError: If updating the realtime session chat context fails.
        """
        if self._activity is None:
            self._chat_ctx = chat_ctx.copy(
                tools=self._tools if exclude_invalid_function_calls else NOT_GIVEN
            )
            return

        await self._activity.update_chat_ctx(
            chat_ctx, exclude_invalid_function_calls=exclude_invalid_function_calls
        )

    # -- Pipeline nodes --
    # They can all be overriden by subclasses, by default they use the STT/LLM/TTS specified in the
    # constructor of the VoiceAgent

    async def on_enter(self) -> None:
        """Called when the task is entered"""
        pass

    async def on_exit(self) -> None:
        """Called when the task is exited"""
        pass

    async def on_user_turn_completed(
        self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage
    ) -> None:
        """Called when the user has finished speaking, and the LLM is about to respond

        This is a good opportunity to update the chat context or edit the new message before it is
        sent to the LLM.
        """
        pass

    def stt_node(
        self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings
    ) -> (
        AsyncIterable[stt.SpeechEvent | str]
        | Coroutine[Any, Any, AsyncIterable[stt.SpeechEvent | str]]
        | Coroutine[Any, Any, None]
    ):
        """
        A node in the processing pipeline that transcribes audio frames into speech events.

        By default, this node uses a Speech-To-Text (STT) capability from the current agent.
        If the STT implementation does not support streaming natively, a VAD (Voice Activity
        Detection) mechanism is required to wrap the STT.

        You can override this node with your own implementation for more flexibility (e.g.,
        custom pre-processing of audio, additional buffering, or alternative STT strategies).

        Args:
            audio (AsyncIterable[rtc.AudioFrame]): An asynchronous stream of audio frames.
            model_settings (ModelSettings): Configuration and parameters for model execution.

        Yields:
            stt.SpeechEvent: An event containing transcribed text or other STT-related data.
        """
        return Agent.default.stt_node(self, audio, model_settings)

    def llm_node(
        self,
        chat_ctx: llm.ChatContext,
        tools: list[FunctionTool | RawFunctionTool],
        model_settings: ModelSettings,
    ) -> (
        AsyncIterable[llm.ChatChunk | str | FlushSentinel]
        | Coroutine[Any, Any, AsyncIterable[llm.ChatChunk | str | FlushSentinel]]
        | Coroutine[Any, Any, str]
        | Coroutine[Any, Any, llm.ChatChunk]
        | Coroutine[Any, Any, None]
    ):
        """
        A node in the processing pipeline that processes text generation with an LLM.

        By default, this node uses the agent's LLM to process the provided context. It may yield
        plain text (as `str`) for straightforward text generation, or `llm.ChatChunk` objects that
        can include text and optional tool calls. `ChatChunk` is helpful for capturing more complex
        outputs such as function calls, usage statistics, or other metadata.

        You can override this node to customize how the LLM is used or how tool invocations
        and responses are handled.

        Args:
            chat_ctx (llm.ChatContext): The context for the LLM (the conversation history).
            tools (list[FunctionTool]): A list of callable tools that the LLM may invoke.
            model_settings (ModelSettings): Configuration and parameters for model execution.

        Yields/Returns:
            str: Plain text output from the LLM.
            llm.ChatChunk: An object that can contain both text and optional tool calls.
        """
        return Agent.default.llm_node(self, chat_ctx, tools, model_settings)

    def transcription_node(
        self, text: AsyncIterable[str | TimedString], model_settings: ModelSettings
    ) -> (
        AsyncIterable[str | TimedString]
        | Coroutine[Any, Any, AsyncIterable[str | TimedString]]
        | Coroutine[Any, Any, None]
    ):
        """
        A node in the processing pipeline that finalizes transcriptions from text segments.

        This node can be used to adjust or post-process text coming from an LLM (or any other
        source) into a final transcribed form. For instance, you might clean up formatting, fix
        punctuation, or perform any other text transformations here.

        You can override this node to customize post-processing logic according to your needs.

        Args:
            text (AsyncIterable[str | TimedString]): An asynchronous stream of text segments.
            model_settings (ModelSettings): Configuration and parameters for model execution.

        Yields:
            str: Finalized or post-processed text segments.
        """
        return Agent.default.transcription_node(self, text, model_settings)

    def tts_node(
        self, text: AsyncIterable[str], model_settings: ModelSettings
    ) -> (
        AsyncIterable[rtc.AudioFrame]
        | Coroutine[Any, Any, AsyncIterable[rtc.AudioFrame]]
        | Coroutine[Any, Any, None]
    ):
        """
        A node in the processing pipeline that synthesizes audio from text segments.

        By default, this node converts incoming text into audio frames using the Text-To-Speech
        from the agent.
        If the TTS implementation does not support streaming natively, it uses a sentence tokenizer
        to split text for incremental synthesis.

        You can override this node to provide different text chunking behavior, a custom TTS engine,
        or any other specialized processing.

        Args:
            text (AsyncIterable[str]): An asynchronous stream of text segments to be synthesized.
            model_settings (ModelSettings): Configuration and parameters for model execution.

        Yields:
            rtc.AudioFrame: Audio frames synthesized from the provided text.
        """
        return Agent.default.tts_node(self, text, model_settings)

    def realtime_audio_output_node(
        self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings
    ) -> (
        AsyncIterable[rtc.AudioFrame]
        | Coroutine[Any, Any, AsyncIterable[rtc.AudioFrame]]
        | Coroutine[Any, Any, None]
    ):
        """A node processing the audio from the realtime LLM session before it is played out."""
        return Agent.default.realtime_audio_output_node(self, audio, model_settings)

    def _get_activity_or_raise(self) -> AgentActivity:
        """Get the current activity context for this task (internal)"""
        if self._activity is None:
            raise RuntimeError("no activity context found, the agent is not running")

        return self._activity

    class default:
        @staticmethod
        async def stt_node(
            agent: Agent, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings
        ) -> AsyncGenerator[stt.SpeechEvent, None]:
            """Default implementation for `Agent.stt_node`"""
            activity = agent._get_activity_or_raise()
            assert activity.stt is not None, "stt_node called but no STT node is available"

            wrapped_stt = activity.stt

            if not activity.stt.capabilities.streaming:
                if not activity.vad:
                    raise RuntimeError(
                        f"The STT ({activity.stt.label}) does not support streaming, add a VAD to the AgentTask/VoiceAgent to enable streaming"  # noqa: E501
                        "Or manually wrap your STT in a stt.StreamAdapter"
                    )

                wrapped_stt = stt.StreamAdapter(stt=wrapped_stt, vad=activity.vad)

            conn_options = activity.session.conn_options.stt_conn_options
            async with wrapped_stt.stream(conn_options=conn_options) as stream:

                @utils.log_exceptions(logger=logger)
                async def _forward_input() -> None:
                    async for frame in audio:
                        stream.push_frame(frame)

                forward_task = asyncio.create_task(_forward_input())
                try:
                    async for event in stream:
                        yield event
                finally:
                    await utils.aio.cancel_and_wait(forward_task)

        @staticmethod
        async def llm_node(
            agent: Agent,
            chat_ctx: llm.ChatContext,
            tools: list[FunctionTool | RawFunctionTool],
            model_settings: ModelSettings,
        ) -> AsyncGenerator[llm.ChatChunk | str | FlushSentinel, None]:
            """Default implementation for `Agent.llm_node`"""
            activity = agent._get_activity_or_raise()
            assert activity.llm is not None, "llm_node called but no LLM node is available"
            assert isinstance(activity.llm, llm.LLM), (
                "llm_node should only be used with LLM (non-multimodal/realtime APIs) nodes"
            )

            tool_choice = model_settings.tool_choice if model_settings else NOT_GIVEN
            activity_llm = activity.llm

            conn_options = activity.session.conn_options.llm_conn_options
            async with activity_llm.chat(
                chat_ctx=chat_ctx, tools=tools, tool_choice=tool_choice, conn_options=conn_options
            ) as stream:
                async for chunk in stream:
                    yield chunk

        @staticmethod
        async def tts_node(
            agent: Agent, text: AsyncIterable[str], model_settings: ModelSettings
        ) -> AsyncGenerator[rtc.AudioFrame, None]:
            """Default implementation for `Agent.tts_node`"""
            activity = agent._get_activity_or_raise()
            assert activity.tts is not None, "tts_node called but no TTS node is available"

            wrapped_tts = activity.tts

            if not activity.tts.capabilities.streaming:
                wrapped_tts = tts.StreamAdapter(
                    tts=wrapped_tts,
                    sentence_tokenizer=tokenize.blingfire.SentenceTokenizer(retain_format=True),
                )

            conn_options = activity.session.conn_options.tts_conn_options
            async with wrapped_tts.stream(conn_options=conn_options) as stream:

                async def _forward_input() -> None:
                    async for chunk in text:
                        stream.push_text(chunk)

                    stream.end_input()

                forward_task = asyncio.create_task(_forward_input())
                try:
                    async for ev in stream:
                        yield ev.frame
                finally:
                    await utils.aio.cancel_and_wait(forward_task)

        @staticmethod
        async def transcription_node(
            agent: Agent, text: AsyncIterable[str | TimedString], model_settings: ModelSettings
        ) -> AsyncGenerator[str | TimedString, None]:
            """Default implementation for `Agent.transcription_node`"""
            async for delta in text:
                yield delta

        @staticmethod
        async def realtime_audio_output_node(
            agent: Agent, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings
        ) -> AsyncGenerator[rtc.AudioFrame, None]:
            """Default implementation for `Agent.realtime_audio_output_node`"""
            activity = agent._get_activity_or_raise()
            assert activity.realtime_llm_session is not None, (
                "realtime_audio_output_node called but no realtime LLM session is available"
            )

            async for frame in audio:
                yield frame

    @property
    def realtime_llm_session(self) -> llm.RealtimeSession:
        """
        Retrieve the realtime LLM session associated with the current agent.

        Raises:
            RuntimeError: If the agent is not running or the realtime LLM session is not available
        """
        if (rt_session := self._get_activity_or_raise().realtime_llm_session) is None:
            raise RuntimeError("no realtime LLM session")

        return rt_session

    @property
    def turn_detection(self) -> NotGivenOr[TurnDetectionMode | None]:
        """
        Retrieves the turn detection mode for identifying conversational turns.

        If this property was not set at Agent creation, but an ``AgentSession`` provides a turn detection,
        the session's turn detection mode will be used at runtime instead.

        Returns:
            NotGivenOr[TurnDetectionMode | None]: An optional turn detection mode for managing conversation flow.
        """  # noqa: E501
        return self._turn_detection

    @turn_detection.setter
    def turn_detection(self, value: TurnDetectionMode | None) -> None:
        self._turn_detection = value

        if self._activity is not None:
            self._activity.update_options(turn_detection=value)

    @property
    def stt(self) -> NotGivenOr[stt.STT | None]:
        """
        Retrieves the Speech-To-Text component for the agent.

        If this property was not set at Agent creation, but an ``AgentSession`` provides an STT component,
        the session's STT will be used at runtime instead.

        Returns:
            NotGivenOr[stt.STT | None]: An optional STT component.
        """  # noqa: E501
        return self._stt

    @property
    def llm(self) -> NotGivenOr[llm.LLM | llm.RealtimeModel | None]:
        """
        Retrieves the Language Model or RealtimeModel used for text generation.

        If this property was not set at Agent creation, but an ``AgentSession`` provides an LLM or RealtimeModel,
        the session's model will be used at runtime instead.

        Returns:
            NotGivenOr[llm.LLM | llm.RealtimeModel | None]: The language model for text generation.
        """  # noqa: E501
        return self._llm

    @property
    def tts(self) -> NotGivenOr[tts.TTS | None]:
        """
        Retrieves the Text-To-Speech component for the agent.

        If this property was not set at Agent creation, but an ``AgentSession`` provides a TTS component,
        the session's TTS will be used at runtime instead.

        Returns:
            NotGivenOr[tts.TTS | None]: An optional TTS component for generating audio output.
        """  # noqa: E501
        return self._tts

    @property
    def mcp_servers(self) -> NotGivenOr[list[mcp.MCPServer] | None]:
        """
        Retrieves the list of Model Context Protocol (MCP) servers providing external tools.

        If this property was not set at Agent creation, but an ``AgentSession`` provides MCP servers,
        the session's MCP servers will be used at runtime instead.

        Returns:
            NotGivenOr[list[mcp.MCPServer]]: An optional list of MCP servers.
        """  # noqa: E501
        return self._mcp_servers

    @property
    def vad(self) -> NotGivenOr[vad.VAD | None]:
        """
        Retrieves the Voice Activity Detection component for the agent.

        If this property was not set at Agent creation, but an ``AgentSession`` provides a VAD component,
        the session's VAD will be used at runtime instead.

        Returns:
            NotGivenOr[vad.VAD | None]: An optional VAD component for detecting voice activity.
        """  # noqa: E501
        return self._vad

    @property
    def allow_interruptions(self) -> NotGivenOr[bool]:
        """
        Indicates whether interruptions (e.g., stopping TTS playback) are allowed.

        If this property was not set at Agent creation, but an ``AgentSession`` provides a value for
        allowing interruptions, the session's value will be used at runtime instead.

        Returns:
            NotGivenOr[bool]: Whether interruptions are permitted.
        """
        return self._allow_interruptions

    @property
    def min_endpointing_delay(self) -> NotGivenOr[float]:
        """
        Minimum time-in-seconds the agent must wait after a potential end-of-utterance signal
        before it declares the user’s turn complete.

        If this property was set at Agent creation, it will be used at runtime instead of the session's value.
        """
        return self._min_endpointing_delay

    @property
    def max_endpointing_delay(self) -> NotGivenOr[float]:
        """
        Maximum time-in-seconds the agent will wait before terminating the turn.

        If this property was set at Agent creation, it will be used at runtime instead of the session's value.
        """
        return self._max_endpointing_delay

    @property
    def min_consecutive_speech_delay(self) -> NotGivenOr[float]:
        """
        Retrieves the minimum consecutive speech delay for the agent.

        If this property was not set at Agent creation, but an ``AgentSession`` provides a value for
        the minimum consecutive speech delay, the session's value will be used at runtime instead.

        Returns:
            NotGivenOr[float]: The minimum consecutive speech delay.
        """
        return self._min_consecutive_speech_delay

    @property
    def use_tts_aligned_transcript(self) -> NotGivenOr[bool]:
        """
        Indicates whether to use TTS-aligned transcript as the input of
        the ``transcription_node``.

        If this property was not set at Agent creation, but an ``AgentSession`` provides a value for
        the use of TTS-aligned transcript, the session's value will be used at runtime instead.

        Returns:
            NotGivenOr[bool]: Whether to use TTS-aligned transcript.
        """
        return self._use_tts_aligned_transcript

    @property
    def session(self) -> AgentSession:
        """
        Retrieve the VoiceAgent associated with the current agent.

        Raises:
            RuntimeError: If the agent is not running
        """
        return self._get_activity_or_raise().session

Subclasses

livekit.agents.voice.agent.AgentTask

Class variables

var default

Instance variables

prop allow_interruptions : NotGivenOr[bool]

Expand source code

@property
def allow_interruptions(self) -> NotGivenOr[bool]:
    """
    Indicates whether interruptions (e.g., stopping TTS playback) are allowed.

    If this property was not set at Agent creation, but an ``AgentSession`` provides a value for
    allowing interruptions, the session's value will be used at runtime instead.

    Returns:
        NotGivenOr[bool]: Whether interruptions are permitted.
    """
    return self._allow_interruptions

Indicates whether interruptions (e.g., stopping TTS playback) are allowed.

If this property was not set at Agent creation, but an AgentSession provides a value for allowing interruptions, the session's value will be used at runtime instead.

Returns

NotGivenOr[bool]: Whether interruptions are permitted.

prop chat_ctx : llm.ChatContext

Expand source code

@property
def chat_ctx(self) -> llm.ChatContext:
    """
    Provides a read-only view of the agent's current chat context.

    Returns:
        llm.ChatContext: A read-only version of the agent's conversation history.

    See Also:
        update_chat_ctx: Method to update the internal chat context.
    """
    return _ReadOnlyChatContext(self._chat_ctx.items)

Provides a read-only view of the agent's current chat context.

Returns

llm.ChatContext: A read-only version of the agent's conversation history.

See Also: update_chat_ctx: Method to update the internal chat context.

prop id : str

Expand source code

@property
def id(self) -> str:
    return self._id

prop instructions : str

Expand source code

@property
def instructions(self) -> str:
    """
    Returns:
        str: The core instructions that guide the agent's behavior.
    """
    return self._instructions

Returns

str: The core instructions that guide the agent's behavior.

prop label : str

Expand source code

@property
def label(self) -> str:
    return self.id

prop llm : NotGivenOr[llm.LLM | llm.RealtimeModel | None]

Expand source code

@property
def llm(self) -> NotGivenOr[llm.LLM | llm.RealtimeModel | None]:
    """
    Retrieves the Language Model or RealtimeModel used for text generation.

    If this property was not set at Agent creation, but an ``AgentSession`` provides an LLM or RealtimeModel,
    the session's model will be used at runtime instead.

    Returns:
        NotGivenOr[llm.LLM | llm.RealtimeModel | None]: The language model for text generation.
    """  # noqa: E501
    return self._llm

Retrieves the Language Model or RealtimeModel used for text generation.

If this property was not set at Agent creation, but an AgentSession provides an LLM or RealtimeModel, the session's model will be used at runtime instead.

Returns

NotGivenOr[llm.LLM | llm.RealtimeModel | None]: The language model for text generation.

prop max_endpointing_delay : NotGivenOr[float]

Expand source code

@property
def max_endpointing_delay(self) -> NotGivenOr[float]:
    """
    Maximum time-in-seconds the agent will wait before terminating the turn.

    If this property was set at Agent creation, it will be used at runtime instead of the session's value.
    """
    return self._max_endpointing_delay

Maximum time-in-seconds the agent will wait before terminating the turn.

If this property was set at Agent creation, it will be used at runtime instead of the session's value.

prop mcp_servers : NotGivenOr[list[mcp.MCPServer] | None]

Expand source code

@property
def mcp_servers(self) -> NotGivenOr[list[mcp.MCPServer] | None]:
    """
    Retrieves the list of Model Context Protocol (MCP) servers providing external tools.

    If this property was not set at Agent creation, but an ``AgentSession`` provides MCP servers,
    the session's MCP servers will be used at runtime instead.

    Returns:
        NotGivenOr[list[mcp.MCPServer]]: An optional list of MCP servers.
    """  # noqa: E501
    return self._mcp_servers

Retrieves the list of Model Context Protocol (MCP) servers providing external tools.

If this property was not set at Agent creation, but an AgentSession provides MCP servers, the session's MCP servers will be used at runtime instead.

Returns

NotGivenOr[list[mcp.MCPServer]]: An optional list of MCP servers.

prop min_consecutive_speech_delay : NotGivenOr[float]

Expand source code

@property
def min_consecutive_speech_delay(self) -> NotGivenOr[float]:
    """
    Retrieves the minimum consecutive speech delay for the agent.

    If this property was not set at Agent creation, but an ``AgentSession`` provides a value for
    the minimum consecutive speech delay, the session's value will be used at runtime instead.

    Returns:
        NotGivenOr[float]: The minimum consecutive speech delay.
    """
    return self._min_consecutive_speech_delay

Retrieves the minimum consecutive speech delay for the agent.

If this property was not set at Agent creation, but an AgentSession provides a value for the minimum consecutive speech delay, the session's value will be used at runtime instead.

Returns

NotGivenOr[float]: The minimum consecutive speech delay.

prop min_endpointing_delay : NotGivenOr[float]

Expand source code

@property
def min_endpointing_delay(self) -> NotGivenOr[float]:
    """
    Minimum time-in-seconds the agent must wait after a potential end-of-utterance signal
    before it declares the user’s turn complete.

    If this property was set at Agent creation, it will be used at runtime instead of the session's value.
    """
    return self._min_endpointing_delay

Minimum time-in-seconds the agent must wait after a potential end-of-utterance signal before it declares the user’s turn complete.

If this property was set at Agent creation, it will be used at runtime instead of the session's value.

prop realtime_llm_session : llm.RealtimeSession

Expand source code

@property
def realtime_llm_session(self) -> llm.RealtimeSession:
    """
    Retrieve the realtime LLM session associated with the current agent.

    Raises:
        RuntimeError: If the agent is not running or the realtime LLM session is not available
    """
    if (rt_session := self._get_activity_or_raise().realtime_llm_session) is None:
        raise RuntimeError("no realtime LLM session")

    return rt_session

Retrieve the realtime LLM session associated with the current agent.

Raises

RuntimeError: If the agent is not running or the realtime LLM session is not available

prop session : AgentSession

Expand source code

@property
def session(self) -> AgentSession:
    """
    Retrieve the VoiceAgent associated with the current agent.

    Raises:
        RuntimeError: If the agent is not running
    """
    return self._get_activity_or_raise().session

Retrieve the VoiceAgent associated with the current agent.

Raises

RuntimeError: If the agent is not running

prop stt : NotGivenOr[stt.STT | None]

Expand source code

@property
def stt(self) -> NotGivenOr[stt.STT | None]:
    """
    Retrieves the Speech-To-Text component for the agent.

    If this property was not set at Agent creation, but an ``AgentSession`` provides an STT component,
    the session's STT will be used at runtime instead.

    Returns:
        NotGivenOr[stt.STT | None]: An optional STT component.
    """  # noqa: E501
    return self._stt

Retrieves the Speech-To-Text component for the agent.

If this property was not set at Agent creation, but an AgentSession provides an STT component, the session's STT will be used at runtime instead.

Returns

NotGivenOr[stt.STT | None]: An optional STT component.

prop tools : list[llm.FunctionTool | llm.RawFunctionTool]

Expand source code

@property
def tools(self) -> list[llm.FunctionTool | llm.RawFunctionTool]:
    """
    Returns:
        list[llm.FunctionTool | llm.RawFunctionTool]:
            A list of function tools available to the agent.
    """
    return self._tools.copy()

Returns

list[llm.FunctionTool | llm.RawFunctionTool]: A list of function tools available to the agent.

prop tts : NotGivenOr[tts.TTS | None]

Expand source code

@property
def tts(self) -> NotGivenOr[tts.TTS | None]:
    """
    Retrieves the Text-To-Speech component for the agent.

    If this property was not set at Agent creation, but an ``AgentSession`` provides a TTS component,
    the session's TTS will be used at runtime instead.

    Returns:
        NotGivenOr[tts.TTS | None]: An optional TTS component for generating audio output.
    """  # noqa: E501
    return self._tts

Retrieves the Text-To-Speech component for the agent.

If this property was not set at Agent creation, but an AgentSession provides a TTS component, the session's TTS will be used at runtime instead.

Returns

NotGivenOr[tts.TTS | None]: An optional TTS component for generating audio output.

prop turn_detection : NotGivenOr[TurnDetectionMode | None]

Expand source code

@property
def turn_detection(self) -> NotGivenOr[TurnDetectionMode | None]:
    """
    Retrieves the turn detection mode for identifying conversational turns.

    If this property was not set at Agent creation, but an ``AgentSession`` provides a turn detection,
    the session's turn detection mode will be used at runtime instead.

    Returns:
        NotGivenOr[TurnDetectionMode | None]: An optional turn detection mode for managing conversation flow.
    """  # noqa: E501
    return self._turn_detection

Retrieves the turn detection mode for identifying conversational turns.

If this property was not set at Agent creation, but an AgentSession provides a turn detection, the session's turn detection mode will be used at runtime instead.

Returns

NotGivenOr[TurnDetectionMode | None]: An optional turn detection mode for managing conversation flow.

prop use_tts_aligned_transcript : NotGivenOr[bool]

Expand source code

@property
def use_tts_aligned_transcript(self) -> NotGivenOr[bool]:
    """
    Indicates whether to use TTS-aligned transcript as the input of
    the ``transcription_node``.

    If this property was not set at Agent creation, but an ``AgentSession`` provides a value for
    the use of TTS-aligned transcript, the session's value will be used at runtime instead.

    Returns:
        NotGivenOr[bool]: Whether to use TTS-aligned transcript.
    """
    return self._use_tts_aligned_transcript

Indicates whether to use TTS-aligned transcript as the input of the transcription_node.

If this property was not set at Agent creation, but an AgentSession provides a value for the use of TTS-aligned transcript, the session's value will be used at runtime instead.

Returns

NotGivenOr[bool]: Whether to use TTS-aligned transcript.

prop vad : NotGivenOr[vad.VAD | None]

Expand source code

@property
def vad(self) -> NotGivenOr[vad.VAD | None]:
    """
    Retrieves the Voice Activity Detection component for the agent.

    If this property was not set at Agent creation, but an ``AgentSession`` provides a VAD component,
    the session's VAD will be used at runtime instead.

    Returns:
        NotGivenOr[vad.VAD | None]: An optional VAD component for detecting voice activity.
    """  # noqa: E501
    return self._vad

Retrieves the Voice Activity Detection component for the agent.

If this property was not set at Agent creation, but an AgentSession provides a VAD component, the session's VAD will be used at runtime instead.

Returns

NotGivenOr[vad.VAD | None]: An optional VAD component for detecting voice activity.

Methods

def llm_node(self, chat_ctx: llm.ChatContext, tools: list[FunctionTool | RawFunctionTool], model_settings: ModelSettings) ‑> collections.abc.AsyncIterable[livekit.agents.llm.llm.ChatChunk | str | livekit.agents.types.FlushSentinel] | collections.abc.Coroutine[typing.Any, typing.Any, collections.abc.AsyncIterable[livekit.agents.llm.llm.ChatChunk | str | livekit.agents.types.FlushSentinel]] | collections.abc.Coroutine[typing.Any, typing.Any, str] | collections.abc.Coroutine[typing.Any, typing.Any, livekit.agents.llm.llm.ChatChunk] | collections.abc.Coroutine[typing.Any, typing.Any, None]

Expand source code

def llm_node(
    self,
    chat_ctx: llm.ChatContext,
    tools: list[FunctionTool | RawFunctionTool],
    model_settings: ModelSettings,
) -> (
    AsyncIterable[llm.ChatChunk | str | FlushSentinel]
    | Coroutine[Any, Any, AsyncIterable[llm.ChatChunk | str | FlushSentinel]]
    | Coroutine[Any, Any, str]
    | Coroutine[Any, Any, llm.ChatChunk]
    | Coroutine[Any, Any, None]
):
    """
    A node in the processing pipeline that processes text generation with an LLM.

    By default, this node uses the agent's LLM to process the provided context. It may yield
    plain text (as `str`) for straightforward text generation, or `llm.ChatChunk` objects that
    can include text and optional tool calls. `ChatChunk` is helpful for capturing more complex
    outputs such as function calls, usage statistics, or other metadata.

    You can override this node to customize how the LLM is used or how tool invocations
    and responses are handled.

    Args:
        chat_ctx (llm.ChatContext): The context for the LLM (the conversation history).
        tools (list[FunctionTool]): A list of callable tools that the LLM may invoke.
        model_settings (ModelSettings): Configuration and parameters for model execution.

    Yields/Returns:
        str: Plain text output from the LLM.
        llm.ChatChunk: An object that can contain both text and optional tool calls.
    """
    return Agent.default.llm_node(self, chat_ctx, tools, model_settings)

A node in the processing pipeline that processes text generation with an LLM.

By default, this node uses the agent's LLM to process the provided context. It may yield plain text (as str) for straightforward text generation, or llm.ChatChunk objects that can include text and optional tool calls. ChatChunk is helpful for capturing more complex outputs such as function calls, usage statistics, or other metadata.

You can override this node to customize how the LLM is used or how tool invocations and responses are handled.

Args

chat_ctx : llm.ChatContext: The context for the LLM (the conversation history).
tools : list[FunctionTool]: A list of callable tools that the LLM may invoke.
model_settings : ModelSettings: Configuration and parameters for model execution.

Yields/Returns: str: Plain text output from the LLM. llm.ChatChunk: An object that can contain both text and optional tool calls.

async def on_enter(self) ‑> None

Expand source code

async def on_enter(self) -> None:
    """Called when the task is entered"""
    pass

Called when the task is entered

async def on_exit(self) ‑> None

Expand source code

async def on_exit(self) -> None:
    """Called when the task is exited"""
    pass

Called when the task is exited

async def on_user_turn_completed(self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage) ‑> None

Expand source code

async def on_user_turn_completed(
    self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage
) -> None:
    """Called when the user has finished speaking, and the LLM is about to respond

    This is a good opportunity to update the chat context or edit the new message before it is
    sent to the LLM.
    """
    pass

Called when the user has finished speaking, and the LLM is about to respond

This is a good opportunity to update the chat context or edit the new message before it is sent to the LLM.

def realtime_audio_output_node(self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings) ‑> collections.abc.AsyncIterable[AudioFrame] | collections.abc.Coroutine[typing.Any, typing.Any, collections.abc.AsyncIterable[AudioFrame]] | collections.abc.Coroutine[typing.Any, typing.Any, None]

Expand source code

def realtime_audio_output_node(
    self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings
) -> (
    AsyncIterable[rtc.AudioFrame]
    | Coroutine[Any, Any, AsyncIterable[rtc.AudioFrame]]
    | Coroutine[Any, Any, None]
):
    """A node processing the audio from the realtime LLM session before it is played out."""
    return Agent.default.realtime_audio_output_node(self, audio, model_settings)

A node processing the audio from the realtime LLM session before it is played out.

def stt_node(self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings) ‑> collections.abc.AsyncIterable[livekit.agents.stt.stt.SpeechEvent | str] | collections.abc.Coroutine[typing.Any, typing.Any, collections.abc.AsyncIterable[livekit.agents.stt.stt.SpeechEvent | str]] | collections.abc.Coroutine[typing.Any, typing.Any, None]

Expand source code

def stt_node(
    self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings
) -> (
    AsyncIterable[stt.SpeechEvent | str]
    | Coroutine[Any, Any, AsyncIterable[stt.SpeechEvent | str]]
    | Coroutine[Any, Any, None]
):
    """
    A node in the processing pipeline that transcribes audio frames into speech events.

    By default, this node uses a Speech-To-Text (STT) capability from the current agent.
    If the STT implementation does not support streaming natively, a VAD (Voice Activity
    Detection) mechanism is required to wrap the STT.

    You can override this node with your own implementation for more flexibility (e.g.,
    custom pre-processing of audio, additional buffering, or alternative STT strategies).

    Args:
        audio (AsyncIterable[rtc.AudioFrame]): An asynchronous stream of audio frames.
        model_settings (ModelSettings): Configuration and parameters for model execution.

    Yields:
        stt.SpeechEvent: An event containing transcribed text or other STT-related data.
    """
    return Agent.default.stt_node(self, audio, model_settings)

A node in the processing pipeline that transcribes audio frames into speech events.

By default, this node uses a Speech-To-Text (STT) capability from the current agent. If the STT implementation does not support streaming natively, a VAD (Voice Activity Detection) mechanism is required to wrap the STT.

You can override this node with your own implementation for more flexibility (e.g., custom pre-processing of audio, additional buffering, or alternative STT strategies).

Args

audio : AsyncIterable[rtc.AudioFrame]: An asynchronous stream of audio frames.
model_settings : ModelSettings: Configuration and parameters for model execution.

Yields

stt.SpeechEvent: An event containing transcribed text or other STT-related data.

Expand source code

def transcription_node(
    self, text: AsyncIterable[str | TimedString], model_settings: ModelSettings
) -> (
    AsyncIterable[str | TimedString]
    | Coroutine[Any, Any, AsyncIterable[str | TimedString]]
    | Coroutine[Any, Any, None]
):
    """
    A node in the processing pipeline that finalizes transcriptions from text segments.

    This node can be used to adjust or post-process text coming from an LLM (or any other
    source) into a final transcribed form. For instance, you might clean up formatting, fix
    punctuation, or perform any other text transformations here.

    You can override this node to customize post-processing logic according to your needs.

    Args:
        text (AsyncIterable[str | TimedString]): An asynchronous stream of text segments.
        model_settings (ModelSettings): Configuration and parameters for model execution.

    Yields:
        str: Finalized or post-processed text segments.
    """
    return Agent.default.transcription_node(self, text, model_settings)

A node in the processing pipeline that finalizes transcriptions from text segments.

This node can be used to adjust or post-process text coming from an LLM (or any other source) into a final transcribed form. For instance, you might clean up formatting, fix punctuation, or perform any other text transformations here.

You can override this node to customize post-processing logic according to your needs.

Args

text : AsyncIterable[str | TimedString]: An asynchronous stream of text segments.
model_settings : ModelSettings: Configuration and parameters for model execution.

Yields

str: Finalized or post-processed text segments.

def tts_node(self, text: AsyncIterable[str], model_settings: ModelSettings) ‑> collections.abc.AsyncIterable[AudioFrame] | collections.abc.Coroutine[typing.Any, typing.Any, collections.abc.AsyncIterable[AudioFrame]] | collections.abc.Coroutine[typing.Any, typing.Any, None]

Expand source code

def tts_node(
    self, text: AsyncIterable[str], model_settings: ModelSettings
) -> (
    AsyncIterable[rtc.AudioFrame]
    | Coroutine[Any, Any, AsyncIterable[rtc.AudioFrame]]
    | Coroutine[Any, Any, None]
):
    """
    A node in the processing pipeline that synthesizes audio from text segments.

    By default, this node converts incoming text into audio frames using the Text-To-Speech
    from the agent.
    If the TTS implementation does not support streaming natively, it uses a sentence tokenizer
    to split text for incremental synthesis.

    You can override this node to provide different text chunking behavior, a custom TTS engine,
    or any other specialized processing.

    Args:
        text (AsyncIterable[str]): An asynchronous stream of text segments to be synthesized.
        model_settings (ModelSettings): Configuration and parameters for model execution.

    Yields:
        rtc.AudioFrame: Audio frames synthesized from the provided text.
    """
    return Agent.default.tts_node(self, text, model_settings)

A node in the processing pipeline that synthesizes audio from text segments.

By default, this node converts incoming text into audio frames using the Text-To-Speech from the agent. If the TTS implementation does not support streaming natively, it uses a sentence tokenizer to split text for incremental synthesis.

You can override this node to provide different text chunking behavior, a custom TTS engine, or any other specialized processing.

Args

text : AsyncIterable[str]: An asynchronous stream of text segments to be synthesized.
model_settings : ModelSettings: Configuration and parameters for model execution.

Yields

rtc.AudioFrame: Audio frames synthesized from the provided text.

async def update_chat_ctx(self, chat_ctx: llm.ChatContext, *, exclude_invalid_function_calls: bool = True) ‑> None

Expand source code

async def update_chat_ctx(
    self, chat_ctx: llm.ChatContext, *, exclude_invalid_function_calls: bool = True
) -> None:
    """
    Updates the agent's chat context.

    If the agent is running in realtime mode, this method also updates
    the chat context for the ongoing realtime session.

    Args:
        chat_ctx (llm.ChatContext):
            The new or updated chat context for the agent.
        exclude_invalid_function_calls (bool): Whether to exclude function calls
            and outputs not from the agent's tools.

    Raises:
        llm.RealtimeError: If updating the realtime session chat context fails.
    """
    if self._activity is None:
        self._chat_ctx = chat_ctx.copy(
            tools=self._tools if exclude_invalid_function_calls else NOT_GIVEN
        )
        return

    await self._activity.update_chat_ctx(
        chat_ctx, exclude_invalid_function_calls=exclude_invalid_function_calls
    )

Updates the agent's chat context.

If the agent is running in realtime mode, this method also updates the chat context for the ongoing realtime session.

Args

chat_ctx (llm.ChatContext):
The new or updated chat context for the agent.
exclude_invalid_function_calls : bool: Whether to exclude function calls and outputs not from the agent's tools.

Raises

llm.RealtimeError: If updating the realtime session chat context fails.

async def update_instructions(self, instructions: str) ‑> None

Expand source code

async def update_instructions(self, instructions: str) -> None:
    """
    Updates the agent's instructions.

    If the agent is running in realtime mode, this method also updates
    the instructions for the ongoing realtime session.

    Args:
        instructions (str):
            The new instructions to set for the agent.

    Raises:
        llm.RealtimeError: If updating the realtime session instructions fails.
    """
    if self._activity is None:
        self._instructions = instructions
        return

    await self._activity.update_instructions(instructions)

Updates the agent's instructions.

If the agent is running in realtime mode, this method also updates the instructions for the ongoing realtime session.

Args

instructions (str): The new instructions to set for the agent.

Raises

llm.RealtimeError: If updating the realtime session instructions fails.

async def update_tools(self, tools: list[llm.FunctionTool | llm.RawFunctionTool]) ‑> None

Expand source code

async def update_tools(self, tools: list[llm.FunctionTool | llm.RawFunctionTool]) -> None:
    """
    Updates the agent's available function tools.

    If the agent is running in realtime mode, this method also updates
    the tools for the ongoing realtime session.

    Args:
        tools (list[llm.FunctionTool]):
            The new list of function tools available to the agent.

    Raises:
        llm.RealtimeError: If updating the realtime session tools fails.
    """
    invalid = [t for t in tools if not (is_function_tool(t) or is_raw_function_tool(t))]
    if invalid:
        kinds = ", ".join(sorted({type(t).__name__ for t in invalid}))
        raise TypeError(
            f"Invalid tool type(s): {kinds}. Expected FunctionTool or RawFunctionTool."
        )

    if self._activity is None:
        self._tools = list(set(tools))
        self._chat_ctx = self._chat_ctx.copy(tools=self._tools)
        return

    await self._activity.update_tools(tools)

Updates the agent's available function tools.

If the agent is running in realtime mode, this method also updates the tools for the ongoing realtime session.

Args

tools (list[llm.FunctionTool]): The new list of function tools available to the agent.

Raises

llm.RealtimeError: If updating the realtime session tools fails.

class AgentFalseInterruptionEvent (**data: Any)

Expand source code

class AgentFalseInterruptionEvent(BaseModel):
    type: Literal["agent_false_interruption"] = "agent_false_interruption"
    resumed: bool
    """Whether the false interruption was resumed automatically."""
    created_at: float = Field(default_factory=time.time)

    # deprecated
    message: ChatMessage | None = None
    extra_instructions: str | None = None

    def __getattribute__(self, name: str) -> Any:
        if name in ["message", "extra_instructions"]:
            logger.warning(
                f"AgentFalseInterruptionEvent.{name} is deprecated, automatic resume is now supported"
            )
        return super().__getattribute__(name)

Usage docs: https://docs.pydantic.dev/2.10/concepts/models/

A base class for creating Pydantic models.

Attributes

__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom __init__ function.
__pydantic_decorators__: Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [RootModel][pydantic.root_model.RootModel].
__pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model.
__pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [FieldInfo][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [ComputedFieldInfo][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [extra][pydantic.config.ConfigDict.extra] is set to 'allow'.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Ancestors

pydantic.main.BaseModel

Class variables

var created_at : float
var extra_instructions : str | None
var message : livekit.agents.llm.chat_context.ChatMessage | None
var model_config
var resumed : bool: Whether the false interruption was resumed automatically.
var type : Literal['agent_false_interruption']

class AgentSession (*, turn_detection: NotGivenOr[TurnDetectionMode] = NOT_GIVEN, stt: NotGivenOr[stt.STT | STTModels | str] = NOT_GIVEN, vad: NotGivenOr[vad.VAD] = NOT_GIVEN, llm: NotGivenOr[llm.LLM | llm.RealtimeModel | LLMModels | str] = NOT_GIVEN, tts: NotGivenOr[tts.TTS | TTSModels | str] = NOT_GIVEN, tools: NotGivenOr[list[llm.FunctionTool | llm.RawFunctionTool]] = NOT_GIVEN, mcp_servers: NotGivenOr[list[mcp.MCPServer]] = NOT_GIVEN, userdata: NotGivenOr[Userdata_T] = NOT_GIVEN, allow_interruptions: bool = True, discard_audio_if_uninterruptible: bool = True, min_interruption_duration: float = 0.5, min_interruption_words: int = 0, min_endpointing_delay: float = 0.5, max_endpointing_delay: float = 3.0, max_tool_steps: int = 3, video_sampler: NotGivenOr[_VideoSampler | None] = NOT_GIVEN, user_away_timeout: float | None = 15.0, false_interruption_timeout: float | None = 2.0, resume_false_interruption: bool = True, min_consecutive_speech_delay: float = 0.0, use_tts_aligned_transcript: NotGivenOr[bool] = NOT_GIVEN, tts_text_transforms: NotGivenOr[Sequence[TextTransforms] | None] = NOT_GIVEN, preemptive_generation: bool = False, ivr_detection: bool = False, conn_options: NotGivenOr[SessionConnectOptions] = NOT_GIVEN, loop: asyncio.AbstractEventLoop | None = None, agent_false_interruption_timeout: NotGivenOr[float | None] = NOT_GIVEN)

Expand source code

class AgentSession(rtc.EventEmitter[EventTypes], Generic[Userdata_T]):
    def __init__(
        self,
        *,
        turn_detection: NotGivenOr[TurnDetectionMode] = NOT_GIVEN,
        stt: NotGivenOr[stt.STT | STTModels | str] = NOT_GIVEN,
        vad: NotGivenOr[vad.VAD] = NOT_GIVEN,
        llm: NotGivenOr[llm.LLM | llm.RealtimeModel | LLMModels | str] = NOT_GIVEN,
        tts: NotGivenOr[tts.TTS | TTSModels | str] = NOT_GIVEN,
        tools: NotGivenOr[list[llm.FunctionTool | llm.RawFunctionTool]] = NOT_GIVEN,
        mcp_servers: NotGivenOr[list[mcp.MCPServer]] = NOT_GIVEN,
        userdata: NotGivenOr[Userdata_T] = NOT_GIVEN,
        allow_interruptions: bool = True,
        discard_audio_if_uninterruptible: bool = True,
        min_interruption_duration: float = 0.5,
        min_interruption_words: int = 0,
        min_endpointing_delay: float = 0.5,
        max_endpointing_delay: float = 3.0,
        max_tool_steps: int = 3,
        video_sampler: NotGivenOr[_VideoSampler | None] = NOT_GIVEN,
        user_away_timeout: float | None = 15.0,
        false_interruption_timeout: float | None = 2.0,
        resume_false_interruption: bool = True,
        min_consecutive_speech_delay: float = 0.0,
        use_tts_aligned_transcript: NotGivenOr[bool] = NOT_GIVEN,
        tts_text_transforms: NotGivenOr[Sequence[TextTransforms] | None] = NOT_GIVEN,
        preemptive_generation: bool = False,
        ivr_detection: bool = False,
        conn_options: NotGivenOr[SessionConnectOptions] = NOT_GIVEN,
        loop: asyncio.AbstractEventLoop | None = None,
        # deprecated
        agent_false_interruption_timeout: NotGivenOr[float | None] = NOT_GIVEN,
    ) -> None:
        """`AgentSession` is the LiveKit Agents runtime that glues together
        media streams, speech/LLM components, and tool orchestration into a
        single real-time voice agent.

        It links audio, video, and text I/O with STT, VAD, TTS, and the LLM;
        handles turn detection, endpointing, interruptions, and multi-step
        tool calls; and exposes everything through event callbacks so you can
        focus on writing function tools and simple hand-offs rather than
        low-level streaming logic.

        Args:
            turn_detection (TurnDetectionMode, optional): Strategy for deciding
                when the user has finifshed speaking.

                * ``"stt"`` – rely on speech-to-text end-of-utterance cues
                * ``"vad"`` – rely on Voice Activity Detection start/stop cues
                * ``"realtime_llm"`` – use server-side detection from a
                  realtime LLM
                * ``"manual"`` – caller controls turn boundaries explicitly
                * ``_TurnDetector`` instance – plug-in custom detector

                If *NOT_GIVEN*, the session chooses the best available mode in
                priority order ``realtime_llm → vad → stt → manual``; it
                automatically falls back if the necessary model is missing.
            stt (stt.STT | str, optional): Speech-to-text backend.
            vad (vad.VAD, optional): Voice-activity detector
            llm (llm.LLM | llm.RealtimeModel | str, optional): LLM or RealtimeModel
            tts (tts.TTS | str, optional): Text-to-speech engine.
            tools (list[llm.FunctionTool | llm.RawFunctionTool], optional): List of
                tools shared by every agent in the agent session.
            mcp_servers (list[mcp.MCPServer], optional): List of MCP servers
                providing external tools for the agent to use.
            userdata (Userdata_T, optional): Arbitrary per-session user data.
            allow_interruptions (bool): Whether the user can interrupt the
                agent mid-utterance. Default ``True``.
            discard_audio_if_uninterruptible (bool): When ``True``, buffered
                audio is dropped while the agent is speaking and cannot be
                interrupted. Default ``True``.
            min_interruption_duration (float): Minimum speech length (s) to
                register as an interruption. Default ``0.5`` s.
            min_interruption_words (int): Minimum number of words to consider
                an interruption, only used if stt enabled. Default ``0``.
            min_endpointing_delay (float): Minimum time-in-seconds the agent
                must wait after a potential end-of-utterance signal (from VAD
                or an EOU model) before it declares the user’s turn complete.
                Default ``0.5`` s.
            max_endpointing_delay (float): Maximum time-in-seconds the agent
                will wait before terminating the turn. Default ``3.0`` s.
            max_tool_steps (int): Maximum consecutive tool calls per LLM turn.
                Default ``3``.
            video_sampler (_VideoSampler, optional): Uses
                :class:`VoiceActivityVideoSampler` when *NOT_GIVEN*; that sampler
                captures video at ~1 fps while the user is speaking and ~0.3 fps
                when silent by default.
            user_away_timeout (float, optional): If set, set the user state as
                "away" after this amount of time after user and agent are silent.
                Default ``15.0`` s, set to ``None`` to disable.
            false_interruption_timeout (float, optional): If set, emit an
                `agent_false_interruption` event after this amount of time if
                the user is silent and no user transcript is detected after
                the interruption. Set to ``None`` to disable. Default ``2.0`` s.
            resume_false_interruption (bool): Whether to resume the false interruption
                after the false_interruption_timeout. Default ``True``.
            min_consecutive_speech_delay (float, optional): The minimum delay between
                consecutive speech. Default ``0.0`` s.
            use_tts_aligned_transcript (bool, optional): Whether to use TTS-aligned
                transcript as the input of the ``transcription_node``. Only applies
                if ``TTS.capabilities.aligned_transcript`` is ``True`` or ``streaming``
                is ``False``. When NOT_GIVEN, it's disabled.
            tts_text_transforms (Sequence[TextTransforms], optional): The transforms to apply
                to the tts input text, available built-in transforms: ``"filter_markdown"``, ``"filter_emoji"``.
                Set to ``None`` to disable. When NOT_GIVEN, all filters will be applied.
            preemptive_generation (bool):
                Whether to speculatively begin LLM and TTS requests before an end-of-turn is
                detected. When True, the agent sends inference calls as soon as a user
                transcript is received rather than waiting for a definitive turn boundary. This
                can reduce response latency by overlapping model inference with user audio,
                but may incur extra compute if the user interrupts or revises mid-utterance.
                Defaults to ``False``.
            ivr_detection (bool): Whether to detect if the agent is interacting with an IVR system.
                Default ``False``.
            conn_options (SessionConnectOptions, optional): Connection options for
                stt, llm, and tts.
            loop (asyncio.AbstractEventLoop, optional): Event loop to bind the
                session to. Falls back to :pyfunc:`asyncio.get_event_loop()`.
        """
        super().__init__()
        self._loop = loop or asyncio.get_event_loop()

        if is_given(agent_false_interruption_timeout):
            logger.warning(
                "`agent_false_interruption_timeout` is deprecated, use `false_interruption_timeout` instead"  # noqa: E501
            )
            false_interruption_timeout = agent_false_interruption_timeout

        if not is_given(video_sampler):
            video_sampler = VoiceActivityVideoSampler(speaking_fps=1.0, silent_fps=0.3)

        self._video_sampler = video_sampler

        # This is the "global" chat_context, it holds the entire conversation history
        self._chat_ctx = ChatContext.empty()
        self._opts = AgentSessionOptions(
            allow_interruptions=allow_interruptions,
            discard_audio_if_uninterruptible=discard_audio_if_uninterruptible,
            min_interruption_duration=min_interruption_duration,
            min_interruption_words=min_interruption_words,
            min_endpointing_delay=min_endpointing_delay,
            max_endpointing_delay=max_endpointing_delay,
            max_tool_steps=max_tool_steps,
            user_away_timeout=user_away_timeout,
            false_interruption_timeout=false_interruption_timeout,
            resume_false_interruption=resume_false_interruption,
            min_consecutive_speech_delay=min_consecutive_speech_delay,
            tts_text_transforms=(
                tts_text_transforms
                if is_given(tts_text_transforms)
                else DEFAULT_TTS_TEXT_TRANSFORMS
            ),
            preemptive_generation=preemptive_generation,
            ivr_detection=ivr_detection,
            use_tts_aligned_transcript=use_tts_aligned_transcript
            if is_given(use_tts_aligned_transcript)
            else None,
        )
        self._conn_options = conn_options or SessionConnectOptions()
        self._started = False
        self._turn_detection = turn_detection or None

        if isinstance(stt, str):
            stt = inference.STT.from_model_string(stt)

        if isinstance(llm, str):
            llm = inference.LLM.from_model_string(llm)

        if isinstance(tts, str):
            tts = inference.TTS.from_model_string(tts)

        self._stt = stt or None
        self._vad = vad or None
        self._llm = llm or None
        self._tts = tts or None
        self._mcp_servers = mcp_servers or None
        self._tools = tools if is_given(tools) else []

        # unrecoverable error counts, reset after agent speaking
        self._llm_error_counts = 0
        self._tts_error_counts = 0

        # configurable IO
        self._input = io.AgentInput(self._on_video_input_changed, self._on_audio_input_changed)
        self._output = io.AgentOutput(
            self._on_video_output_changed,
            self._on_audio_output_changed,
            self._on_text_output_changed,
        )

        self._forward_audio_atask: asyncio.Task[None] | None = None
        self._forward_video_atask: asyncio.Task[None] | None = None
        self._update_activity_atask: asyncio.Task[None] | None = None
        self._activity_lock = asyncio.Lock()
        self._lock = asyncio.Lock()

        # used to keep a reference to the room io
        self._room_io: room_io.RoomIO | None = None
        self._recorder_io: RecorderIO | None = None

        self._agent: Agent | None = None
        self._activity: AgentActivity | None = None
        self._next_activity: AgentActivity | None = None
        self._user_state: UserState = "listening"
        self._agent_state: AgentState = "initializing"
        self._user_away_timer: asyncio.TimerHandle | None = None

        self._userdata: Userdata_T | None = userdata if is_given(userdata) else None
        self._closing_task: asyncio.Task[None] | None = None
        self._closing: bool = False
        self._job_context_cb_registered: bool = False

        self._global_run_state: RunResult | None = None

        # trace
        self._user_speaking_span: trace.Span | None = None
        self._agent_speaking_span: trace.Span | None = None
        self._session_span: trace.Span | None = None
        self._root_span_context: otel_context.Context | None = None
        self._session_ctx_token: Token[otel_context.Context] | None = None

        self._recorded_events: list[AgentEvent] = []
        self._enable_recording: bool = False
        self._started_at: float | None = None

        # ivr activity
        self._ivr_activity: IVRActivity | None = None

    def emit(self, event: EventTypes, arg: AgentEvent) -> None:  # type: ignore
        self._recorded_events.append(arg)
        super().emit(event, arg)

    @property
    def userdata(self) -> Userdata_T:
        if self._userdata is None:
            raise ValueError("AgentSession userdata is not set")

        return self._userdata

    @userdata.setter
    def userdata(self, value: Userdata_T) -> None:
        self._userdata = value

    @property
    def turn_detection(self) -> TurnDetectionMode | None:
        return self._turn_detection

    @property
    def mcp_servers(self) -> list[mcp.MCPServer] | None:
        return self._mcp_servers

    @property
    def input(self) -> io.AgentInput:
        return self._input

    @property
    def output(self) -> io.AgentOutput:
        return self._output

    @property
    def options(self) -> AgentSessionOptions:
        return self._opts

    @property
    def conn_options(self) -> SessionConnectOptions:
        return self._conn_options

    @property
    def history(self) -> llm.ChatContext:
        return self._chat_ctx

    @property
    def current_speech(self) -> SpeechHandle | None:
        return self._activity.current_speech if self._activity is not None else None

    @property
    def user_state(self) -> UserState:
        return self._user_state

    @property
    def agent_state(self) -> AgentState:
        return self._agent_state

    @property
    def current_agent(self) -> Agent:
        if self._agent is None:
            raise RuntimeError("VoiceAgent isn't running")

        return self._agent

    @property
    def tools(self) -> list[llm.FunctionTool | llm.RawFunctionTool]:
        return self._tools

    def run(self, *, user_input: str, output_type: type[Run_T] | None = None) -> RunResult[Run_T]:
        if self._global_run_state is not None and not self._global_run_state.done():
            raise RuntimeError("nested runs are not supported")

        run_state = RunResult(user_input=user_input, output_type=output_type)
        self._global_run_state = run_state
        self.generate_reply(user_input=user_input)
        return run_state

    @overload
    async def start(
        self,
        agent: Agent,
        *,
        capture_run: Literal[True],
        room: NotGivenOr[rtc.Room] = NOT_GIVEN,
        room_options: NotGivenOr[room_io.RoomOptions] = NOT_GIVEN,
        # deprecated
        room_input_options: NotGivenOr[room_io.RoomInputOptions] = NOT_GIVEN,
        room_output_options: NotGivenOr[room_io.RoomOutputOptions] = NOT_GIVEN,
        record: bool = True,
    ) -> RunResult: ...

    @overload
    async def start(
        self,
        agent: Agent,
        *,
        capture_run: Literal[False] = False,
        room: NotGivenOr[rtc.Room] = NOT_GIVEN,
        room_options: NotGivenOr[room_io.RoomOptions] = NOT_GIVEN,
        # deprecated
        room_input_options: NotGivenOr[room_io.RoomInputOptions] = NOT_GIVEN,
        room_output_options: NotGivenOr[room_io.RoomOutputOptions] = NOT_GIVEN,
        record: bool = True,
    ) -> None: ...

    async def start(
        self,
        agent: Agent,
        *,
        capture_run: bool = False,
        room: NotGivenOr[rtc.Room] = NOT_GIVEN,
        room_options: NotGivenOr[room_io.RoomOptions] = NOT_GIVEN,
        # deprecated
        room_input_options: NotGivenOr[room_io.RoomInputOptions] = NOT_GIVEN,
        room_output_options: NotGivenOr[room_io.RoomOutputOptions] = NOT_GIVEN,
        record: NotGivenOr[bool] = NOT_GIVEN,
    ) -> RunResult | None:
        """Start the voice agent.

        Create a default RoomIO if the input or output audio is not already set.
        If the console flag is provided, start a ChatCLI.

        Args:
            capture_run: Whether to return a RunResult and capture the run result during session start.
            room: The room to use for input and output
            room_input_options: Options for the room input
            room_output_options: Options for the room output
            record: Whether to record the audio
        """
        async with self._lock:
            if self._started:
                return None

            self._started_at = time.time()

            # configure observability first
            job_ctx: JobContext | None = None
            try:
                job_ctx = get_job_context()
                if not is_given(record):
                    record = job_ctx.job.enable_recording

                self._enable_recording = record

                if self._enable_recording:
                    job_ctx.init_recording()

            except RuntimeError:
                # JobContext is not available in evals
                pass

            self._session_span = current_span = tracer.start_span("agent_session")
            # we detach here to avoid context issues since tokens need to be detached
            # in the same context as it was created
            if self._session_ctx_token is not None:
                otel_context.detach(self._session_ctx_token)
                self._session_ctx_token = None
            ctx = trace.set_span_in_context(current_span)
            self._session_ctx_token = otel_context.attach(ctx)

            self._recorded_events = []
            self._room_io = None
            self._recorder_io = None

            self._closing = False
            self._root_span_context = otel_context.get_current()
            current_span = trace.get_current_span()
            current_span.set_attribute(trace_types.ATTR_AGENT_LABEL, agent.label)

            self._agent = agent
            self._update_agent_state("initializing")

            tasks: list[asyncio.Task[None]] = []

            c = cli.AgentsConsole.get_instance()
            if c.enabled and not c.io_acquired:
                if self.input.audio is not None or self.output.audio is not None:
                    logger.warning(
                        "agent started with the console subcommand, but input.audio/output.audio "
                        "is already set, overriding..."
                    )

                c.acquire_io(loop=self._loop, session=self)
            elif is_given(room) and not self._room_io:
                room_options = room_io.RoomOptions._ensure_options(
                    room_options,
                    room_input_options=room_input_options,
                    room_output_options=room_output_options,
                )
                room_options = copy.copy(room_options)  # shadow copy is enough

                if self.input.audio is not None:
                    if room_options.audio_input:
                        logger.warning(
                            "RoomIO audio input is enabled but input.audio is already set, ignoring.."  # noqa: E501
                        )
                    room_options.audio_input = False

                if self.output.audio is not None:
                    if room_options.audio_output:
                        logger.warning(
                            "RoomIO audio output is enabled but output.audio is already set, ignoring.."  # noqa: E501
                        )
                    room_options.audio_output = False

                if self.output.transcription is not None:
                    if room_options.text_output:
                        logger.warning(
                            "RoomIO transcription output is enabled but output.transcription is already set, ignoring.."  # noqa: E501
                        )
                    room_options.text_output = False

                self._room_io = room_io.RoomIO(room=room, agent_session=self, options=room_options)
                await self._room_io.start()

            if job_ctx:
                # these aren't relevant during eval mode, as they require job context and/or room_io
                if self.input.audio and self.output.audio:
                    if self._enable_recording:
                        self._recorder_io = RecorderIO(agent_session=self)
                        self.input.audio = self._recorder_io.record_input(self.input.audio)
                        self.output.audio = self._recorder_io.record_output(self.output.audio)

                        if (c.enabled and c.record) or not c.enabled:
                            task = asyncio.create_task(
                                self._recorder_io.start(
                                    output_path=job_ctx.session_directory / "audio.ogg"
                                )
                            )
                            tasks.append(task)

                if job_ctx._primary_agent_session is None:
                    job_ctx._primary_agent_session = self
                elif self._enable_recording:
                    raise RuntimeError(
                        "Only one `AgentSession` can be the primary at a time. "
                        "If you want to ignore primary designation, use session.start(record=False)."
                    )

                if self.options.ivr_detection:
                    self._ivr_activity = IVRActivity(self)

                    # inject the IVR activity tools into the session tools
                    self._tools.extend(self._ivr_activity.tools)

                    tasks.append(
                        asyncio.create_task(self._ivr_activity.start(), name="_ivr_activity_start")
                    )

                current_span.set_attribute(trace_types.ATTR_ROOM_NAME, job_ctx.room.name)
                current_span.set_attribute(trace_types.ATTR_JOB_ID, job_ctx.job.id)
                current_span.set_attribute(trace_types.ATTR_AGENT_NAME, job_ctx.job.agent_name)
                if self._room_io:
                    # automatically connect to the room when room io is used
                    tasks.append(asyncio.create_task(job_ctx.connect(), name="_job_ctx_connect"))

                # session can be restarted, register the callbacks only once
                if not self._job_context_cb_registered:
                    job_ctx.add_shutdown_callback(
                        lambda: self._aclose_impl(reason=CloseReason.JOB_SHUTDOWN)
                    )
                    self._job_context_cb_registered = True

            run_state: RunResult | None = None
            if capture_run:
                if self._global_run_state is not None and not self._global_run_state.done():
                    raise RuntimeError("nested runs are not supported")

                run_state = RunResult(output_type=None)
                self._global_run_state = run_state

            # it is ok to await it directly, there is no previous task to drain
            tasks.append(
                asyncio.create_task(self._update_activity(self._agent, wait_on_enter=False))
            )

            try:
                await asyncio.gather(*tasks)
            finally:
                await utils.aio.cancel_and_wait(*tasks)

            # important: no await should be done after this!

            if self.input.audio is not None:
                self._forward_audio_atask = asyncio.create_task(
                    self._forward_audio_task(), name="_forward_audio_task"
                )

            if self.input.video is not None:
                self._forward_video_atask = asyncio.create_task(
                    self._forward_video_task(), name="_forward_video_task"
                )

            self._started = True
            self._update_agent_state("listening")
            if self._room_io and self._room_io.subscribed_fut:

                def on_room_io_subscribed(_: asyncio.Future[None]) -> None:
                    if self._user_state == "listening" and self._agent_state == "listening":
                        self._set_user_away_timer()

                self._room_io.subscribed_fut.add_done_callback(on_room_io_subscribed)

            # log used IO
            def _collect_source(
                inp: io.AudioInput | io.VideoInput | None,
            ) -> list[io.AudioInput | io.VideoInput]:
                return [] if inp is None else [inp] + _collect_source(inp.source)

            def _collect_chain(
                out: io.TextOutput | io.VideoOutput | io.AudioOutput | None,
            ) -> list[io.VideoOutput | io.AudioOutput | io.TextOutput]:
                return [] if out is None else [out] + _collect_chain(out.next_in_chain)

            audio_input = _collect_source(self.input.audio)[::-1]
            video_input = _collect_source(self.input.video)[::-1]

            audio_output = _collect_chain(self.output.audio)
            video_output = _collect_chain(self.output.video)
            transcript_output = _collect_chain(self.output.transcription)

            logger.debug(
                "using audio io: %s -> `AgentSession` -> %s",
                " -> ".join([f"`{out.label}`" for out in audio_input]) or "(none)",
                " -> ".join([f"`{out.label}`" for out in audio_output]) or "(none)",
            )
            if (
                self._opts.resume_false_interruption
                and self.output.audio
                and not self.output.audio.can_pause
            ):
                logger.warning(
                    "resume_false_interruption is enabled but audio output does not support pause, it will be ignored",
                    extra={"audio_output": self.output.audio.label},
                )

            logger.debug(
                "using transcript io: `AgentSession` -> %s",
                " -> ".join([f"`{out.label}`" for out in transcript_output]) or "(none)",
            )

            if video_input or video_output:
                logger.debug(
                    "using video io: %s > `AgentSession` > %s",
                    " -> ".join([f"`{out.label}`" for out in video_input]) or "(none)",
                    " -> ".join([f"`{out.label}`" for out in video_output]) or "(none)",
                )

            if run_state:
                await run_state

            return run_state

    async def drain(self) -> None:
        if self._activity is None:
            raise RuntimeError("AgentSession isn't running")

        await self._activity.drain()

    @property
    def room_io(self) -> room_io.RoomIO:
        if not self._room_io:
            raise RuntimeError(
                "Cannot access room_io: the AgentSession was not started with a room."
            )

        return self._room_io

    def _close_soon(
        self,
        *,
        reason: CloseReason,
        drain: bool = False,
        error: llm.LLMError | stt.STTError | tts.TTSError | llm.RealtimeModelError | None = None,
    ) -> None:
        if self._closing_task:
            return
        self._closing_task = asyncio.create_task(
            self._aclose_impl(error=error, drain=drain, reason=reason)
        )

    def shutdown(self, *, drain: bool = True) -> None:
        self._close_soon(error=None, drain=drain, reason=CloseReason.USER_INITIATED)

    @utils.log_exceptions(logger=logger)
    async def _aclose_impl(
        self,
        *,
        reason: CloseReason,
        drain: bool = False,
        error: llm.LLMError | stt.STTError | tts.TTSError | llm.RealtimeModelError | None = None,
    ) -> None:
        if self._root_span_context:
            # make `activity.drain` and `on_exit` under the root span
            otel_context.attach(self._root_span_context)

        async with self._lock:
            if not self._started:
                return

            self._closing = True
            self._cancel_user_away_timer()

            if self._activity is not None:
                if not drain:
                    try:
                        await self._activity.interrupt()
                    except RuntimeError:
                        # uninterruptible speech
                        # TODO(long): force interrupt or wait for it to finish?
                        # it might be an audio played from the error callback
                        pass
                await self._activity.drain()

                # wait any uninterruptible speech to finish
                if self._activity.current_speech:
                    await self._activity.current_speech

                # detach the inputs and outputs
                self.input.audio = None
                self.input.video = None
                self.output.audio = None
                self.output.transcription = None

                if (
                    reason != CloseReason.ERROR
                    and (audio_recognition := self._activity._audio_recognition) is not None
                ):
                    # wait for the user transcript to be committed
                    audio_recognition.commit_user_turn(audio_detached=True, transcript_timeout=2.0)

                await self._activity.aclose()
                self._activity = None

            if self._agent_speaking_span:
                self._agent_speaking_span.end()
                self._agent_speaking_span = None

            if self._user_speaking_span:
                self._user_speaking_span.end()
                self._user_speaking_span = None

            if self._forward_audio_atask is not None:
                await utils.aio.cancel_and_wait(self._forward_audio_atask)

            if self._recorder_io:
                await self._recorder_io.aclose()

            if self._ivr_activity is not None:
                await self._ivr_activity.aclose()

            if self._session_span:
                self._session_span.end()
                self._session_span = None

            self._started = False

            self.emit("close", CloseEvent(error=error, reason=reason))

            if self._room_io:
                # close room io after close event is emitted, ensure the room io's close callback is called
                await self._room_io.aclose()

            self._cancel_user_away_timer()
            self._user_state = "listening"
            self._agent_state = "initializing"
            self._llm_error_counts = 0
            self._tts_error_counts = 0
            self._root_span_context = None

            # close room io after close event is emitted
            if self._room_io:
                await self._room_io.aclose()
                self._room_io = None

        logger.debug("session closed", extra={"reason": reason.value, "error": error})

    async def aclose(self) -> None:
        await self._aclose_impl(reason=CloseReason.USER_INITIATED)

    def update_options(
        self,
        *,
        min_endpointing_delay: NotGivenOr[float] = NOT_GIVEN,
        max_endpointing_delay: NotGivenOr[float] = NOT_GIVEN,
        turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
    ) -> None:
        """
        Update the options for the agent session.

        Args:
            min_endpointing_delay (NotGivenOr[float], optional): The minimum endpointing delay.
            max_endpointing_delay (NotGivenOr[float], optional): The maximum endpointing delay.
            turn_detection (NotGivenOr[TurnDetectionMode | None], optional): Strategy for deciding
                when the user has finished speaking. ``None`` reverts to automatic selection.
        """
        if is_given(min_endpointing_delay):
            self._opts.min_endpointing_delay = min_endpointing_delay
        if is_given(max_endpointing_delay):
            self._opts.max_endpointing_delay = max_endpointing_delay

        if is_given(turn_detection):
            self._turn_detection = cast(Optional[TurnDetectionMode], turn_detection)

        if self._activity is not None:
            self._activity.update_options(
                min_endpointing_delay=min_endpointing_delay,
                max_endpointing_delay=max_endpointing_delay,
                turn_detection=turn_detection,
            )

    def say(
        self,
        text: str | AsyncIterable[str],
        *,
        audio: NotGivenOr[AsyncIterable[rtc.AudioFrame]] = NOT_GIVEN,
        allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
        add_to_chat_ctx: bool = True,
    ) -> SpeechHandle:
        if self._activity is None:
            raise RuntimeError("AgentSession isn't running")

        run_state = self._global_run_state
        activity = self._next_activity if self._activity.scheduling_paused else self._activity

        if activity is None:
            raise RuntimeError("AgentSession is closing, cannot use say()")

        # attach to the session span if called outside of the AgentSession
        use_span: AbstractContextManager[trace.Span | None] = nullcontext()
        if trace.get_current_span() is trace.INVALID_SPAN and self._session_span is not None:
            use_span = trace.use_span(self._session_span, end_on_exit=False)

        with use_span:
            handle = activity.say(
                text,
                audio=audio,
                allow_interruptions=allow_interruptions,
                add_to_chat_ctx=add_to_chat_ctx,
            )
            if run_state:
                run_state._watch_handle(handle)

        return handle

    def generate_reply(
        self,
        *,
        user_input: NotGivenOr[str] = NOT_GIVEN,
        instructions: NotGivenOr[str] = NOT_GIVEN,
        tool_choice: NotGivenOr[llm.ToolChoice] = NOT_GIVEN,
        allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
        chat_ctx: NotGivenOr[ChatContext] = NOT_GIVEN,
    ) -> SpeechHandle:
        """Generate a reply for the agent to speak to the user.

        Args:
            user_input (NotGivenOr[str], optional): The user's input that may influence the reply,
                such as answering a question.
            instructions (NotGivenOr[str], optional): Additional instructions for generating the reply.
            tool_choice (NotGivenOr[llm.ToolChoice], optional): Specifies the external tool to use when
                generating the reply. If generate_reply is invoked within a function_tool, defaults to "none".
            allow_interruptions (NotGivenOr[bool], optional): Indicates whether the user can interrupt this speech.

        Returns:
            SpeechHandle: A handle to the generated reply.
        """  # noqa: E501
        if self._activity is None:
            raise RuntimeError("AgentSession isn't running")

        user_message = (
            llm.ChatMessage(role="user", content=[user_input])
            if is_given(user_input)
            else NOT_GIVEN
        )

        run_state = self._global_run_state
        activity = self._next_activity if self._activity.scheduling_paused else self._activity

        if activity is None:
            raise RuntimeError("AgentSession is closing, cannot use generate_reply()")

        # attach to the session span if called outside of the AgentSession
        use_span: AbstractContextManager[trace.Span | None] = nullcontext()
        if trace.get_current_span() is trace.INVALID_SPAN and self._session_span is not None:
            use_span = trace.use_span(self._session_span, end_on_exit=False)

        with use_span:
            handle = activity._generate_reply(
                user_message=user_message if user_message else None,
                instructions=instructions,
                tool_choice=tool_choice,
                allow_interruptions=allow_interruptions,
                chat_ctx=chat_ctx,
            )
            if run_state:
                run_state._watch_handle(handle)

        return handle

    def interrupt(self, *, force: bool = False) -> asyncio.Future[None]:
        """Interrupt the current speech generation.

        Returns:
            An asyncio.Future that completes when the interruption is fully processed
            and chat context has been updated.
        """
        if self._activity is None:
            raise RuntimeError("AgentSession isn't running")

        return self._activity.interrupt(force=force)

    def clear_user_turn(self) -> None:
        # clear the transcription or input audio buffer of the user turn
        if self._activity is None:
            raise RuntimeError("AgentSession isn't running")

        self._activity.clear_user_turn()

    def commit_user_turn(
        self, *, transcript_timeout: float = 2.0, stt_flush_duration: float = 2.0
    ) -> None:
        """Commit the user turn and generate a reply.

        Args:
            transcript_timeout (float, optional): The timeout for the final transcript
                to be received after committing the user turn.
                Increase this value if the STT is slow to respond.
            stt_flush_duration (float, optional): The duration of the silence to be appended to the STT
                to flush the buffer and generate the final transcript.

        Raises:
            RuntimeError: If the AgentSession isn't running.
        """
        if self._activity is None:
            raise RuntimeError("AgentSession isn't running")

        self._activity.commit_user_turn(
            transcript_timeout=transcript_timeout, stt_flush_duration=stt_flush_duration
        )

    def update_agent(self, agent: Agent) -> None:
        self._agent = agent

        if self._started:
            self._update_activity_atask = task = asyncio.create_task(
                self._update_activity_task(self._update_activity_atask, self._agent),
                name="_update_activity_task",
            )
            run_state = self._global_run_state
            if run_state:
                # don't mark the RunResult as done, if there is currently an agent transition happening.  # noqa: E501
                # (used to make sure we're correctly adding the AgentHandoffResult before completion)  # noqa: E501
                run_state._watch_handle(task)

    async def _update_activity(
        self,
        agent: Agent,
        *,
        previous_activity: Literal["close", "pause"] = "close",
        new_activity: Literal["start", "resume"] = "start",
        blocked_tasks: list[asyncio.Task] | None = None,
        wait_on_enter: bool = True,
    ) -> None:
        async with self._activity_lock:
            # _update_activity is called directly sometimes, update for redundancy
            self._agent = agent

            if new_activity == "start":
                previous_agent = self._activity.agent if self._activity else None
                if agent._activity is not None and (
                    # allow updating the same agent that is running
                    agent is not previous_agent or previous_activity != "close"
                ):
                    raise RuntimeError("cannot start agent: an activity is already running")

                self._next_activity = AgentActivity(agent, self)
            elif new_activity == "resume":
                if agent._activity is None:
                    raise RuntimeError("cannot resume agent: no existing active activity to resume")

                self._next_activity = agent._activity

            if self._root_span_context is not None:
                # restore the root span context so on_exit, on_enter, and future turns
                # are direct children of the root span, not nested under a tool call.
                otel_context.attach(self._root_span_context)

            previous_activity_v = self._activity
            if self._activity is not None:
                if previous_activity == "close":
                    await self._activity.drain()
                    await self._activity.aclose()
                elif previous_activity == "pause":
                    await self._activity.pause(blocked_tasks=blocked_tasks or [])

            self._activity = self._next_activity
            self._next_activity = None

            run_state = self._global_run_state
            handoff_item = AgentHandoff(
                old_agent_id=previous_activity_v.agent.id if previous_activity_v else None,
                new_agent_id=self._activity.agent.id,
            )
            if run_state:
                run_state._agent_handoff(
                    item=handoff_item,
                    old_agent=previous_activity_v.agent if previous_activity_v else None,
                    new_agent=self._activity.agent,
                )
            self._chat_ctx.insert(handoff_item)

            if new_activity == "start":
                await self._activity.start()
            elif new_activity == "resume":
                await self._activity.resume()

        # move it outside the lock to allow calling _update_activity in on_enter of a new agent
        if wait_on_enter:
            assert self._activity._on_enter_task is not None
            await asyncio.shield(self._activity._on_enter_task)

    @utils.log_exceptions(logger=logger)
    async def _update_activity_task(
        self, old_task: asyncio.Task[None] | None, agent: Agent
    ) -> None:
        if old_task is not None:
            await old_task

        await self._update_activity(agent)

    def _on_error(
        self,
        error: llm.LLMError | stt.STTError | tts.TTSError | llm.RealtimeModelError,
    ) -> None:
        if self._closing_task or error.recoverable:
            return

        if error.type == "llm_error":
            self._llm_error_counts += 1
            if self._llm_error_counts <= self.conn_options.max_unrecoverable_errors:
                return
        elif error.type == "tts_error":
            self._tts_error_counts += 1
            if self._tts_error_counts <= self.conn_options.max_unrecoverable_errors:
                return

        logger.error("AgentSession is closing due to unrecoverable error", exc_info=error.error)

        def on_close_done(_: asyncio.Task[None]) -> None:
            self._closing_task = None

        self._closing_task = asyncio.create_task(
            self._aclose_impl(error=error, reason=CloseReason.ERROR)
        )
        self._closing_task.add_done_callback(on_close_done)

    @utils.log_exceptions(logger=logger)
    async def _forward_audio_task(self) -> None:
        audio_input = self.input.audio
        if audio_input is None:
            return

        async for frame in audio_input:
            if self._activity is not None:
                self._activity.push_audio(frame)

    @utils.log_exceptions(logger=logger)
    async def _forward_video_task(self) -> None:
        video_input = self.input.video
        if video_input is None:
            return

        async for frame in video_input:
            if self._activity is not None:
                if self._video_sampler is not None and not self._video_sampler(frame, self):
                    continue  # ignore this frame

                self._activity.push_video(frame)

    def _set_user_away_timer(self) -> None:
        self._cancel_user_away_timer()
        if self._opts.user_away_timeout is None:
            return

        if (
            (room_io := self._room_io)
            and room_io.subscribed_fut
            and not room_io.subscribed_fut.done()
        ):
            # skip the timer before user join the room
            return

        self._user_away_timer = self._loop.call_later(
            self._opts.user_away_timeout, self._update_user_state, "away"
        )

    def _cancel_user_away_timer(self) -> None:
        if self._user_away_timer is not None:
            self._user_away_timer.cancel()
            self._user_away_timer = None

    def _update_agent_state(
        self, state: AgentState, *, otel_context: otel_context.Context | None = None
    ) -> None:
        if self._agent_state == state:
            return

        if state == "speaking":
            self._llm_error_counts = 0
            self._tts_error_counts = 0

            if self._agent_speaking_span is None:
                self._agent_speaking_span = tracer.start_span(
                    "agent_speaking", context=otel_context
                )

                if self._room_io:
                    _set_participant_attributes(
                        self._agent_speaking_span, self._room_io.room.local_participant
                    )
                # self._agent_speaking_span.set_attribute(trace_types.ATTR_START_TIME, time.time())
        elif self._agent_speaking_span is not None:
            # self._agent_speaking_span.set_attribute(trace_types.ATTR_END_TIME, time.time())
            self._agent_speaking_span.end()
            self._agent_speaking_span = None

        if state == "listening" and self._user_state == "listening":
            self._set_user_away_timer()
        else:
            self._cancel_user_away_timer()

        old_state = self._agent_state
        self._agent_state = state
        self.emit(
            "agent_state_changed",
            AgentStateChangedEvent(old_state=old_state, new_state=state),
        )

    def _update_user_state(
        self, state: UserState, *, last_speaking_time: float | None = None
    ) -> None:
        if self._user_state == state:
            return

        if state == "speaking" and self._user_speaking_span is None:
            self._user_speaking_span = tracer.start_span("user_speaking")

            if self._room_io and self._room_io.linked_participant:
                _set_participant_attributes(
                    self._user_speaking_span, self._room_io.linked_participant
                )

            # self._user_speaking_span.set_attribute(trace_types.ATTR_START_TIME, time.time())
        elif self._user_speaking_span is not None:
            # end_time = last_speaking_time or time.time()
            # self._user_speaking_span.set_attribute(trace_types.ATTR_END_TIME, end_time)
            self._user_speaking_span.end()
            self._user_speaking_span = None

        if state == "listening" and self._agent_state == "listening":
            self._set_user_away_timer()
        else:
            self._cancel_user_away_timer()

        old_state = self._user_state
        self._user_state = state
        self.emit("user_state_changed", UserStateChangedEvent(old_state=old_state, new_state=state))

    def _user_input_transcribed(self, ev: UserInputTranscribedEvent) -> None:
        if self.user_state == "away" and ev.is_final:
            # reset user state from away to listening in case VAD has a miss detection
            self._update_user_state("listening")

        self.emit("user_input_transcribed", ev)

    def _conversation_item_added(self, message: llm.ChatMessage) -> None:
        self._chat_ctx.insert(message)
        self.emit("conversation_item_added", ConversationItemAddedEvent(item=message))

    def _tool_items_added(self, items: Sequence[llm.FunctionCall | llm.FunctionCallOutput]) -> None:
        self._chat_ctx.insert(items)

    # move them to the end to avoid shadowing the same named modules for mypy
    @property
    def stt(self) -> stt.STT | None:
        return self._stt

    @property
    def llm(self) -> llm.LLM | llm.RealtimeModel | None:
        return self._llm

    @property
    def tts(self) -> tts.TTS | None:
        return self._tts

    @property
    def vad(self) -> vad.VAD | None:
        return self._vad

    # -- User changed input/output streams/sinks --

    def _on_video_input_changed(self) -> None:
        if not self._started:
            return

        if self._forward_video_atask is not None:
            self._forward_video_atask.cancel()

        self._forward_video_atask = asyncio.create_task(
            self._forward_video_task(), name="_forward_video_task"
        )

    def _on_audio_input_changed(self) -> None:
        if not self._started:
            return

        if self._forward_audio_atask is not None:
            self._forward_audio_atask.cancel()

        self._forward_audio_atask = asyncio.create_task(
            self._forward_audio_task(), name="_forward_audio_task"
        )

    def _on_video_output_changed(self) -> None:
        pass

    def _on_audio_output_changed(self) -> None:
        if (
            self._started
            and self._opts.resume_false_interruption
            and (audio_output := self.output.audio)
            and not audio_output.can_pause
        ):
            logger.warning(
                "resume_false_interruption is enabled, but the audio output does not support pause, ignored",
                extra={"audio_output": audio_output.label},
            )

    def _on_text_output_changed(self) -> None:
        pass

    # ---

    async def __aenter__(self) -> AgentSession:
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc: BaseException | None,
        exc_tb: TracebackType | None,
    ) -> None:
        await self.aclose()

Abstract base class for generic types.

On Python 3.12 and newer, generic classes implicitly inherit from Generic when they declare a parameter list after the class's name::

class Mapping[KT, VT]:
    def __getitem__(self, key: KT) -> VT:
        ...
    # Etc.

On older versions of Python, however, generic classes have to explicitly inherit from Generic.

After a class has been declared to be generic, it can then be used as follows::

def lookup_name[KT, VT](mapping: Mapping[KT, VT], key: KT, default: VT) -> VT:
    try:
        return mapping[key]
    except KeyError:
        return default

AgentSession is the LiveKit Agents runtime that glues together media streams, speech/LLM components, and tool orchestration into a single real-time voice agent.

It links audio, video, and text I/O with STT, VAD, TTS, and the LLM; handles turn detection, endpointing, interruptions, and multi-step tool calls; and exposes everything through event callbacks so you can focus on writing function tools and simple hand-offs rather than low-level streaming logic.

Args

turn_detection : TurnDetectionMode, optional

Strategy for deciding when the user has finifshed speaking.

"stt" – rely on speech-to-text end-of-utterance cues
"vad" – rely on Voice Activity Detection start/stop cues
"realtime_llm" – use server-side detection from a realtime LLM
"manual" – caller controls turn boundaries explicitly
_TurnDetector instance – plug-in custom detector

If NOT_GIVEN, the session chooses the best available mode in priority order realtime_llm → vad → stt → manual; it automatically falls back if the necessary model is missing.

stt : stt.STT | str, optional

Speech-to-text backend.

vad : vad.VAD, optional

Voice-activity detector

llm : llm.LLM | llm.RealtimeModel | str, optional

LLM or RealtimeModel

tts : tts.TTS | str, optional

Text-to-speech engine.

tools : list[llm.FunctionTool | llm.RawFunctionTool], optional

List of tools shared by every agent in the agent session.

mcp_servers : list[mcp.MCPServer], optional

List of MCP servers providing external tools for the agent to use.

userdata : Userdata_T, optional

Arbitrary per-session user data.

allow_interruptions : bool

Whether the user can interrupt the agent mid-utterance. Default True.

discard_audio_if_uninterruptible : bool

When True, buffered audio is dropped while the agent is speaking and cannot be interrupted. Default True.

min_interruption_duration : float

Minimum speech length (s) to register as an interruption. Default 0.5 s.

min_interruption_words : int

Minimum number of words to consider an interruption, only used if stt enabled. Default 0.

min_endpointing_delay : float

Minimum time-in-seconds the agent must wait after a potential end-of-utterance signal (from VAD or an EOU model) before it declares the user’s turn complete. Default 0.5 s.

max_endpointing_delay : float

Maximum time-in-seconds the agent will wait before terminating the turn. Default 3.0 s.

max_tool_steps : int

Maximum consecutive tool calls per LLM turn. Default 3.

video_sampler : _VideoSampler, optional

Uses :class:VoiceActivityVideoSampler when NOT_GIVEN; that sampler captures video at ~1 fps while the user is speaking and ~0.3 fps when silent by default.

user_away_timeout : float, optional

If set, set the user state as "away" after this amount of time after user and agent are silent. Default 15.0 s, set to None to disable.

false_interruption_timeout : float, optional

If set, emit an agent_false_interruption event after this amount of time if the user is silent and no user transcript is detected after the interruption. Set to None to disable. Default 2.0 s.

resume_false_interruption : bool

Whether to resume the false interruption after the false_interruption_timeout. Default True.

min_consecutive_speech_delay : float, optional

The minimum delay between consecutive speech. Default 0.0 s.

use_tts_aligned_transcript : bool, optional

Whether to use TTS-aligned transcript as the input of the transcription_node. Only applies if TTS.capabilities.aligned_transcript is True or streaming is False. When NOT_GIVEN, it's disabled.

tts_text_transforms : Sequence[TextTransforms], optional

The transforms to apply to the tts input text, available built-in transforms: "filter_markdown", "filter_emoji". Set to None to disable. When NOT_GIVEN, all filters will be applied.

preemptive_generation (bool):

Whether to speculatively begin LLM and TTS requests before an end-of-turn is

detected. When True, the agent sends inference calls as soon as a user

transcript is received rather than waiting for a definitive turn boundary. This

can reduce response latency by overlapping model inference with user audio,

but may incur extra compute if the user interrupts or revises mid-utterance.

Defaults to False.

ivr_detection : bool

Whether to detect if the agent is interacting with an IVR system. Default False.

conn_options : SessionConnectOptions, optional

Connection options for stt, llm, and tts.

loop : asyncio.AbstractEventLoop, optional

Event loop to bind the session to. Falls back to :pyfunc:asyncio.get_event_loop().

Ancestors

EventEmitter
typing.Generic

Instance variables

prop agent_state : AgentState

Expand source code

@property
def agent_state(self) -> AgentState:
    return self._agent_state

prop conn_options : SessionConnectOptions

Expand source code

@property
def conn_options(self) -> SessionConnectOptions:
    return self._conn_options

prop current_agent : Agent

Expand source code

@property
def current_agent(self) -> Agent:
    if self._agent is None:
        raise RuntimeError("VoiceAgent isn't running")

    return self._agent

prop current_speech : SpeechHandle | None

Expand source code

@property
def current_speech(self) -> SpeechHandle | None:
    return self._activity.current_speech if self._activity is not None else None

prop history : llm.ChatContext

Expand source code

@property
def history(self) -> llm.ChatContext:
    return self._chat_ctx

prop input : AgentInput

Expand source code

@property
def input(self) -> io.AgentInput:
    return self._input

prop llm : llm.LLM | llm.RealtimeModel | None

Expand source code

@property
def llm(self) -> llm.LLM | llm.RealtimeModel | None:
    return self._llm

prop mcp_servers : list[mcp.MCPServer] | None

Expand source code

@property
def mcp_servers(self) -> list[mcp.MCPServer] | None:
    return self._mcp_servers

prop options : AgentSessionOptions

Expand source code

@property
def options(self) -> AgentSessionOptions:
    return self._opts

prop output : AgentOutput

Expand source code

@property
def output(self) -> io.AgentOutput:
    return self._output

prop room_io : RoomIO

Expand source code

@property
def room_io(self) -> room_io.RoomIO:
    if not self._room_io:
        raise RuntimeError(
            "Cannot access room_io: the AgentSession was not started with a room."
        )

    return self._room_io

prop stt : stt.STT | None

Expand source code

@property
def stt(self) -> stt.STT | None:
    return self._stt

prop tools : list[llm.FunctionTool | llm.RawFunctionTool]

Expand source code

@property
def tools(self) -> list[llm.FunctionTool | llm.RawFunctionTool]:
    return self._tools

prop tts : tts.TTS | None

Expand source code

@property
def tts(self) -> tts.TTS | None:
    return self._tts

prop turn_detection : TurnDetectionMode | None

Expand source code

@property
def turn_detection(self) -> TurnDetectionMode | None:
    return self._turn_detection

prop user_state : UserState

Expand source code

@property
def user_state(self) -> UserState:
    return self._user_state

prop userdata : Userdata_T

Expand source code

@property
def userdata(self) -> Userdata_T:
    if self._userdata is None:
        raise ValueError("AgentSession userdata is not set")

    return self._userdata

prop vad : vad.VAD | None

Expand source code

@property
def vad(self) -> vad.VAD | None:
    return self._vad

Methods

async def aclose(self) ‑> None

Expand source code

async def aclose(self) -> None:
    await self._aclose_impl(reason=CloseReason.USER_INITIATED)

def clear_user_turn(self) ‑> None

Expand source code

def clear_user_turn(self) -> None:
    # clear the transcription or input audio buffer of the user turn
    if self._activity is None:
        raise RuntimeError("AgentSession isn't running")

    self._activity.clear_user_turn()

def commit_user_turn(self, *, transcript_timeout: float = 2.0, stt_flush_duration: float = 2.0) ‑> None

Expand source code

def commit_user_turn(
    self, *, transcript_timeout: float = 2.0, stt_flush_duration: float = 2.0
) -> None:
    """Commit the user turn and generate a reply.

    Args:
        transcript_timeout (float, optional): The timeout for the final transcript
            to be received after committing the user turn.
            Increase this value if the STT is slow to respond.
        stt_flush_duration (float, optional): The duration of the silence to be appended to the STT
            to flush the buffer and generate the final transcript.

    Raises:
        RuntimeError: If the AgentSession isn't running.
    """
    if self._activity is None:
        raise RuntimeError("AgentSession isn't running")

    self._activity.commit_user_turn(
        transcript_timeout=transcript_timeout, stt_flush_duration=stt_flush_duration
    )

Commit the user turn and generate a reply.

Args

transcript_timeout : float, optional: The timeout for the final transcript to be received after committing the user turn. Increase this value if the STT is slow to respond.
stt_flush_duration : float, optional: The duration of the silence to be appended to the STT to flush the buffer and generate the final transcript.

Raises

RuntimeError: If the AgentSession isn't running.

async def drain(self) ‑> None

Expand source code

async def drain(self) -> None:
    if self._activity is None:
        raise RuntimeError("AgentSession isn't running")

    await self._activity.drain()

def generate_reply(self, *, user_input: NotGivenOr[str] = NOT_GIVEN, instructions: NotGivenOr[str] = NOT_GIVEN, tool_choice: NotGivenOr[llm.ToolChoice] = NOT_GIVEN, allow_interruptions: NotGivenOr[bool] = NOT_GIVEN, chat_ctx: NotGivenOr[ChatContext] = NOT_GIVEN) ‑> livekit.agents.voice.speech_handle.SpeechHandle

Expand source code

def generate_reply(
    self,
    *,
    user_input: NotGivenOr[str] = NOT_GIVEN,
    instructions: NotGivenOr[str] = NOT_GIVEN,
    tool_choice: NotGivenOr[llm.ToolChoice] = NOT_GIVEN,
    allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
    chat_ctx: NotGivenOr[ChatContext] = NOT_GIVEN,
) -> SpeechHandle:
    """Generate a reply for the agent to speak to the user.

    Args:
        user_input (NotGivenOr[str], optional): The user's input that may influence the reply,
            such as answering a question.
        instructions (NotGivenOr[str], optional): Additional instructions for generating the reply.
        tool_choice (NotGivenOr[llm.ToolChoice], optional): Specifies the external tool to use when
            generating the reply. If generate_reply is invoked within a function_tool, defaults to "none".
        allow_interruptions (NotGivenOr[bool], optional): Indicates whether the user can interrupt this speech.

    Returns:
        SpeechHandle: A handle to the generated reply.
    """  # noqa: E501
    if self._activity is None:
        raise RuntimeError("AgentSession isn't running")

    user_message = (
        llm.ChatMessage(role="user", content=[user_input])
        if is_given(user_input)
        else NOT_GIVEN
    )

    run_state = self._global_run_state
    activity = self._next_activity if self._activity.scheduling_paused else self._activity

    if activity is None:
        raise RuntimeError("AgentSession is closing, cannot use generate_reply()")

    # attach to the session span if called outside of the AgentSession
    use_span: AbstractContextManager[trace.Span | None] = nullcontext()
    if trace.get_current_span() is trace.INVALID_SPAN and self._session_span is not None:
        use_span = trace.use_span(self._session_span, end_on_exit=False)

    with use_span:
        handle = activity._generate_reply(
            user_message=user_message if user_message else None,
            instructions=instructions,
            tool_choice=tool_choice,
            allow_interruptions=allow_interruptions,
            chat_ctx=chat_ctx,
        )
        if run_state:
            run_state._watch_handle(handle)

    return handle

Generate a reply for the agent to speak to the user.

Args

user_input : NotGivenOr[str], optional: The user's input that may influence the reply, such as answering a question.
instructions : NotGivenOr[str], optional: Additional instructions for generating the reply.
tool_choice : NotGivenOr[llm.ToolChoice], optional: Specifies the external tool to use when generating the reply. If generate_reply is invoked within a function_tool, defaults to "none".
allow_interruptions : NotGivenOr[bool], optional: Indicates whether the user can interrupt this speech.

Returns

SpeechHandle: A handle to the generated reply.

def interrupt(self, *, force: bool = False) ‑> _asyncio.Future[None]

Expand source code

def interrupt(self, *, force: bool = False) -> asyncio.Future[None]:
    """Interrupt the current speech generation.

    Returns:
        An asyncio.Future that completes when the interruption is fully processed
        and chat context has been updated.
    """
    if self._activity is None:
        raise RuntimeError("AgentSession isn't running")

    return self._activity.interrupt(force=force)

Interrupt the current speech generation.

Returns

An asyncio.Future that completes when the interruption is fully processed and chat context has been updated.

def run(self, *, user_input: str, output_type: type[Run_T] | None = None) ‑> RunResult[~Run_T]

Expand source code

def run(self, *, user_input: str, output_type: type[Run_T] | None = None) -> RunResult[Run_T]:
    if self._global_run_state is not None and not self._global_run_state.done():
        raise RuntimeError("nested runs are not supported")

    run_state = RunResult(user_input=user_input, output_type=output_type)
    self._global_run_state = run_state
    self.generate_reply(user_input=user_input)
    return run_state

def say(self, text: str | AsyncIterable[str], *, audio: NotGivenOr[AsyncIterable[rtc.AudioFrame]] = NOT_GIVEN, allow_interruptions: NotGivenOr[bool] = NOT_GIVEN, add_to_chat_ctx: bool = True) ‑> livekit.agents.voice.speech_handle.SpeechHandle

Expand source code

def say(
    self,
    text: str | AsyncIterable[str],
    *,
    audio: NotGivenOr[AsyncIterable[rtc.AudioFrame]] = NOT_GIVEN,
    allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
    add_to_chat_ctx: bool = True,
) -> SpeechHandle:
    if self._activity is None:
        raise RuntimeError("AgentSession isn't running")

    run_state = self._global_run_state
    activity = self._next_activity if self._activity.scheduling_paused else self._activity

    if activity is None:
        raise RuntimeError("AgentSession is closing, cannot use say()")

    # attach to the session span if called outside of the AgentSession
    use_span: AbstractContextManager[trace.Span | None] = nullcontext()
    if trace.get_current_span() is trace.INVALID_SPAN and self._session_span is not None:
        use_span = trace.use_span(self._session_span, end_on_exit=False)

    with use_span:
        handle = activity.say(
            text,
            audio=audio,
            allow_interruptions=allow_interruptions,
            add_to_chat_ctx=add_to_chat_ctx,
        )
        if run_state:
            run_state._watch_handle(handle)

    return handle

def shutdown(self, *, drain: bool = True) ‑> None

Expand source code

def shutdown(self, *, drain: bool = True) -> None:
    self._close_soon(error=None, drain=drain, reason=CloseReason.USER_INITIATED)

async def start(self, agent: Agent, *, capture_run: bool = False, room: NotGivenOr[rtc.Room] = NOT_GIVEN, room_options: NotGivenOr[RoomOptions] = NOT_GIVEN, room_input_options: NotGivenOr[RoomInputOptions] = NOT_GIVEN, room_output_options: NotGivenOr[RoomOutputOptions] = NOT_GIVEN, record: NotGivenOr[bool] = NOT_GIVEN) ‑> RunResult | None

Expand source code

async def start(
    self,
    agent: Agent,
    *,
    capture_run: bool = False,
    room: NotGivenOr[rtc.Room] = NOT_GIVEN,
    room_options: NotGivenOr[room_io.RoomOptions] = NOT_GIVEN,
    # deprecated
    room_input_options: NotGivenOr[room_io.RoomInputOptions] = NOT_GIVEN,
    room_output_options: NotGivenOr[room_io.RoomOutputOptions] = NOT_GIVEN,
    record: NotGivenOr[bool] = NOT_GIVEN,
) -> RunResult | None:
    """Start the voice agent.

    Create a default RoomIO if the input or output audio is not already set.
    If the console flag is provided, start a ChatCLI.

    Args:
        capture_run: Whether to return a RunResult and capture the run result during session start.
        room: The room to use for input and output
        room_input_options: Options for the room input
        room_output_options: Options for the room output
        record: Whether to record the audio
    """
    async with self._lock:
        if self._started:
            return None

        self._started_at = time.time()

        # configure observability first
        job_ctx: JobContext | None = None
        try:
            job_ctx = get_job_context()
            if not is_given(record):
                record = job_ctx.job.enable_recording

            self._enable_recording = record

            if self._enable_recording:
                job_ctx.init_recording()

        except RuntimeError:
            # JobContext is not available in evals
            pass

        self._session_span = current_span = tracer.start_span("agent_session")
        # we detach here to avoid context issues since tokens need to be detached
        # in the same context as it was created
        if self._session_ctx_token is not None:
            otel_context.detach(self._session_ctx_token)
            self._session_ctx_token = None
        ctx = trace.set_span_in_context(current_span)
        self._session_ctx_token = otel_context.attach(ctx)

        self._recorded_events = []
        self._room_io = None
        self._recorder_io = None

        self._closing = False
        self._root_span_context = otel_context.get_current()
        current_span = trace.get_current_span()
        current_span.set_attribute(trace_types.ATTR_AGENT_LABEL, agent.label)

        self._agent = agent
        self._update_agent_state("initializing")

        tasks: list[asyncio.Task[None]] = []

        c = cli.AgentsConsole.get_instance()
        if c.enabled and not c.io_acquired:
            if self.input.audio is not None or self.output.audio is not None:
                logger.warning(
                    "agent started with the console subcommand, but input.audio/output.audio "
                    "is already set, overriding..."
                )

            c.acquire_io(loop=self._loop, session=self)
        elif is_given(room) and not self._room_io:
            room_options = room_io.RoomOptions._ensure_options(
                room_options,
                room_input_options=room_input_options,
                room_output_options=room_output_options,
            )
            room_options = copy.copy(room_options)  # shadow copy is enough

            if self.input.audio is not None:
                if room_options.audio_input:
                    logger.warning(
                        "RoomIO audio input is enabled but input.audio is already set, ignoring.."  # noqa: E501
                    )
                room_options.audio_input = False

            if self.output.audio is not None:
                if room_options.audio_output:
                    logger.warning(
                        "RoomIO audio output is enabled but output.audio is already set, ignoring.."  # noqa: E501
                    )
                room_options.audio_output = False

            if self.output.transcription is not None:
                if room_options.text_output:
                    logger.warning(
                        "RoomIO transcription output is enabled but output.transcription is already set, ignoring.."  # noqa: E501
                    )
                room_options.text_output = False

            self._room_io = room_io.RoomIO(room=room, agent_session=self, options=room_options)
            await self._room_io.start()

        if job_ctx:
            # these aren't relevant during eval mode, as they require job context and/or room_io
            if self.input.audio and self.output.audio:
                if self._enable_recording:
                    self._recorder_io = RecorderIO(agent_session=self)
                    self.input.audio = self._recorder_io.record_input(self.input.audio)
                    self.output.audio = self._recorder_io.record_output(self.output.audio)

                    if (c.enabled and c.record) or not c.enabled:
                        task = asyncio.create_task(
                            self._recorder_io.start(
                                output_path=job_ctx.session_directory / "audio.ogg"
                            )
                        )
                        tasks.append(task)

            if job_ctx._primary_agent_session is None:
                job_ctx._primary_agent_session = self
            elif self._enable_recording:
                raise RuntimeError(
                    "Only one `AgentSession` can be the primary at a time. "
                    "If you want to ignore primary designation, use session.start(record=False)."
                )

            if self.options.ivr_detection:
                self._ivr_activity = IVRActivity(self)

                # inject the IVR activity tools into the session tools
                self._tools.extend(self._ivr_activity.tools)

                tasks.append(
                    asyncio.create_task(self._ivr_activity.start(), name="_ivr_activity_start")
                )

            current_span.set_attribute(trace_types.ATTR_ROOM_NAME, job_ctx.room.name)
            current_span.set_attribute(trace_types.ATTR_JOB_ID, job_ctx.job.id)
            current_span.set_attribute(trace_types.ATTR_AGENT_NAME, job_ctx.job.agent_name)
            if self._room_io:
                # automatically connect to the room when room io is used
                tasks.append(asyncio.create_task(job_ctx.connect(), name="_job_ctx_connect"))

            # session can be restarted, register the callbacks only once
            if not self._job_context_cb_registered:
                job_ctx.add_shutdown_callback(
                    lambda: self._aclose_impl(reason=CloseReason.JOB_SHUTDOWN)
                )
                self._job_context_cb_registered = True

        run_state: RunResult | None = None
        if capture_run:
            if self._global_run_state is not None and not self._global_run_state.done():
                raise RuntimeError("nested runs are not supported")

            run_state = RunResult(output_type=None)
            self._global_run_state = run_state

        # it is ok to await it directly, there is no previous task to drain
        tasks.append(
            asyncio.create_task(self._update_activity(self._agent, wait_on_enter=False))
        )

        try:
            await asyncio.gather(*tasks)
        finally:
            await utils.aio.cancel_and_wait(*tasks)

        # important: no await should be done after this!

        if self.input.audio is not None:
            self._forward_audio_atask = asyncio.create_task(
                self._forward_audio_task(), name="_forward_audio_task"
            )

        if self.input.video is not None:
            self._forward_video_atask = asyncio.create_task(
                self._forward_video_task(), name="_forward_video_task"
            )

        self._started = True
        self._update_agent_state("listening")
        if self._room_io and self._room_io.subscribed_fut:

            def on_room_io_subscribed(_: asyncio.Future[None]) -> None:
                if self._user_state == "listening" and self._agent_state == "listening":
                    self._set_user_away_timer()

            self._room_io.subscribed_fut.add_done_callback(on_room_io_subscribed)

        # log used IO
        def _collect_source(
            inp: io.AudioInput | io.VideoInput | None,
        ) -> list[io.AudioInput | io.VideoInput]:
            return [] if inp is None else [inp] + _collect_source(inp.source)

        def _collect_chain(
            out: io.TextOutput | io.VideoOutput | io.AudioOutput | None,
        ) -> list[io.VideoOutput | io.AudioOutput | io.TextOutput]:
            return [] if out is None else [out] + _collect_chain(out.next_in_chain)

        audio_input = _collect_source(self.input.audio)[::-1]
        video_input = _collect_source(self.input.video)[::-1]

        audio_output = _collect_chain(self.output.audio)
        video_output = _collect_chain(self.output.video)
        transcript_output = _collect_chain(self.output.transcription)

        logger.debug(
            "using audio io: %s -> `AgentSession` -> %s",
            " -> ".join([f"`{out.label}`" for out in audio_input]) or "(none)",
            " -> ".join([f"`{out.label}`" for out in audio_output]) or "(none)",
        )
        if (
            self._opts.resume_false_interruption
            and self.output.audio
            and not self.output.audio.can_pause
        ):
            logger.warning(
                "resume_false_interruption is enabled but audio output does not support pause, it will be ignored",
                extra={"audio_output": self.output.audio.label},
            )

        logger.debug(
            "using transcript io: `AgentSession` -> %s",
            " -> ".join([f"`{out.label}`" for out in transcript_output]) or "(none)",
        )

        if video_input or video_output:
            logger.debug(
                "using video io: %s > `AgentSession` > %s",
                " -> ".join([f"`{out.label}`" for out in video_input]) or "(none)",
                " -> ".join([f"`{out.label}`" for out in video_output]) or "(none)",
            )

        if run_state:
            await run_state

        return run_state

Start the voice agent.

Create a default RoomIO if the input or output audio is not already set. If the console flag is provided, start a ChatCLI.

Args

capture_run: Whether to return a RunResult and capture the run result during session start.
room: The room to use for input and output
room_input_options: Options for the room input
room_output_options: Options for the room output
record: Whether to record the audio

def update_agent(self, agent: Agent) ‑> None

Expand source code

def update_agent(self, agent: Agent) -> None:
    self._agent = agent

    if self._started:
        self._update_activity_atask = task = asyncio.create_task(
            self._update_activity_task(self._update_activity_atask, self._agent),
            name="_update_activity_task",
        )
        run_state = self._global_run_state
        if run_state:
            # don't mark the RunResult as done, if there is currently an agent transition happening.  # noqa: E501
            # (used to make sure we're correctly adding the AgentHandoffResult before completion)  # noqa: E501
            run_state._watch_handle(task)

def update_options(self, *, min_endpointing_delay: NotGivenOr[float] = NOT_GIVEN, max_endpointing_delay: NotGivenOr[float] = NOT_GIVEN, turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN) ‑> None

Expand source code

def update_options(
    self,
    *,
    min_endpointing_delay: NotGivenOr[float] = NOT_GIVEN,
    max_endpointing_delay: NotGivenOr[float] = NOT_GIVEN,
    turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
) -> None:
    """
    Update the options for the agent session.

    Args:
        min_endpointing_delay (NotGivenOr[float], optional): The minimum endpointing delay.
        max_endpointing_delay (NotGivenOr[float], optional): The maximum endpointing delay.
        turn_detection (NotGivenOr[TurnDetectionMode | None], optional): Strategy for deciding
            when the user has finished speaking. ``None`` reverts to automatic selection.
    """
    if is_given(min_endpointing_delay):
        self._opts.min_endpointing_delay = min_endpointing_delay
    if is_given(max_endpointing_delay):
        self._opts.max_endpointing_delay = max_endpointing_delay

    if is_given(turn_detection):
        self._turn_detection = cast(Optional[TurnDetectionMode], turn_detection)

    if self._activity is not None:
        self._activity.update_options(
            min_endpointing_delay=min_endpointing_delay,
            max_endpointing_delay=max_endpointing_delay,
            turn_detection=turn_detection,
        )

Update the options for the agent session.

Args

min_endpointing_delay : NotGivenOr[float], optional: The minimum endpointing delay.
max_endpointing_delay : NotGivenOr[float], optional: The maximum endpointing delay.
turn_detection : NotGivenOr[TurnDetectionMode | None], optional: Strategy for deciding when the user has finished speaking. None reverts to automatic selection.

Inherited members

EventEmitter:
- emit
- off
- on
- once

class AgentStateChangedEvent (**data: Any)

Expand source code

class AgentStateChangedEvent(BaseModel):
    type: Literal["agent_state_changed"] = "agent_state_changed"
    old_state: AgentState
    new_state: AgentState
    created_at: float = Field(default_factory=time.time)

Usage docs: https://docs.pydantic.dev/2.10/concepts/models/

A base class for creating Pydantic models.

Attributes

__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom __init__ function.
__pydantic_decorators__: Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [RootModel][pydantic.root_model.RootModel].
__pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model.
__pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [FieldInfo][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [ComputedFieldInfo][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [extra][pydantic.config.ConfigDict.extra] is set to 'allow'.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Ancestors

pydantic.main.BaseModel

Class variables

var created_at : float
var model_config
var new_state : Literal['initializing', 'idle', 'listening', 'thinking', 'speaking']
var old_state : Literal['initializing', 'idle', 'listening', 'thinking', 'speaking']
var type : Literal['agent_state_changed']

class AgentTask (*, instructions: str, chat_ctx: NotGivenOr[llm.ChatContext] = NOT_GIVEN, tools: list[llm.FunctionTool | llm.RawFunctionTool] | None = None, turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN, stt: NotGivenOr[stt.STT | None] = NOT_GIVEN, vad: NotGivenOr[vad.VAD | None] = NOT_GIVEN, llm: NotGivenOr[llm.LLM | llm.RealtimeModel | None] = NOT_GIVEN, tts: NotGivenOr[tts.TTS | None] = NOT_GIVEN, mcp_servers: NotGivenOr[list[mcp.MCPServer] | None] = NOT_GIVEN, allow_interruptions: NotGivenOr[bool] = NOT_GIVEN, min_endpointing_delay: NotGivenOr[float] = NOT_GIVEN, max_endpointing_delay: NotGivenOr[float] = NOT_GIVEN)

Expand source code

class AgentTask(Agent, Generic[TaskResult_T]):
    def __init__(
        self,
        *,
        instructions: str,
        chat_ctx: NotGivenOr[llm.ChatContext] = NOT_GIVEN,
        tools: list[llm.FunctionTool | llm.RawFunctionTool] | None = None,
        turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
        stt: NotGivenOr[stt.STT | None] = NOT_GIVEN,
        vad: NotGivenOr[vad.VAD | None] = NOT_GIVEN,
        llm: NotGivenOr[llm.LLM | llm.RealtimeModel | None] = NOT_GIVEN,
        tts: NotGivenOr[tts.TTS | None] = NOT_GIVEN,
        mcp_servers: NotGivenOr[list[mcp.MCPServer] | None] = NOT_GIVEN,
        allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
        min_endpointing_delay: NotGivenOr[float] = NOT_GIVEN,
        max_endpointing_delay: NotGivenOr[float] = NOT_GIVEN,
    ) -> None:
        tools = tools or []
        super().__init__(
            instructions=instructions,
            chat_ctx=chat_ctx,
            tools=tools,
            turn_detection=turn_detection,
            stt=stt,
            vad=vad,
            llm=llm,
            tts=tts,
            mcp_servers=mcp_servers,
            allow_interruptions=allow_interruptions,
            min_endpointing_delay=min_endpointing_delay,
            max_endpointing_delay=max_endpointing_delay,
        )

        self.__started = False
        self.__fut = asyncio.Future[TaskResult_T]()

    def done(self) -> bool:
        return self.__fut.done()

    def complete(self, result: TaskResult_T | Exception) -> None:
        if self.__fut.done():
            raise RuntimeError(f"{self.__class__.__name__} is already done")

        if isinstance(result, Exception):
            self.__fut.set_exception(result)
        else:
            self.__fut.set_result(result)

        self.__fut.exception()  # silence exc not retrieved warnings

        from .agent_activity import _SpeechHandleContextVar

        speech_handle = _SpeechHandleContextVar.get(None)

        if speech_handle:
            speech_handle._maybe_run_final_output = result

        # if not self.__inline_mode:
        #    session._close_soon(reason=CloseReason.TASK_COMPLETED, drain=True)

    async def __await_impl(self) -> TaskResult_T:
        if self.__started:
            raise RuntimeError(f"{self.__class__.__name__} is not re-entrant, await only once")

        self.__started = True

        current_task = asyncio.current_task()
        if current_task is None:
            raise RuntimeError(
                f"{self.__class__.__name__} must be executed inside an async context"
            )

        task_info = _get_activity_task_info(current_task)
        if not task_info or not task_info.inline_task:
            raise RuntimeError(
                f"{self.__class__.__name__} should only be awaited inside tool_functions or the on_enter/on_exit methods of an Agent"  # noqa: E501
            )

        def _handle_task_done(_: asyncio.Task[Any]) -> None:
            if self.__fut.done():
                return

            # if the asyncio.Task running the InlineTask completes before the InlineTask itself, log
            # an error and attempt to recover by terminating the InlineTask.
            logger.error(
                f"The asyncio.Task finished before {self.__class__.__name__} was completed."
            )

            self.complete(
                RuntimeError(
                    f"The asyncio.Task finished before {self.__class__.__name__} was completed."
                )
            )

        current_task.add_done_callback(_handle_task_done)

        from .agent_activity import _AgentActivityContextVar, _SpeechHandleContextVar

        # TODO(theomonnom): add a global lock for inline tasks
        # This may currently break in the case we use parallel tool calls.

        speech_handle = _SpeechHandleContextVar.get(None)
        old_activity = _AgentActivityContextVar.get()
        old_agent = old_activity.agent
        session = old_activity.session

        blocked_tasks = [current_task]
        if (
            old_activity._on_enter_task
            and not old_activity._on_enter_task.done()
            and current_task is not old_activity._on_enter_task
        ):
            blocked_tasks.append(old_activity._on_enter_task)

        if (
            task_info.function_call
            and isinstance(old_activity.llm, RealtimeModel)
            and not old_activity.llm.capabilities.manual_function_calls
        ):
            logger.error(
                f"Realtime model '{old_activity.llm.label}' does not support resuming function calls from chat context, "
                "using AgentTask inside a function tool may have unexpected behavior."
            )

        # TODO(theomonnom): could the RunResult watcher & the blocked_tasks share the same logic?
        await session._update_activity(self, previous_activity="pause", blocked_tasks=blocked_tasks)

        # NOTE: _update_activity is calling the on_enter method, so the RunResult can capture all speeches
        run_state = session._global_run_state
        if speech_handle and run_state and not run_state.done():
            # make sure to not deadlock on the current speech handle
            run_state._unwatch_handle(speech_handle)
            # it is OK to call _mark_done_if_needed here, the above _update_activity will call on_enter
            # so handles added inside the on_enter will make sure we're not completing the run_state too early.
            run_state._mark_done_if_needed(None)

        try:
            return await asyncio.shield(self.__fut)

        finally:
            # run_state could have changed after self.__fut
            run_state = session._global_run_state

            if session.current_agent != self:
                logger.warning(
                    f"{self.__class__.__name__} completed, but the agent has changed in the meantime. "
                    "Ignoring handoff to the previous agent, likely due to `AgentSession.update_agent` being invoked."
                )
                await old_activity.aclose()
            else:
                if speech_handle and run_state and not run_state.done():
                    run_state._watch_handle(speech_handle)

                merged_chat_ctx = old_agent.chat_ctx.merge(
                    self.chat_ctx, exclude_function_call=True, exclude_instructions=True
                )
                # set the chat_ctx directly, `session._update_activity` will sync it to the rt_session if needed
                old_agent._chat_ctx.items[:] = merged_chat_ctx.items
                # await old_agent.update_chat_ctx(merged_chat_ctx)

                await session._update_activity(
                    old_agent, new_activity="resume", wait_on_enter=False
                )

    def __await__(self) -> Generator[None, None, TaskResult_T]:
        return self.__await_impl().__await__()

Abstract base class for generic types.

On Python 3.12 and newer, generic classes implicitly inherit from Generic when they declare a parameter list after the class's name::

class Mapping[KT, VT]:
    def __getitem__(self, key: KT) -> VT:
        ...
    # Etc.

On older versions of Python, however, generic classes have to explicitly inherit from Generic.

After a class has been declared to be generic, it can then be used as follows::

def lookup_name[KT, VT](mapping: Mapping[KT, VT], key: KT, default: VT) -> VT:
    try:
        return mapping[key]
    except KeyError:
        return default

Ancestors

livekit.agents.voice.agent.Agent
typing.Generic

Subclasses

Methods

def complete(self, result: TaskResult_T | Exception) ‑> None

Expand source code

def complete(self, result: TaskResult_T | Exception) -> None:
    if self.__fut.done():
        raise RuntimeError(f"{self.__class__.__name__} is already done")

    if isinstance(result, Exception):
        self.__fut.set_exception(result)
    else:
        self.__fut.set_result(result)

    self.__fut.exception()  # silence exc not retrieved warnings

    from .agent_activity import _SpeechHandleContextVar

    speech_handle = _SpeechHandleContextVar.get(None)

    if speech_handle:
        speech_handle._maybe_run_final_output = result

    # if not self.__inline_mode:
    #    session._close_soon(reason=CloseReason.TASK_COMPLETED, drain=True)

def done(self) ‑> bool

Expand source code

def done(self) -> bool:
    return self.__fut.done()

class CloseEvent (**data: Any)

Expand source code

class CloseEvent(BaseModel):
    type: Literal["close"] = "close"
    error: LLMError | STTError | TTSError | RealtimeModelError | None = None
    reason: CloseReason
    created_at: float = Field(default_factory=time.time)

Usage docs: https://docs.pydantic.dev/2.10/concepts/models/

A base class for creating Pydantic models.

Attributes

__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom __init__ function.
__pydantic_decorators__: Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [RootModel][pydantic.root_model.RootModel].
__pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model.
__pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [FieldInfo][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [ComputedFieldInfo][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [extra][pydantic.config.ConfigDict.extra] is set to 'allow'.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Ancestors

pydantic.main.BaseModel

Class variables

var created_at : float
var error : livekit.agents.llm.llm.LLMError | livekit.agents.stt.stt.STTError | livekit.agents.tts.tts.TTSError | livekit.agents.llm.realtime.RealtimeModelError | None
var model_config
var reason : livekit.agents.voice.events.CloseReason
var type : Literal['close']

class CloseReason (*args, **kwds)

Expand source code

@unique
class CloseReason(str, Enum):
    ERROR = "error"
    JOB_SHUTDOWN = "job_shutdown"
    PARTICIPANT_DISCONNECTED = "participant_disconnected"
    USER_INITIATED = "user_initiated"
    TASK_COMPLETED = "task_completed"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.

Ancestors

builtins.str
enum.Enum

Class variables

var ERROR
var JOB_SHUTDOWN
var PARTICIPANT_DISCONNECTED
var TASK_COMPLETED
var USER_INITIATED

class ConversationItemAddedEvent (**data: Any)

Expand source code

class ConversationItemAddedEvent(BaseModel):
    type: Literal["conversation_item_added"] = "conversation_item_added"
    item: ChatMessage | _TypeDiscriminator
    created_at: float = Field(default_factory=time.time)

Usage docs: https://docs.pydantic.dev/2.10/concepts/models/

A base class for creating Pydantic models.

Attributes

__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom __init__ function.
__pydantic_decorators__: Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [RootModel][pydantic.root_model.RootModel].
__pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model.
__pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [FieldInfo][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [ComputedFieldInfo][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [extra][pydantic.config.ConfigDict.extra] is set to 'allow'.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Ancestors

pydantic.main.BaseModel

Class variables

var created_at : float
var item : livekit.agents.llm.chat_context.ChatMessage | livekit.agents.voice.events._TypeDiscriminator
var model_config
var type : Literal['conversation_item_added']

class ErrorEvent (**data: Any)

Expand source code

class ErrorEvent(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    type: Literal["error"] = "error"
    error: LLMError | STTError | TTSError | RealtimeModelError | Any
    source: LLM | STT | TTS | RealtimeModel | Any
    created_at: float = Field(default_factory=time.time)

Usage docs: https://docs.pydantic.dev/2.10/concepts/models/

A base class for creating Pydantic models.

Attributes

__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom __init__ function.
__pydantic_decorators__: Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [RootModel][pydantic.root_model.RootModel].
__pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model.
__pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [FieldInfo][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [ComputedFieldInfo][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [extra][pydantic.config.ConfigDict.extra] is set to 'allow'.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Ancestors

pydantic.main.BaseModel

Class variables

var created_at : float
var error : livekit.agents.llm.llm.LLMError | livekit.agents.stt.stt.STTError | livekit.agents.tts.tts.TTSError | livekit.agents.llm.realtime.RealtimeModelError | typing.Any
var model_config
var source : livekit.agents.llm.llm.LLM | livekit.agents.stt.stt.STT | livekit.agents.tts.tts.TTS | livekit.agents.llm.realtime.RealtimeModel | typing.Any
var type : Literal['error']

class FunctionToolsExecutedEvent (**data: Any)

Expand source code

class FunctionToolsExecutedEvent(BaseModel):
    type: Literal["function_tools_executed"] = "function_tools_executed"
    function_calls: list[FunctionCall]
    function_call_outputs: list[FunctionCallOutput | None]
    created_at: float = Field(default_factory=time.time)
    _reply_required: bool = PrivateAttr(default=False)
    _handoff_required: bool = PrivateAttr(default=False)

    def zipped(self) -> list[tuple[FunctionCall, FunctionCallOutput | None]]:
        return list(zip(self.function_calls, self.function_call_outputs))

    def cancel_tool_reply(self) -> None:
        self._reply_required = False

    def cancel_agent_handoff(self) -> None:
        self._handoff_required = False

    @property
    def has_tool_reply(self) -> bool:
        return self._reply_required

    @property
    def has_agent_handoff(self) -> bool:
        return self._handoff_required

    @model_validator(mode="after")
    def verify_lists_length(self) -> Self:
        if len(self.function_calls) != len(self.function_call_outputs):
            raise ValueError("The number of function_calls and function_call_outputs must match.")

        return self

Usage docs: https://docs.pydantic.dev/2.10/concepts/models/

A base class for creating Pydantic models.

Attributes

__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom __init__ function.
__pydantic_decorators__: Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [RootModel][pydantic.root_model.RootModel].
__pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model.
__pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [FieldInfo][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [ComputedFieldInfo][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [extra][pydantic.config.ConfigDict.extra] is set to 'allow'.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Ancestors

pydantic.main.BaseModel

Class variables

var created_at : float
var function_call_outputs : list[livekit.agents.llm.chat_context.FunctionCallOutput | None]
var function_calls : list[livekit.agents.llm.chat_context.FunctionCall]
var model_config
var type : Literal['function_tools_executed']

Instance variables

prop has_agent_handoff : bool

Expand source code

@property
def has_agent_handoff(self) -> bool:
    return self._handoff_required

prop has_tool_reply : bool

Expand source code

@property
def has_tool_reply(self) -> bool:
    return self._reply_required

Methods

def cancel_agent_handoff(self) ‑> None

Expand source code

def cancel_agent_handoff(self) -> None:
    self._handoff_required = False

def cancel_tool_reply(self) ‑> None

Expand source code

def cancel_tool_reply(self) -> None:
    self._reply_required = False

def model_post_init(self: BaseModel, context: Any, /) ‑> None

Expand source code

def init_private_attributes(self: BaseModel, context: Any, /) -> None:
    """This function is meant to behave like a BaseModel method to initialise private attributes.

    It takes context as an argument since that's what pydantic-core passes when calling it.

    Args:
        self: The BaseModel instance.
        context: The context.
    """
    if getattr(self, '__pydantic_private__', None) is None:
        pydantic_private = {}
        for name, private_attr in self.__private_attributes__.items():
            default = private_attr.get_default()
            if default is not PydanticUndefined:
                pydantic_private[name] = default
        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args

self: The BaseModel instance.
context: The context.

def verify_lists_length(self) ‑> Self

Expand source code

@model_validator(mode="after")
def verify_lists_length(self) -> Self:
    if len(self.function_calls) != len(self.function_call_outputs):
        raise ValueError("The number of function_calls and function_call_outputs must match.")

    return self

def zipped(self) ‑> list[tuple[livekit.agents.llm.chat_context.FunctionCall, livekit.agents.llm.chat_context.FunctionCallOutput | None]]

Expand source code

def zipped(self) -> list[tuple[FunctionCall, FunctionCallOutput | None]]:
    return list(zip(self.function_calls, self.function_call_outputs))

class MetricsCollectedEvent (**data: Any)

Expand source code

class MetricsCollectedEvent(BaseModel):
    type: Literal["metrics_collected"] = "metrics_collected"
    metrics: AgentMetrics
    created_at: float = Field(default_factory=time.time)

Usage docs: https://docs.pydantic.dev/2.10/concepts/models/

A base class for creating Pydantic models.

Attributes

__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom __init__ function.
__pydantic_decorators__: Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [RootModel][pydantic.root_model.RootModel].
__pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model.
__pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [FieldInfo][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [ComputedFieldInfo][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [extra][pydantic.config.ConfigDict.extra] is set to 'allow'.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Ancestors

pydantic.main.BaseModel

Class variables

var created_at : float
var metrics : livekit.agents.metrics.base.STTMetrics | livekit.agents.metrics.base.LLMMetrics | livekit.agents.metrics.base.TTSMetrics | livekit.agents.metrics.base.VADMetrics | livekit.agents.metrics.base.EOUMetrics | livekit.agents.metrics.base.RealtimeModelMetrics
var model_config
var type : Literal['metrics_collected']

class ModelSettings (tool_choice: NotGivenOr[llm.ToolChoice] = NOT_GIVEN)

Expand source code

@dataclass
class ModelSettings:
    tool_choice: NotGivenOr[llm.ToolChoice] = NOT_GIVEN
    """The tool choice to use when calling the LLM."""

ModelSettings(tool_choice: 'NotGivenOr[llm.ToolChoice]' = NOT_GIVEN)

Instance variables

var tool_choice : livekit.agents.llm.tool_context.NamedToolChoice | Literal['auto', 'required', 'none'] | livekit.agents.types.NotGiven: The tool choice to use when calling the LLM.

class RunContext (*, session: AgentSession[Userdata_T], speech_handle: SpeechHandle, function_call: FunctionCall)

Expand source code

class RunContext(Generic[Userdata_T]):
    # private ctor
    def __init__(
        self,
        *,
        session: AgentSession[Userdata_T],
        speech_handle: SpeechHandle,
        function_call: FunctionCall,
    ) -> None:
        self._session = session
        self._speech_handle = speech_handle
        self._function_call = function_call

        self._initial_step_idx = speech_handle.num_steps - 1

    @property
    def session(self) -> AgentSession[Userdata_T]:
        return self._session

    @property
    def speech_handle(self) -> SpeechHandle:
        return self._speech_handle

    @property
    def function_call(self) -> FunctionCall:
        return self._function_call

    @property
    def userdata(self) -> Userdata_T:
        return self.session.userdata

    def disallow_interruptions(self) -> None:
        """Disable interruptions for this FunctionCall.

        Delegates to the SpeechHandle.allow_interruptions setter,
        which will raise a RuntimeError if the handle is already interrupted.

        Raises:
            RuntimeError: If the SpeechHandle is already interrupted.
        """
        self.speech_handle.allow_interruptions = False

    async def wait_for_playout(self) -> None:
        """Waits for the speech playout corresponding to this function call step.

        Unlike `SpeechHandle.wait_for_playout`, which waits for the full
        assistant turn to complete (including all function tools),
        this method only waits for the assistant's spoken response prior running
        this tool to finish playing."""
        await self.speech_handle._wait_for_generation(step_idx=self._initial_step_idx)

Abstract base class for generic types.

On Python 3.12 and newer, generic classes implicitly inherit from Generic when they declare a parameter list after the class's name::

class Mapping[KT, VT]:
    def __getitem__(self, key: KT) -> VT:
        ...
    # Etc.

On older versions of Python, however, generic classes have to explicitly inherit from Generic.

After a class has been declared to be generic, it can then be used as follows::

def lookup_name[KT, VT](mapping: Mapping[KT, VT], key: KT, default: VT) -> VT:
    try:
        return mapping[key]
    except KeyError:
        return default

Ancestors

typing.Generic

Instance variables

prop function_call : FunctionCall

Expand source code

@property
def function_call(self) -> FunctionCall:
    return self._function_call

prop session : AgentSession[Userdata_T]

Expand source code

@property
def session(self) -> AgentSession[Userdata_T]:
    return self._session

prop speech_handle : SpeechHandle

Expand source code

@property
def speech_handle(self) -> SpeechHandle:
    return self._speech_handle

prop userdata : Userdata_T

Expand source code

@property
def userdata(self) -> Userdata_T:
    return self.session.userdata

Methods

def disallow_interruptions(self) ‑> None

Expand source code

def disallow_interruptions(self) -> None:
    """Disable interruptions for this FunctionCall.

    Delegates to the SpeechHandle.allow_interruptions setter,
    which will raise a RuntimeError if the handle is already interrupted.

    Raises:
        RuntimeError: If the SpeechHandle is already interrupted.
    """
    self.speech_handle.allow_interruptions = False

Disable interruptions for this FunctionCall.

Delegates to the SpeechHandle.allow_interruptions setter, which will raise a RuntimeError if the handle is already interrupted.

Raises

RuntimeError: If the SpeechHandle is already interrupted.

async def wait_for_playout(self) ‑> None

Expand source code

async def wait_for_playout(self) -> None:
    """Waits for the speech playout corresponding to this function call step.

    Unlike `SpeechHandle.wait_for_playout`, which waits for the full
    assistant turn to complete (including all function tools),
    this method only waits for the assistant's spoken response prior running
    this tool to finish playing."""
    await self.speech_handle._wait_for_generation(step_idx=self._initial_step_idx)

Waits for the speech playout corresponding to this function call step.

Unlike SpeechHandle.wait_for_playout(), which waits for the full assistant turn to complete (including all function tools), this method only waits for the assistant's spoken response prior running this tool to finish playing.

class SpeechCreatedEvent (**data: Any)

Expand source code

class SpeechCreatedEvent(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    type: Literal["speech_created"] = "speech_created"
    user_initiated: bool
    """True if the speech was created using public methods like `say` or `generate_reply`"""
    source: Literal["say", "generate_reply"]
    """Source indicating how the speech handle was created"""
    speech_handle: SpeechHandle = Field(..., exclude=True)
    """The speech handle that was created"""
    created_at: float = Field(default_factory=time.time)

Usage docs: https://docs.pydantic.dev/2.10/concepts/models/

A base class for creating Pydantic models.

Attributes

__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom __init__ function.
__pydantic_decorators__: Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [RootModel][pydantic.root_model.RootModel].
__pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model.
__pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [FieldInfo][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [ComputedFieldInfo][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [extra][pydantic.config.ConfigDict.extra] is set to 'allow'.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Ancestors

pydantic.main.BaseModel

Class variables

var created_at : float
var model_config
var source : Literal['say', 'generate_reply']: Source indicating how the speech handle was created
var speech_handle : livekit.agents.voice.speech_handle.SpeechHandle: The speech handle that was created
var type : Literal['speech_created']
var user_initiated : bool: True if the speech was created using public methods like say or generate_reply

class SpeechHandle (*, speech_id: str, allow_interruptions: bool)

Expand source code

class SpeechHandle:
    SPEECH_PRIORITY_LOW = 0
    """Priority for messages that should be played after all other messages in the queue"""
    SPEECH_PRIORITY_NORMAL = 5
    """Every speech generates by the VoiceAgent defaults to this priority."""
    SPEECH_PRIORITY_HIGH = 10
    """Priority for important messages that should be played before others."""

    def __init__(self, *, speech_id: str, allow_interruptions: bool) -> None:
        self._id = speech_id
        self._allow_interruptions = allow_interruptions

        self._interrupt_fut = asyncio.Future[None]()
        self._done_fut = asyncio.Future[None]()
        self._scheduled_fut = asyncio.Future[None]()
        self._authorize_event = asyncio.Event()

        self._generations: list[asyncio.Future[None]] = []

        # internal tasks used by this generation
        self._tasks: list[asyncio.Task] = []
        self._chat_items: list[llm.ChatItem] = []
        self._num_steps = 1
        self._agent_turn_context: otel_context.Context | None = None

        self._interrupt_timeout_handle: asyncio.TimerHandle | None = None

        self._item_added_callbacks: set[Callable[[llm.ChatItem], None]] = set()
        self._done_callbacks: set[Callable[[SpeechHandle], None]] = set()

        def _on_done(_: asyncio.Future[None]) -> None:
            for cb in self._done_callbacks:
                cb(self)

        self._done_fut.add_done_callback(_on_done)
        self._maybe_run_final_output: Any = None  # kept private

    @staticmethod
    def create(allow_interruptions: bool = True) -> SpeechHandle:
        return SpeechHandle(
            speech_id=utils.shortuuid("speech_"),
            allow_interruptions=allow_interruptions,
        )

    @property
    def num_steps(self) -> int:
        return self._num_steps

    @property
    def id(self) -> str:
        return self._id

    @property
    def _generation_id(self) -> str:
        return f"{self._id}_{self._num_steps}"

    @property
    def _parent_generation_id(self) -> str | None:
        if self._num_steps <= 1:
            return None
        return f"{self._id}_{self._num_steps - 1}"

    @property
    def scheduled(self) -> bool:
        return self._scheduled_fut.done()

    @property
    def interrupted(self) -> bool:
        return self._interrupt_fut.done()

    @property
    def allow_interruptions(self) -> bool:
        return self._allow_interruptions

    @allow_interruptions.setter
    def allow_interruptions(self, value: bool) -> None:
        """Allow or disallow interruptions on this SpeechHandle.

        When set to False, the SpeechHandle will no longer accept any incoming
        interruption requests until re-enabled. If the handle is already
        interrupted, clearing interruptions is not allowed.

        Args:
            value (bool): True to allow interruptions, False to disallow.

        Raises:
            RuntimeError: If attempting to disable interruptions when already interrupted.
        """
        if self.interrupted and not value:
            raise RuntimeError(
                "Cannot set allow_interruptions to False, the SpeechHandle is already interrupted"
            )

        self._allow_interruptions = value

    @property
    def chat_items(self) -> list[llm.ChatItem]:
        return self._chat_items

    def done(self) -> bool:
        return self._done_fut.done()

    def interrupt(self, *, force: bool = False) -> SpeechHandle:
        """Interrupt the current speech generation.

        Raises:
            RuntimeError: If this speech handle does not allow interruptions.

        Returns:
            SpeechHandle: The same speech handle that was interrupted.
        """
        if not force and not self._allow_interruptions:
            raise RuntimeError("This generation handle does not allow interruptions")

        self._cancel()
        return self

    async def wait_for_playout(self) -> None:
        """Waits for the entire assistant turn to complete playback.

        This method waits until the assistant has fully finished speaking,
        including any finalization steps beyond initial response generation.
        This is appropriate to call when you want to ensure the speech output
        has entirely played out, including any tool calls and response follow-ups."""

        # raise an error to avoid developer mistakes
        from .agent import _get_activity_task_info

        if task := asyncio.current_task():
            info = _get_activity_task_info(task)
            if info and info.function_call and info.speech_handle == self:
                raise RuntimeError(
                    f"cannot call `SpeechHandle.wait_for_playout()` from inside the function tool `{info.function_call.name}` that owns this SpeechHandle. "
                    "This creates a circular wait: the speech handle is waiting for the function tool to complete, "
                    "while the function tool is simultaneously waiting for the speech handle.\n"
                    "To wait for the assistant’s spoken response prior to running this tool, use `RunContext.wait_for_playout()` instead."
                )

        await asyncio.shield(self._done_fut)

    def __await__(self) -> Generator[None, None, SpeechHandle]:
        async def _await_impl() -> SpeechHandle:
            await self.wait_for_playout()
            return self

        return _await_impl().__await__()

    def add_done_callback(self, callback: Callable[[SpeechHandle], None]) -> None:
        self._done_callbacks.add(callback)

    def remove_done_callback(self, callback: Callable[[SpeechHandle], None]) -> None:
        self._done_callbacks.discard(callback)

    async def wait_if_not_interrupted(self, aw: list[asyncio.futures.Future[Any]]) -> None:
        fs: list[asyncio.Future[Any]] = [
            asyncio.gather(*aw, return_exceptions=True),
            self._interrupt_fut,
        ]
        await asyncio.wait(fs, return_when=asyncio.FIRST_COMPLETED)

    def _cancel(self) -> SpeechHandle:
        if self.done():
            return self

        if not self._interrupt_fut.done():
            self._interrupt_fut.set_result(None)

            def _on_timeout() -> None:
                logger.error(
                    "speech not done in time after interruption, cancelling the speech arbitrarily.",
                    extra={"speech_id": self._id, "timeout": INTERRUPTION_TIMEOUT},
                )
                for task in self._tasks:
                    task.cancel()
                self._mark_done()

            self._interrupt_timeout_handle = asyncio.get_event_loop().call_later(
                INTERRUPTION_TIMEOUT, _on_timeout
            )

        return self

    def _add_item_added_callback(self, callback: Callable[[llm.ChatItem], Any]) -> None:
        self._item_added_callbacks.add(callback)

    def _remove_item_added_callback(self, callback: Callable[[llm.ChatItem], Any]) -> None:
        self._item_added_callbacks.discard(callback)

    def _item_added(self, items: Sequence[llm.ChatItem]) -> None:
        for item in items:
            for cb in self._item_added_callbacks:
                cb(item)

            self._chat_items.append(item)

    def _authorize_generation(self) -> None:
        fut = asyncio.Future[None]()
        self._generations.append(fut)
        self._authorize_event.set()

    def _clear_authorization(self) -> None:
        self._authorize_event.clear()

    async def _wait_for_authorization(self) -> None:
        await self._authorize_event.wait()

    async def _wait_for_generation(self, step_idx: int = -1) -> None:
        if not self._generations:
            raise RuntimeError("cannot use wait_for_generation: no active generation is running.")

        await asyncio.shield(self._generations[step_idx])

    async def _wait_for_scheduled(self) -> None:
        await asyncio.shield(self._scheduled_fut)

    def _mark_generation_done(self) -> None:
        if not self._generations:
            raise RuntimeError("cannot use mark_generation_done: no active generation is running.")

        with contextlib.suppress(asyncio.InvalidStateError):
            self._generations[-1].set_result(None)

    def _mark_done(self) -> None:
        with contextlib.suppress(asyncio.InvalidStateError):
            # will raise InvalidStateError if the future is already done (interrupted)
            self._done_fut.set_result(None)
            if self._generations:
                self._mark_generation_done()  # preemptive generation could be cancelled before being scheduled

        if self._interrupt_timeout_handle is not None:
            self._interrupt_timeout_handle.cancel()
            self._interrupt_timeout_handle = None

    def _mark_scheduled(self) -> None:
        with contextlib.suppress(asyncio.InvalidStateError):
            self._scheduled_fut.set_result(None)

Class variables

var SPEECH_PRIORITY_HIGH: Priority for important messages that should be played before others.
var SPEECH_PRIORITY_LOW: Priority for messages that should be played after all other messages in the queue
var SPEECH_PRIORITY_NORMAL: Every speech generates by the VoiceAgent defaults to this priority.

Static methods

def create(allow_interruptions: bool = True) ‑> livekit.agents.voice.speech_handle.SpeechHandle

Expand source code

@staticmethod
def create(allow_interruptions: bool = True) -> SpeechHandle:
    return SpeechHandle(
        speech_id=utils.shortuuid("speech_"),
        allow_interruptions=allow_interruptions,
    )

Instance variables

prop allow_interruptions : bool

Expand source code

@property
def allow_interruptions(self) -> bool:
    return self._allow_interruptions

prop chat_items : list[llm.ChatItem]

Expand source code

@property
def chat_items(self) -> list[llm.ChatItem]:
    return self._chat_items

prop id : str

Expand source code

@property
def id(self) -> str:
    return self._id

prop interrupted : bool

Expand source code

@property
def interrupted(self) -> bool:
    return self._interrupt_fut.done()

prop num_steps : int

Expand source code

@property
def num_steps(self) -> int:
    return self._num_steps

prop scheduled : bool

Expand source code

@property
def scheduled(self) -> bool:
    return self._scheduled_fut.done()

Methods

def add_done_callback(self, callback: Callable[[SpeechHandle], None]) ‑> None

Expand source code

def add_done_callback(self, callback: Callable[[SpeechHandle], None]) -> None:
    self._done_callbacks.add(callback)

def done(self) ‑> bool

Expand source code

def done(self) -> bool:
    return self._done_fut.done()

def interrupt(self, *, force: bool = False) ‑> livekit.agents.voice.speech_handle.SpeechHandle

Expand source code

def interrupt(self, *, force: bool = False) -> SpeechHandle:
    """Interrupt the current speech generation.

    Raises:
        RuntimeError: If this speech handle does not allow interruptions.

    Returns:
        SpeechHandle: The same speech handle that was interrupted.
    """
    if not force and not self._allow_interruptions:
        raise RuntimeError("This generation handle does not allow interruptions")

    self._cancel()
    return self

Interrupt the current speech generation.

Raises

RuntimeError: If this speech handle does not allow interruptions.

Returns

SpeechHandle: The same speech handle that was interrupted.

def remove_done_callback(self, callback: Callable[[SpeechHandle], None]) ‑> None

Expand source code

def remove_done_callback(self, callback: Callable[[SpeechHandle], None]) -> None:
    self._done_callbacks.discard(callback)

async def wait_for_playout(self) ‑> None

Expand source code

async def wait_for_playout(self) -> None:
    """Waits for the entire assistant turn to complete playback.

    This method waits until the assistant has fully finished speaking,
    including any finalization steps beyond initial response generation.
    This is appropriate to call when you want to ensure the speech output
    has entirely played out, including any tool calls and response follow-ups."""

    # raise an error to avoid developer mistakes
    from .agent import _get_activity_task_info

    if task := asyncio.current_task():
        info = _get_activity_task_info(task)
        if info and info.function_call and info.speech_handle == self:
            raise RuntimeError(
                f"cannot call `SpeechHandle.wait_for_playout()` from inside the function tool `{info.function_call.name}` that owns this SpeechHandle. "
                "This creates a circular wait: the speech handle is waiting for the function tool to complete, "
                "while the function tool is simultaneously waiting for the speech handle.\n"
                "To wait for the assistant’s spoken response prior to running this tool, use `RunContext.wait_for_playout()` instead."
            )

    await asyncio.shield(self._done_fut)

Waits for the entire assistant turn to complete playback.

This method waits until the assistant has fully finished speaking, including any finalization steps beyond initial response generation. This is appropriate to call when you want to ensure the speech output has entirely played out, including any tool calls and response follow-ups.

async def wait_if_not_interrupted(self, aw: list[asyncio.futures.Future[Any]]) ‑> None

Expand source code

async def wait_if_not_interrupted(self, aw: list[asyncio.futures.Future[Any]]) -> None:
    fs: list[asyncio.Future[Any]] = [
        asyncio.gather(*aw, return_exceptions=True),
        self._interrupt_fut,
    ]
    await asyncio.wait(fs, return_when=asyncio.FIRST_COMPLETED)

class TranscriptSynchronizer (*, next_in_chain_audio: AudioOutput, next_in_chain_text: TextOutput, speed: float = 1.0, hyphenate_word: Callable[[str], list[str]] = <function hyphenate_word>, word_tokenizer: NotGivenOr[tokenize.WordTokenizer] = NOT_GIVEN)

Expand source code

class TranscriptSynchronizer:
    """
    Synchronizes text with audio playback timing.

    This class is responsible for synchronizing text with audio playback timing.
    It currently assumes that the first push_audio is starting the audio playback of a segment.
    """

    def __init__(
        self,
        *,
        next_in_chain_audio: io.AudioOutput,
        next_in_chain_text: io.TextOutput,
        speed: float = 1.0,
        hyphenate_word: Callable[[str], list[str]] = tokenize.basic.hyphenate_word,
        word_tokenizer: NotGivenOr[tokenize.WordTokenizer] = NOT_GIVEN,
    ) -> None:
        super().__init__()

        self._text_output = _SyncedTextOutput(self, next_in_chain=next_in_chain_text)
        self._audio_output = _SyncedAudioOutput(self, next_in_chain=next_in_chain_audio)
        self._text_attached, self._audio_attached = True, True
        self._opts = _TextSyncOptions(
            speed=speed,
            hyphenate_word=hyphenate_word,
            word_tokenizer=(
                word_tokenizer
                or tokenize.basic.WordTokenizer(
                    retain_format=True, ignore_punctuation=False, split_character=True
                )
            ),
            speaking_rate_detector=SpeakingRateDetector(),
        )
        self._enabled = True
        self._closed = False

        # initial segment/first segment, recreated for each new segment
        self._impl = _SegmentSynchronizerImpl(options=self._opts, next_in_chain=next_in_chain_text)
        self._rotate_segment_atask: asyncio.Task[None] | None = None

    @property
    def audio_output(self) -> _SyncedAudioOutput:
        return self._audio_output

    @property
    def text_output(self) -> _SyncedTextOutput:
        return self._text_output

    @property
    def enabled(self) -> bool:
        return self._enabled

    async def aclose(self) -> None:
        self._closed = True
        await self.barrier()
        await self._impl.aclose()

    def set_enabled(self, enabled: bool) -> None:
        if self._enabled == enabled:
            return

        self._enabled = enabled
        if not self._rotate_segment_atask or self._rotate_segment_atask.done():
            # avoid calling rotate_segment twice when closing the session during agent speaking
            # first time when speech interrupted, second time here when output detached
            self.rotate_segment()

    def _on_attachment_changed(
        self,
        *,
        audio_attached: NotGivenOr[bool] = NOT_GIVEN,
        text_attached: NotGivenOr[bool] = NOT_GIVEN,
    ) -> None:
        if is_given(audio_attached):
            self._audio_attached = audio_attached

        if is_given(text_attached):
            self._text_attached = text_attached

        self.set_enabled(self._audio_attached and self._text_attached)

    async def _rotate_segment_task(self, old_task: asyncio.Task[None] | None) -> None:
        if old_task:
            await old_task

        await self._impl.aclose()
        self._impl = _SegmentSynchronizerImpl(
            options=self._opts, next_in_chain=self._text_output._next_in_chain
        )

    def rotate_segment(self) -> None:
        if self._closed:
            return

        if self._rotate_segment_atask and not self._rotate_segment_atask.done():
            logger.warning("rotate_segment called while previous segment is still being rotated")

        self._rotate_segment_atask = asyncio.create_task(
            self._rotate_segment_task(self._rotate_segment_atask)
        )

    async def barrier(self) -> None:
        if self._rotate_segment_atask is None:
            return

        # using a while loop in case rotate_segment is called twice (this should not happen, but
        # just in case, we do log a warning if it does)
        while not self._rotate_segment_atask.done():
            await self._rotate_segment_atask

Synchronizes text with audio playback timing.

This class is responsible for synchronizing text with audio playback timing. It currently assumes that the first push_audio is starting the audio playback of a segment.

Instance variables

prop audio_output : _SyncedAudioOutput

Expand source code

@property
def audio_output(self) -> _SyncedAudioOutput:
    return self._audio_output

prop enabled : bool

Expand source code

@property
def enabled(self) -> bool:
    return self._enabled

prop text_output : _SyncedTextOutput

Expand source code

@property
def text_output(self) -> _SyncedTextOutput:
    return self._text_output

Methods

async def aclose(self) ‑> None

Expand source code

async def aclose(self) -> None:
    self._closed = True
    await self.barrier()
    await self._impl.aclose()

async def barrier(self) ‑> None

Expand source code

async def barrier(self) -> None:
    if self._rotate_segment_atask is None:
        return

    # using a while loop in case rotate_segment is called twice (this should not happen, but
    # just in case, we do log a warning if it does)
    while not self._rotate_segment_atask.done():
        await self._rotate_segment_atask

def rotate_segment(self) ‑> None

Expand source code

def rotate_segment(self) -> None:
    if self._closed:
        return

    if self._rotate_segment_atask and not self._rotate_segment_atask.done():
        logger.warning("rotate_segment called while previous segment is still being rotated")

    self._rotate_segment_atask = asyncio.create_task(
        self._rotate_segment_task(self._rotate_segment_atask)
    )

def set_enabled(self, enabled: bool) ‑> None

Expand source code

def set_enabled(self, enabled: bool) -> None:
    if self._enabled == enabled:
        return

    self._enabled = enabled
    if not self._rotate_segment_atask or self._rotate_segment_atask.done():
        # avoid calling rotate_segment twice when closing the session during agent speaking
        # first time when speech interrupted, second time here when output detached
        self.rotate_segment()

class UserInputTranscribedEvent (**data: Any)

Expand source code

class UserInputTranscribedEvent(BaseModel):
    type: Literal["user_input_transcribed"] = "user_input_transcribed"
    transcript: str
    is_final: bool
    speaker_id: str | None = None
    language: str | None = None
    created_at: float = Field(default_factory=time.time)

Usage docs: https://docs.pydantic.dev/2.10/concepts/models/

A base class for creating Pydantic models.

Attributes

__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom __init__ function.
__pydantic_decorators__: Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [RootModel][pydantic.root_model.RootModel].
__pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model.
__pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [FieldInfo][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [ComputedFieldInfo][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [extra][pydantic.config.ConfigDict.extra] is set to 'allow'.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Ancestors

pydantic.main.BaseModel

Class variables

var created_at : float
var is_final : bool
var language : str | None
var model_config
var speaker_id : str | None
var transcript : str
var type : Literal['user_input_transcribed']

class UserStateChangedEvent (**data: Any)

Expand source code

class UserStateChangedEvent(BaseModel):
    type: Literal["user_state_changed"] = "user_state_changed"
    old_state: UserState
    new_state: UserState
    created_at: float = Field(default_factory=time.time)

Usage docs: https://docs.pydantic.dev/2.10/concepts/models/

A base class for creating Pydantic models.

Attributes

__class_vars__: The names of the class variables defined on the model.
__private_attributes__: Metadata about the private attributes of the model.
__signature__: The synthesized __init__ [Signature][inspect.Signature] of the model.
__pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
__pydantic_core_schema__: The core schema of the model.
__pydantic_custom_init__: Whether the model has a custom __init__ function.
__pydantic_decorators__: Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.
__pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__: The name of the post-init method for the model, if defined.
__pydantic_root_model__: Whether the model is a [RootModel][pydantic.root_model.RootModel].
__pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model.
__pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model.
__pydantic_fields__: A dictionary of field names and their corresponding [FieldInfo][pydantic.fields.FieldInfo] objects.
__pydantic_computed_fields__: A dictionary of computed field names and their corresponding [ComputedFieldInfo][pydantic.fields.ComputedFieldInfo] objects.
__pydantic_extra__: A dictionary containing extra values, if [extra][pydantic.config.ConfigDict.extra] is set to 'allow'.
__pydantic_fields_set__: The names of fields explicitly set during instantiation.
__pydantic_private__: Values of private attributes set on the model instance.

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Ancestors

pydantic.main.BaseModel

Class variables

var created_at : float
var model_config
var new_state : Literal['speaking', 'listening', 'away']
var old_state : Literal['speaking', 'listening', 'away']
var type : Literal['user_state_changed']

class VoiceActivityVideoSampler (*, speaking_fps: float = 1.0, silent_fps: float = 0.3)

Expand source code

class VoiceActivityVideoSampler:
    def __init__(self, *, speaking_fps: float = 1.0, silent_fps: float = 0.3):
        self.speaking_fps = speaking_fps
        self.silent_fps = silent_fps
        self._last_sampled_time: float | None = None

    def __call__(self, frame: rtc.VideoFrame, session: AgentSession) -> bool:
        now = time.time()
        is_speaking = session.user_state == "speaking"
        target_fps = self.speaking_fps if is_speaking else self.silent_fps
        if target_fps == 0:
            return False
        min_frame_interval = 1.0 / target_fps

        if self._last_sampled_time is None:
            self._last_sampled_time = now
            return True

        if (now - self._last_sampled_time) >= min_frame_interval:
            self._last_sampled_time = now
            return True

        return False

class _ParticipantAudioOutput (room: rtc.Room, *, sample_rate: int, num_channels: int, track_publish_options: rtc.TrackPublishOptions, track_name: str = 'roomio_audio')

Expand source code

class _ParticipantAudioOutput(io.AudioOutput):
    def __init__(
        self,
        room: rtc.Room,
        *,
        sample_rate: int,
        num_channels: int,
        track_publish_options: rtc.TrackPublishOptions,
        track_name: str = "roomio_audio",
    ) -> None:
        super().__init__(
            label="RoomIO",
            next_in_chain=None,
            sample_rate=sample_rate,
            capabilities=io.AudioOutputCapabilities(pause=True),
        )
        self._room = room
        self._track_name = track_name
        self._lock = asyncio.Lock()
        self._audio_source = rtc.AudioSource(sample_rate, num_channels, queue_size_ms=200)
        self._publish_options = track_publish_options
        self._publication: rtc.LocalTrackPublication | None = None
        self._subscribed_fut = asyncio.Future[None]()

        self._audio_buf = utils.aio.Chan[rtc.AudioFrame]()
        self._audio_bstream = utils.audio.AudioByteStream(
            sample_rate, num_channels, samples_per_channel=sample_rate // 20
        )  # chunk the frame into a small, fixed size

        # used to republish track on reconnection
        self._republish_task: asyncio.Task[None] | None = None
        self._flush_task: asyncio.Task[None] | None = None
        self._interrupted_event = asyncio.Event()
        self._forwarding_task: asyncio.Task[None] | None = None

        self._pushed_duration: float = 0.0

        self._playback_enabled = asyncio.Event()
        self._playback_enabled.set()

    async def _publish_track(self) -> None:
        async with self._lock:
            track = rtc.LocalAudioTrack.create_audio_track(self._track_name, self._audio_source)
            self._publication = await self._room.local_participant.publish_track(
                track, self._publish_options
            )
            await self._publication.wait_for_subscription()
            if not self._subscribed_fut.done():
                self._subscribed_fut.set_result(None)

    @property
    def subscribed(self) -> asyncio.Future[None]:
        return self._subscribed_fut

    async def start(self) -> None:
        self._forwarding_task = asyncio.create_task(self._forward_audio())
        await self._publish_track()
        self._room.on("reconnected", self._on_reconnected)

    async def aclose(self) -> None:
        self._room.off("reconnected", self._on_reconnected)
        if self._republish_task:
            await utils.aio.cancel_and_wait(self._republish_task)
        if self._flush_task:
            await utils.aio.cancel_and_wait(self._flush_task)
        if self._forwarding_task:
            await utils.aio.cancel_and_wait(self._forwarding_task)

        await self._audio_source.aclose()

    async def capture_frame(self, frame: rtc.AudioFrame) -> None:
        await self._subscribed_fut

        await super().capture_frame(frame)

        if self._flush_task and not self._flush_task.done():
            logger.error("capture_frame called while flush is in progress")
            await self._flush_task

        for f in self._audio_bstream.push(frame.data):
            await self._audio_buf.send(f)
            self._pushed_duration += f.duration

    def flush(self) -> None:
        super().flush()

        for f in self._audio_bstream.flush():
            self._audio_buf.send_nowait(f)
            self._pushed_duration += f.duration

        if not self._pushed_duration:
            return

        if self._flush_task and not self._flush_task.done():
            # shouldn't happen if only one active speech handle at a time
            logger.error("flush called while playback is in progress")
            self._flush_task.cancel()

        self._flush_task = asyncio.create_task(self._wait_for_playout())

    def clear_buffer(self) -> None:
        self._audio_bstream.clear()

        if not self._pushed_duration:
            return
        self._interrupted_event.set()

    def pause(self) -> None:
        super().pause()
        self._playback_enabled.clear()
        # self._audio_source.clear_queue()

    def resume(self) -> None:
        super().resume()
        self._playback_enabled.set()

    async def _wait_for_playout(self) -> None:
        wait_for_interruption = asyncio.create_task(self._interrupted_event.wait())

        async def _wait_buffered_audio() -> None:
            while not self._audio_buf.empty():
                if not self._playback_enabled.is_set():
                    await self._playback_enabled.wait()

                await self._audio_source.wait_for_playout()
                # avoid deadlock when clear_buffer called before capture_frame
                await asyncio.sleep(0)

        wait_for_playout = asyncio.create_task(_wait_buffered_audio())
        await asyncio.wait(
            [wait_for_playout, wait_for_interruption],
            return_when=asyncio.FIRST_COMPLETED,
        )

        interrupted = wait_for_interruption.done()
        pushed_duration = self._pushed_duration

        if interrupted:
            queued_duration = self._audio_source.queued_duration
            while not self._audio_buf.empty():
                queued_duration += self._audio_buf.recv_nowait().duration

            pushed_duration = max(pushed_duration - queued_duration, 0)
            self._audio_source.clear_queue()
            wait_for_playout.cancel()
        else:
            wait_for_interruption.cancel()

        self._pushed_duration = 0
        self._interrupted_event.clear()
        self.on_playback_finished(playback_position=pushed_duration, interrupted=interrupted)

    async def _forward_audio(self) -> None:
        async for frame in self._audio_buf:
            if not self._playback_enabled.is_set():
                self._audio_source.clear_queue()
                await self._playback_enabled.wait()
                # TODO(long): save the frames in the queue and play them later
                # TODO(long): ignore frames from previous syllable

            if self._interrupted_event.is_set() or self._pushed_duration == 0:
                if self._interrupted_event.is_set() and self._flush_task:
                    await self._flush_task

                # ignore frames if interrupted
                continue

            await self._audio_source.capture_frame(frame)

    def _on_reconnected(self) -> None:
        if self._republish_task:
            self._republish_task.cancel()
        self._republish_task = asyncio.create_task(self._publish_track())

Helper class that provides a standard way to create an ABC using inheritance.

Args

sample_rate: The sample rate required by the audio sink, if None, any sample rate is accepted

Ancestors

AudioOutput
abc.ABC
EventEmitter
typing.Generic

Instance variables

prop subscribed : asyncio.Future[None]

Expand source code

@property
def subscribed(self) -> asyncio.Future[None]:
    return self._subscribed_fut

Methods

async def aclose(self) ‑> None

Expand source code

async def aclose(self) -> None:
    self._room.off("reconnected", self._on_reconnected)
    if self._republish_task:
        await utils.aio.cancel_and_wait(self._republish_task)
    if self._flush_task:
        await utils.aio.cancel_and_wait(self._flush_task)
    if self._forwarding_task:
        await utils.aio.cancel_and_wait(self._forwarding_task)

    await self._audio_source.aclose()

async def start(self) ‑> None

Expand source code

async def start(self) -> None:
    self._forwarding_task = asyncio.create_task(self._forward_audio())
    await self._publish_track()
    self._room.on("reconnected", self._on_reconnected)

Inherited members

AudioOutput:
- capture_frame
- clear_buffer
- emit
- flush
- off
- on
- on_playback_finished
- once
- pause
- resume
- sample_rate
- wait_for_playout

class _ParticipantStreamTranscriptionOutput (room: rtc.Room, *, is_delta_stream: bool = True, participant: rtc.Participant | str | None = None, attributes: dict[str, str] | None = None)

Expand source code

class _ParticipantStreamTranscriptionOutput:
    def __init__(
        self,
        room: rtc.Room,
        *,
        is_delta_stream: bool = True,
        participant: rtc.Participant | str | None = None,
        attributes: dict[str, str] | None = None,
    ):
        self._room, self._is_delta_stream = room, is_delta_stream
        self._track_id: str | None = None
        self._participant_identity: str | None = None
        self._additional_attributes = attributes or {}

        self._writer: rtc.TextStreamWriter | None = None

        self._room.on("track_published", self._on_track_published)
        self._room.on("local_track_published", self._on_local_track_published)
        self._flush_atask: asyncio.Task[None] | None = None

        self._reset_state()
        self.set_participant(participant)

    def set_participant(
        self,
        participant: rtc.Participant | str | None,
    ) -> None:
        self._participant_identity = (
            participant.identity if isinstance(participant, rtc.Participant) else participant
        )
        if self._participant_identity is None:
            return

        try:
            self._track_id = find_micro_track_id(self._room, self._participant_identity)
        except ValueError:
            # track id is optional for TextStream when audio is not published
            self._track_id = None

        self.flush()
        self._reset_state()

    def _reset_state(self) -> None:
        self._current_id = utils.shortuuid("SG_")
        self._capturing = False
        self._latest_text = ""

    async def _create_text_writer(
        self, attributes: dict[str, str] | None = None
    ) -> rtc.TextStreamWriter:
        assert self._participant_identity is not None, "participant_identity is not set"

        if not attributes:
            attributes = {
                ATTRIBUTE_TRANSCRIPTION_FINAL: "false",
            }
            if self._track_id:
                attributes[ATTRIBUTE_TRANSCRIPTION_TRACK_ID] = self._track_id
        attributes[ATTRIBUTE_TRANSCRIPTION_SEGMENT_ID] = self._current_id

        for key, val in self._additional_attributes.items():
            if key not in attributes:
                attributes[key] = val

        return await self._room.local_participant.stream_text(
            topic=TOPIC_TRANSCRIPTION,
            sender_identity=self._participant_identity,
            attributes=attributes,
        )

    @utils.log_exceptions(logger=logger)
    async def capture_text(self, text: str) -> None:
        if self._participant_identity is None:
            return

        if self._flush_atask and not self._flush_atask.done():
            await self._flush_atask

        if not self._capturing:
            self._reset_state()
            self._capturing = True

        self._latest_text = text

        try:
            if self._room.isconnected():
                if self._is_delta_stream:  # reuse the existing writer
                    if self._writer is None:
                        self._writer = await self._create_text_writer()

                    await self._writer.write(text)
                else:  # always create a new writer
                    tmp_writer = await self._create_text_writer()
                    await tmp_writer.write(text)
                    await tmp_writer.aclose()
        except Exception as e:
            logger.warning("failed to publish transcription", exc_info=e)

    async def _flush_task(self, writer: rtc.TextStreamWriter | None) -> None:
        attributes = {ATTRIBUTE_TRANSCRIPTION_FINAL: "true"}
        if self._track_id:
            attributes[ATTRIBUTE_TRANSCRIPTION_TRACK_ID] = self._track_id

        try:
            if self._room.isconnected():
                if self._is_delta_stream:
                    if writer:
                        await writer.aclose(attributes=attributes)
                else:
                    tmp_writer = await self._create_text_writer(attributes=attributes)
                    await tmp_writer.write(self._latest_text)
                    await tmp_writer.aclose()
        except Exception as e:
            logger.warning("failed to publish transcription", exc_info=e)

    def flush(self) -> None:
        if self._participant_identity is None or not self._capturing:
            return

        self._capturing = False
        curr_writer = self._writer
        self._writer = None
        self._flush_atask = asyncio.create_task(self._flush_task(curr_writer))

    def _on_track_published(
        self, track: rtc.RemoteTrackPublication, participant: rtc.RemoteParticipant
    ) -> None:
        if (
            self._participant_identity is None
            or participant.identity != self._participant_identity
            or track.source != rtc.TrackSource.SOURCE_MICROPHONE
        ):
            return

        self._track_id = track.sid

    def _on_local_track_published(self, track: rtc.LocalTrackPublication, _: rtc.Track) -> None:
        if (
            self._participant_identity is None
            or self._participant_identity != self._room.local_participant.identity
            or track.source != rtc.TrackSource.SOURCE_MICROPHONE
        ):
            return

        self._track_id = track.sid

Methods

async def capture_text(self, text: str) ‑> None

Expand source code

@utils.log_exceptions(logger=logger)
async def capture_text(self, text: str) -> None:
    if self._participant_identity is None:
        return

    if self._flush_atask and not self._flush_atask.done():
        await self._flush_atask

    if not self._capturing:
        self._reset_state()
        self._capturing = True

    self._latest_text = text

    try:
        if self._room.isconnected():
            if self._is_delta_stream:  # reuse the existing writer
                if self._writer is None:
                    self._writer = await self._create_text_writer()

                await self._writer.write(text)
            else:  # always create a new writer
                tmp_writer = await self._create_text_writer()
                await tmp_writer.write(text)
                await tmp_writer.aclose()
    except Exception as e:
        logger.warning("failed to publish transcription", exc_info=e)

def flush(self) ‑> None

Expand source code

def flush(self) -> None:
    if self._participant_identity is None or not self._capturing:
        return

    self._capturing = False
    curr_writer = self._writer
    self._writer = None
    self._flush_atask = asyncio.create_task(self._flush_task(curr_writer))

def set_participant(self, participant: rtc.Participant | str | None) ‑> None

Expand source code

def set_participant(
    self,
    participant: rtc.Participant | str | None,
) -> None:
    self._participant_identity = (
        participant.identity if isinstance(participant, rtc.Participant) else participant
    )
    if self._participant_identity is None:
        return

    try:
        self._track_id = find_micro_track_id(self._room, self._participant_identity)
    except ValueError:
        # track id is optional for TextStream when audio is not published
        self._track_id = None

    self.flush()
    self._reset_state()

class _ParticipantTranscriptionOutput (*, room: rtc.Room, is_delta_stream: bool = True, participant: rtc.Participant | str | None = None, next_in_chain: TextOutput | None = None)

Expand source code

class _ParticipantTranscriptionOutput(io.TextOutput):
    def __init__(
        self,
        *,
        room: rtc.Room,
        is_delta_stream: bool = True,
        participant: rtc.Participant | str | None = None,
        next_in_chain: io.TextOutput | None = None,
    ) -> None:
        super().__init__(label="RoomIO", next_in_chain=next_in_chain)

        self.__outputs: list[
            _ParticipantLegacyTranscriptionOutput | _ParticipantStreamTranscriptionOutput
        ] = [
            _ParticipantLegacyTranscriptionOutput(
                room=room,
                is_delta_stream=is_delta_stream,
                participant=participant,
            ),
            _ParticipantStreamTranscriptionOutput(
                room=room,
                is_delta_stream=is_delta_stream,
                participant=participant,
            ),
        ]

    def set_participant(self, participant: rtc.Participant | str | None) -> None:
        for source in self.__outputs:
            source.set_participant(participant)

    async def capture_text(self, text: str) -> None:
        await asyncio.gather(*[sink.capture_text(text) for sink in self.__outputs])

        if self.next_in_chain:
            await self.next_in_chain.capture_text(text)

    def flush(self) -> None:
        for source in self.__outputs:
            source.flush()

        if self.next_in_chain:
            self.next_in_chain.flush()

Helper class that provides a standard way to create an ABC using inheritance.

Ancestors

TextOutput
abc.ABC

Methods

def set_participant(self, participant: rtc.Participant | str | None) ‑> None

Expand source code

def set_participant(self, participant: rtc.Participant | str | None) -> None:
    for source in self.__outputs:
        source.set_participant(participant)

Inherited members

TextOutput:
- capture_text
- flush