Module `livekit.agents.beta`

Sub-modules

livekit.agents.beta.tools
livekit.agents.beta.toolsets
livekit.agents.beta.workflows

Classes

class EndCallTool (*, extra_description: str = '', delete_room: bool = True, end_instructions: str | None = 'say goodbye to the user', on_tool_called: collections.abc.Callable[[livekit.agents.llm.tool_context.Toolset.ToolCalledEvent], collections.abc.Awaitable[None]] | None = None, on_tool_completed: collections.abc.Callable[[livekit.agents.llm.tool_context.Toolset.ToolCompletedEvent], collections.abc.Awaitable[None]] | None = None)

Expand source code

class EndCallTool(Toolset):
    def __init__(
        self,
        *,
        extra_description: str = "",
        delete_room: bool = True,
        end_instructions: str | None = "say goodbye to the user",
        on_tool_called: Callable[[Toolset.ToolCalledEvent], Awaitable[None]] | None = None,
        on_tool_completed: Callable[[Toolset.ToolCompletedEvent], Awaitable[None]] | None = None,
    ):
        """
        This tool allows the agent to end the call and disconnect from the room.

        Args:
            extra_description: Additional description to add to the end call tool.
            delete_room: Whether to delete the room when the user ends the call. deleting the room disconnects all remote users, including SIP callers.
            end_instructions: Tool output to the LLM for generating the tool response.
            on_tool_called: Callback to call when the tool is called.
            on_tool_completed: Callback to call when the tool is completed.
        """
        end_call_tool = function_tool(
            self._end_call,
            name="end_call",
            description=f"{END_CALL_DESCRIPTION}\n{extra_description}",
        )
        super().__init__(id="end_call", tools=[end_call_tool])

        self._delete_room = delete_room
        self._extra_description = extra_description

        self._end_instructions = end_instructions
        self._on_tool_called = on_tool_called
        self._on_tool_completed = on_tool_completed

        self._shutdown_session_task: asyncio.Task[None] | None = None

    async def _end_call(self, ctx: RunContext) -> Any | None:
        logger.debug("end_call tool called")
        llm_v = ctx.session.current_agent._get_activity_or_raise().llm

        def _on_speech_done(_: SpeechHandle) -> None:
            if (
                not isinstance(llm_v, RealtimeModel)
                or not llm_v.capabilities.auto_tool_reply_generation
            ):
                # tool reply will reuse the same speech handle, so we can shutdown the session
                # directly after this speech handle is done
                ctx.session.shutdown()
            else:
                self._shutdown_session_task = asyncio.create_task(
                    self._delayed_session_shutdown(ctx)
                )

        ctx.speech_handle.add_done_callback(_on_speech_done)
        ctx.session.once("close", self._on_session_close)

        if self._on_tool_called:
            await self._on_tool_called(Toolset.ToolCalledEvent(ctx=ctx, arguments={}))

        completed_ev = Toolset.ToolCompletedEvent(ctx=ctx, output=self._end_instructions)
        if self._on_tool_completed:
            await self._on_tool_completed(completed_ev)

        return completed_ev.output

    async def _delayed_session_shutdown(self, ctx: RunContext) -> None:
        """Shutdown the session after the tool reply is played out"""
        speech_created_fut = asyncio.Future[SpeechHandle]()

        @ctx.session.once("speech_created")
        def _on_speech_created(ev: SpeechCreatedEvent) -> None:
            if not speech_created_fut.done():
                speech_created_fut.set_result(ev.speech_handle)

        try:
            speech_handle = await asyncio.wait_for(speech_created_fut, timeout=5.0)
            await speech_handle
        except asyncio.TimeoutError:
            logger.warning("tool reply timed out, shutting down session")
        finally:
            ctx.session.off("speech_created", _on_speech_created)
            ctx.session.shutdown()

    def _on_session_close(self, ev: CloseEvent) -> None:
        """Close the job process when AgentSession is closed"""
        if self._shutdown_session_task:
            # cleanup
            self._shutdown_session_task.cancel()
            self._shutdown_session_task = None

        job_ctx = get_job_context()

        if self._delete_room:

            async def _on_shutdown() -> None:
                logger.info("deleting the room because the user ended the call")
                await job_ctx.delete_room()

            job_ctx.add_shutdown_callback(_on_shutdown)

        # shutdown the job process
        job_ctx.shutdown(reason=ev.reason.value)

This tool allows the agent to end the call and disconnect from the room.

Args

extra_description: Additional description to add to the end call tool.
delete_room: Whether to delete the room when the user ends the call. deleting the room disconnects all remote users, including SIP callers.
end_instructions: Tool output to the LLM for generating the tool response.
on_tool_called: Callback to call when the tool is called.
on_tool_completed: Callback to call when the tool is completed.

Ancestors

livekit.agents.llm.tool_context.Toolset

class Instructions (audio: str, *, text: str | None = None)

Expand source code

class Instructions(str):
    """Instructions that adapt based on the user's input modality (audio vs. text).

    ``str(self)`` is what providers see when treating this as a plain string.
    By default it equals the ``audio`` variant; after :meth:`as_modality` it
    equals the chosen variant.

    ``_audio_variant`` and ``_text_variant`` are always preserved so
    :meth:`as_modality` can be called again for a different modality (e.g.,
    when the same ``ChatContext`` is reused across tool-call turns).
    """

    _audio_variant: str
    _text_variant: str | None

    def __new__(
        cls, audio: str, *, text: str | None = None, _represent: str | None = None
    ) -> Instructions:
        """Create an Instructions object.

        Args:
            audio: The audio (voice) variant.
            text: The text variant.  Falls back to ``audio`` when omitted.
        """
        instance = super().__new__(cls, _represent if _represent is not None else audio)
        instance._audio_variant = audio
        instance._text_variant = text
        return instance

    @property
    def audio(self) -> str:
        """The audio (voice) variant of the instructions."""
        return self._audio_variant

    @property
    def text(self) -> str:
        """The text variant of the instructions.

        Falls back to the audio variant when no text variant was provided.
        """
        return self._text_variant if self._text_variant is not None else self._audio_variant

    def format(self, *args: object, **kwargs: object) -> Instructions:
        """Format the instructions with the given keyword arguments."""

        any_instructions = any(isinstance(arg, Instructions) for arg in args) or any(
            isinstance(v, Instructions) for v in kwargs.values()
        )
        if any_instructions:
            audio_args = tuple(arg.audio if isinstance(arg, Instructions) else arg for arg in args)
            text_args = tuple(arg.text if isinstance(arg, Instructions) else arg for arg in args)
            audio_kwargs = {
                k: v.audio if isinstance(v, Instructions) else v for k, v in kwargs.items()
            }
            text_kwargs = {
                k: v.text if isinstance(v, Instructions) else v for k, v in kwargs.items()
            }
        else:
            audio_args = text_args = args
            audio_kwargs = text_kwargs = kwargs

        return Instructions(
            audio=self.audio.format(*audio_args, **audio_kwargs),
            text=(
                self.text.format(*text_args, **text_kwargs)
                if any_instructions or self._text_variant is not None
                else None
            ),
            _represent=str(self).format(*args, **kwargs),
        )

    def as_modality(self, modality: Literal["audio", "text"]) -> Instructions:
        """Return a copy whose ``str`` value is the correct variant for *modality*.

        Both ``_audio_variant`` and ``_text_variant`` are preserved so this can
        be called again for a different modality (e.g. across tool-call turns).
        """
        return Instructions(
            audio=self._audio_variant,
            text=self._text_variant,
            _represent=self.audio if modality == "audio" else self.text,
        )

    def __add__(self, other: object) -> Instructions:
        """Concatenate, propagating both variants and the current str value."""
        if isinstance(other, Instructions):
            has_text = self._text_variant is not None or other._text_variant is not None
            return Instructions(
                audio=self.audio + other.audio,
                text=(self.text + other.text) if has_text else None,
                _represent=str(self) + str(other),
            )
        if isinstance(other, str):
            return Instructions(
                audio=self.audio + other,
                text=(self._text_variant + other) if self._text_variant is not None else None,
                _represent=str(self) + other,
            )
        raise TypeError(f"Cannot add Instructions and {type(other)}")

    def __radd__(self, other: object) -> Instructions:
        """Support ``plain_str + Instructions``, propagating both variants."""
        if isinstance(other, str):
            return Instructions(
                audio=other + self.audio,
                text=(other + self._text_variant) if self._text_variant is not None else None,
                _represent=other + str(self),
            )
        raise TypeError(f"Cannot add {type(other)} and Instructions")

    def __repr__(self) -> str:
        return f"Instructions({str(self)!r})"

    @classmethod
    def __get_pydantic_core_schema__(cls, source_type: Any, handler: Any) -> Any:
        from pydantic_core import core_schema

        def validate_python(v: Any) -> Instructions:
            if isinstance(v, Instructions):
                return v
            if isinstance(v, dict) and v.get("type") == "instructions":
                return cls(v["audio"], text=v.get("text"))
            raise ValueError(f"Cannot convert {type(v)!r} to Instructions")

        def validate_json(v: Any) -> Instructions:
            if isinstance(v, dict) and v.get("type") == "instructions":
                return cls(v["audio"], text=v.get("text"))
            raise ValueError(f"Cannot convert {type(v)!r} to Instructions")

        def serialize(v: Instructions) -> dict[str, Any]:
            d: dict[str, Any] = {"type": "instructions", "audio": v.audio}
            if v._text_variant is not None:
                d["text"] = v._text_variant
            return d

        return core_schema.json_or_python_schema(
            python_schema=core_schema.no_info_plain_validator_function(validate_python),
            json_schema=core_schema.no_info_plain_validator_function(validate_json),
            serialization=core_schema.plain_serializer_function_ser_schema(
                serialize, info_arg=False
            ),
        )

Instructions that adapt based on the user's input modality (audio vs. text).

str(self) is what providers see when treating this as a plain string. By default it equals the audio variant; after :meth:as_modality it equals the chosen variant.

_audio_variant and _text_variant are always preserved so :meth:as_modality can be called again for a different modality (e.g., when the same ChatContext is reused across tool-call turns).

Ancestors

builtins.str

Instance variables

prop audio : str

Expand source code

@property
def audio(self) -> str:
    """The audio (voice) variant of the instructions."""
    return self._audio_variant

The audio (voice) variant of the instructions.

prop text : str

Expand source code

@property
def text(self) -> str:
    """The text variant of the instructions.

    Falls back to the audio variant when no text variant was provided.
    """
    return self._text_variant if self._text_variant is not None else self._audio_variant

The text variant of the instructions.

Falls back to the audio variant when no text variant was provided.

Methods

def as_modality(self, modality: "Literal['audio', 'text']") ‑> livekit.agents.llm.chat_context.Instructions

Expand source code

def as_modality(self, modality: Literal["audio", "text"]) -> Instructions:
    """Return a copy whose ``str`` value is the correct variant for *modality*.

    Both ``_audio_variant`` and ``_text_variant`` are preserved so this can
    be called again for a different modality (e.g. across tool-call turns).
    """
    return Instructions(
        audio=self._audio_variant,
        text=self._text_variant,
        _represent=self.audio if modality == "audio" else self.text,
    )

Return a copy whose str value is the correct variant for modality.

Both _audio_variant and _text_variant are preserved so this can be called again for a different modality (e.g. across tool-call turns).

def format(self, *args: object, **kwargs: object) ‑> livekit.agents.llm.chat_context.Instructions

Expand source code

def format(self, *args: object, **kwargs: object) -> Instructions:
    """Format the instructions with the given keyword arguments."""

    any_instructions = any(isinstance(arg, Instructions) for arg in args) or any(
        isinstance(v, Instructions) for v in kwargs.values()
    )
    if any_instructions:
        audio_args = tuple(arg.audio if isinstance(arg, Instructions) else arg for arg in args)
        text_args = tuple(arg.text if isinstance(arg, Instructions) else arg for arg in args)
        audio_kwargs = {
            k: v.audio if isinstance(v, Instructions) else v for k, v in kwargs.items()
        }
        text_kwargs = {
            k: v.text if isinstance(v, Instructions) else v for k, v in kwargs.items()
        }
    else:
        audio_args = text_args = args
        audio_kwargs = text_kwargs = kwargs

    return Instructions(
        audio=self.audio.format(*audio_args, **audio_kwargs),
        text=(
            self.text.format(*text_args, **text_kwargs)
            if any_instructions or self._text_variant is not None
            else None
        ),
        _represent=str(self).format(*args, **kwargs),
    )

Format the instructions with the given keyword arguments.