This example shows how to capture token and latency metrics emitted by the LLM pipeline and print them as a Rich table whenever the agent responds. It's a quick way to see prompt/response token counts and time-to-first-token during a live call.
Prerequisites
- Add a
.envin this directory with your LiveKit and OpenAI credentials:LIVEKIT_URL=your_livekit_urlLIVEKIT_API_KEY=your_api_keyLIVEKIT_API_SECRET=your_api_secretOPENAI_API_KEY=your_openai_key - Install dependencies:pip install python-dotenv rich "livekit-agents[silero]"
Load configuration and logging
Set up dotenv, a logger, and a Rich console for the metrics table.
import loggingimport asynciofrom dotenv import load_dotenvfrom livekit.agents import JobContext, JobProcess, Agent, AgentSession, inference, AgentServer, clifrom livekit.agents.metrics import LLMMetricsfrom livekit.plugins import silerofrom rich.console import Consolefrom rich.table import Tablefrom rich import boxfrom datetime import datetimeload_dotenv()logger = logging.getLogger("metrics-llm")logger.setLevel(logging.INFO)console = Console()server = AgentServer()
Prewarm VAD for faster connections
Preload the VAD model once per process to reduce connection latency.
def prewarm(proc: JobProcess):proc.userdata["vad"] = silero.VAD.load()server.setup_fnc = prewarm
Create the metrics-enabled agent
Keep the agent lightweight with just instructions. In on_enter, attach an on("metrics_collected") listener to the session's LLM so every response triggers your metrics handler.
class LLMMetricsAgent(Agent):def __init__(self) -> None:super().__init__(instructions="""You are a helpful agent.""")async def on_enter(self):def sync_wrapper(metrics: LLMMetrics):asyncio.create_task(self.on_metrics_collected(metrics))self.session.llm.on("metrics_collected", sync_wrapper)self.session.generate_reply()
Render metrics with Rich
When metrics arrive, format them into a table with timestamps, TTFT, durations, and token counts.
async def on_metrics_collected(self, metrics: LLMMetrics) -> None:table = Table(title="[bold blue]LLM Metrics Report[/bold blue]",box=box.ROUNDED,highlight=True,show_header=True,header_style="bold cyan")table.add_column("Metric", style="bold green")table.add_column("Value", style="yellow")timestamp = datetime.fromtimestamp(metrics.timestamp).strftime('%Y-%m-%d %H:%M:%S')table.add_row("Type", str(metrics.type))table.add_row("Label", str(metrics.label))table.add_row("Request ID", str(metrics.request_id))table.add_row("Timestamp", timestamp)table.add_row("Duration", f"[white]{metrics.duration:.4f}[/white]s")table.add_row("Time to First Token", f"[white]{metrics.ttft:.4f}[/white]s")table.add_row("Cancelled", "✓" if metrics.cancelled else "✗")table.add_row("Completion Tokens", str(metrics.completion_tokens))table.add_row("Prompt Tokens", str(metrics.prompt_tokens))table.add_row("Total Tokens", str(metrics.total_tokens))table.add_row("Tokens/Second", f"{metrics.tokens_per_second:.2f}")console.print("\n")console.print(table)console.print("\n")
Set up the session
Configure the AgentSession with STT, LLM, TTS, and prewarmed VAD. The LLM's metrics events will be captured by the listener attached in on_enter.
@server.rtc_session()async def entrypoint(ctx: JobContext):ctx.log_context_fields = {"room": ctx.room.name}session = AgentSession(stt=inference.STT(model="deepgram/nova-3-general"),llm=inference.LLM(model="openai/gpt-4.1-mini"),tts=inference.TTS(model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc"),vad=ctx.proc.userdata["vad"],preemptive_generation=True,)agent = LLMMetricsAgent()await session.start(agent=agent, room=ctx.room)await ctx.connect()
Run the server
Start the agent server with the CLI.
if __name__ == "__main__":cli.run_app(server)
Run it
python metrics_llm.py console
How it works
- The agent runs with standard STT/LLM/TTS and Silero VAD.
- The LLM emits
metrics_collectedafter each generation. - A wrapper in
on_enterscheduleson_metrics_collectedso you can await inside it. - Rich renders the metrics in a readable table showing latency and token stats.
Full example
import loggingimport asynciofrom dotenv import load_dotenvfrom livekit.agents import JobContext, JobProcess, Agent, AgentSession, inference, AgentServer, clifrom livekit.agents.metrics import LLMMetricsfrom livekit.plugins import silerofrom rich.console import Consolefrom rich.table import Tablefrom rich import boxfrom datetime import datetimeload_dotenv()logger = logging.getLogger("metrics-llm")logger.setLevel(logging.INFO)console = Console()class LLMMetricsAgent(Agent):def __init__(self) -> None:super().__init__(instructions="""You are a helpful agent.""")async def on_enter(self):def sync_wrapper(metrics: LLMMetrics):asyncio.create_task(self.on_metrics_collected(metrics))self.session.llm.on("metrics_collected", sync_wrapper)self.session.generate_reply()async def on_metrics_collected(self, metrics: LLMMetrics) -> None:table = Table(title="[bold blue]LLM Metrics Report[/bold blue]",box=box.ROUNDED,highlight=True,show_header=True,header_style="bold cyan")table.add_column("Metric", style="bold green")table.add_column("Value", style="yellow")timestamp = datetime.fromtimestamp(metrics.timestamp).strftime('%Y-%m-%d %H:%M:%S')table.add_row("Type", str(metrics.type))table.add_row("Label", str(metrics.label))table.add_row("Request ID", str(metrics.request_id))table.add_row("Timestamp", timestamp)table.add_row("Duration", f"[white]{metrics.duration:.4f}[/white]s")table.add_row("Time to First Token", f"[white]{metrics.ttft:.4f}[/white]s")table.add_row("Cancelled", "✓" if metrics.cancelled else "✗")table.add_row("Completion Tokens", str(metrics.completion_tokens))table.add_row("Prompt Tokens", str(metrics.prompt_tokens))table.add_row("Total Tokens", str(metrics.total_tokens))table.add_row("Tokens/Second", f"{metrics.tokens_per_second:.2f}")console.print("\n")console.print(table)console.print("\n")server = AgentServer()def prewarm(proc: JobProcess):proc.userdata["vad"] = silero.VAD.load()server.setup_fnc = prewarm@server.rtc_session()async def entrypoint(ctx: JobContext):ctx.log_context_fields = {"room": ctx.room.name}session = AgentSession(stt=inference.STT(model="deepgram/nova-3-general"),llm=inference.LLM(model="openai/gpt-4.1-mini"),tts=inference.TTS(model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc"),vad=ctx.proc.userdata["vad"],preemptive_generation=True,)agent = LLMMetricsAgent()await session.start(agent=agent, room=ctx.room)await ctx.connect()if __name__ == "__main__":cli.run_app(server)