Overview
LiveKit Agents supports images as both input and output. On the input side, you can add images to your agent's chat context, receive images from the frontend, or sample video frames. On the output side, you can send images to the frontend using byte streams.
Image input
The agent's chat context supports images as well as text. You can add as many images as you want to the chat context, but keep in mind that larger context windows contribute to slow response times.
To add an image to the chat context, create an ImageContent object and include it in a chat message. The image content can be a base64 data URL, an external URL, or a frame from a video track.
Load into initial context
The following example shows an agent initialized with an image at startup. This example uses an external URL, but you can modify it to load a local file using a base64 data URL instead:
def entrypoint(ctx: JobContext):# ctx.connect, etc.session = AgentSession(# ... stt, tts, llm, etc.)initial_ctx = ChatContext()initial_ctx.add_message(role="user",content=["Here is a picture of me",ImageContent(image="https://example.com/image.jpg")],)await session.start(room=ctx.room,agent=Agent(chat_ctx=initial_ctx,),# ... room_options, etc.)
from livekit.agents.llm import ImageContentfrom livekit.agents import Agent, AgentSession, ChatContext, JobContext
export default defineAgent({entry: async (ctx: JobContext) => {// await ctx.connect(), etcconst initialCtx = llm.ChatContext.empty();initialCtx.addMessage({role: 'user',content: ['Here is a picture of me',llm.createImageContent({image: 'https://example.com/image.jpg',}),],});const agent = new voice.Agent({instructions: 'You are a helpful voice AI assistant.',chatCtx: initialCtx,});const session = new voice.AgentSession({// ... stt, tts, llm, etc.});await session.start({room: ctx.room,agent,// ... inputOptions, etc.});},});
import { type JobContext, defineAgent, llm, voice } from '@livekit/agents';
Not every provider supports external image URLs. Consult their documentation for details.
Upload from frontend
To upload an image from your frontend app, use the sendFile method of the LiveKit SDK. Add a byte stream handler to your agent to receive the image data and add it to the chat context. Here is a simple agent capable of receiving images from the user on the byte stream topic "images":
class Assistant(Agent):def __init__(self) -> None:self._tasks = [] # Prevent garbage collection of running taskssuper().__init__(instructions="You are a helpful voice AI assistant.")async def on_enter(self):def _image_received_handler(reader, participant_identity):task = asyncio.create_task(self._image_received(reader, participant_identity))self._tasks.append(task)task.add_done_callback(lambda t: self._tasks.remove(t))# Add the handler when the agent joinsget_job_context().room.register_byte_stream_handler("images", _image_received_handler)async def _image_received(self, reader, participant_identity):image_bytes = bytes()async for chunk in reader:image_bytes += chunkchat_ctx = self.chat_ctx.copy()# Encode the image to base64 and add it to the chat contextchat_ctx.add_message(role="user",content=[ImageContent(image=f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}")],)await self.update_chat_ctx(chat_ctx)
import asyncioimport base64from livekit.agents import Agent, get_job_contextfrom livekit.agents.llm import ImageContent
class Assistant extends voice.Agent {private tasks: Set<Task<void>> = new Set(); // Prevent garbage collection of running tasksconstructor() {super({instructions: 'You are a helpful voice AI assistant.',});}async onEnter(): Promise<void> {// Register byte stream handler for receiving imagesgetJobContext().room.registerByteStreamHandler('images', async (stream: ByteStreamReader) => {const task = Task.from((controller) => this.imageReceived(stream, controller));this.tasks.add(task);task.result.finally(() => {this.tasks.delete(task);});});}private async imageReceived(stream: ByteStreamReader,controller: AbortController,): Promise<void> {const chunks: Uint8Array[] = [];// Read all chunks from the streamfor await (const chunk of stream) {if (controller.signal.aborted) return;chunks.push(chunk);}// Combine all chunks into a single bufferconst totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);const imageBytes = new Uint8Array(totalLength);let offset = 0;for (const chunk of chunks) {imageBytes.set(chunk, offset);offset += chunk.length;}const chatCtx = this.chatCtx.copy();// Encode the image to base64 and add it to the chat contextconst imageContent = llm.createImageContent({image: `data:image/png;base64,${Buffer.from(imageBytes).toString('base64')}`,inferenceDetail: 'auto',});chatCtx.addMessage({role: 'user',content: [imageContent],});if (controller.signal.aborted) return;await this.updateChatCtx(chatCtx);}}
import { Task, getJobContext, llm, voice } from '@livekit/agents';import type { ByteStreamReader } from '@livekit/rtc-node';
Inference detail
If your LLM provider supports it, you can set the inference_detail parameter to "high" or "low" to control the token usage and inference quality applied. The default is "auto", which uses the provider's default.
Image output
Your agent can send images to the frontend using byte streams. Use this to share generated images, diagrams, screenshots, or any other visual content from your agent to the user.
To send an image, use the send_file method on the room's local participant. The frontend receives the image by registering a byte stream handler for the same topic.
Send an image from your agent
class Assistant(Agent):def __init__(self) -> None:super().__init__(instructions="You are a helpful voice AI assistant.")async def on_enter(self):room = get_job_context().room# Send an image file to the frontendawait room.local_participant.send_file(file_path="path/to/image.png",topic="agent-images",)
from livekit.agents import Agent, get_job_context
class Assistant extends voice.Agent {constructor() {super({instructions: 'You are a helpful voice AI assistant.',});}async onEnter(): Promise<void> {const room = getJobContext().room;// Send an image file to the frontendawait room.localParticipant!.sendFile('path/to/image.png', {topic: 'agent-images',});}}
import { getJobContext, voice } from '@livekit/agents';
Receive images in your frontend
Register a byte stream handler in your frontend to receive images from the agent:
room.registerByteStreamHandler('agent-images', async (reader, participantInfo) => {const data = await reader.readAll();const blob = new Blob(data, { type: reader.info.mimeType });const url = URL.createObjectURL(blob);// Display the image in your UIconst img = document.createElement('img');img.src = url;document.body.appendChild(img);});
try await room.registerByteStreamHandler(for: "agent-images") { reader, participantIdentity inlet data = try await reader.readAll()// Display the image in your UIDispatchQueue.main.async {let image = UIImage(data: data)let imageView = UIImageView(image: image)self.view.addSubview(imageView)}}
room.registerByteStreamHandler("agent-images") { reader, participantIdentity ->myCoroutineScope.launch {val chunks = reader.readAll()val bytes = chunks.fold(ByteArray(0)) { acc, chunk -> acc + chunk }// Display the image in your UIwithContext(Dispatchers.Main) {val bitmap = BitmapFactory.decodeByteArray(bytes, 0, bytes.size)imageView.setImageBitmap(bitmap)}}}
room.registerByteStreamHandler('agent-images',(ByteStreamReader reader, String participantIdentity) async {final chunks = await reader.readAll();final bytes = chunks.expand((chunk) => chunk).toList();// Display the image in your UIsetState(() {imageBytes = Uint8List.fromList(bytes);});});
For full details on byte streams, see Sending files & bytes.