Skip to main content

Images

Add images to your agent's context, receive images from the frontend, and send images back to users.

Overview

LiveKit Agents supports images as both input and output. On the input side, you can add images to your agent's chat context, receive images from the frontend, or sample video frames. On the output side, you can send images to the frontend using byte streams.

Image input

The agent's chat context supports images as well as text. You can add as many images as you want to the chat context, but keep in mind that larger context windows contribute to slow response times.

To add an image to the chat context, create an ImageContent object and include it in a chat message. The image content can be a base64 data URL, an external URL, or a frame from a video track.

Load into initial context

The following example shows an agent initialized with an image at startup. This example uses an external URL, but you can modify it to load a local file using a base64 data URL instead:

def entrypoint(ctx: JobContext):
# ctx.connect, etc.
session = AgentSession(
# ... stt, tts, llm, etc.
)
initial_ctx = ChatContext()
initial_ctx.add_message(
role="user",
content=[
"Here is a picture of me",
ImageContent(image="https://example.com/image.jpg")
],
)
await session.start(
room=ctx.room,
agent=Agent(chat_ctx=initial_ctx,),
# ... room_options, etc.
)
from livekit.agents.llm import ImageContent
from livekit.agents import Agent, AgentSession, ChatContext, JobContext
export default defineAgent({
entry: async (ctx: JobContext) => {
// await ctx.connect(), etc
const initialCtx = llm.ChatContext.empty();
initialCtx.addMessage({
role: 'user',
content: [
'Here is a picture of me',
llm.createImageContent({
image: 'https://example.com/image.jpg',
}),
],
});
const agent = new voice.Agent({
instructions: 'You are a helpful voice AI assistant.',
chatCtx: initialCtx,
});
const session = new voice.AgentSession({
// ... stt, tts, llm, etc.
});
await session.start({
room: ctx.room,
agent,
// ... inputOptions, etc.
});
},
});
import { type JobContext, defineAgent, llm, voice } from '@livekit/agents';
LLM provider support for external URLs

Not every provider supports external image URLs. Consult their documentation for details.

Upload from frontend

To upload an image from your frontend app, use the sendFile method of the LiveKit SDK. Add a byte stream handler to your agent to receive the image data and add it to the chat context. Here is a simple agent capable of receiving images from the user on the byte stream topic "images":

class Assistant(Agent):
def __init__(self) -> None:
self._tasks = [] # Prevent garbage collection of running tasks
super().__init__(instructions="You are a helpful voice AI assistant.")
async def on_enter(self):
def _image_received_handler(reader, participant_identity):
task = asyncio.create_task(
self._image_received(reader, participant_identity)
)
self._tasks.append(task)
task.add_done_callback(lambda t: self._tasks.remove(t))
# Add the handler when the agent joins
get_job_context().room.register_byte_stream_handler("images", _image_received_handler)
async def _image_received(self, reader, participant_identity):
image_bytes = bytes()
async for chunk in reader:
image_bytes += chunk
chat_ctx = self.chat_ctx.copy()
# Encode the image to base64 and add it to the chat context
chat_ctx.add_message(
role="user",
content=[
ImageContent(
image=f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"
)
],
)
await self.update_chat_ctx(chat_ctx)
import asyncio
import base64
from livekit.agents import Agent, get_job_context
from livekit.agents.llm import ImageContent
class Assistant extends voice.Agent {
private tasks: Set<Task<void>> = new Set(); // Prevent garbage collection of running tasks
constructor() {
super({
instructions: 'You are a helpful voice AI assistant.',
});
}
async onEnter(): Promise<void> {
// Register byte stream handler for receiving images
getJobContext().room.registerByteStreamHandler('images', async (stream: ByteStreamReader) => {
const task = Task.from((controller) => this.imageReceived(stream, controller));
this.tasks.add(task);
task.result.finally(() => {
this.tasks.delete(task);
});
});
}
private async imageReceived(
stream: ByteStreamReader,
controller: AbortController,
): Promise<void> {
const chunks: Uint8Array[] = [];
// Read all chunks from the stream
for await (const chunk of stream) {
if (controller.signal.aborted) return;
chunks.push(chunk);
}
// Combine all chunks into a single buffer
const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
const imageBytes = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of chunks) {
imageBytes.set(chunk, offset);
offset += chunk.length;
}
const chatCtx = this.chatCtx.copy();
// Encode the image to base64 and add it to the chat context
const imageContent = llm.createImageContent({
image: `data:image/png;base64,${Buffer.from(imageBytes).toString('base64')}`,
inferenceDetail: 'auto',
});
chatCtx.addMessage({
role: 'user',
content: [imageContent],
});
if (controller.signal.aborted) return;
await this.updateChatCtx(chatCtx);
}
}
import { Task, getJobContext, llm, voice } from '@livekit/agents';
import type { ByteStreamReader } from '@livekit/rtc-node';

Inference detail

If your LLM provider supports it, you can set the inference_detail parameter to "high" or "low" to control the token usage and inference quality applied. The default is "auto", which uses the provider's default.

Image output

Your agent can send images to the frontend using byte streams. Use this to share generated images, diagrams, screenshots, or any other visual content from your agent to the user.

To send an image, use the send_file method on the room's local participant. The frontend receives the image by registering a byte stream handler for the same topic.

Send an image from your agent

class Assistant(Agent):
def __init__(self) -> None:
super().__init__(instructions="You are a helpful voice AI assistant.")
async def on_enter(self):
room = get_job_context().room
# Send an image file to the frontend
await room.local_participant.send_file(
file_path="path/to/image.png",
topic="agent-images",
)
from livekit.agents import Agent, get_job_context
class Assistant extends voice.Agent {
constructor() {
super({
instructions: 'You are a helpful voice AI assistant.',
});
}
async onEnter(): Promise<void> {
const room = getJobContext().room;
// Send an image file to the frontend
await room.localParticipant!.sendFile('path/to/image.png', {
topic: 'agent-images',
});
}
}
import { getJobContext, voice } from '@livekit/agents';

Receive images in your frontend

Register a byte stream handler in your frontend to receive images from the agent:

room.registerByteStreamHandler('agent-images', async (reader, participantInfo) => {
const data = await reader.readAll();
const blob = new Blob(data, { type: reader.info.mimeType });
const url = URL.createObjectURL(blob);
// Display the image in your UI
const img = document.createElement('img');
img.src = url;
document.body.appendChild(img);
});
try await room.registerByteStreamHandler(for: "agent-images") { reader, participantIdentity in
let data = try await reader.readAll()
// Display the image in your UI
DispatchQueue.main.async {
let image = UIImage(data: data)
let imageView = UIImageView(image: image)
self.view.addSubview(imageView)
}
}
room.registerByteStreamHandler("agent-images") { reader, participantIdentity ->
myCoroutineScope.launch {
val chunks = reader.readAll()
val bytes = chunks.fold(ByteArray(0)) { acc, chunk -> acc + chunk }
// Display the image in your UI
withContext(Dispatchers.Main) {
val bitmap = BitmapFactory.decodeByteArray(bytes, 0, bytes.size)
imageView.setImageBitmap(bitmap)
}
}
}
room.registerByteStreamHandler('agent-images',
(ByteStreamReader reader, String participantIdentity) async {
final chunks = await reader.readAll();
final bytes = chunks.expand((chunk) => chunk).toList();
// Display the image in your UI
setState(() {
imageBytes = Uint8List.fromList(bytes);
});
});

For full details on byte streams, see Sending files & bytes.