Images | LiveKit Documentation

Overview

LiveKit Agents supports images as both input and output. On the input side, you can add images to your agent's chat context, receive images from the frontend, or sample video frames. On the output side, you can send images to the frontend using byte streams.

Image input

The agent's chat context supports images as well as text. You can add as many images as you want to the chat context, but keep in mind that larger context windows contribute to slow response times.

To add an image to the chat context, create an ImageContent object and include it in a chat message. The image content can be a base64 data URL, an external URL, or a frame from a video track.

Load into initial context

The following example shows an agent initialized with an image at startup. This example uses an external URL, but you can modify it to load a local file using a base64 data URL instead:

def entrypoint(ctx: JobContext):
    # ctx.connect, etc.

    session = AgentSession(
        # ... stt, tts, llm, etc.
    )

    initial_ctx = ChatContext()
    initial_ctx.add_message(
        role="user",
        content=[
            "Here is a picture of me",
            ImageContent(image="https://example.com/image.jpg")
        ],
    )

    await session.start(
        room=ctx.room,
        agent=Agent(chat_ctx=initial_ctx,),
        # ... room_options, etc.
    )

from livekit.agents.llm import ImageContent
from livekit.agents import Agent, AgentSession, ChatContext, JobContext

export default defineAgent({
  entry: async (ctx: JobContext) => {
    // await ctx.connect(), etc

    const initialCtx = llm.ChatContext.empty();

    initialCtx.addMessage({
      role: 'user',
      content: [
        'Here is a picture of me',
        llm.createImageContent({
          image: 'https://example.com/image.jpg',
        }),
      ],
    });

    const agent = new voice.Agent({
      instructions: 'You are a helpful voice AI assistant.',
      chatCtx: initialCtx,
    });

    const session = new voice.AgentSession({
      // ... stt, tts, llm, etc.
    });

    await session.start({
      room: ctx.room,
      agent,
      // ... inputOptions, etc.
    });
  },
});

import { type JobContext, defineAgent, llm, voice } from '@livekit/agents';

LLM provider support for external URLs

Not every provider supports external image URLs. Consult their documentation for details.

Upload from frontend

To upload an image from your frontend app, use the sendFile method of the LiveKit SDK. Add a byte stream handler to your agent to receive the image data and add it to the chat context. Here is a simple agent capable of receiving images from the user on the byte stream topic "images":

class Assistant(Agent):
    def __init__(self) -> None:
        self._tasks = [] # Prevent garbage collection of running tasks
        super().__init__(instructions="You are a helpful voice AI assistant.")

    async def on_enter(self):
        def _image_received_handler(reader, participant_identity):
            task = asyncio.create_task(
                self._image_received(reader, participant_identity)
            )
            self._tasks.append(task)
            task.add_done_callback(lambda t: self._tasks.remove(t))

        # Add the handler when the agent joins
        get_job_context().room.register_byte_stream_handler("images", _image_received_handler)

    async def _image_received(self, reader, participant_identity):
        image_bytes = bytes()
        async for chunk in reader:
            image_bytes += chunk

        chat_ctx = self.chat_ctx.copy()

        # Encode the image to base64 and add it to the chat context
        chat_ctx.add_message(
            role="user",
            content=[
                ImageContent(
                    image=f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"
                )
            ],
        )
        await self.update_chat_ctx(chat_ctx)

import asyncio
import base64
from livekit.agents import Agent, get_job_context
from livekit.agents.llm import ImageContent

class Assistant extends voice.Agent {
  private tasks: Set<Task<void>> = new Set(); // Prevent garbage collection of running tasks

  constructor() {
    super({
      instructions: 'You are a helpful voice AI assistant.',
    });
  }

  async onEnter(): Promise<void> {
    // Register byte stream handler for receiving images
    getJobContext().room.registerByteStreamHandler('images', async (stream: ByteStreamReader) => {
      const task = Task.from((controller) => this.imageReceived(stream, controller));
      this.tasks.add(task);

      task.result.finally(() => {
        this.tasks.delete(task);
      });
    });
  }

  private async imageReceived(
    stream: ByteStreamReader,
    controller: AbortController,
  ): Promise<void> {
    const chunks: Uint8Array[] = [];

    // Read all chunks from the stream
    for await (const chunk of stream) {
      if (controller.signal.aborted) return;
      chunks.push(chunk);
    }

    // Combine all chunks into a single buffer
    const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
    const imageBytes = new Uint8Array(totalLength);
    let offset = 0;

    for (const chunk of chunks) {
      imageBytes.set(chunk, offset);
      offset += chunk.length;
    }

    const chatCtx = this.chatCtx.copy();

    // Encode the image to base64 and add it to the chat context
    const imageContent = llm.createImageContent({
      image: `data:image/png;base64,${Buffer.from(imageBytes).toString('base64')}`,
      inferenceDetail: 'auto',
    });

    chatCtx.addMessage({
      role: 'user',
      content: [imageContent],
    });

    if (controller.signal.aborted) return;
    await this.updateChatCtx(chatCtx);
  }
}

import { Task, getJobContext, llm, voice } from '@livekit/agents';
import type { ByteStreamReader } from '@livekit/rtc-node';

Inference detail

If your LLM provider supports it, you can set the inference_detail parameter to "high" or "low" to control the token usage and inference quality applied. The default is "auto", which uses the provider's default.

Image output

Your agent can send images to the frontend using byte streams. Use this to share generated images, diagrams, screenshots, or any other visual content from your agent to the user.

To send an image, use the send_file method on the room's local participant. The frontend receives the image by registering a byte stream handler for the same topic.

Send an image from your agent

class Assistant(Agent):
    def __init__(self) -> None:
        super().__init__(instructions="You are a helpful voice AI assistant.")

    async def on_enter(self):
        room = get_job_context().room

        # Send an image file to the frontend
        await room.local_participant.send_file(
            file_path="path/to/image.png",
            topic="agent-images",
        )

from livekit.agents import Agent, get_job_context

class Assistant extends voice.Agent {
  constructor() {
    super({
      instructions: 'You are a helpful voice AI assistant.',
    });
  }

  async onEnter(): Promise<void> {
    const room = getJobContext().room;

    // Send an image file to the frontend
    await room.localParticipant!.sendFile('path/to/image.png', {
      topic: 'agent-images',
    });
  }
}

import { getJobContext, voice } from '@livekit/agents';

Receive images in your frontend

room.registerByteStreamHandler('agent-images', async (reader, participantInfo) => {
  const data = await reader.readAll();
  const blob = new Blob(data, { type: reader.info.mimeType });
  const url = URL.createObjectURL(blob);

  // Display the image in your UI
  const img = document.createElement('img');
  img.src = url;
  document.body.appendChild(img);
});

try await room.registerByteStreamHandler(for: "agent-images") { reader, participantIdentity in
    let data = try await reader.readAll()

    // Display the image in your UI
    DispatchQueue.main.async {
        let image = UIImage(data: data)
        let imageView = UIImageView(image: image)
        self.view.addSubview(imageView)
    }
}

room.registerByteStreamHandler("agent-images") { reader, participantIdentity ->
    myCoroutineScope.launch {
        val chunks = reader.readAll()
        val bytes = chunks.fold(ByteArray(0)) { acc, chunk -> acc + chunk }

        // Display the image in your UI
        withContext(Dispatchers.Main) {
            val bitmap = BitmapFactory.decodeByteArray(bytes, 0, bytes.size)
            imageView.setImageBitmap(bitmap)
        }
    }
}

room.registerByteStreamHandler('agent-images',
    (ByteStreamReader reader, String participantIdentity) async {
  final chunks = await reader.readAll();
  final bytes = chunks.expand((chunk) => chunk).toList();

  // Display the image in your UI
  setState(() {
    imageBytes = Uint8List.fromList(bytes);
  });
});

For full details on byte streams, see Sending files & bytes.