Skip to main content

Transcriptions

Generate realtime transcriptions of agent sessions using client SDKs.

Agents 1.0 available

This documentation is for v0.x of the LiveKit Agents framework.

For the latest documentation, see Text and transcriptions.

Overview

The Agents framework includes the ability to capture and deliver realtime transcriptions of a user's speech and LLM-generated speech or text.

Both VoicePipelineAgent and MultimodalAgent can forward transcriptions to clients automatically if you implement support for receiving them in your frontend. If you're not using either of these agent classes, you can add transcription forwarding to your agent code.

To learn more about creating transcriptions in the agent process, see Recording agent sessions.

Frontend integration

You can use a LiveKit SDK to receive transcription events in your frontend.

Transcriptions are delivered in segments, each associated with a particular Participant and Track. Each segment has a unique id. Segments might be sent as fragments as they're generated. You can monitor the final property to determine when a segment is complete.

This example uses React with TypeScript, but the principles are the same for other frameworks.

Collect TranscriptionSegment by listening to RoomEvent.TranscriptionReceived:

import { useEffect, useState } from "react";
import {
TranscriptionSegment,
Participant,
TrackPublication,
RoomEvent,
} from "livekit-client";
import { useMaybeRoomContext } from "@livekit/components-react";
export default function Transcriptions() {
const room = useMaybeRoomContext();
const [transcriptions, setTranscriptions] = useState<{ [id: string]: TranscriptionSegment }>({});
useEffect(() => {
if (!room) {
return;
}
const updateTranscriptions = (
segments: TranscriptionSegment[],
participant?: Participant,
publication?: TrackPublication
) => {
setTranscriptions((prev) => {
const newTranscriptions = { ...prev };
for (const segment of segments) {
newTranscriptions[segment.id] = segment;
}
return newTranscriptions;
});
};
room.on(RoomEvent.TranscriptionReceived, updateTranscriptions);
return () => {
room.off(RoomEvent.TranscriptionReceived, updateTranscriptions);
};
}, [room]);
return (
<ul>
{Object.values(transcriptions)
.sort((a, b) => a.firstReceivedTime - b.firstReceivedTime)
.map((segment) => (
<li key={segment.id}>{segment.text}</li>
))}
</ul>
)
}

Collect TranscriptionSegment in your RoomDelegate implementation:

@Published var receivedSegments: [String: TranscriptionSegment] = [:]
func room(_ room: Room, participant: Participant, trackPublication: TrackPublication, didReceiveTranscriptionSegments segments: [TranscriptionSegment]) {
Task { @MainActor in
for segment in segments {
self.receivedSegments[segment.id] = segment
}
}
}

Then present them in your view:

List {
ForEach(roomDelegate.receivedSegments.values.sorted(by: { $0.firstReceivedTime < $1.firstReceivedTime }),
id: \.id) { segment in
Text(segment.isFinal ? segment.text : "\(segment.text) …")
}
}

Collect TranscriptionSegment by listening to TranscriptionReceived on the Room:

var transcriptions = remember { mutableStateMapOf<String, TranscriptionSegment>() }
LaunchedEffect(room) {
room.events.collect { event ->
when (event) {
is RoomEvent.TranscriptionReceived -> {
transcriptions = transcriptions.mergeNewSegments(event.transcriptionSegments)
}
else -> {}
}
}
}

Then present them in your view:

LazyColumn(modifier = Modifier.fillMaxSize()) {
val sortedTranscriptions = transcriptions.values.toList().sortedBy { it.firstReceivedTime }
items(sortedTranscriptions.size) { index ->
Text(
text = sortedTranscriptions[index].text,
modifier = Modifier.padding(8.dp)
)
}
}

Collect TranscriptionSegment in by listening to TranscriptionEvent on the Room:

Map<String, TranscriptionSegment> _transcriptions = {};
late final EventsListener<RoomEvent> _listener = _room.createListener();
_listener.on<TranscriptionEvent>((event) {
setState(() {
for (final segment in event.segments) {
_transcriptions[segment.id] = segment;
}
});
});

Sort them for presentation:

final sortedTranscriptions = _transcriptions.values.toList()
..sort((a, b) => a.firstReceivedTime.compareTo(b.firstReceivedTime));

Then render them in a list:

ListView(
children: sortedTranscriptions.map((segment) =>
ListTile(title: Text(segment.text))
).toList()
)

Agent integration

The STTSegmentsForwarder class provides an interface for delivering transcriptions from your custom agent to your frontend in realtime. Here's a sample implementation:

from livekit.agents import stt, transcription
from livekit.plugins.deepgram import STT
async def _forward_transcription(
stt_stream: stt.SpeechStream,
stt_forwarder: transcription.STTSegmentsForwarder,
):
"""Forward the transcription and log the transcript in the console"""
async for ev in stt_stream:
stt_forwarder.update(ev)
if ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
print(ev.alternatives[0].text, end="")
elif ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
print("\n")
print(" -> ", ev.alternatives[0].text)
async def entrypoint(job: JobContext):
stt = STT()
tasks = []
async def transcribe_track(participant: rtc.RemoteParticipant, track: rtc.Track):
audio_stream = rtc.AudioStream(track)
stt_forwarder = transcription.STTSegmentsForwarder(
room=job.room, participant=participant, track=track
)
stt_stream = stt.stream()
stt_task = asyncio.create_task(
_forward_transcription(stt_stream, stt_forwarder)
)
tasks.append(stt_task)
async for ev in audio_stream:
stt_stream.push_frame(ev.frame)
@job.room.on("track_subscribed")
def on_track_subscribed(
track: rtc.Track,
publication: rtc.TrackPublication,
participant: rtc.RemoteParticipant,
):
if track.kind == rtc.TrackKind.KIND_AUDIO:
tasks.append(asyncio.create_task(transcribe_track(participant, track)))