Module `livekit.rtc.audio_frame`

Classes

class AudioFrame (data: bytes | bytearray | memoryview, sample_rate: int, num_channels: int, samples_per_channel: int)

Expand source code

class AudioFrame:
    """
    A class that represents a frame of audio data with specific properties such as sample rate,
    number of channels, and samples per channel.

    The format of the audio data is 16-bit signed integers (int16) interleaved by channel.
    """

    def __init__(
        self,
        data: Union[bytes, bytearray, memoryview],
        sample_rate: int,
        num_channels: int,
        samples_per_channel: int,
    ) -> None:
        """
        Initialize an AudioFrame instance.

        Args:
            data (Union[bytes, bytearray, memoryview]): The raw audio data, which must be at least
                `num_channels * samples_per_channel * sizeof(int16)` bytes long.
            sample_rate (int): The sample rate of the audio in Hz.
            num_channels (int): The number of audio channels (e.g., 1 for mono, 2 for stereo).
            samples_per_channel (int): The number of samples per channel.

        Raises:
            ValueError: If the length of `data` is smaller than the required size.
        """
        data = memoryview(data).cast("B")

        if len(data) < num_channels * samples_per_channel * ctypes.sizeof(ctypes.c_int16):
            raise ValueError(
                "data length must be >= num_channels * samples_per_channel * sizeof(int16)"
            )

        if len(data) % ctypes.sizeof(ctypes.c_int16) != 0:
            # can happen if data is bigger than needed
            raise ValueError("data length must be a multiple of sizeof(int16)")

        n = len(data) // ctypes.sizeof(ctypes.c_int16)
        self._data = (ctypes.c_int16 * n).from_buffer_copy(data)

        self._sample_rate = sample_rate
        self._num_channels = num_channels
        self._samples_per_channel = samples_per_channel

    @staticmethod
    def create(sample_rate: int, num_channels: int, samples_per_channel: int) -> "AudioFrame":
        """
        Create a new empty AudioFrame instance with specified sample rate, number of channels,
        and samples per channel.

        Args:
            sample_rate (int): The sample rate of the audio in Hz.
            num_channels (int): The number of audio channels (e.g., 1 for mono, 2 for stereo).
            samples_per_channel (int): The number of samples per channel.

        Returns:
            AudioFrame: A new AudioFrame instance with uninitialized (zeroed) data.
        """
        size = num_channels * samples_per_channel * ctypes.sizeof(ctypes.c_int16)
        data = bytearray(size)
        return AudioFrame(data, sample_rate, num_channels, samples_per_channel)

    @staticmethod
    def _from_owned_info(owned_info: proto_audio.OwnedAudioFrameBuffer) -> "AudioFrame":
        info = owned_info.info
        size = info.num_channels * info.samples_per_channel
        cdata = (ctypes.c_int16 * size).from_address(info.data_ptr)
        data = bytearray(cdata)
        FfiHandle(owned_info.handle.id)
        return AudioFrame(data, info.sample_rate, info.num_channels, info.samples_per_channel)

    def _proto_info(self) -> proto_audio.AudioFrameBufferInfo:
        audio_info = proto_audio.AudioFrameBufferInfo()
        audio_info.data_ptr = get_address(memoryview(self._data))
        audio_info.sample_rate = self.sample_rate
        audio_info.num_channels = self.num_channels
        audio_info.samples_per_channel = self.samples_per_channel
        return audio_info

    @property
    def data(self) -> memoryview:
        """
        Returns a memory view of the audio data as 16-bit signed integers.

        Returns:
            memoryview: A memory view of the audio data.
        """
        return memoryview(self._data).cast("B").cast("h")

    @property
    def sample_rate(self) -> int:
        """
        Returns the sample rate of the audio frame.

        Returns:
            int: The sample rate in Hz.
        """
        return self._sample_rate

    @property
    def num_channels(self) -> int:
        """
        Returns the number of channels in the audio frame.

        Returns:
            int: The number of audio channels (e.g., 1 for mono, 2 for stereo).
        """
        return self._num_channels

    @property
    def samples_per_channel(self) -> int:
        """
        Returns the number of samples per channel.

        Returns:
            int: The number of samples per channel.
        """
        return self._samples_per_channel

    @property
    def duration(self) -> float:
        """
        Returns the duration of the audio frame in seconds.

        Returns:
            float: The duration in seconds.
        """
        return self.samples_per_channel / self.sample_rate

    def to_wav_bytes(self) -> bytes:
        """
        Convert the audio frame data to a WAV-formatted byte stream.

        Returns:
            bytes: The audio data encoded in WAV format.
        """
        import wave
        import io

        with io.BytesIO() as wav_file:
            with wave.open(wav_file, "wb") as wav:
                wav.setnchannels(self.num_channels)
                wav.setsampwidth(2)
                wav.setframerate(self.sample_rate)
                wav.writeframes(self._data)

            return wav_file.getvalue()

    def __repr__(self) -> str:
        return (
            f"rtc.AudioFrame(sample_rate={self.sample_rate}, "
            f"num_channels={self.num_channels}, "
            f"samples_per_channel={self.samples_per_channel}, "
            f"duration={self.duration:.3f})"
        )

    @classmethod
    def __get_pydantic_core_schema__(cls, *_: Any):
        from pydantic_core import core_schema
        import base64

        def validate_audio_frame(value: Any) -> "AudioFrame":
            if isinstance(value, AudioFrame):
                return value

            if isinstance(value, tuple):
                value = value[0]

            if isinstance(value, dict):
                return AudioFrame(
                    data=base64.b64decode(value["data"]),
                    sample_rate=value["sample_rate"],
                    num_channels=value["num_channels"],
                    samples_per_channel=value["samples_per_channel"],
                )

            raise TypeError("Invalid type for AudioFrame")

        return core_schema.json_or_python_schema(
            json_schema=core_schema.chain_schema(
                [
                    core_schema.model_fields_schema(
                        {
                            "data": core_schema.model_field(core_schema.str_schema()),
                            "sample_rate": core_schema.model_field(core_schema.int_schema()),
                            "num_channels": core_schema.model_field(core_schema.int_schema()),
                            "samples_per_channel": core_schema.model_field(
                                core_schema.int_schema()
                            ),
                        },
                    ),
                    core_schema.no_info_plain_validator_function(validate_audio_frame),
                ]
            ),
            python_schema=core_schema.no_info_plain_validator_function(validate_audio_frame),
            serialization=core_schema.plain_serializer_function_ser_schema(
                lambda instance: {
                    "data": base64.b64encode(instance.data).decode("utf-8"),
                    "sample_rate": instance.sample_rate,
                    "num_channels": instance.num_channels,
                    "samples_per_channel": instance.samples_per_channel,
                }
            ),
        )

A class that represents a frame of audio data with specific properties such as sample rate, number of channels, and samples per channel.

The format of the audio data is 16-bit signed integers (int16) interleaved by channel.

Initialize an AudioFrame instance.

Args

data : Union[bytes, bytearray, memoryview]: The raw audio data, which must be at least num_channels * samples_per_channel * sizeof(int16) bytes long.
sample_rate : int: The sample rate of the audio in Hz.
num_channels : int: The number of audio channels (e.g., 1 for mono, 2 for stereo).
samples_per_channel : int: The number of samples per channel.

Raises

ValueError: If the length of data is smaller than the required size.

Static methods

def create(sample_rate: int, num_channels: int, samples_per_channel: int) ‑> AudioFrame

Expand source code

@staticmethod
def create(sample_rate: int, num_channels: int, samples_per_channel: int) -> "AudioFrame":
    """
    Create a new empty AudioFrame instance with specified sample rate, number of channels,
    and samples per channel.

    Args:
        sample_rate (int): The sample rate of the audio in Hz.
        num_channels (int): The number of audio channels (e.g., 1 for mono, 2 for stereo).
        samples_per_channel (int): The number of samples per channel.

    Returns:
        AudioFrame: A new AudioFrame instance with uninitialized (zeroed) data.
    """
    size = num_channels * samples_per_channel * ctypes.sizeof(ctypes.c_int16)
    data = bytearray(size)
    return AudioFrame(data, sample_rate, num_channels, samples_per_channel)

Create a new empty AudioFrame instance with specified sample rate, number of channels, and samples per channel.

Args

sample_rate : int: The sample rate of the audio in Hz.
num_channels : int: The number of audio channels (e.g., 1 for mono, 2 for stereo).
samples_per_channel : int: The number of samples per channel.

Returns

AudioFrame: A new AudioFrame instance with uninitialized (zeroed) data.

Instance variables

prop data : memoryview

Expand source code

@property
def data(self) -> memoryview:
    """
    Returns a memory view of the audio data as 16-bit signed integers.

    Returns:
        memoryview: A memory view of the audio data.
    """
    return memoryview(self._data).cast("B").cast("h")

Returns a memory view of the audio data as 16-bit signed integers.

Returns

memoryview: A memory view of the audio data.

prop duration : float

Expand source code

@property
def duration(self) -> float:
    """
    Returns the duration of the audio frame in seconds.

    Returns:
        float: The duration in seconds.
    """
    return self.samples_per_channel / self.sample_rate

Returns the duration of the audio frame in seconds.

Returns

float: The duration in seconds.

prop num_channels : int

Expand source code

@property
def num_channels(self) -> int:
    """
    Returns the number of channels in the audio frame.

    Returns:
        int: The number of audio channels (e.g., 1 for mono, 2 for stereo).
    """
    return self._num_channels

Returns the number of channels in the audio frame.

Returns

int: The number of audio channels (e.g., 1 for mono, 2 for stereo).

prop sample_rate : int

Expand source code

@property
def sample_rate(self) -> int:
    """
    Returns the sample rate of the audio frame.

    Returns:
        int: The sample rate in Hz.
    """
    return self._sample_rate

Returns the sample rate of the audio frame.

Returns

int: The sample rate in Hz.

prop samples_per_channel : int

Expand source code

@property
def samples_per_channel(self) -> int:
    """
    Returns the number of samples per channel.

    Returns:
        int: The number of samples per channel.
    """
    return self._samples_per_channel

Returns the number of samples per channel.

Returns

int: The number of samples per channel.

Methods

def to_wav_bytes(self) ‑> bytes

Expand source code

def to_wav_bytes(self) -> bytes:
    """
    Convert the audio frame data to a WAV-formatted byte stream.

    Returns:
        bytes: The audio data encoded in WAV format.
    """
    import wave
    import io

    with io.BytesIO() as wav_file:
        with wave.open(wav_file, "wb") as wav:
            wav.setnchannels(self.num_channels)
            wav.setsampwidth(2)
            wav.setframerate(self.sample_rate)
            wav.writeframes(self._data)

        return wav_file.getvalue()

Convert the audio frame data to a WAV-formatted byte stream.

Returns

bytes: The audio data encoded in WAV format.