Module `livekit.plugins.hume`

Hume AI TTS plugin for LiveKit Agents

See https://docs.livekit.io/agents/integrations/tts/hume/ for more information.

Sub-modules

livekit.plugins.hume.log

Classes

class AudioFormat (*args, **kwds)

Expand source code

class AudioFormat(str, Enum):
    """Audio format for the synthesized speech."""

    mp3 = "mp3"
    wav = "wav"
    pcm = "pcm"

Audio format for the synthesized speech.

Ancestors

builtins.str
enum.Enum

Class variables

var mp3
var pcm
var wav

class TTS (*, api_key: str | None = None, voice: VoiceById | VoiceByName | None = {'name': 'Male English Actor', 'provider': <VoiceProvider.hume: 'HUME_AI'>}, model_version: ModelVersion | None = '1', description: str | None = None, speed: float | None = None, trailing_silence: float | None = None, context: str | list[Utterance] | None = None, instant_mode: NotGivenOr[bool] = NOT_GIVEN, audio_format: AudioFormat = AudioFormat.mp3, base_url: str = 'https://api.hume.ai', http_session: aiohttp.ClientSession | None = None)

Expand source code

class TTS(tts.TTS):
    def __init__(
        self,
        *,
        api_key: str | None = None,
        voice: VoiceById | VoiceByName | None = DEFAULT_VOICE,
        model_version: ModelVersion | None = "1",
        description: str | None = None,
        speed: float | None = None,
        trailing_silence: float | None = None,
        context: str | list[Utterance] | None = None,
        instant_mode: NotGivenOr[bool] = NOT_GIVEN,
        audio_format: AudioFormat = AudioFormat.mp3,
        base_url: str = DEFAULT_BASE_URL,
        http_session: aiohttp.ClientSession | None = None,
    ):
        """Initialize the Hume AI TTS client. Options will be used for all future synthesis
        (until updated with update_options).

        Args:
            api_key: Hume AI API key. If not provided, will look for HUME_API_KEY environment
                variable.
            voice: A voice from the voice library specified by name or id.
            model_version: Specifies which version of Octave to use. See Hume's documentation for
                details on model version differences: https://dev.hume.ai/docs/text-to-speech-tts/overview.
            description: Natural language instructions describing how the synthesized speech
                should sound (≤1000 characters).
            speed: Speed multiplier for the synthesized speech (≥0.5, ≤2.0, default: 1.0).
            trailing_silence: Duration of trailing silence (in seconds) to add to each utterance
                (≥0, ≤5.0, default: 0.35).
            context: Optional context for synthesis, either as text or list of utterances.
            instant_mode: Whether to use instant mode. Defaults to True if voice specified,
                False otherwise. Requires a voice to be specified when enabled.
            audio_format: Output audio format (mp3, wav, or pcm). Defaults to mp3.
            base_url: Base URL for Hume AI API. Defaults to https://api.hume.ai
            http_session: Optional aiohttp ClientSession to use for requests.
        """
        super().__init__(
            capabilities=tts.TTSCapabilities(streaming=False),
            sample_rate=SUPPORTED_SAMPLE_RATE,
            num_channels=1,
        )
        key = api_key or os.environ.get("HUME_API_KEY")
        if not key:
            raise ValueError("Hume API key is required via api_key or HUME_API_KEY env var")

        has_voice = voice is not None

        # Default instant_mode is True if a voice is specified, otherwise False
        # (Hume API requires a voice for instant mode)
        if not is_given(instant_mode):
            resolved_instant_mode = has_voice
        elif instant_mode and not has_voice:
            raise ValueError("Hume TTS: instant_mode cannot be enabled without specifying a voice")
        else:
            resolved_instant_mode = instant_mode

        self._opts = _TTSOptions(
            api_key=key,
            voice=voice,
            model_version=model_version,
            description=description,
            speed=speed,
            trailing_silence=trailing_silence,
            context=context,
            instant_mode=resolved_instant_mode,
            audio_format=audio_format,
            base_url=base_url,
        )
        self._session = http_session

    @property
    def model(self) -> str:
        return "Octave"

    @property
    def provider(self) -> str:
        return "Hume"

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()

        return self._session

    def update_options(
        self,
        *,
        description: NotGivenOr[str | None] = NOT_GIVEN,
        speed: NotGivenOr[float | None] = NOT_GIVEN,
        voice: NotGivenOr[VoiceById | VoiceByName | None] = NOT_GIVEN,
        trailing_silence: NotGivenOr[float | None] = NOT_GIVEN,
        context: NotGivenOr[str | list[Utterance] | None] = NOT_GIVEN,
        instant_mode: NotGivenOr[bool] = NOT_GIVEN,
        audio_format: NotGivenOr[AudioFormat] = NOT_GIVEN,
    ) -> None:
        """Update TTS options used for all future synthesis (until updated again)

        Args:
            voice: A voice from the voice library specified by name or id.
            description: Natural language instructions describing how the synthesized speech
                should sound (≤1000 characters).
            speed: Speed multiplier for the synthesized speech (≥0.5, ≤2.0, default: 1.0).
            trailing_silence: Duration of trailing silence (in seconds) to add to each utterance.
            context: Optional context for synthesis, either as text or list of utterances.
            instant_mode: Whether to use instant mode.
            audio_format: Output audio format (mp3, wav, or pcm).
        """
        if is_given(description):
            self._opts.description = description
        if is_given(speed):
            self._opts.speed = speed
        if is_given(voice):
            self._opts.voice = voice  # type: ignore
        if is_given(trailing_silence):
            self._opts.trailing_silence = trailing_silence
        if is_given(context):
            self._opts.context = context  # type: ignore
        if is_given(instant_mode):
            self._opts.instant_mode = instant_mode
        if is_given(audio_format):
            self._opts.audio_format = audio_format

    def synthesize(
        self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> tts.ChunkedStream:
        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

Helper class that provides a standard way to create an ABC using inheritance.

Initialize the Hume AI TTS client. Options will be used for all future synthesis (until updated with update_options).

Args

api_key: Hume AI API key. If not provided, will look for HUME_API_KEY environment variable.
voice: A voice from the voice library specified by name or id.
model_version: Specifies which version of Octave to use. See Hume's documentation for details on model version differences: https://dev.hume.ai/docs/text-to-speech-tts/overview.
description: Natural language instructions describing how the synthesized speech should sound (≤1000 characters).
speed: Speed multiplier for the synthesized speech (≥0.5, ≤2.0, default: 1.0).
trailing_silence: Duration of trailing silence (in seconds) to add to each utterance (≥0, ≤5.0, default: 0.35).
context: Optional context for synthesis, either as text or list of utterances.
instant_mode: Whether to use instant mode. Defaults to True if voice specified, False otherwise. Requires a voice to be specified when enabled.
audio_format: Output audio format (mp3, wav, or pcm). Defaults to mp3.
base_url: Base URL for Hume AI API. Defaults to https://api.hume.ai
http_session: Optional aiohttp ClientSession to use for requests.

Ancestors

livekit.agents.tts.tts.TTS
abc.ABC
EventEmitter
typing.Generic

Instance variables

prop model : str

Expand source code

@property
def model(self) -> str:
    return "Octave"

Get the model name/identifier for this TTS instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop provider : str

Expand source code

@property
def provider(self) -> str:
    return "Hume"

Get the provider name/identifier for this TTS instance.

Returns

The provider name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their provider information.

Methods

def synthesize(self, text: str, *, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.agents.tts.tts.ChunkedStream

Expand source code

def synthesize(
    self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> tts.ChunkedStream:
    return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

def update_options(self, *, description: NotGivenOr[str | None] = NOT_GIVEN, speed: NotGivenOr[float | None] = NOT_GIVEN, voice: NotGivenOr[VoiceById | VoiceByName | None] = NOT_GIVEN, trailing_silence: NotGivenOr[float | None] = NOT_GIVEN, context: NotGivenOr[str | list[Utterance] | None] = NOT_GIVEN, instant_mode: NotGivenOr[bool] = NOT_GIVEN, audio_format: NotGivenOr[AudioFormat] = NOT_GIVEN) ‑> None

Expand source code

def update_options(
    self,
    *,
    description: NotGivenOr[str | None] = NOT_GIVEN,
    speed: NotGivenOr[float | None] = NOT_GIVEN,
    voice: NotGivenOr[VoiceById | VoiceByName | None] = NOT_GIVEN,
    trailing_silence: NotGivenOr[float | None] = NOT_GIVEN,
    context: NotGivenOr[str | list[Utterance] | None] = NOT_GIVEN,
    instant_mode: NotGivenOr[bool] = NOT_GIVEN,
    audio_format: NotGivenOr[AudioFormat] = NOT_GIVEN,
) -> None:
    """Update TTS options used for all future synthesis (until updated again)

    Args:
        voice: A voice from the voice library specified by name or id.
        description: Natural language instructions describing how the synthesized speech
            should sound (≤1000 characters).
        speed: Speed multiplier for the synthesized speech (≥0.5, ≤2.0, default: 1.0).
        trailing_silence: Duration of trailing silence (in seconds) to add to each utterance.
        context: Optional context for synthesis, either as text or list of utterances.
        instant_mode: Whether to use instant mode.
        audio_format: Output audio format (mp3, wav, or pcm).
    """
    if is_given(description):
        self._opts.description = description
    if is_given(speed):
        self._opts.speed = speed
    if is_given(voice):
        self._opts.voice = voice  # type: ignore
    if is_given(trailing_silence):
        self._opts.trailing_silence = trailing_silence
    if is_given(context):
        self._opts.context = context  # type: ignore
    if is_given(instant_mode):
        self._opts.instant_mode = instant_mode
    if is_given(audio_format):
        self._opts.audio_format = audio_format

Update TTS options used for all future synthesis (until updated again)

Args

voice: A voice from the voice library specified by name or id.
description: Natural language instructions describing how the synthesized speech should sound (≤1000 characters).
speed: Speed multiplier for the synthesized speech (≥0.5, ≤2.0, default: 1.0).
trailing_silence: Duration of trailing silence (in seconds) to add to each utterance.
context: Optional context for synthesis, either as text or list of utterances.
instant_mode: Whether to use instant mode.
audio_format: Output audio format (mp3, wav, or pcm).

Inherited members

EventEmitter:
- emit
- off
- on
- once

class Utterance (*args, **kwargs)

Expand source code

class Utterance(TypedDict, total=False):
    """Utterance for TTS synthesis."""

    text: str
    description: str | None
    speed: float | None
    voice: VoiceById | VoiceByName | None
    trailing_silence: float | None

Utterance for TTS synthesis.

Ancestors

builtins.dict

Class variables

var description : str | None
var speed : float | None
var text : str
var trailing_silence : float | None
var voice : livekit.plugins.hume.tts.VoiceById | livekit.plugins.hume.tts.VoiceByName | None

class VoiceById (*args, **kwargs)

Expand source code

class VoiceById(TypedDict, total=False):
    id: str
    provider: VoiceProvider | None

dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

Ancestors

builtins.dict

Class variables

var id : str
var provider : livekit.plugins.hume.tts.VoiceProvider | None

class VoiceByName (*args, **kwargs)

Expand source code

class VoiceByName(TypedDict, total=False):
    name: str
    provider: VoiceProvider | None

Ancestors

builtins.dict

Class variables

var name : str
var provider : livekit.plugins.hume.tts.VoiceProvider | None

class VoiceProvider (*args, **kwds)

Expand source code

class VoiceProvider(str, Enum):
    """Voice provider for the voice library."""

    hume = "HUME_AI"
    custom = "CUSTOM_VOICE"

Voice provider for the voice library.

Ancestors

builtins.str
enum.Enum

Class variables

var custom
var hume