Module livekit.plugins.fishaudio

Fish Audio plugin for LiveKit Agents

See https://docs.fish.audio for more information.

Environment variables used: - FISH_API_KEY for authentication (required)

Classes

class TTS (*,
api_key: NotGivenOr[str] = NOT_GIVEN,
model: Backends = 's1',
reference_id: NotGivenOr[str] = '8ef4a238714b45718ce04243307c57a7',
output_format: OutputFormat = 'pcm',
sample_rate: int = 24000,
num_channels: int = 1,
base_url: NotGivenOr[str] = NOT_GIVEN,
latency_mode: LatencyMode = 'balanced')
Expand source code
class TTS(tts.TTS):
    """
    Fish Audio TTS implementation for LiveKit Agents.

    This plugin provides text-to-speech synthesis using Fish Audio's API.
    It supports both chunked (non-streaming) and real-time WebSocket streaming modes,
    as well as reference ID-based and custom reference audio-based synthesis.

    Args:
        api_key (NotGivenOr[str]): Fish Audio API key. Can be set via argument or `FISH_API_KEY` environment variable.
        model (Backends): TTS model/backend to use. Defaults to "s1".
        reference_id (NotGivenOr[str]): Reference voice model ID. Defaults to a general-purpose voice.
        output_format (OutputFormat): Audio output format. Defaults to "pcm" for streaming.
        sample_rate (int): Audio sample rate in Hz. Defaults to 24000.
        num_channels (int): Number of audio channels. Defaults to 1 (mono).
        base_url (NotGivenOr[str]): Custom base URL for the Fish Audio API. Optional.
        latency_mode (LatencyMode): Streaming latency mode for WebSocket. "normal" (~500ms) or "balanced" (~300ms). Defaults to "balanced".
    """

    def __init__(
        self,
        *,
        api_key: NotGivenOr[str] = NOT_GIVEN,
        model: Backends = DEFAULT_MODEL,
        reference_id: NotGivenOr[str] = DEFAULT_REFERENCE_ID,
        output_format: OutputFormat = "pcm",
        sample_rate: int = 24000,
        num_channels: int = 1,
        base_url: NotGivenOr[str] = NOT_GIVEN,
        latency_mode: LatencyMode = "balanced",
    ) -> None:
        super().__init__(
            capabilities=tts.TTSCapabilities(streaming=True),
            sample_rate=sample_rate,
            num_channels=num_channels,
        )

        fish_api_key = api_key if is_given(api_key) else os.getenv("FISH_API_KEY")
        if not fish_api_key:
            raise ValueError(
                "Fish Audio API key is required, either as argument or set FISH_API_KEY environment variable"
            )

        self._opts = _TTSOptions(
            model=model,
            output_format=output_format,
            sample_rate=sample_rate,
            num_channels=num_channels,
            reference_id=reference_id,
            base_url=base_url if is_given(base_url) else "https://api.fish.audio",
            api_key=fish_api_key,
            latency_mode=latency_mode,
        )

        # Initialize Fish Audio sessions
        self._session = FishAudioSession(self._opts.api_key, base_url=self._opts.base_url)

        # WebSocket session for streaming (lazy initialized)
        self._ws_session: AsyncWebSocketSession | None = None
        self._ws_session_lock = asyncio.Lock()

        # Track active streams
        self._streams = weakref.WeakSet[SynthesizeStream]()

        logger.info(
            "FishAudioTTS initialized",
            extra={
                "model": self._opts.model,
                "format": self._opts.output_format,
                "sample_rate": self._opts.sample_rate,
                "latency_mode": self._opts.latency_mode,
            },
        )

    @property
    def model(self) -> Backends:
        return self._opts.model

    @property
    def output_format(self) -> OutputFormat:
        return self._opts.output_format

    @property
    def reference_id(self) -> NotGivenOr[str]:
        return self._opts.reference_id

    @property
    def session(self) -> FishAudioSession:
        return self._session

    @property
    def latency_mode(self) -> LatencyMode:
        return self._opts.latency_mode

    async def _ensure_ws_session(self) -> AsyncWebSocketSession:
        """Get the current WebSocket session, creating one if needed"""
        async with self._ws_session_lock:
            if self._ws_session is None:
                self._ws_session = AsyncWebSocketSession(
                    apikey=self._opts.api_key, base_url=self._opts.base_url
                )
            return self._ws_session

    def synthesize(
        self,
        text: str,
        *,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> ChunkedStream:
        """
        Synthesize speech from text using chunked (non-streaming) mode.

        Args:
            text (str): The text to synthesize.
            conn_options (APIConnectOptions): Connection options for the API call.

        Returns:
            ChunkedStream: A stream object that will produce synthesized audio.
        """
        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

    def stream(
        self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> SynthesizeStream:
        """
        Create a real-time streaming TTS session using WebSocket.

        Args:
            conn_options (APIConnectOptions): Connection options for the WebSocket.

        Returns:
            SynthesizeStream: A streaming object for real-time text-to-speech.
        """
        stream = SynthesizeStream(tts=self, conn_options=conn_options)
        self._streams.add(stream)
        return stream

    async def aclose(self) -> None:
        """
        Close TTS resources and WebSocket sessions.
        """
        for stream in list(self._streams):
            await stream.aclose()
        self._streams.clear()

        if self._ws_session is not None:
            await self._ws_session.close()
            self._ws_session = None

Fish Audio TTS implementation for LiveKit Agents.

This plugin provides text-to-speech synthesis using Fish Audio's API. It supports both chunked (non-streaming) and real-time WebSocket streaming modes, as well as reference ID-based and custom reference audio-based synthesis.

Args

api_key : NotGivenOr[str]
Fish Audio API key. Can be set via argument or FISH_API_KEY environment variable.
model : Backends
TTS model/backend to use. Defaults to "s1".
reference_id : NotGivenOr[str]
Reference voice model ID. Defaults to a general-purpose voice.
output_format : OutputFormat
Audio output format. Defaults to "pcm" for streaming.
sample_rate : int
Audio sample rate in Hz. Defaults to 24000.
num_channels : int
Number of audio channels. Defaults to 1 (mono).
base_url : NotGivenOr[str]
Custom base URL for the Fish Audio API. Optional.
latency_mode : LatencyMode
Streaming latency mode for WebSocket. "normal" (~500ms) or "balanced" (~300ms). Defaults to "balanced".

Ancestors

  • livekit.agents.tts.tts.TTS
  • abc.ABC
  • EventEmitter
  • typing.Generic

Instance variables

prop latency_mode : LatencyMode
Expand source code
@property
def latency_mode(self) -> LatencyMode:
    return self._opts.latency_mode
prop model : Backends
Expand source code
@property
def model(self) -> Backends:
    return self._opts.model

Get the model name/identifier for this TTS instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop output_format : OutputFormat
Expand source code
@property
def output_format(self) -> OutputFormat:
    return self._opts.output_format
prop reference_id : NotGivenOr[str]
Expand source code
@property
def reference_id(self) -> NotGivenOr[str]:
    return self._opts.reference_id
prop session : FishAudioSession
Expand source code
@property
def session(self) -> FishAudioSession:
    return self._session

Methods

async def aclose(self) ‑> None
Expand source code
async def aclose(self) -> None:
    """
    Close TTS resources and WebSocket sessions.
    """
    for stream in list(self._streams):
        await stream.aclose()
    self._streams.clear()

    if self._ws_session is not None:
        await self._ws_session.close()
        self._ws_session = None

Close TTS resources and WebSocket sessions.

def stream(self,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.fishaudio.tts.SynthesizeStream
Expand source code
def stream(
    self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> SynthesizeStream:
    """
    Create a real-time streaming TTS session using WebSocket.

    Args:
        conn_options (APIConnectOptions): Connection options for the WebSocket.

    Returns:
        SynthesizeStream: A streaming object for real-time text-to-speech.
    """
    stream = SynthesizeStream(tts=self, conn_options=conn_options)
    self._streams.add(stream)
    return stream

Create a real-time streaming TTS session using WebSocket.

Args

conn_options : APIConnectOptions
Connection options for the WebSocket.

Returns

SynthesizeStream
A streaming object for real-time text-to-speech.
def synthesize(self,
text: str,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.fishaudio.tts.ChunkedStream
Expand source code
def synthesize(
    self,
    text: str,
    *,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> ChunkedStream:
    """
    Synthesize speech from text using chunked (non-streaming) mode.

    Args:
        text (str): The text to synthesize.
        conn_options (APIConnectOptions): Connection options for the API call.

    Returns:
        ChunkedStream: A stream object that will produce synthesized audio.
    """
    return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

Synthesize speech from text using chunked (non-streaming) mode.

Args

text : str
The text to synthesize.
conn_options : APIConnectOptions
Connection options for the API call.

Returns

ChunkedStream
A stream object that will produce synthesized audio.

Inherited members