Module livekit.plugins.fishaudio

Fish Audio plugin for LiveKit Agents

See https://docs.fish.audio for more information.

Environment variables used: - FISH_API_KEY for authentication (required)

Classes

class TTS (*,
api_key: NotGivenOr[str] = NOT_GIVEN,
model: TTSModels | str = 's2-pro',
voice_id: NotGivenOr[str] = '933563129e564b19a115bedd57b7406a',
output_format: OutputFormat = 'wav',
sample_rate: NotGivenOr[int] = NOT_GIVEN,
base_url: NotGivenOr[str] = NOT_GIVEN,
latency_mode: LatencyMode = 'balanced',
chunk_length: int = 100,
tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
http_session: aiohttp.ClientSession | None = None)
Expand source code
class TTS(tts.TTS):
    def __init__(
        self,
        *,
        api_key: NotGivenOr[str] = NOT_GIVEN,
        model: TTSModels | str = DEFAULT_MODEL,
        voice_id: NotGivenOr[str] = DEFAULT_VOICE_ID,
        output_format: OutputFormat = "wav",
        sample_rate: NotGivenOr[int] = NOT_GIVEN,
        base_url: NotGivenOr[str] = NOT_GIVEN,
        latency_mode: LatencyMode = "balanced",
        chunk_length: int = 100,
        tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
        http_session: aiohttp.ClientSession | None = None,
    ) -> None:
        """
        Create a new instance of Fish Audio TTS.

        See https://docs.fish.audio/api-reference/endpoint/websocket/tts-live for more details
        on the Fish Audio Live TTS WebSocket API.

        Args:
            api_key (NotGivenOr[str]): Fish Audio API key. Reads ``FISH_API_KEY`` if unset.
            model (TTSModels | str): TTS model to use. Defaults to ``"s2-pro"``.
            voice_id (NotGivenOr[str]): Voice model ID. Fish Audio's API refers to this
                as ``reference_id``; it's the same value either way.
            output_format (OutputFormat): Audio output format. Defaults to ``"wav"``.
            sample_rate (int): Audio sample rate in Hz.
            base_url (NotGivenOr[str]): Custom base URL. Defaults to ``https://api.fish.audio``.
            latency_mode (LatencyMode): Streaming latency mode. ``"normal"``, ``"balanced"``,
                or ``"low"``. Defaults to ``"balanced"``.
            chunk_length (int): Upper bound on text Fish buffers before auto-synthesizing
                (100–300). With sentence-level flushing this is only hit by sentences longer
                than ``chunk_length``; otherwise audio is produced when each sentence is
                flushed. Defaults to 100.
            tokenizer (tokenize.SentenceTokenizer): Sentence tokenizer used to detect
                sentence boundaries. Defaults to ``tokenize.blingfire.SentenceTokenizer()``.
            http_session (aiohttp.ClientSession | None): Optional aiohttp session.
        """
        if is_given(sample_rate):
            if output_format == "opus" and sample_rate != 48000:
                raise ValueError(
                    "Fish Audio only supports 48000 Hz for opus output; "
                    f"got sample_rate={sample_rate}"
                )
            resolved_sample_rate = sample_rate
        else:
            resolved_sample_rate = _DEFAULT_SAMPLE_RATE[output_format]

        super().__init__(
            capabilities=tts.TTSCapabilities(streaming=True),
            sample_rate=resolved_sample_rate,
            num_channels=NUM_CHANNELS,
        )

        fish_api_key = api_key if is_given(api_key) else os.getenv("FISH_API_KEY")
        if not fish_api_key:
            raise ValueError(
                "Fish Audio API key is required, either as argument or set "
                "FISH_API_KEY environment variable"
            )

        if not 100 <= chunk_length <= 300:
            raise ValueError("chunk_length must be between 100 and 300")

        self._opts = _TTSOptions(
            model=model,
            output_format=output_format,
            sample_rate=resolved_sample_rate,
            voice_id=voice_id,
            base_url=base_url if is_given(base_url) else DEFAULT_BASE_URL,
            api_key=fish_api_key,
            latency_mode=latency_mode,
            chunk_length=chunk_length,
        )

        self._session = http_session
        # min_sentence_len=1 emits each sentence as soon as the next one starts,
        # rather than batching short sentences together — minimizes TTFB on the
        # first sentence and keeps Fish synthesizing continuously.
        self._sentence_tokenizer = (
            tokenizer
            if is_given(tokenizer)
            else tokenize.blingfire.SentenceTokenizer(min_sentence_len=1)
        )
        self._streams = weakref.WeakSet[SynthesizeStream]()

    @property
    def model(self) -> TTSModels | str:
        return self._opts.model

    @property
    def provider(self) -> str:
        return "FishAudio"

    @property
    def output_format(self) -> OutputFormat:
        return self._opts.output_format

    @property
    def voice_id(self) -> NotGivenOr[str]:
        return self._opts.voice_id

    @property
    def latency_mode(self) -> LatencyMode:
        return self._opts.latency_mode

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()
        return self._session

    def update_options(
        self,
        *,
        model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
        voice_id: NotGivenOr[str] = NOT_GIVEN,
        latency_mode: NotGivenOr[LatencyMode] = NOT_GIVEN,
        chunk_length: NotGivenOr[int] = NOT_GIVEN,
    ) -> None:
        if is_given(model):
            self._opts.model = model
        if is_given(voice_id):
            self._opts.voice_id = voice_id
        if is_given(latency_mode):
            self._opts.latency_mode = latency_mode
        if is_given(chunk_length):
            if not 100 <= chunk_length <= 300:
                raise ValueError("chunk_length must be between 100 and 300")
            self._opts.chunk_length = chunk_length

    def synthesize(
        self,
        text: str,
        *,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> ChunkedStream:
        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

    def stream(
        self,
        *,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> SynthesizeStream:
        stream = SynthesizeStream(tts=self, conn_options=conn_options)
        self._streams.add(stream)
        return stream

    async def aclose(self) -> None:
        for stream in list(self._streams):
            await stream.aclose()
        self._streams.clear()

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Fish Audio TTS.

See https://docs.fish.audio/api-reference/endpoint/websocket/tts-live for more details on the Fish Audio Live TTS WebSocket API.

Args

api_key : NotGivenOr[str]
Fish Audio API key. Reads FISH_API_KEY if unset.
model : TTSModels | str
TTS model to use. Defaults to "s2-pro".
voice_id : NotGivenOr[str]
Voice model ID. Fish Audio's API refers to this as reference_id; it's the same value either way.
output_format : OutputFormat
Audio output format. Defaults to "wav".
sample_rate : int
Audio sample rate in Hz.
base_url : NotGivenOr[str]
Custom base URL. Defaults to https://api.fish.audio.
latency_mode : LatencyMode
Streaming latency mode. "normal", "balanced", or "low". Defaults to "balanced".
chunk_length : int
Upper bound on text Fish buffers before auto-synthesizing (100–300). With sentence-level flushing this is only hit by sentences longer than chunk_length; otherwise audio is produced when each sentence is flushed. Defaults to 100.
tokenizer : tokenize.SentenceTokenizer
Sentence tokenizer used to detect sentence boundaries. Defaults to tokenize.blingfire.SentenceTokenizer().
http_session : aiohttp.ClientSession | None
Optional aiohttp session.

Ancestors

  • livekit.agents.tts.tts.TTS
  • abc.ABC
  • EventEmitter
  • typing.Generic

Instance variables

prop latency_mode : LatencyMode
Expand source code
@property
def latency_mode(self) -> LatencyMode:
    return self._opts.latency_mode
prop model : TTSModels | str
Expand source code
@property
def model(self) -> TTSModels | str:
    return self._opts.model

Get the model name/identifier for this TTS instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop output_format : OutputFormat
Expand source code
@property
def output_format(self) -> OutputFormat:
    return self._opts.output_format
prop provider : str
Expand source code
@property
def provider(self) -> str:
    return "FishAudio"

Get the provider name/identifier for this TTS instance.

Returns

The provider name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their provider information.

prop voice_id : NotGivenOr[str]
Expand source code
@property
def voice_id(self) -> NotGivenOr[str]:
    return self._opts.voice_id

Methods

async def aclose(self) ‑> None
Expand source code
async def aclose(self) -> None:
    for stream in list(self._streams):
        await stream.aclose()
    self._streams.clear()
def stream(self,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.fishaudio.tts.SynthesizeStream
Expand source code
def stream(
    self,
    *,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> SynthesizeStream:
    stream = SynthesizeStream(tts=self, conn_options=conn_options)
    self._streams.add(stream)
    return stream
def synthesize(self,
text: str,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.fishaudio.tts.ChunkedStream
Expand source code
def synthesize(
    self,
    text: str,
    *,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> ChunkedStream:
    return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
def update_options(self,
*,
model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
voice_id: NotGivenOr[str] = NOT_GIVEN,
latency_mode: NotGivenOr[LatencyMode] = NOT_GIVEN,
chunk_length: NotGivenOr[int] = NOT_GIVEN) ‑> None
Expand source code
def update_options(
    self,
    *,
    model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
    voice_id: NotGivenOr[str] = NOT_GIVEN,
    latency_mode: NotGivenOr[LatencyMode] = NOT_GIVEN,
    chunk_length: NotGivenOr[int] = NOT_GIVEN,
) -> None:
    if is_given(model):
        self._opts.model = model
    if is_given(voice_id):
        self._opts.voice_id = voice_id
    if is_given(latency_mode):
        self._opts.latency_mode = latency_mode
    if is_given(chunk_length):
        if not 100 <= chunk_length <= 300:
            raise ValueError("chunk_length must be between 100 and 300")
        self._opts.chunk_length = chunk_length

Inherited members