Module livekit.plugins.fishaudio
Fish Audio plugin for LiveKit Agents
See https://docs.fish.audio for more information.
Environment variables used:
- FISH_API_KEY for authentication (required)
Classes
class TTS (*,
api_key: NotGivenOr[str] = NOT_GIVEN,
model: TTSModels | str = 's2-pro',
voice_id: NotGivenOr[str] = '933563129e564b19a115bedd57b7406a',
output_format: OutputFormat = 'wav',
sample_rate: NotGivenOr[int] = NOT_GIVEN,
base_url: NotGivenOr[str] = NOT_GIVEN,
latency_mode: LatencyMode = 'balanced',
chunk_length: int = 100,
tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
http_session: aiohttp.ClientSession | None = None)-
Expand source code
class TTS(tts.TTS): def __init__( self, *, api_key: NotGivenOr[str] = NOT_GIVEN, model: TTSModels | str = DEFAULT_MODEL, voice_id: NotGivenOr[str] = DEFAULT_VOICE_ID, output_format: OutputFormat = "wav", sample_rate: NotGivenOr[int] = NOT_GIVEN, base_url: NotGivenOr[str] = NOT_GIVEN, latency_mode: LatencyMode = "balanced", chunk_length: int = 100, tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN, http_session: aiohttp.ClientSession | None = None, ) -> None: """ Create a new instance of Fish Audio TTS. See https://docs.fish.audio/api-reference/endpoint/websocket/tts-live for more details on the Fish Audio Live TTS WebSocket API. Args: api_key (NotGivenOr[str]): Fish Audio API key. Reads ``FISH_API_KEY`` if unset. model (TTSModels | str): TTS model to use. Defaults to ``"s2-pro"``. voice_id (NotGivenOr[str]): Voice model ID. Fish Audio's API refers to this as ``reference_id``; it's the same value either way. output_format (OutputFormat): Audio output format. Defaults to ``"wav"``. sample_rate (int): Audio sample rate in Hz. base_url (NotGivenOr[str]): Custom base URL. Defaults to ``https://api.fish.audio``. latency_mode (LatencyMode): Streaming latency mode. ``"normal"``, ``"balanced"``, or ``"low"``. Defaults to ``"balanced"``. chunk_length (int): Upper bound on text Fish buffers before auto-synthesizing (100–300). With sentence-level flushing this is only hit by sentences longer than ``chunk_length``; otherwise audio is produced when each sentence is flushed. Defaults to 100. tokenizer (tokenize.SentenceTokenizer): Sentence tokenizer used to detect sentence boundaries. Defaults to ``tokenize.blingfire.SentenceTokenizer()``. http_session (aiohttp.ClientSession | None): Optional aiohttp session. """ if is_given(sample_rate): if output_format == "opus" and sample_rate != 48000: raise ValueError( "Fish Audio only supports 48000 Hz for opus output; " f"got sample_rate={sample_rate}" ) resolved_sample_rate = sample_rate else: resolved_sample_rate = _DEFAULT_SAMPLE_RATE[output_format] super().__init__( capabilities=tts.TTSCapabilities(streaming=True), sample_rate=resolved_sample_rate, num_channels=NUM_CHANNELS, ) fish_api_key = api_key if is_given(api_key) else os.getenv("FISH_API_KEY") if not fish_api_key: raise ValueError( "Fish Audio API key is required, either as argument or set " "FISH_API_KEY environment variable" ) if not 100 <= chunk_length <= 300: raise ValueError("chunk_length must be between 100 and 300") self._opts = _TTSOptions( model=model, output_format=output_format, sample_rate=resolved_sample_rate, voice_id=voice_id, base_url=base_url if is_given(base_url) else DEFAULT_BASE_URL, api_key=fish_api_key, latency_mode=latency_mode, chunk_length=chunk_length, ) self._session = http_session # min_sentence_len=1 emits each sentence as soon as the next one starts, # rather than batching short sentences together — minimizes TTFB on the # first sentence and keeps Fish synthesizing continuously. self._sentence_tokenizer = ( tokenizer if is_given(tokenizer) else tokenize.blingfire.SentenceTokenizer(min_sentence_len=1) ) self._streams = weakref.WeakSet[SynthesizeStream]() @property def model(self) -> TTSModels | str: return self._opts.model @property def provider(self) -> str: return "FishAudio" @property def output_format(self) -> OutputFormat: return self._opts.output_format @property def voice_id(self) -> NotGivenOr[str]: return self._opts.voice_id @property def latency_mode(self) -> LatencyMode: return self._opts.latency_mode def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: self._session = utils.http_context.http_session() return self._session def update_options( self, *, model: NotGivenOr[TTSModels | str] = NOT_GIVEN, voice_id: NotGivenOr[str] = NOT_GIVEN, latency_mode: NotGivenOr[LatencyMode] = NOT_GIVEN, chunk_length: NotGivenOr[int] = NOT_GIVEN, ) -> None: if is_given(model): self._opts.model = model if is_given(voice_id): self._opts.voice_id = voice_id if is_given(latency_mode): self._opts.latency_mode = latency_mode if is_given(chunk_length): if not 100 <= chunk_length <= 300: raise ValueError("chunk_length must be between 100 and 300") self._opts.chunk_length = chunk_length def synthesize( self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> ChunkedStream: return ChunkedStream(tts=self, input_text=text, conn_options=conn_options) def stream( self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> SynthesizeStream: stream = SynthesizeStream(tts=self, conn_options=conn_options) self._streams.add(stream) return stream async def aclose(self) -> None: for stream in list(self._streams): await stream.aclose() self._streams.clear()Helper class that provides a standard way to create an ABC using inheritance.
Create a new instance of Fish Audio TTS.
See https://docs.fish.audio/api-reference/endpoint/websocket/tts-live for more details on the Fish Audio Live TTS WebSocket API.
Args
api_key:NotGivenOr[str]- Fish Audio API key. Reads
FISH_API_KEYif unset. model:TTSModels | str- TTS model to use. Defaults to
"s2-pro". voice_id:NotGivenOr[str]- Voice model ID. Fish Audio's API refers to this
as
reference_id; it's the same value either way. output_format:OutputFormat- Audio output format. Defaults to
"wav". sample_rate:int- Audio sample rate in Hz.
base_url:NotGivenOr[str]- Custom base URL. Defaults to
https://api.fish.audio. latency_mode:LatencyMode- Streaming latency mode.
"normal","balanced", or"low". Defaults to"balanced". chunk_length:int- Upper bound on text Fish buffers before auto-synthesizing
(100–300). With sentence-level flushing this is only hit by sentences longer
than
chunk_length; otherwise audio is produced when each sentence is flushed. Defaults to 100. tokenizer:tokenize.SentenceTokenizer- Sentence tokenizer used to detect
sentence boundaries. Defaults to
tokenize.blingfire.SentenceTokenizer(). http_session:aiohttp.ClientSession | None- Optional aiohttp session.
Ancestors
- livekit.agents.tts.tts.TTS
- abc.ABC
- EventEmitter
- typing.Generic
Instance variables
prop latency_mode : LatencyMode-
Expand source code
@property def latency_mode(self) -> LatencyMode: return self._opts.latency_mode prop model : TTSModels | str-
Expand source code
@property def model(self) -> TTSModels | str: return self._opts.modelGet the model name/identifier for this TTS instance.
Returns
The model name if available, "unknown" otherwise.
Note
Plugins should override this property to provide their model information.
prop output_format : OutputFormat-
Expand source code
@property def output_format(self) -> OutputFormat: return self._opts.output_format prop provider : str-
Expand source code
@property def provider(self) -> str: return "FishAudio"Get the provider name/identifier for this TTS instance.
Returns
The provider name if available, "unknown" otherwise.
Note
Plugins should override this property to provide their provider information.
prop voice_id : NotGivenOr[str]-
Expand source code
@property def voice_id(self) -> NotGivenOr[str]: return self._opts.voice_id
Methods
async def aclose(self) ‑> None-
Expand source code
async def aclose(self) -> None: for stream in list(self._streams): await stream.aclose() self._streams.clear() def stream(self,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.fishaudio.tts.SynthesizeStream-
Expand source code
def stream( self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> SynthesizeStream: stream = SynthesizeStream(tts=self, conn_options=conn_options) self._streams.add(stream) return stream def synthesize(self,
text: str,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.fishaudio.tts.ChunkedStream-
Expand source code
def synthesize( self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> ChunkedStream: return ChunkedStream(tts=self, input_text=text, conn_options=conn_options) def update_options(self,
*,
model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
voice_id: NotGivenOr[str] = NOT_GIVEN,
latency_mode: NotGivenOr[LatencyMode] = NOT_GIVEN,
chunk_length: NotGivenOr[int] = NOT_GIVEN) ‑> None-
Expand source code
def update_options( self, *, model: NotGivenOr[TTSModels | str] = NOT_GIVEN, voice_id: NotGivenOr[str] = NOT_GIVEN, latency_mode: NotGivenOr[LatencyMode] = NOT_GIVEN, chunk_length: NotGivenOr[int] = NOT_GIVEN, ) -> None: if is_given(model): self._opts.model = model if is_given(voice_id): self._opts.voice_id = voice_id if is_given(latency_mode): self._opts.latency_mode = latency_mode if is_given(chunk_length): if not 100 <= chunk_length <= 300: raise ValueError("chunk_length must be between 100 and 300") self._opts.chunk_length = chunk_length
Inherited members