Module livekit.plugins.fishaudio
Fish Audio plugin for LiveKit Agents
See https://docs.fish.audio for more information.
Environment variables used:
- FISH_API_KEY for authentication (required)
Classes
class TTS (*,
api_key: NotGivenOr[str] = NOT_GIVEN,
model: Backends = 's1',
reference_id: NotGivenOr[str] = '8ef4a238714b45718ce04243307c57a7',
output_format: OutputFormat = 'pcm',
sample_rate: int = 24000,
num_channels: int = 1,
base_url: NotGivenOr[str] = NOT_GIVEN,
latency_mode: LatencyMode = 'balanced')-
Expand source code
class TTS(tts.TTS): """ Fish Audio TTS implementation for LiveKit Agents. This plugin provides text-to-speech synthesis using Fish Audio's API. It supports both chunked (non-streaming) and real-time WebSocket streaming modes, as well as reference ID-based and custom reference audio-based synthesis. Args: api_key (NotGivenOr[str]): Fish Audio API key. Can be set via argument or `FISH_API_KEY` environment variable. model (Backends): TTS model/backend to use. Defaults to "s1". reference_id (NotGivenOr[str]): Reference voice model ID. Defaults to a general-purpose voice. output_format (OutputFormat): Audio output format. Defaults to "pcm" for streaming. sample_rate (int): Audio sample rate in Hz. Defaults to 24000. num_channels (int): Number of audio channels. Defaults to 1 (mono). base_url (NotGivenOr[str]): Custom base URL for the Fish Audio API. Optional. latency_mode (LatencyMode): Streaming latency mode for WebSocket. "normal" (~500ms) or "balanced" (~300ms). Defaults to "balanced". """ def __init__( self, *, api_key: NotGivenOr[str] = NOT_GIVEN, model: Backends = DEFAULT_MODEL, reference_id: NotGivenOr[str] = DEFAULT_REFERENCE_ID, output_format: OutputFormat = "pcm", sample_rate: int = 24000, num_channels: int = 1, base_url: NotGivenOr[str] = NOT_GIVEN, latency_mode: LatencyMode = "balanced", ) -> None: super().__init__( capabilities=tts.TTSCapabilities(streaming=True), sample_rate=sample_rate, num_channels=num_channels, ) fish_api_key = api_key if is_given(api_key) else os.getenv("FISH_API_KEY") if not fish_api_key: raise ValueError( "Fish Audio API key is required, either as argument or set FISH_API_KEY environment variable" ) self._opts = _TTSOptions( model=model, output_format=output_format, sample_rate=sample_rate, num_channels=num_channels, reference_id=reference_id, base_url=base_url if is_given(base_url) else "https://api.fish.audio", api_key=fish_api_key, latency_mode=latency_mode, ) # Initialize Fish Audio sessions self._session = FishAudioSession(self._opts.api_key, base_url=self._opts.base_url) # WebSocket session for streaming (lazy initialized) self._ws_session: AsyncWebSocketSession | None = None self._ws_session_lock = asyncio.Lock() # Track active streams self._streams = weakref.WeakSet[SynthesizeStream]() logger.info( "FishAudioTTS initialized", extra={ "model": self._opts.model, "format": self._opts.output_format, "sample_rate": self._opts.sample_rate, "latency_mode": self._opts.latency_mode, }, ) @property def model(self) -> Backends: return self._opts.model @property def output_format(self) -> OutputFormat: return self._opts.output_format @property def reference_id(self) -> NotGivenOr[str]: return self._opts.reference_id @property def session(self) -> FishAudioSession: return self._session @property def latency_mode(self) -> LatencyMode: return self._opts.latency_mode async def _ensure_ws_session(self) -> AsyncWebSocketSession: """Get the current WebSocket session, creating one if needed""" async with self._ws_session_lock: if self._ws_session is None: self._ws_session = AsyncWebSocketSession( apikey=self._opts.api_key, base_url=self._opts.base_url ) return self._ws_session def synthesize( self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> ChunkedStream: """ Synthesize speech from text using chunked (non-streaming) mode. Args: text (str): The text to synthesize. conn_options (APIConnectOptions): Connection options for the API call. Returns: ChunkedStream: A stream object that will produce synthesized audio. """ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options) def stream( self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS ) -> SynthesizeStream: """ Create a real-time streaming TTS session using WebSocket. Args: conn_options (APIConnectOptions): Connection options for the WebSocket. Returns: SynthesizeStream: A streaming object for real-time text-to-speech. """ stream = SynthesizeStream(tts=self, conn_options=conn_options) self._streams.add(stream) return stream async def aclose(self) -> None: """ Close TTS resources and WebSocket sessions. """ for stream in list(self._streams): await stream.aclose() self._streams.clear() if self._ws_session is not None: await self._ws_session.close() self._ws_session = NoneFish Audio TTS implementation for LiveKit Agents.
This plugin provides text-to-speech synthesis using Fish Audio's API. It supports both chunked (non-streaming) and real-time WebSocket streaming modes, as well as reference ID-based and custom reference audio-based synthesis.
Args
api_key:NotGivenOr[str]- Fish Audio API key. Can be set via argument or
FISH_API_KEYenvironment variable. model:Backends- TTS model/backend to use. Defaults to "s1".
reference_id:NotGivenOr[str]- Reference voice model ID. Defaults to a general-purpose voice.
output_format:OutputFormat- Audio output format. Defaults to "pcm" for streaming.
sample_rate:int- Audio sample rate in Hz. Defaults to 24000.
num_channels:int- Number of audio channels. Defaults to 1 (mono).
base_url:NotGivenOr[str]- Custom base URL for the Fish Audio API. Optional.
latency_mode:LatencyMode- Streaming latency mode for WebSocket. "normal" (~500ms) or "balanced" (~300ms). Defaults to "balanced".
Ancestors
- livekit.agents.tts.tts.TTS
- abc.ABC
- EventEmitter
- typing.Generic
Instance variables
prop latency_mode : LatencyMode-
Expand source code
@property def latency_mode(self) -> LatencyMode: return self._opts.latency_mode prop model : Backends-
Expand source code
@property def model(self) -> Backends: return self._opts.modelGet the model name/identifier for this TTS instance.
Returns
The model name if available, "unknown" otherwise.
Note
Plugins should override this property to provide their model information.
prop output_format : OutputFormat-
Expand source code
@property def output_format(self) -> OutputFormat: return self._opts.output_format prop reference_id : NotGivenOr[str]-
Expand source code
@property def reference_id(self) -> NotGivenOr[str]: return self._opts.reference_id prop session : FishAudioSession-
Expand source code
@property def session(self) -> FishAudioSession: return self._session
Methods
async def aclose(self) ‑> None-
Expand source code
async def aclose(self) -> None: """ Close TTS resources and WebSocket sessions. """ for stream in list(self._streams): await stream.aclose() self._streams.clear() if self._ws_session is not None: await self._ws_session.close() self._ws_session = NoneClose TTS resources and WebSocket sessions.
def stream(self,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.fishaudio.tts.SynthesizeStream-
Expand source code
def stream( self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS ) -> SynthesizeStream: """ Create a real-time streaming TTS session using WebSocket. Args: conn_options (APIConnectOptions): Connection options for the WebSocket. Returns: SynthesizeStream: A streaming object for real-time text-to-speech. """ stream = SynthesizeStream(tts=self, conn_options=conn_options) self._streams.add(stream) return streamCreate a real-time streaming TTS session using WebSocket.
Args
conn_options:APIConnectOptions- Connection options for the WebSocket.
Returns
SynthesizeStream- A streaming object for real-time text-to-speech.
def synthesize(self,
text: str,
*,
conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.fishaudio.tts.ChunkedStream-
Expand source code
def synthesize( self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, ) -> ChunkedStream: """ Synthesize speech from text using chunked (non-streaming) mode. Args: text (str): The text to synthesize. conn_options (APIConnectOptions): Connection options for the API call. Returns: ChunkedStream: A stream object that will produce synthesized audio. """ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)Synthesize speech from text using chunked (non-streaming) mode.
Args
text:str- The text to synthesize.
conn_options:APIConnectOptions- Connection options for the API call.
Returns
ChunkedStream- A stream object that will produce synthesized audio.
Inherited members