Module `livekit.plugins.speechmatics`

Speechmatics STT plugin for LiveKit Agents

See https://docs.livekit.io/agents/integrations/stt/speechmatics/ for more information.

Classes

class AdditionalVocabEntry (content: str, sounds_like: list[str] | None = None)

Expand source code

@dataclass
class AdditionalVocabEntry:
    """Additional vocabulary entry.

    Attributes:
        content: The word to add to the dictionary.
        sounds_like: Similar words to the word.
    """

    content: str
    sounds_like: list[str] | None = None

Additional vocabulary entry.

Attributes

content: The word to add to the dictionary.
sounds_like: Similar words to the word.

Instance variables

var content : str
var sounds_like : list[str] | None

class AudioSettings (encoding: str = 'pcm_s16le', sample_rate: int = 16000)

Expand source code

@dataclass
class AudioSettings:
    """Real-time: Defines audio parameters."""

    encoding: str = "pcm_s16le"
    """Encoding format when raw audio is used. Allowed values are
    `pcm_f32le`, `pcm_s16le` and `mulaw`."""

    sample_rate: int = 16000
    """Sampling rate in hertz."""

Real-time: Defines audio parameters.

Instance variables

var encoding : str: Encoding format when raw audio is used. Allowed values are pcm_f32le, pcm_s16le and mulaw.
var sample_rate : int: Sampling rate in hertz.

class DiarizationFocusMode (*args, **kwds)

Expand source code

class DiarizationFocusMode(str, Enum):
    """Speaker focus mode for diarization."""

    RETAIN = "retain"
    IGNORE = "ignore"

Speaker focus mode for diarization.

Ancestors

builtins.str
enum.Enum

Class variables

var IGNORE
var RETAIN

class DiarizationKnownSpeaker (label: str, speaker_identifiers: list[str])

Expand source code

@dataclass
class DiarizationKnownSpeaker:
    """Known speakers for speaker diarization.

    Attributes:
        label: The label of the speaker.
        speaker_identifiers: One or more data strings for the speaker.
    """

    label: str
    speaker_identifiers: list[str]

Known speakers for speaker diarization.

Attributes

label: The label of the speaker.
speaker_identifiers: One or more data strings for the speaker.

Instance variables

var label : str
var speaker_identifiers : list[str]

class EndOfUtteranceMode (*args, **kwds)

Expand source code

class EndOfUtteranceMode(str, Enum):
    """End of turn delay options for transcription."""

    NONE = "none"
    FIXED = "fixed"
    ADAPTIVE = "adaptive"

End of turn delay options for transcription.

Ancestors

builtins.str
enum.Enum

Class variables

var ADAPTIVE
var FIXED
var NONE

class STT (*, api_key: NotGivenOr[str] = NOT_GIVEN, base_url: NotGivenOr[str] = NOT_GIVEN, operating_point: OperatingPoint = OperatingPoint.ENHANCED, domain: NotGivenOr[str] = NOT_GIVEN, language: NotGivenOr[str] = NOT_GIVEN, output_locale: NotGivenOr[str] = NOT_GIVEN, enable_partials: bool = True, enable_diarization: bool = False, max_delay: float = 1.0, end_of_utterance_silence_trigger: float = 0.5, end_of_utterance_mode: EndOfUtteranceMode = EndOfUtteranceMode.FIXED, additional_vocab: NotGivenOr[list[AdditionalVocabEntry]] = NOT_GIVEN, punctuation_overrides: NotGivenOr[dict] = NOT_GIVEN, diarization_sensitivity: float = 0.5, max_speakers: NotGivenOr[int] = NOT_GIVEN, speaker_active_format: str = '{text}', speaker_passive_format: str = '{text}', prefer_current_speaker: bool = False, focus_speakers: NotGivenOr[list[str]] = NOT_GIVEN, ignore_speakers: NotGivenOr[list[str]] = NOT_GIVEN, focus_mode: DiarizationFocusMode = DiarizationFocusMode.RETAIN, known_speakers: NotGivenOr[list[DiarizationKnownSpeaker]] = NOT_GIVEN, sample_rate: int = 16000, chunk_size: int = 160, audio_encoding: AudioEncoding = AudioEncoding.PCM_S16LE, transcription_config: NotGivenOr[TranscriptionConfig] = NOT_GIVEN, audio_settings: NotGivenOr[AudioSettings] = NOT_GIVEN, http_session: NotGivenOr[aiohttp.ClientSession] = NOT_GIVEN)

Expand source code

class STT(stt.STT):
    def __init__(
        self,
        *,
        api_key: NotGivenOr[str] = NOT_GIVEN,
        base_url: NotGivenOr[str] = NOT_GIVEN,
        operating_point: OperatingPoint = OperatingPoint.ENHANCED,
        domain: NotGivenOr[str] = NOT_GIVEN,
        language: NotGivenOr[str] = NOT_GIVEN,
        output_locale: NotGivenOr[str] = NOT_GIVEN,
        enable_partials: bool = True,
        enable_diarization: bool = False,
        max_delay: float = 1.0,
        end_of_utterance_silence_trigger: float = 0.5,
        end_of_utterance_mode: EndOfUtteranceMode = EndOfUtteranceMode.FIXED,
        additional_vocab: NotGivenOr[list[AdditionalVocabEntry]] = NOT_GIVEN,
        punctuation_overrides: NotGivenOr[dict] = NOT_GIVEN,
        diarization_sensitivity: float = 0.5,
        max_speakers: NotGivenOr[int] = NOT_GIVEN,
        speaker_active_format: str = "{text}",
        speaker_passive_format: str = "{text}",
        prefer_current_speaker: bool = False,
        focus_speakers: NotGivenOr[list[str]] = NOT_GIVEN,
        ignore_speakers: NotGivenOr[list[str]] = NOT_GIVEN,
        focus_mode: DiarizationFocusMode = DiarizationFocusMode.RETAIN,
        known_speakers: NotGivenOr[list[DiarizationKnownSpeaker]] = NOT_GIVEN,
        sample_rate: int = 16000,
        chunk_size: int = 160,
        audio_encoding: AudioEncoding = AudioEncoding.PCM_S16LE,
        transcription_config: NotGivenOr[TranscriptionConfig] = NOT_GIVEN,  # Deprecated
        audio_settings: NotGivenOr[AudioSettings] = NOT_GIVEN,  # Deprecated
        http_session: NotGivenOr[aiohttp.ClientSession] = NOT_GIVEN,
    ):
        """
        Create a new instance of Speechmatics STT.

        Args:
            api_key (str): Speechmatics API key. Can be set via `api_key` argument
                or `SPEECHMATICS_API_KEY` environment variable

            base_url (str): Custom base URL for the API. Can be set via `base_url`
                argument or `SPEECHMATICS_RT_URL` environment variable. Optional.

            operating_point (OperatingPoint): Operating point for transcription accuracy
                vs. latency tradeoff. It is recommended to use OperatingPoint.ENHANCED
                for most use cases. Defaults to OperatingPoint.ENHANCED.

            domain (str): Domain to use. Optional.

            language (str): Language code for the STT model. Defaults to `en`. Optional.

            output_locale (str): Output locale for the STT model, e.g. `en-GB`. Optional.

            enable_partials (bool): Enable partial transcriptions. When enabled, the STT
                engine will emit `INTERIM_TRANSCRIPT` events - useful for the visualisation
                of real-time transcription. Defaults to True.

            enable_diarization (bool): Enable speaker diarization. When enabled, the STT
                engine will determine and attribute words to unique speakers. The
                speaker_sensitivity parameter can be used to adjust the sensitivity of
                diarization. Defaults to False.

            max_delay (float): Maximum delay in seconds for transcription. This forces the
                STT engine to speed up the processing of transcribed words and reduces the
                interval between partial and final results. Lower values can have an impact on
                accuracy. Defaults to 1.0.

            end_of_utterance_silence_trigger (float): Maximum delay in seconds for end of
                utterance trigger. The delay is used to wait for any further transcribed
                words before emitting the `FINAL_TRANSCRIPT` events. The value must be
                lower than `max_delay`. Defaults to 0.5.

            end_of_utterance_mode (EndOfUtteranceMode): End of utterance delay mode. When
                ADAPTIVE is used, the delay can be adjusted on the content of what the most
                recent speaker has said, such as rate of speech and whether they have any
                pauses or disfluencies. When FIXED is used, the delay is fixed to the value of
                `end_of_utterance_silence_trigger`. Use of NONE disables end of utterance detection and
                uses a fallback timer. Defaults to `EndOfUtteranceMode.FIXED`.

            additional_vocab (list[AdditionalVocabEntry]): List of additional vocabulary entries.
                If you supply a list of additional vocabulary entries, the this will increase the
                weight of the words in the vocabulary and help the STT engine to better transcribe
                the words. Defaults to [].

            punctuation_overrides (dict): Punctuation overrides. This allows you to override
                the punctuation in the STT engine. This is useful for languages that use different
                punctuation than English. See documentation for more information.
                Defaults to None.

            diarization_sensitivity (float): Diarization sensitivity. A higher value increases
                the sensitivity of diarization and helps when two or more speakers have similar voices.
                Defaults to 0.5.

            max_speakers (int): Maximum number of speakers to detect during diarization. When set,
                the STT engine will limit the number of unique speakers identified in the transcription.
                This is useful for scenarios where you know the maximum number of participants (e.g.,
                2-person interviews, small group meetings). Optional.

            speaker_active_format (str): Formatter for active speaker ID. This formatter is used
                to format the text output for individual speakers and ensures that the context is
                clear for language models further down the pipeline. The attributes `text` and
                `speaker_id` are available. The system instructions for the language model may need
                to include any necessary instructions to handle the formatting.
                Example: `@{speaker_id}: {text}`.
                Defaults to transcription output.

            speaker_passive_format (str): Formatter for passive speaker ID. As with the
                speaker_active_format, the attributes `text` and `speaker_id` are available.
                Example: `@{speaker_id} [background]: {text}`.
                Defaults to transcription output.

            prefer_current_speaker (bool): Prefer current speaker ID. When set to true, groups of
                words close together are given extra weight to be identified as the same speaker.
                Defaults to False.

            focus_speakers (list[str]): List of speaker IDs to focus on. When enabled, only these
                speakers are emitted as `FINAL_TRANSCRIPT` events and other speakers are considered
                passive. Words from other speakers are still processed, but only emitted when a
                focussed speaker has also said new words. A list of labels (e.g. `S1`, `S2`) or
                identifiers of known speakers (e.g. `speaker_1`, `speaker_2`) can be used.
                Defaults to [].

            ignore_speakers (list[str]): List of speaker IDs to ignore. When enabled, these speakers
                are excluded from the transcription and their words are not processed. Their speech
                will not trigger any VAD or end of utterance detection. By default, any speaker
                with a label starting and ending with double underscores will be excluded (e.g.
                `__ASSISTANT__`).
                Defaults to [].

            focus_mode (DiarizationFocusMode): Speaker focus mode for diarization. When set to
                `DiarizationFocusMode.RETAIN`, the STT engine will retain words spoken by other speakers
                (not listed in `ignore_speakers`) and process them as passive speaker frames. When set to
                `DiarizationFocusMode.IGNORE`, the STT engine will ignore words spoken by other speakers
                and they will not be processed. Defaults to `DiarizationFocusMode.RETAIN`.

            known_speakers (list[DiarizationKnownSpeaker]): List of known speaker labels and identifiers.
                If you supply a list of labels and identifiers for speakers, then the STT engine will
                use them to attribute any spoken words to that speaker. This is useful when you want to
                attribute words to a specific speaker, such as the assistant or a specific user. Labels
                and identifiers can be obtained from a running STT session and then used in subsequent
                sessions. Identifiers are unique to each Speechmatics account and cannot be used across
                accounts. Refer to our examples on the format of the known_speakers parameter.
                Defaults to [].

            sample_rate (int): Sample rate for the audio. Optional. Defaults to 16000.

            chunk_size (int): Chunk size for the audio. Optional. Defaults to 160.

            audio_encoding (AudioEncoding): Audio encoding for the audio. Optional.
                Defaults to `AudioEncoding.PCM_S16LE`.

            transcription_config (TranscriptionConfig): Transcription configuration (Deprecated). Optional.

            audio_settings (AudioSettings): Audio settings (Deprecated). Optional.

            http_session (aiohttp.ClientSession | None): Custom HTTP session for API requests. Optional.
        """

        super().__init__(
            capabilities=stt.STTCapabilities(
                streaming=True, interim_results=True, diarization=enable_diarization
            ),
        )

        if is_given(transcription_config):
            logger.warning(
                "`transcription_config` is deprecated. Use individual arguments instead (which override this argument)."
            )

        if is_given(audio_settings):
            logger.warning(
                "`audio_settings` is deprecated. Use individual arguments instead (which override this argument)."
            )

        self._stt_options = STTOptions(
            operating_point=operating_point,
            domain=domain if is_given(domain) else None,
            language=language if is_given(language) else "en",
            output_locale=output_locale if is_given(output_locale) else None,
            enable_partials=enable_partials,
            enable_diarization=enable_diarization,
            max_delay=max_delay,
            end_of_utterance_silence_trigger=end_of_utterance_silence_trigger,
            end_of_utterance_mode=end_of_utterance_mode,
            additional_vocab=additional_vocab if is_given(additional_vocab) else [],
            punctuation_overrides=punctuation_overrides if is_given(punctuation_overrides) else {},
            diarization_sensitivity=diarization_sensitivity,
            max_speakers=max_speakers if is_given(max_speakers) else None,
            speaker_active_format=speaker_active_format,
            speaker_passive_format=speaker_passive_format,
            prefer_current_speaker=prefer_current_speaker,
            focus_speakers=focus_speakers if is_given(focus_speakers) else [],
            ignore_speakers=ignore_speakers if is_given(ignore_speakers) else [],
            focus_mode=focus_mode,
            known_speakers=known_speakers if is_given(known_speakers) else [],
        )

        self._api_key: str = api_key if is_given(api_key) else os.getenv("SPEECHMATICS_API_KEY", "")
        self._base_url: str = (
            base_url
            if is_given(base_url)
            else os.getenv("SPEECHMATICS_RT_URL", "wss://eu2.rt.speechmatics.com/v2")
        )

        if not self._api_key:
            raise ValueError("Missing Speechmatics API key")
        if not self._base_url:
            raise ValueError("Missing Speechmatics base URL")

        self._transcription_config: TranscriptionConfig | None = None
        self._process_config()
        self._audio_format = AudioFormat(
            sample_rate=sample_rate,
            chunk_size=chunk_size,
            encoding=audio_encoding,
        )

        self._stream: stt.RecognizeStream | None = None
        self._http_session: aiohttp.ClientSession | None = None

        # Lower logging of the SMX module
        logging.getLogger("speechmatics.rt.transport").setLevel(logging.WARNING)

    @property
    def model(self) -> str:
        return "unknown"

    @property
    def provider(self) -> str:
        return "Speechmatics"

    async def _recognize_impl(
        self,
        buffer: AudioBuffer,
        *,
        language: NotGivenOr[str] = NOT_GIVEN,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> stt.SpeechEvent:
        raise NotImplementedError("Not implemented")

    def stream(
        self,
        *,
        language: NotGivenOr[str] = NOT_GIVEN,
        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
    ) -> stt.RecognizeStream:
        """Create a new SpeechStream."""
        if self._transcription_config is None:
            raise RuntimeError("Transcription config not initialized")
        transcription_config = dataclasses.replace(self._transcription_config)
        if is_given(language):
            transcription_config.language = language
        self._stream = SpeechStream(
            stt=self,
            conn_options=conn_options,
        )

        return self._stream

    def _process_config(self) -> None:
        """Create a formatted STT transcription config.

        Creates a transcription config object based on the service parameters. Aligns
        with the Speechmatics RT API transcription config.
        """
        transcription_config = TranscriptionConfig(
            language=self._stt_options.language,
            domain=self._stt_options.domain,
            output_locale=self._stt_options.output_locale,
            operating_point=self._stt_options.operating_point,
            diarization="speaker" if self._stt_options.enable_diarization else None,
            enable_partials=self._stt_options.enable_partials,
            max_delay=self._stt_options.max_delay,
        )

        if self._stt_options.additional_vocab:
            # API expects list of dicts, not dict format
            transcription_config.additional_vocab = [
                {
                    "content": e.content,
                    **({"sounds_like": e.sounds_like} if e.sounds_like else {}),
                }
                for e in self._stt_options.additional_vocab
            ]

        if self._stt_options.enable_diarization:
            # Use dict for speaker diarization config to support all fields including speakers
            dz_cfg: dict[str, Any] = {
                "speaker_sensitivity": self._stt_options.diarization_sensitivity,
                "prefer_current_speaker": self._stt_options.prefer_current_speaker,
            }

            # Add max_speakers if provided
            if self._stt_options.max_speakers is not None:
                dz_cfg["max_speakers"] = self._stt_options.max_speakers

            # Add speakers mapping from known speakers
            if self._stt_options.known_speakers:
                dz_cfg["speakers"] = {
                    s.label: s.speaker_identifiers for s in self._stt_options.known_speakers
                }

            transcription_config.speaker_diarization_config = dz_cfg  # type: ignore[assignment]
        if (
            self._stt_options.end_of_utterance_silence_trigger
            and self._stt_options.end_of_utterance_mode == EndOfUtteranceMode.FIXED
        ):
            transcription_config.conversation_config = ConversationConfig(
                end_of_utterance_silence_trigger=self._stt_options.end_of_utterance_silence_trigger,
            )

        if self._stt_options.punctuation_overrides:
            transcription_config.punctuation_overrides = self._stt_options.punctuation_overrides

        self._transcription_config = transcription_config

    def update_speakers(
        self,
        focus_speakers: NotGivenOr[list[str]] = NOT_GIVEN,
        ignore_speakers: NotGivenOr[list[str]] = NOT_GIVEN,
        focus_mode: NotGivenOr[DiarizationFocusMode] = NOT_GIVEN,
    ) -> None:
        """Updates the speaker configuration.

        This can update the speakers to listen to or ignore during an in-flight
        transcription. Only available if diarization is enabled.

        Args:
            focus_speakers: List of speakers to focus on.
            ignore_speakers: List of speakers to ignore.
            focus_mode: Focus mode to use.
        """
        # Check possible
        if not self._stt_options.enable_diarization:
            raise ValueError("Diarization is not enabled")

        # Update the diarization configuration
        if is_given(focus_speakers):
            self._stt_options.focus_speakers = focus_speakers
        if is_given(ignore_speakers):
            self._stt_options.ignore_speakers = ignore_speakers
        if is_given(focus_mode):
            self._stt_options.focus_mode = focus_mode

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Speechmatics STT.

Args

api_key : str: Speechmatics API key. Can be set via api_key argument or SPEECHMATICS_API_KEY environment variable
base_url : str: Custom base URL for the API. Can be set via base_url argument or SPEECHMATICS_RT_URL environment variable. Optional.
operating_point : OperatingPoint: Operating point for transcription accuracy vs. latency tradeoff. It is recommended to use OperatingPoint.ENHANCED for most use cases. Defaults to OperatingPoint.ENHANCED.
domain : str: Domain to use. Optional.
language : str: Language code for the STT model. Defaults to en. Optional.
output_locale : str: Output locale for the STT model, e.g. en-GB. Optional.
enable_partials : bool: Enable partial transcriptions. When enabled, the STT engine will emit INTERIM_TRANSCRIPT events - useful for the visualisation of real-time transcription. Defaults to True.
enable_diarization : bool: Enable speaker diarization. When enabled, the STT engine will determine and attribute words to unique speakers. The speaker_sensitivity parameter can be used to adjust the sensitivity of diarization. Defaults to False.
max_delay : float: Maximum delay in seconds for transcription. This forces the STT engine to speed up the processing of transcribed words and reduces the interval between partial and final results. Lower values can have an impact on accuracy. Defaults to 1.0.
end_of_utterance_silence_trigger : float: Maximum delay in seconds for end of utterance trigger. The delay is used to wait for any further transcribed words before emitting the FINAL_TRANSCRIPT events. The value must be lower than max_delay. Defaults to 0.5.
end_of_utterance_mode : EndOfUtteranceMode: End of utterance delay mode. When ADAPTIVE is used, the delay can be adjusted on the content of what the most recent speaker has said, such as rate of speech and whether they have any pauses or disfluencies. When FIXED is used, the delay is fixed to the value of end_of_utterance_silence_trigger. Use of NONE disables end of utterance detection and uses a fallback timer. Defaults to EndOfUtteranceMode.FIXED.
additional_vocab : list[AdditionalVocabEntry]: List of additional vocabulary entries. If you supply a list of additional vocabulary entries, the this will increase the weight of the words in the vocabulary and help the STT engine to better transcribe the words. Defaults to [].
punctuation_overrides : dict: Punctuation overrides. This allows you to override the punctuation in the STT engine. This is useful for languages that use different punctuation than English. See documentation for more information. Defaults to None.
diarization_sensitivity : float: Diarization sensitivity. A higher value increases the sensitivity of diarization and helps when two or more speakers have similar voices. Defaults to 0.5.
max_speakers : int: Maximum number of speakers to detect during diarization. When set, the STT engine will limit the number of unique speakers identified in the transcription. This is useful for scenarios where you know the maximum number of participants (e.g., 2-person interviews, small group meetings). Optional.
speaker_active_format : str: Formatter for active speaker ID. This formatter is used to format the text output for individual speakers and ensures that the context is clear for language models further down the pipeline. The attributes text and speaker_id are available. The system instructions for the language model may need to include any necessary instructions to handle the formatting. Example: @{speaker_id}: {text}. Defaults to transcription output.
speaker_passive_format : str: Formatter for passive speaker ID. As with the speaker_active_format, the attributes text and speaker_id are available. Example: @{speaker_id} [background]: {text}. Defaults to transcription output.
prefer_current_speaker : bool: Prefer current speaker ID. When set to true, groups of words close together are given extra weight to be identified as the same speaker. Defaults to False.
focus_speakers : list[str]: List of speaker IDs to focus on. When enabled, only these speakers are emitted as FINAL_TRANSCRIPT events and other speakers are considered passive. Words from other speakers are still processed, but only emitted when a focussed speaker has also said new words. A list of labels (e.g. S1, S2) or identifiers of known speakers (e.g. speaker_1, speaker_2) can be used. Defaults to [].
ignore_speakers : list[str]: List of speaker IDs to ignore. When enabled, these speakers are excluded from the transcription and their words are not processed. Their speech will not trigger any VAD or end of utterance detection. By default, any speaker with a label starting and ending with double underscores will be excluded (e.g. __ASSISTANT__). Defaults to [].
focus_mode : DiarizationFocusMode: Speaker focus mode for diarization. When set to DiarizationFocusMode.RETAIN, the STT engine will retain words spoken by other speakers (not listed in ignore_speakers) and process them as passive speaker frames. When set to DiarizationFocusMode.IGNORE, the STT engine will ignore words spoken by other speakers and they will not be processed. Defaults to DiarizationFocusMode.RETAIN.
known_speakers : list[DiarizationKnownSpeaker]: List of known speaker labels and identifiers. If you supply a list of labels and identifiers for speakers, then the STT engine will use them to attribute any spoken words to that speaker. This is useful when you want to attribute words to a specific speaker, such as the assistant or a specific user. Labels and identifiers can be obtained from a running STT session and then used in subsequent sessions. Identifiers are unique to each Speechmatics account and cannot be used across accounts. Refer to our examples on the format of the known_speakers parameter. Defaults to [].
sample_rate : int: Sample rate for the audio. Optional. Defaults to 16000.
chunk_size : int: Chunk size for the audio. Optional. Defaults to 160.
audio_encoding : AudioEncoding: Audio encoding for the audio. Optional. Defaults to AudioEncoding.PCM_S16LE.
transcription_config : TranscriptionConfig: Transcription configuration (Deprecated). Optional.
audio_settings : AudioSettings: Audio settings (Deprecated). Optional.
http_session : aiohttp.ClientSession | None: Custom HTTP session for API requests. Optional.

Ancestors

livekit.agents.stt.stt.STT
abc.ABC
EventEmitter
typing.Generic

Instance variables

prop model : str

Expand source code

@property
def model(self) -> str:
    return "unknown"

Get the model name/identifier for this STT instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop provider : str

Expand source code

@property
def provider(self) -> str:
    return "Speechmatics"

Get the provider name/identifier for this STT instance.

Returns

The provider name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their provider information.

Methods

def stream(self, *, language: NotGivenOr[str] = NOT_GIVEN, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.agents.stt.stt.RecognizeStream

Expand source code

def stream(
    self,
    *,
    language: NotGivenOr[str] = NOT_GIVEN,
    conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
) -> stt.RecognizeStream:
    """Create a new SpeechStream."""
    if self._transcription_config is None:
        raise RuntimeError("Transcription config not initialized")
    transcription_config = dataclasses.replace(self._transcription_config)
    if is_given(language):
        transcription_config.language = language
    self._stream = SpeechStream(
        stt=self,
        conn_options=conn_options,
    )

    return self._stream

Create a new SpeechStream.

def update_speakers(self, focus_speakers: NotGivenOr[list[str]] = NOT_GIVEN, ignore_speakers: NotGivenOr[list[str]] = NOT_GIVEN, focus_mode: NotGivenOr[DiarizationFocusMode] = NOT_GIVEN) ‑> None

Expand source code

def update_speakers(
    self,
    focus_speakers: NotGivenOr[list[str]] = NOT_GIVEN,
    ignore_speakers: NotGivenOr[list[str]] = NOT_GIVEN,
    focus_mode: NotGivenOr[DiarizationFocusMode] = NOT_GIVEN,
) -> None:
    """Updates the speaker configuration.

    This can update the speakers to listen to or ignore during an in-flight
    transcription. Only available if diarization is enabled.

    Args:
        focus_speakers: List of speakers to focus on.
        ignore_speakers: List of speakers to ignore.
        focus_mode: Focus mode to use.
    """
    # Check possible
    if not self._stt_options.enable_diarization:
        raise ValueError("Diarization is not enabled")

    # Update the diarization configuration
    if is_given(focus_speakers):
        self._stt_options.focus_speakers = focus_speakers
    if is_given(ignore_speakers):
        self._stt_options.ignore_speakers = ignore_speakers
    if is_given(focus_mode):
        self._stt_options.focus_mode = focus_mode

Updates the speaker configuration.

This can update the speakers to listen to or ignore during an in-flight transcription. Only available if diarization is enabled.

Args

focus_speakers: List of speakers to focus on.
ignore_speakers: List of speakers to ignore.
focus_mode: Focus mode to use.

Inherited members

EventEmitter:
- emit
- off
- on
- once

class SpeechStream (stt: STT, conn_options: APIConnectOptions)

Expand source code

class SpeechStream(stt.RecognizeStream):
    def __init__(self, stt: STT, conn_options: APIConnectOptions) -> None:
        super().__init__(
            stt=stt,
            conn_options=conn_options,
            sample_rate=stt._audio_format.sample_rate,
        )

        # redefine types
        self._stt: STT = stt
        self._speech_duration: float = 0
        # fill in with default value, it'll be reset when `RECOGNITION_STARTED` is received
        self._client: AsyncClient | None = None
        self._speech_fragments: list[SpeechFragment] = []

        # EndOfUtterance fallback timer
        self._end_of_utterance_timer: asyncio.TimerHandle | None = None

    async def _run(self) -> None:
        """Run the STT stream."""
        self._client = AsyncClient(
            api_key=self._stt._api_key,
            url=get_stt_url(self._stt._base_url),
        )

        logger.debug("Connected to Speechmatics STT service")

        opts = self._stt._stt_options

        @self._client.on(ServerMessageType.RECOGNITION_STARTED)
        def _evt_on_recognition_started(message: dict[str, Any]) -> None:
            logger.debug("Recognition started", extra={"data": message})

        if opts.enable_partials:

            @self._client.on(ServerMessageType.ADD_PARTIAL_TRANSCRIPT)
            def _evt_on_partial_transcript(message: dict[str, Any]) -> None:
                self._handle_transcript(message, is_final=False)

        @self._client.on(ServerMessageType.ADD_TRANSCRIPT)
        def _evt_on_final_transcript(message: dict[str, Any]) -> None:
            self._handle_transcript(message, is_final=True)

        if opts.end_of_utterance_mode == EndOfUtteranceMode.FIXED:

            @self._client.on(ServerMessageType.END_OF_UTTERANCE)
            def _evt_on_end_of_utterance(message: dict[str, Any]) -> None:
                self._handle_end_of_utterance()

        await self._client.start_session(
            transcription_config=self._stt._transcription_config,
            audio_format=self._stt._audio_format,
        )

        audio_bstream = utils.audio.AudioByteStream(
            sample_rate=self._stt._audio_format.sample_rate,
            num_channels=1,
        )

        async for data in self._input_ch:
            # If the data is a flush sentinel, flush the audio byte stream
            if isinstance(data, self._FlushSentinel):
                frames = audio_bstream.flush()
            else:
                frames = audio_bstream.write(data.data.tobytes())

            for frame in frames:
                self._speech_duration += frame.duration
                await self._client.send_audio(frame.data.tobytes())

        # TODO - handle the closing of the stream?

    def _handle_transcript(self, message: dict[str, Any], is_final: bool) -> None:
        """Handle the partial and final transcript events.

        Args:
            message: The new Partial or Final from the STT engine.
            is_final: Whether the data is final or partial.
        """
        has_changed = self._add_speech_fragments(
            message=message,
            is_final=is_final,
        )

        if not has_changed:
            return

        self._end_of_utterance_timer_start()
        self._send_frames()

    def _end_of_utterance_timer_start(self) -> None:
        """Start the timer for the end of utterance.

        This will use the STT's `end_of_utterance_silence_trigger` value and set
        a timer to send the latest transcript to the pipeline. It is used as a
        fallback from the EnfOfUtterance messages from the STT. Majority of the times,
        the server should be sending the end of utterance messages. In the rare case
        that it doesn't, we'll still time it out so that the pipeline doesn't hang.

        Note that the `end_of_utterance_silence_trigger` will be from when the
        last updated speech was received and this will likely be longer in
        real world time to that inside of the STT engine.
        """
        if self._end_of_utterance_timer is not None:
            self._end_of_utterance_timer.cancel()

        def send_after_delay() -> None:
            logger.debug("Fallback EndOfUtterance triggered.")
            self._handle_end_of_utterance()

        delay = self._stt._stt_options.end_of_utterance_silence_trigger * 4
        self._end_of_utterance_timer = asyncio.get_event_loop().call_later(delay, send_after_delay)

    def _handle_end_of_utterance(self) -> None:
        """Handle the end of utterance event.

        This will check for any running timers for end of utterance, reset them,
        and then send a finalized frame to the pipeline.
        """
        self._send_frames(finalized=True)
        if self._end_of_utterance_timer is not None:
            self._end_of_utterance_timer.cancel()
            self._end_of_utterance_timer = None

    def _send_frames(self, finalized: bool = False) -> None:
        """Send frames to the pipeline.

        Send speech frames to the pipeline. If VAD is enabled, then this will
        also send an interruption and user started speaking frames. When the
        final transcript is received, then this will send a user stopped speaking
        and stop interruption frames.

        Args:
            finalized: Whether the data is final or partial.
        """
        speech_frames = self._get_frames_from_fragments()
        if not speech_frames:
            return

        if not any(frame.is_active for frame in speech_frames):
            return

        if not finalized:
            event_type = stt.SpeechEventType.INTERIM_TRANSCRIPT
        else:
            event_type = stt.SpeechEventType.FINAL_TRANSCRIPT

        for item in speech_frames:
            final_event = stt.SpeechEvent(
                type=event_type,
                alternatives=[
                    item._as_speech_data(
                        self._stt._stt_options.speaker_active_format,
                        self._stt._stt_options.speaker_passive_format,
                    ),
                ],
            )
            self._event_ch.send_nowait(final_event)

        if finalized:
            self._event_ch.send_nowait(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
            self._speech_fragments.clear()

            if self._speech_duration > 0:
                usage_event = stt.SpeechEvent(
                    type=stt.SpeechEventType.RECOGNITION_USAGE,
                    alternatives=[],
                    recognition_usage=stt.RecognitionUsage(audio_duration=self._speech_duration),
                )
                self._event_ch.send_nowait(usage_event)
                self._speech_duration = 0

    def _add_speech_fragments(self, message: dict[str, Any], is_final: bool = False) -> bool:
        """Takes a new Partial or Final from the STT engine.

        Accumulates it into the _speech_data list. As new final data is added, all
        partials are removed from the list.

        Note: If a known speaker is `__[A-Z0-9_]{2,}__`, then the words are skipped,
        as this is used to protect against self-interruption by the assistant or to
        block out specific known voices.

        Args:
            message: The new Partial or Final from the STT engine.
            is_final: Whether the data is final or partial.

        Returns:
            bool: True if the speech data was updated, False otherwise.
        """
        opts = self._stt._stt_options
        fragments: list[SpeechFragment] = []
        current_length = len(self._speech_fragments)

        for result in message.get("results", []):
            alt = result.get("alternatives", [{}])[0]
            if alt.get("content", None):
                fragment = SpeechFragment(
                    start_time=result.get("start_time", 0),
                    end_time=result.get("end_time", 0),
                    language=alt.get("language", "en"),
                    is_eos=alt.get("is_eos", False),
                    is_final=is_final,
                    attaches_to=result.get("attaches_to", ""),
                    content=alt.get("content", ""),
                    speaker=alt.get("speaker", None),
                    confidence=alt.get("confidence", 1.0),
                    result=result,
                )

                # Speaker filtering
                if fragment.speaker:
                    # Drop `__XX__` speakers
                    if re.match(r"^__[A-Z0-9_]{2,}__$", fragment.speaker):
                        continue

                    # Drop speakers not focussed on
                    if (
                        opts.focus_mode == DiarizationFocusMode.IGNORE
                        and opts.focus_speakers
                        and fragment.speaker not in opts.focus_speakers
                    ):
                        continue

                    # Drop ignored speakers
                    if opts.ignore_speakers and fragment.speaker in opts.ignore_speakers:
                        continue

                fragments.append(fragment)

        self._speech_fragments = [frag for frag in self._speech_fragments if frag.is_final]
        if not fragments and len(self._speech_fragments) == current_length:
            return False

        self._speech_fragments.extend(fragments)
        return True

    def _get_frames_from_fragments(self) -> list[SpeakerFragments]:
        """Get speech data objects for the current fragment list.

        Each speech fragments is grouped by contiguous speaker and then
        returned as internal SpeakerFragments objects with the `speaker_id` field
        set to the current speaker (string). An utterance may contain speech from
        more than one speaker (e.g. S1, S2, S1, S3, ...), so they are kept
        in strict order for the context of the conversation.

        Returns:
            List[SpeakerFragments]: The list of objects.
        """
        current_speaker: str | None = None
        speaker_groups: list[list[SpeechFragment]] = [[]]
        for frag in self._speech_fragments:
            if frag.speaker != current_speaker:
                current_speaker = frag.speaker
                if speaker_groups[-1]:
                    speaker_groups.append([])
            speaker_groups[-1].append(frag)

        speaker_fragments: list[SpeakerFragments] = []
        for group in speaker_groups:
            sd = self._get_speaker_fragments_from_fragment_group(group)
            if sd:
                speaker_fragments.append(sd)

        return speaker_fragments

    def _get_speaker_fragments_from_fragment_group(
        self,
        group: list[SpeechFragment],
    ) -> SpeakerFragments | None:
        """Take a group of fragments and piece together into SpeakerFragments.

        Each fragment for a given speaker is assembled into a string,
        taking into consideration whether words are attached to the
        previous or next word (notably punctuation). This ensures that
        the text does not have extra spaces. This will also check for
        any straggling punctuation from earlier utterances that should
        be removed.

        Args:
            group: List of SpeechFragment objects.

        Returns:
            SpeakerFragments: The object for the group.
        """
        opts = self._stt._stt_options

        # Check for starting fragments that are attached to previous
        if group and group[0].attaches_to == "previous":
            group = group[1:]

        # Check for trailing fragments that are attached to next
        if group and group[-1].attaches_to == "next":
            group = group[:-1]

        if not group:
            return None

        start_time = min(frag.start_time for frag in group)
        end_time = max(frag.end_time for frag in group)

        # Determine if the speaker is considered active
        is_active = True
        if opts.enable_diarization and opts.focus_speakers:
            is_active = group[0].speaker in opts.focus_speakers

        return SpeakerFragments(
            speaker_id=group[0].speaker,
            start_time=start_time,
            end_time=end_time,
            language=group[0].language,
            fragments=group,
            is_active=is_active,
        )

    async def aclose(self) -> None:
        """
        End input to the STT engine.

        This will close the STT engine and the WebSocket connection, if established, and
        release any resources.
        """
        await super().aclose()

        # Close the STT session cleanly
        if self._client:
            await self._client.close()
            self._client = None

Helper class that provides a standard way to create an ABC using inheritance.

Args: sample_rate : int or None, optional The desired sample rate for the audio input. If specified, the audio input will be automatically resampled to match the given sample rate before being processed for Speech-to-Text. If not provided (None), the input will retain its original sample rate.

Ancestors

livekit.agents.stt.stt.RecognizeStream
abc.ABC

Methods

async def aclose(self) ‑> None

Expand source code

async def aclose(self) -> None:
    """
    End input to the STT engine.

    This will close the STT engine and the WebSocket connection, if established, and
    release any resources.
    """
    await super().aclose()

    # Close the STT session cleanly
    if self._client:
        await self._client.close()
        self._client = None

End input to the STT engine.

This will close the STT engine and the WebSocket connection, if established, and release any resources.

class TTS (*, voice: str = 'sarah', sample_rate: int = 16000, api_key: str | None = None, base_url: str = 'https://preview.tts.speechmatics.com', word_tokenizer: NotGivenOr[tokenize.WordTokenizer] = NOT_GIVEN, http_session: aiohttp.ClientSession | None = None)

Expand source code

class TTS(tts.TTS):
    def __init__(
        self,
        *,
        voice: str = DEFAULT_VOICE,
        sample_rate: int = 16000,
        api_key: str | None = None,
        base_url: str = BASE_URL,
        word_tokenizer: NotGivenOr[tokenize.WordTokenizer] = NOT_GIVEN,
        http_session: aiohttp.ClientSession | None = None,
    ) -> None:
        """
        Create a new instance of Speechmatics TTS.

        Args:
            voice (str): Voice model to use for synthesis. Options: "sarah", "theo", "megan". Defaults to "sarah".
            sample_rate (int): Sample rate of audio. Defaults to 16000.
            api_key (str): Speechmatics API key. If not provided, will look for SPEECHMATICS_API_KEY in environment.
            base_url (str): Base URL for Speechmatics TTS API. Defaults to "https://preview.tts.speechmatics.com"
            word_tokenizer (tokenize.WordTokenizer): Tokenizer for processing text. Defaults to basic WordTokenizer.
            http_session (aiohttp.ClientSession): Optional aiohttp session to use for requests.
        """
        super().__init__(
            capabilities=tts.TTSCapabilities(streaming=False),
            sample_rate=sample_rate,
            num_channels=NUM_CHANNELS,
        )

        api_key = api_key or os.environ.get("SPEECHMATICS_API_KEY")
        if not api_key:
            raise ValueError(
                "Speechmatics API key required. Set SPEECHMATICS_API_KEY or provide api_key."
            )

        if not is_given(word_tokenizer):
            word_tokenizer = tokenize.basic.WordTokenizer(ignore_punctuation=False)

        self._opts = _TTSOptions(
            voice=voice,
            sample_rate=sample_rate,
            word_tokenizer=word_tokenizer,
            base_url=base_url,
            api_key=api_key,
        )
        self._session = http_session

    @property
    def model(self) -> str:
        return "unknown"

    @property
    def provider(self) -> str:
        return "Speechmatics"

    def _ensure_session(self) -> aiohttp.ClientSession:
        if not self._session:
            self._session = utils.http_context.http_session()
        return self._session

    def update_options(
        self,
        *,
        voice: NotGivenOr[str] = NOT_GIVEN,
    ) -> None:
        """
        Args:
            voice (str): Voice model to use for synthesis.
        """
        if is_given(voice):
            self._opts.voice = voice

    def synthesize(
        self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
    ) -> ChunkedStream:
        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

Helper class that provides a standard way to create an ABC using inheritance.

Create a new instance of Speechmatics TTS.

Args

voice : str: Voice model to use for synthesis. Options: "sarah", "theo", "megan". Defaults to "sarah".
sample_rate : int: Sample rate of audio. Defaults to 16000.
api_key : str: Speechmatics API key. If not provided, will look for SPEECHMATICS_API_KEY in environment.
base_url : str: Base URL for Speechmatics TTS API. Defaults to "https://preview.tts.speechmatics.com"
word_tokenizer : tokenize.WordTokenizer: Tokenizer for processing text. Defaults to basic WordTokenizer.
http_session : aiohttp.ClientSession: Optional aiohttp session to use for requests.

Ancestors

livekit.agents.tts.tts.TTS
abc.ABC
EventEmitter
typing.Generic

Instance variables

prop model : str

Expand source code

@property
def model(self) -> str:
    return "unknown"

Get the model name/identifier for this TTS instance.

Returns

The model name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their model information.

prop provider : str

Expand source code

@property
def provider(self) -> str:
    return "Speechmatics"

Get the provider name/identifier for this TTS instance.

Returns

The provider name if available, "unknown" otherwise.

Note

Plugins should override this property to provide their provider information.

Methods

def synthesize(self, text: str, *, conn_options: APIConnectOptions = APIConnectOptions(max_retry=3, retry_interval=2.0, timeout=10.0)) ‑> livekit.plugins.speechmatics.tts.ChunkedStream

Expand source code

def synthesize(
    self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
) -> ChunkedStream:
    return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)

def update_options(self, *, voice: NotGivenOr[str] = NOT_GIVEN) ‑> None

Expand source code

def update_options(
    self,
    *,
    voice: NotGivenOr[str] = NOT_GIVEN,
) -> None:
    """
    Args:
        voice (str): Voice model to use for synthesis.
    """
    if is_given(voice):
        self._opts.voice = voice

Args

voice : str: Voice model to use for synthesis.

Inherited members

EventEmitter:
- emit
- off
- on
- once

class TranscriptionConfig (language: str = 'en', operating_point: OperatingPoint = OperatingPoint.ENHANCED, output_locale: Optional[str] = None, diarization: Optional[str] = None, additional_vocab: Optional[list[dict[str, Any]]] = None, punctuation_overrides: Optional[dict] = None, domain: Optional[str] = None, enable_entities: Optional[bool] = None, audio_filtering_config: Optional[dict] = None, transcript_filtering_config: Optional[dict] = None, max_delay: Optional[float] = None, max_delay_mode: Optional[str] = None, enable_partials: Optional[bool] = None, speaker_diarization_config: Optional[SpeakerDiarizationConfig] = None, streaming_mode: Optional[bool] = None, conversation_config: Optional[ConversationConfig] = None, ctrl: Optional[dict] = None, channel_diarization_labels: Optional[list[str]] = None)

Expand source code

@dataclass
class TranscriptionConfig:
    """
    Configuration for transcription behavior and features.

    Attributes:
        language: (Optional) ISO 639-1 language code (e.g., "en", "es", "fr").
            Defaults to "en".
        operating_point: (Optional) Which acoustic model to use.
            Defaults to "enhanced".
        output_locale: (Optional) RFC-5646 language code for transcript output (eg. "en-US").
            Defaults to None.
        diarization: Type of diarization to use. Options: "none", "channel", "speaker", "channel_and_speaker".
            Defaults to None.
        additional_vocab: (Optional) Additional vocabulary that is not part of the standard language.
            Defaults to None.
        punctuation_overrides: (Optional) Permitted punctuation marks for advanced punctuation.
            Defaults to None.
        domain: (Optional) Request a language pack optimized for a specific domain (e.g. 'medical').
            Defaults to None.
        enable_entities: (Optional) Whether to enable entity detection/recognition.
            Defaults to None.
        enable_partials: (Optional) Whether to receive partial transcription results.
            Defaults to None.
        max_delay: (Optional) Maximum delay in seconds for transcript delivery.
            Defaults to None.
        max_delay_mode: (Optional) Determines whether the threshold specified in max_delay can be exceeded
            if a potential entity is detected. Flexible means if a potential entity
            is detected, then the max_delay can be overriden until the end of that
            entity. Fixed means that max_delay specified ignores any potential
            entity that would not be completed within that threshold.
        speaker_diarization_config: (Optional) Configuration for speaker diarization.
            Defaults to None.
        streaming_mode: (Optional) Indicates if we run the engine in streaming mode, or regular RT mode.
        audio_filtering_config: (Optional) Configuration for limiting the transcription of quiet audio.
            Defaults to None.
        transcript_filtering_config: (Optional) Configuration for applying filtering to the transcription.
            Defaults to None.
        conversation_config: (Optional) Configuration for end-of-utterance detection.
            Defaults to None.
        ctrl: (Optional) Configuration for controlling the engine.
            Defaults to None.
        channel_diarization_labels: (Optional) Configuration for channel diarization.
            Defaults to None.


    Examples:
        Basic English transcription:
            >>> config = TranscriptionConfig(language="en")

        Spanish with partials enabled:
            >>> config = TranscriptionConfig(
            ...     language="es",
            ...     operating_point="enhanced",
            ...     enable_partials=True
            ... )

        Advanced configuration with speaker diarization:
            >>> config = TranscriptionConfig(
            ...     language="en",
            ...     enable_partials=True,
            ...     max_delay=5.0,
            ...     speaker_diarization_config={
            ...         "speaker_sensitivity": 0.7,
            ...         "max_speakers": 4
            ...     }
            ... )
    """

    language: str = "en"
    operating_point: OperatingPoint = OperatingPoint.ENHANCED
    output_locale: Optional[str] = None
    diarization: Optional[str] = None
    additional_vocab: Optional[list[dict[str, Any]]] = None
    punctuation_overrides: Optional[dict] = None
    domain: Optional[str] = None
    enable_entities: Optional[bool] = None
    audio_filtering_config: Optional[dict] = None
    transcript_filtering_config: Optional[dict] = None
    max_delay: Optional[float] = None
    max_delay_mode: Optional[str] = None
    enable_partials: Optional[bool] = None
    speaker_diarization_config: Optional[SpeakerDiarizationConfig] = None
    streaming_mode: Optional[bool] = None
    conversation_config: Optional[ConversationConfig] = None
    ctrl: Optional[dict] = None
    channel_diarization_labels: Optional[list[str]] = None

    def to_dict(self) -> dict[str, Any]:
        """
        Convert transcription parameters to dictionary.

        Returns:
        Transcription configuration as a dict while excluding None values.

        Examples:
            >>> config = TranscriptionConfig(
            ...     language="en",
            ...     enable_partials=True,
            ...     max_delay=5.0,
            ...     enable_entities=True
            ... )
            >>> api_dict = config.to_dict()
            >>> # Returns: {
            >>> #     "language": "en",
            >>> #     "enable_partials": True,
            >>> #     "max_delay_mode": "flexible",
            >>> #     "enable_entities": True,
            >>> #     "max_delay": 5.0
            >>> # }
        """
        return asdict(self, dict_factory=lambda x: {k: v for (k, v) in x if v is not None})

Configuration for transcription behavior and features.

Attributes

language: (Optional) ISO 639-1 language code (e.g., "en", "es", "fr"). Defaults to "en".
operating_point: (Optional) Which acoustic model to use. Defaults to "enhanced".
output_locale: (Optional) RFC-5646 language code for transcript output (eg. "en-US"). Defaults to None.
diarization: Type of diarization to use. Options: "none", "channel", "speaker", "channel_and_speaker". Defaults to None.
additional_vocab: (Optional) Additional vocabulary that is not part of the standard language. Defaults to None.
punctuation_overrides: (Optional) Permitted punctuation marks for advanced punctuation. Defaults to None.
domain: (Optional) Request a language pack optimized for a specific domain (e.g. 'medical'). Defaults to None.
enable_entities: (Optional) Whether to enable entity detection/recognition. Defaults to None.
enable_partials: (Optional) Whether to receive partial transcription results. Defaults to None.
max_delay: (Optional) Maximum delay in seconds for transcript delivery. Defaults to None.
max_delay_mode: (Optional) Determines whether the threshold specified in max_delay can be exceeded if a potential entity is detected. Flexible means if a potential entity is detected, then the max_delay can be overriden until the end of that entity. Fixed means that max_delay specified ignores any potential entity that would not be completed within that threshold.
speaker_diarization_config: (Optional) Configuration for speaker diarization. Defaults to None.
streaming_mode: (Optional) Indicates if we run the engine in streaming mode, or regular RT mode.
audio_filtering_config: (Optional) Configuration for limiting the transcription of quiet audio. Defaults to None.
transcript_filtering_config: (Optional) Configuration for applying filtering to the transcription. Defaults to None.
conversation_config: (Optional) Configuration for end-of-utterance detection. Defaults to None.
ctrl: (Optional) Configuration for controlling the engine. Defaults to None.
channel_diarization_labels: (Optional) Configuration for channel diarization. Defaults to None.

Examples

Basic English transcription: >>> config = TranscriptionConfig(language="en")

Spanish with partials enabled: >>> config = TranscriptionConfig( … language="es", … operating_point="enhanced", … enable_partials=True … )

Advanced configuration with speaker diarization: >>> config = TranscriptionConfig( … language="en", … enable_partials=True, … max_delay=5.0, … speaker_diarization_config={ … "speaker_sensitivity": 0.7, … "max_speakers": 4 … } … )

Instance variables

var additional_vocab : list[dict[str, typing.Any]] | None
var audio_filtering_config : dict | None
var channel_diarization_labels : list[str] | None
var conversation_config : speechmatics.rt._models.ConversationConfig | None
var ctrl : dict | None
var diarization : str | None
var domain : str | None
var enable_entities : bool | None
var enable_partials : bool | None
var language : str
var max_delay : float | None
var max_delay_mode : str | None
var operating_point : speechmatics.rt._models.OperatingPoint
var output_locale : str | None
var punctuation_overrides : dict | None
var speaker_diarization_config : speechmatics.rt._models.SpeakerDiarizationConfig | None
var streaming_mode : bool | None
var transcript_filtering_config : dict | None

Methods

def to_dict(self) ‑> dict[str, typing.Any]

Expand source code

def to_dict(self) -> dict[str, Any]:
    """
    Convert transcription parameters to dictionary.

    Returns:
    Transcription configuration as a dict while excluding None values.

    Examples:
        >>> config = TranscriptionConfig(
        ...     language="en",
        ...     enable_partials=True,
        ...     max_delay=5.0,
        ...     enable_entities=True
        ... )
        >>> api_dict = config.to_dict()
        >>> # Returns: {
        >>> #     "language": "en",
        >>> #     "enable_partials": True,
        >>> #     "max_delay_mode": "flexible",
        >>> #     "enable_entities": True,
        >>> #     "max_delay": 5.0
        >>> # }
    """
    return asdict(self, dict_factory=lambda x: {k: v for (k, v) in x if v is not None})

Convert transcription parameters to dictionary.

Returns: Transcription configuration as a dict while excluding None values.

Examples

>>> config = TranscriptionConfig(
...     language="en",
...     enable_partials=True,
...     max_delay=5.0,
...     enable_entities=True
... )
>>> api_dict = config.to_dict()
>>> # Returns: {
>>> #     "language": "en",
>>> #     "enable_partials": True,
>>> #     "max_delay_mode": "flexible",
>>> #     "enable_entities": True,
>>> #     "max_delay": 5.0
>>> # }