Module `livekit.agents.tokenize.basic`

Functions

def hyphenate_word(word: str) ‑> list[str]

Expand source code

def hyphenate_word(word: str) -> list[str]:
    return _basic_hyphenator.hyphenate_word(word)

def tokenize_paragraphs(text: str) ‑> list[str]

Expand source code

def tokenize_paragraphs(text: str) -> list[str]:
    return [tok[0] for tok in _basic_paragraph.split_paragraphs(text)]

Classes

class SentenceTokenizer (*, language: str = 'english', min_sentence_len: int = 20, stream_context_len: int = 10)

Expand source code

class SentenceTokenizer(tokenizer.SentenceTokenizer):
    def __init__(
        self,
        *,
        language: str = "english",
        min_sentence_len: int = 20,
        stream_context_len: int = 10,
    ) -> None:
        self._config = _TokenizerOptions(
            language=language,
            min_sentence_len=min_sentence_len,
            stream_context_len=stream_context_len,
        )

    def tokenize(self, text: str, *, language: str | None = None) -> list[str]:
        return [
            tok[0]
            for tok in _basic_sent.split_sentences(
                text, min_sentence_len=self._config.min_sentence_len
            )
        ]

    def stream(self, *, language: str | None = None) -> tokenizer.SentenceStream:
        return token_stream.BufferedSentenceStream(
            tokenizer=functools.partial(
                _basic_sent.split_sentences,
                min_sentence_len=self._config.min_sentence_len,
            ),
            min_token_len=self._config.min_sentence_len,
            min_ctx_len=self._config.stream_context_len,
        )

Helper class that provides a standard way to create an ABC using inheritance.

Ancestors

SentenceTokenizer
abc.ABC

Methods

def stream(self, *, language: str | None = None) ‑> SentenceStream

Expand source code

def stream(self, *, language: str | None = None) -> tokenizer.SentenceStream:
    return token_stream.BufferedSentenceStream(
        tokenizer=functools.partial(
            _basic_sent.split_sentences,
            min_sentence_len=self._config.min_sentence_len,
        ),
        min_token_len=self._config.min_sentence_len,
        min_ctx_len=self._config.stream_context_len,
    )

def tokenize(self, text: str, *, language: str | None = None) ‑> list[str]

Expand source code

def tokenize(self, text: str, *, language: str | None = None) -> list[str]:
    return [
        tok[0]
        for tok in _basic_sent.split_sentences(
            text, min_sentence_len=self._config.min_sentence_len
        )
    ]

class WordTokenizer (*, ignore_punctuation: bool = True)

Expand source code

class WordTokenizer(tokenizer.WordTokenizer):
    def __init__(self, *, ignore_punctuation: bool = True) -> None:
        self._ignore_punctuation = ignore_punctuation

    def tokenize(self, text: str, *, language: str | None = None) -> list[str]:
        return [
            tok[0]
            for tok in _basic_word.split_words(
                text, ignore_punctuation=self._ignore_punctuation
            )
        ]

    def stream(self, *, language: str | None = None) -> tokenizer.WordStream:
        return token_stream.BufferedWordStream(
            tokenizer=functools.partial(
                _basic_word.split_words, ignore_punctuation=self._ignore_punctuation
            ),
            min_token_len=1,
            min_ctx_len=1,  # ignore
        )

Helper class that provides a standard way to create an ABC using inheritance.

Ancestors

WordTokenizer
abc.ABC

Methods

def stream(self, *, language: str | None = None) ‑> WordStream

Expand source code

def stream(self, *, language: str | None = None) -> tokenizer.WordStream:
    return token_stream.BufferedWordStream(
        tokenizer=functools.partial(
            _basic_word.split_words, ignore_punctuation=self._ignore_punctuation
        ),
        min_token_len=1,
        min_ctx_len=1,  # ignore
    )

def tokenize(self, text: str, *, language: str | None = None) ‑> list[str]

Expand source code

def tokenize(self, text: str, *, language: str | None = None) -> list[str]:
    return [
        tok[0]
        for tok in _basic_word.split_words(
            text, ignore_punctuation=self._ignore_punctuation
        )
    ]