Module livekit.agents.tokenize.basic
Functions
def hyphenate_word(word: str) ‑> list[str]
def tokenize_paragraphs(text: str) ‑> list[str]
Classes
class SentenceTokenizer (*, language: str = 'english', min_sentence_len: int = 20, stream_context_len: int = 10)
-
Helper class that provides a standard way to create an ABC using inheritance.
Expand source code
class SentenceTokenizer(tokenizer.SentenceTokenizer): def __init__( self, *, language: str = "english", min_sentence_len: int = 20, stream_context_len: int = 10, ) -> None: self._config = _TokenizerOptions( language=language, min_sentence_len=min_sentence_len, stream_context_len=stream_context_len, ) def tokenize(self, text: str, *, language: str | None = None) -> list[str]: return [ tok[0] for tok in _basic_sent.split_sentences( text, min_sentence_len=self._config.min_sentence_len ) ] def stream(self, *, language: str | None = None) -> tokenizer.SentenceStream: return token_stream.BufferedSentenceStream( tokenizer=functools.partial( _basic_sent.split_sentences, min_sentence_len=self._config.min_sentence_len, ), min_token_len=self._config.min_sentence_len, min_ctx_len=self._config.stream_context_len, )
Ancestors
- SentenceTokenizer
- abc.ABC
Methods
def stream(self, *, language: str | None = None) ‑> SentenceStream
def tokenize(self, text: str, *, language: str | None = None) ‑> list[str]
class WordTokenizer (*, ignore_punctuation: bool = True)
-
Helper class that provides a standard way to create an ABC using inheritance.
Expand source code
class WordTokenizer(tokenizer.WordTokenizer): def __init__(self, *, ignore_punctuation: bool = True) -> None: self._ignore_punctuation = ignore_punctuation def tokenize(self, text: str, *, language: str | None = None) -> list[str]: return [ tok[0] for tok in _basic_word.split_words( text, ignore_punctuation=self._ignore_punctuation ) ] def stream(self, *, language: str | None = None) -> tokenizer.WordStream: return token_stream.BufferedWordStream( tokenizer=functools.partial( _basic_word.split_words, ignore_punctuation=self._ignore_punctuation ), min_token_len=1, min_ctx_len=1, # ignore )
Ancestors
- WordTokenizer
- abc.ABC
Methods
def stream(self, *, language: str | None = None) ‑> WordStream
def tokenize(self, text: str, *, language: str | None = None) ‑> list[str]