Module livekit.agents.tokenize.basic
Functions
def hyphenate_word(word: str) ‑> list[str]
-
Expand source code
def hyphenate_word(word: str) -> list[str]: return _basic_hyphenator.hyphenate_word(word)
def tokenize_paragraphs(text: str) ‑> list[str]
-
Expand source code
def tokenize_paragraphs(text: str) -> list[str]: return [tok[0] for tok in _basic_paragraph.split_paragraphs(text)]
Classes
class SentenceTokenizer (*,
language: str = 'english',
min_sentence_len: int = 20,
stream_context_len: int = 10,
retain_format: bool = False)-
Expand source code
class SentenceTokenizer(tokenizer.SentenceTokenizer): def __init__( self, *, language: str = "english", min_sentence_len: int = 20, stream_context_len: int = 10, retain_format: bool = False, ) -> None: self._config = _TokenizerOptions( language=language, min_sentence_len=min_sentence_len, stream_context_len=stream_context_len, retain_format=retain_format, ) def tokenize(self, text: str, *, language: str | None = None) -> list[str]: return [ tok[0] for tok in _basic_sent.split_sentences( text, min_sentence_len=self._config.min_sentence_len, retain_format=self._config.retain_format, ) ] def stream(self, *, language: str | None = None) -> tokenizer.SentenceStream: return token_stream.BufferedSentenceStream( tokenizer=functools.partial( _basic_sent.split_sentences, min_sentence_len=self._config.min_sentence_len, retain_format=self._config.retain_format, ), min_token_len=self._config.min_sentence_len, min_ctx_len=self._config.stream_context_len, )
Helper class that provides a standard way to create an ABC using inheritance.
Ancestors
- livekit.agents.tokenize.tokenizer.SentenceTokenizer
- abc.ABC
Methods
def stream(self, *, language: str | None = None) ‑> livekit.agents.tokenize.tokenizer.SentenceStream
-
Expand source code
def stream(self, *, language: str | None = None) -> tokenizer.SentenceStream: return token_stream.BufferedSentenceStream( tokenizer=functools.partial( _basic_sent.split_sentences, min_sentence_len=self._config.min_sentence_len, retain_format=self._config.retain_format, ), min_token_len=self._config.min_sentence_len, min_ctx_len=self._config.stream_context_len, )
def tokenize(self, text: str, *, language: str | None = None) ‑> list[str]
-
Expand source code
def tokenize(self, text: str, *, language: str | None = None) -> list[str]: return [ tok[0] for tok in _basic_sent.split_sentences( text, min_sentence_len=self._config.min_sentence_len, retain_format=self._config.retain_format, ) ]
class WordTokenizer (*, ignore_punctuation: bool = True, split_character: bool = False)
-
Expand source code
class WordTokenizer(tokenizer.WordTokenizer): def __init__(self, *, ignore_punctuation: bool = True, split_character: bool = False) -> None: self._ignore_punctuation = ignore_punctuation self._split_character = split_character def tokenize(self, text: str, *, language: str | None = None) -> list[str]: return [ tok[0] for tok in _basic_word.split_words( text, ignore_punctuation=self._ignore_punctuation, split_character=self._split_character, ) ] def stream(self, *, language: str | None = None) -> tokenizer.WordStream: return token_stream.BufferedWordStream( tokenizer=functools.partial( _basic_word.split_words, ignore_punctuation=self._ignore_punctuation, split_character=self._split_character, ), min_token_len=1, min_ctx_len=1, # ignore )
Helper class that provides a standard way to create an ABC using inheritance.
Ancestors
- livekit.agents.tokenize.tokenizer.WordTokenizer
- abc.ABC
Methods
def stream(self, *, language: str | None = None) ‑> livekit.agents.tokenize.tokenizer.WordStream
-
Expand source code
def stream(self, *, language: str | None = None) -> tokenizer.WordStream: return token_stream.BufferedWordStream( tokenizer=functools.partial( _basic_word.split_words, ignore_punctuation=self._ignore_punctuation, split_character=self._split_character, ), min_token_len=1, min_ctx_len=1, # ignore )
def tokenize(self, text: str, *, language: str | None = None) ‑> list[str]
-
Expand source code
def tokenize(self, text: str, *, language: str | None = None) -> list[str]: return [ tok[0] for tok in _basic_word.split_words( text, ignore_punctuation=self._ignore_punctuation, split_character=self._split_character, ) ]