Module livekit.plugins.nltk
Classes
class SentenceTokenizer (*, language: str = 'english', min_sentence_len: int = 20, stream_context_len: int = 10)
-
Helper class that provides a standard way to create an ABC using inheritance.
Expand source code
class SentenceTokenizer(agents.tokenize.SentenceTokenizer): def __init__( self, *, language: str = "english", min_sentence_len: int = 20, stream_context_len: int = 10, ) -> None: super().__init__() self._config = _TokenizerOptions( language=language, min_sentence_len=min_sentence_len, stream_context_len=stream_context_len, ) def _sanitize_options(self, language: str | None = None) -> _TokenizerOptions: config = dataclasses.replace(self._config) if language: config.language = language return config def tokenize(self, text: str, *, language: str | None = None) -> list[str]: config = self._sanitize_options(language=language) sentences = nltk.tokenize.sent_tokenize(text, config.language) new_sentences = [] buff = "" for sentence in sentences: buff += sentence + " " if len(buff) - 1 >= config.min_sentence_len: new_sentences.append(buff.rstrip()) buff = "" if buff: new_sentences.append(buff.rstrip()) return new_sentences def stream(self, *, language: str | None = None) -> agents.tokenize.SentenceStream: config = self._sanitize_options(language=language) return agents.tokenize.BufferedSentenceStream( tokenizer=functools.partial( nltk.tokenize.sent_tokenize, language=config.language ), min_token_len=self._config.min_sentence_len, min_ctx_len=self._config.stream_context_len, )
Ancestors
- SentenceTokenizer
- abc.ABC
Methods
def stream(self, *, language: str | None = None) ‑> SentenceStream
def tokenize(self, text: str, *, language: str | None = None) ‑> list[str]