Module livekit.plugins.rag

Sub-modules

livekit.plugins.rag.annoy

Classes

class SentenceChunker (*, max_chunk_size: int = 120, chunk_overlap: int = 30, paragraph_tokenizer: Callable[[str], list[str]] = <function tokenize_paragraphs>, sentence_tokenizer: SentenceTokenizer = <livekit.agents.tokenize.basic.SentenceTokenizer object>, word_tokenizer: WordTokenizer = <livekit.agents.tokenize.basic.WordTokenizer object>)
Expand source code
class SentenceChunker:
    def __init__(
        self,
        *,
        max_chunk_size: int = 120,
        chunk_overlap: int = 30,
        paragraph_tokenizer: Callable[
            [str], list[str]
        ] = tokenize.basic.tokenize_paragraphs,
        sentence_tokenizer: tokenize.SentenceTokenizer = tokenize.basic.SentenceTokenizer(),
        word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
            ignore_punctuation=False
        ),
    ) -> None:
        self._max_chunk_size = max_chunk_size
        self._chunk_overlap = chunk_overlap
        self._paragraph_tokenizer = paragraph_tokenizer
        self._sentence_tokenizer = sentence_tokenizer
        self._word_tokenizer = word_tokenizer

    def chunk(self, *, text: str) -> list[str]:
        chunks = []

        buf_words: list[str] = []
        for paragraph in self._paragraph_tokenizer(text):
            last_buf_words: list[str] = []

            for sentence in self._sentence_tokenizer.tokenize(text=paragraph):
                for word in self._word_tokenizer.tokenize(text=sentence):
                    reconstructed = self._word_tokenizer.format_words(
                        buf_words + [word]
                    )

                    if len(reconstructed) > self._max_chunk_size:
                        while (
                            len(self._word_tokenizer.format_words(last_buf_words))
                            > self._chunk_overlap
                        ):
                            last_buf_words = last_buf_words[1:]

                        new_chunk = self._word_tokenizer.format_words(
                            last_buf_words + buf_words
                        )
                        chunks.append(new_chunk)
                        last_buf_words = buf_words
                        buf_words = []

                    buf_words.append(word)

            if buf_words:
                while (
                    len(self._word_tokenizer.format_words(last_buf_words))
                    > self._chunk_overlap
                ):
                    last_buf_words = last_buf_words[1:]

                new_chunk = self._word_tokenizer.format_words(
                    last_buf_words + buf_words
                )
                chunks.append(new_chunk)
                buf_words = []

        return chunks

Methods

def chunk(self, *, text: str) ‑> list[str]