diff --git a/pkg/rag/knowledge/services/chunker.py b/pkg/rag/knowledge/services/chunker.py index f169d5f1..19b1f296 100644 --- a/pkg/rag/knowledge/services/chunker.py +++ b/pkg/rag/knowledge/services/chunker.py @@ -4,6 +4,7 @@ import json from typing import List from pkg.rag.knowledge.services import base_service from pkg.core import app +from langchain_text_splitters import RecursiveCharacterTextSplitter class Chunker(base_service.BaseService): @@ -27,21 +28,6 @@ class Chunker(base_service.BaseService): """ if not text: return [] - # words = text.split() - # chunks = [] - # current_chunk = [] - - # for word in words: - # current_chunk.append(word) - # if len(current_chunk) > self.chunk_size: - # chunks.append(" ".join(current_chunk[:self.chunk_size])) - # current_chunk = current_chunk[self.chunk_size - self.chunk_overlap:] - - # if current_chunk: - # chunks.append(" ".join(current_chunk)) - - # A more robust chunking strategy (e.g., using recursive character text splitter) - from langchain.text_splitter import RecursiveCharacterTextSplitter text_splitter = RecursiveCharacterTextSplitter( chunk_size=self.chunk_size, diff --git a/pyproject.toml b/pyproject.toml index c0200bd0..1384b22c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ dependencies = [ "ebooklib>=0.18", "html2text>=2024.2.26", "langchain>=0.2.0", + "langchain-text-splitters>=0.0.1", "chromadb>=0.4.24", "qdrant-client (>=1.15.1,<2.0.0)", "langbot-plugin==0.1.4",