Files
WeKnora/docreader/models/read_config.py

28 lines
803 B
Python
Raw Normal View History

from dataclasses import dataclass, field
@dataclass
class ChunkingConfig:
"""
Configuration for text chunking process.
Controls how documents are split into smaller pieces for processing.
"""
# Maximum size of each chunk in tokens/chars
chunk_size: int = 512
# Number of tokens/chars to overlap between chunks
chunk_overlap: int = 50
# Text separators in order of priority
separators: list = field(default_factory=lambda: ["\n\n", "\n", ""])
# Whether to enable multimodal processing (text + images)
enable_multimodal: bool = False
# Preferred field name going forward
storage_config: dict[str, str] = field(default_factory=dict)
# VLM configuration for image captioning
vlm_config: dict[str, str] = field(default_factory=dict)