mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
28 lines
803 B
Python
28 lines
803 B
Python
|
|
from dataclasses import dataclass, field
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ChunkingConfig:
|
||
|
|
"""
|
||
|
|
Configuration for text chunking process.
|
||
|
|
Controls how documents are split into smaller pieces for processing.
|
||
|
|
"""
|
||
|
|
|
||
|
|
# Maximum size of each chunk in tokens/chars
|
||
|
|
chunk_size: int = 512
|
||
|
|
|
||
|
|
# Number of tokens/chars to overlap between chunks
|
||
|
|
chunk_overlap: int = 50
|
||
|
|
|
||
|
|
# Text separators in order of priority
|
||
|
|
separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"])
|
||
|
|
|
||
|
|
# Whether to enable multimodal processing (text + images)
|
||
|
|
enable_multimodal: bool = False
|
||
|
|
|
||
|
|
# Preferred field name going forward
|
||
|
|
storage_config: dict[str, str] = field(default_factory=dict)
|
||
|
|
|
||
|
|
# VLM configuration for image captioning
|
||
|
|
vlm_config: dict[str, str] = field(default_factory=dict)
|