mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
移除日志设置与冗余代码,优化导入、类型提示及OCR后端管理 统一调整各文件模块导入路径为绝对导入 调整导入路径,移除部分导入,优化日志及注释 升级文档解析器为 Docx2Parser,优化超时与图片处理逻辑
28 lines
803 B
Python
28 lines
803 B
Python
from dataclasses import dataclass, field
|
|
|
|
|
|
@dataclass
|
|
class ChunkingConfig:
|
|
"""
|
|
Configuration for text chunking process.
|
|
Controls how documents are split into smaller pieces for processing.
|
|
"""
|
|
|
|
# Maximum size of each chunk in tokens/chars
|
|
chunk_size: int = 512
|
|
|
|
# Number of tokens/chars to overlap between chunks
|
|
chunk_overlap: int = 50
|
|
|
|
# Text separators in order of priority
|
|
separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"])
|
|
|
|
# Whether to enable multimodal processing (text + images)
|
|
enable_multimodal: bool = False
|
|
|
|
# Preferred field name going forward
|
|
storage_config: dict[str, str] = field(default_factory=dict)
|
|
|
|
# VLM configuration for image captioning
|
|
vlm_config: dict[str, str] = field(default_factory=dict)
|