feat: 新增文档模型类，调整配置与解析逻辑，优化日志及导入

移除日志设置与冗余代码，优化导入、类型提示及OCR后端管理统一调整各文件模块导入路径为绝对导入调整导入路径，移除部分导入，优化日志及注释升级文档解析器为 Docx2Parser，优化超时与图片处理逻辑
2025-11-25 03:15:00 +08:00 · 2025-11-07 10:30:02 +08:00
parent af620806e0
commit 2d66abedf0
39 changed files with 2676 additions and 1570 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -24,17 +24,14 @@ node_modules/
 tmp/
 temp/

-# Docker compose файл (локальные настройки)
-# docker-compose.yml
-
 WeKnora
 /models/
-**/__pycache__
 test/data/mswag.txt
 data/files/

-.python-version
 .venv/
+**/__pycache__
+.python-version

 ### macOS
 # General
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -127,6 +127,7 @@ services:
      - MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
      - MINIO_USE_SSL=${MINIO_USE_SSL:-}
      - WEB_PROXY=${WEB_PROXY:-}
+      - MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
    healthcheck:
      test: ["CMD", "grpc_health_probe", "-addr=:50051"]
      interval: 30s
--- a/docker/Dockerfile.docreader
+++ b/docker/Dockerfile.docreader
@@ -80,10 +80,11 @@ RUN pip install uv --break-system-packages && \
    python -m uv sync --locked --no-dev

 # 复制源代码和生成脚本
-COPY docreader .
+COPY docreader docreader

 # 生成 protobuf 代码
-RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh
+RUN chmod +x docreader/scripts/generate_proto.sh && \
+    bash docreader/scripts/generate_proto.sh

 # 确保模型目录存在
 RUN ls -la /root/.paddleocr/whl/
@@ -150,10 +151,11 @@ RUN python -m playwright install-deps webkit
 # COPY docreader/scripts/download_deps.py download_deps.py
 # RUN python -m download_deps

-COPY --from=builder /app/ ./
+COPY docreader/pyproject.toml docreader/uv.lock ./
+COPY --from=builder /app/docreader docreader

 # 暴露 gRPC 端口
 EXPOSE 50051

 # 直接运行 Python 服务（日志输出到 stdout/stderr）
-CMD ["uv", "run", "main.py"]
+CMD ["uv", "run", "-m", "docreader.main"]
--- a/docreader/.pylintrc
+++ b/docreader/.pylintrc
@@ -0,0 +1,5 @@
+[LOGGING]
+logging-format-style=fstr
+
+[MESSAGES CONTROL]
+; disable=W1203
--- a/docreader/main.py
+++ b/docreader/main.py
@@ -1,37 +1,25 @@
-import os
-import sys
 import logging
-from concurrent import futures
+import os
+import re
+import sys
 import traceback
-import grpc
 import uuid
-import atexit
+from concurrent import futures
+from typing import Optional
+
+import grpc
 from grpc_health.v1 import health_pb2_grpc
 from grpc_health.v1.health import HealthServicer

-# Add parent directory to Python path
-current_dir = os.path.dirname(os.path.abspath(__file__))
-parent_dir = os.path.dirname(current_dir)
-if parent_dir not in sys.path:
-    sys.path.insert(0, parent_dir)
+from docreader.models.read_config import ChunkingConfig
+from docreader.parser import Parser
+from docreader.parser.ocr_engine import OCREngine
+from docreader.proto import docreader_pb2_grpc
+from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse
+from docreader.utils.request import init_logging_request_id, request_id_context

-from proto.docreader_pb2 import ReadResponse, Chunk, Image
-from proto import docreader_pb2_grpc
-from parser import Parser, OCREngine
-from parser.config import ChunkingConfig
-from utils.request import request_id_context, init_logging_request_id
-
-# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
-import re
-from typing import Optional
-
-try:
-    # Optional dependency for charset detection; install via `pip install charset-normalizer`
-    from charset_normalizer import from_bytes as _cn_from_bytes  # type: ignore
-except Exception:  # pragma: no cover
-    _cn_from_bytes = None  # type: ignore
-
-# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
+# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
+# cannot be encoded to UTF-8
 _SURROGATE_RE = re.compile(r"[\ud800-\udfff]")


@@ -47,29 +35,6 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
    return s.encode("utf-8", errors="replace").decode("utf-8")


-def read_text_with_fallback(file_path: str) -> str:
-    """Read text from file supporting multiple encodings with graceful fallback.
-
-    This server currently receives bytes over gRPC and delegates decoding to the parser.
-    This helper is provided for future local-file reads if needed.
-    """
-    with open(file_path, "rb") as f:
-        raw = f.read()
-    if _cn_from_bytes is not None:
-        try:
-            result = _cn_from_bytes(raw).best()
-            if result:
-                return str(result)
-        except Exception:
-            pass
-    for enc in ("utf-8", "gb18030", "latin-1"):
-        try:
-            return raw.decode(enc, errors="replace")
-        except UnicodeDecodeError:
-            continue
-    return raw.decode("utf-8", errors="replace")
-
-
 # Ensure no existing handlers
 for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
@@ -113,7 +78,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                    request.file_type or os.path.splitext(request.file_name)[1][1:]
                )
                logger.info(
-                    f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
+                    f"ReadFromFile for file: {request.file_name}, type: {file_type}"
                )
                logger.info(f"File content size: {len(request.file_content)} bytes")

@@ -124,8 +89,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                enable_multimodal = request.read_config.enable_multimodal or False

                logger.info(
-                    f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
-                    f"multimodal={enable_multimodal}"
+                    f"Using chunking config: size={chunk_size}, "
+                    f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
                )

                # Get Storage and VLM config from request
@@ -144,7 +109,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                    "path_prefix": sc.path_prefix,
                }
                logger.info(
-                    f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
+                    f"Using Storage config: provider={storage_config.get('provider')}, "
+                    f"bucket={storage_config['bucket_name']}"
                )

                vlm_config = {
@@ -170,7 +136,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                )

                # Parse file
-                logger.info(f"Starting file parsing process")
+                logger.info("Starting file parsing process")
                result = self.parser.parse_file(
                    request.file_name, file_type, request.file_content, chunking_config
                )
@@ -184,7 +150,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):

                # Convert to protobuf message
                logger.info(
-                    f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
+                    f"Parsed file {request.file_name}, with {len(result.chunks)} chunks"
                )

                # Build response, including image info
@@ -224,8 +190,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                enable_multimodal = request.read_config.enable_multimodal or False

                logger.info(
-                    f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
-                    f"multimodal={enable_multimodal}"
+                    f"Using chunking config: size={chunk_size}, "
+                    f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
                )

                # Get Storage and VLM config from request
@@ -243,7 +209,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                    "path_prefix": sc.path_prefix,
                }
                logger.info(
-                    f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
+                    f"Using Storage config: provider={storage_config.get('provider')}, "
+                    f"bucket={storage_config['bucket_name']}"
                )

                vlm_config = {
@@ -269,7 +236,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                )

                # Parse URL
-                logger.info(f"Starting URL parsing process")
+                logger.info("Starting URL parsing process")
                result = self.parser.parse_url(
                    request.url, request.title, chunking_config
                )
@@ -282,7 +249,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):

                # Convert to protobuf message, including image info
                logger.info(
-                    f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
+                    f"Parsed URL {request.url}, returning {len(result.chunks)} chunks"
                )

                response = ReadResponse(
@@ -335,29 +302,15 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
        return proto_chunk


-def init_ocr_engine(ocr_backend, ocr_config):
+def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs):
    """Initialize OCR engine"""
-    try:
-        logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
-        ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
-        if ocr_engine:
-            logger.info("OCR engine initialized successfully")
-            return True
-        else:
-            logger.error("OCR engine initialization failed")
-            return False
-    except Exception as e:
-        logger.error(f"Error initializing OCR engine: {str(e)}")
-        return False
+    backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle")
+    logger.info(f"Initializing OCR engine with backend: {backend_type}")
+    OCREngine.get_instance(backend_type=backend_type, **kwargs)


 def main():
-    init_ocr_engine(
-        os.getenv("OCR_BACKEND", "paddle"),
-        {
-            "OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
-        },
-    )
+    init_ocr_engine()

    # Set max number of worker threads
    max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
--- a/docreader/models/init.py
+++ b/docreader/models/init.py
--- a/docreader/models/document.py
+++ b/docreader/models/document.py
@@ -0,0 +1,87 @@
+"""Chunk document schema."""
+
+import json
+from typing import Any, Dict, List
+
+from pydantic import BaseModel, Field
+
+
+class Chunk(BaseModel):
+    """Document Chunk including chunk content, chunk metadata."""
+
+    content: str = Field(default="", description="chunk text content")
+    seq: int = Field(default=0, description="Chunk sequence number")
+    start: int = Field(default=0, description="Chunk start position")
+    end: int = Field(description="Chunk end position")
+    images: List[Dict[str, Any]] = Field(
+        default_factory=list, description="Images in the chunk"
+    )
+
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="metadata fields",
+    )
+
+    def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
+        """Convert Chunk to dict."""
+
+        data = self.model_dump()
+        data.update(kwargs)
+        data["class_name"] = self.__class__.__name__
+        return data
+
+    def to_json(self, **kwargs: Any) -> str:
+        """Convert Chunk to json."""
+        data = self.to_dict(**kwargs)
+        return json.dumps(data)
+
+    def __hash__(self):
+        """Hash function."""
+        return hash((self.content,))
+
+    def __eq__(self, other):
+        """Equal function."""
+        return self.content == other.content
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any], **kwargs: Any):  # type: ignore
+        """Create Chunk from dict."""
+        if isinstance(kwargs, dict):
+            data.update(kwargs)
+
+        data.pop("class_name", None)
+        return cls(**data)
+
+    @classmethod
+    def from_json(cls, data_str: str, **kwargs: Any):  # type: ignore
+        """Create Chunk from json."""
+        data = json.loads(data_str)
+        return cls.from_dict(data, **kwargs)
+
+
+class Document(BaseModel):
+    """Document including document content, document metadata."""
+
+    model_config = {"arbitrary_types_allowed": True}
+
+    content: str = Field(default="", description="document text content")
+    images: Dict[str, str] = Field(
+        default_factory=dict, description="Images in the document"
+    )
+
+    chunks: List[Chunk] = Field(default_factory=list, description="document chunks")
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="metadata fields",
+    )
+
+    def set_content(self, content: str) -> None:
+        """Set document content."""
+        self.content = content
+
+    def get_content(self) -> str:
+        """Get document content."""
+        return self.content
+
+    def is_valid(self) -> bool:
+        return self.content != ""
--- a/docreader/models/read_config.py
+++ b/docreader/models/read_config.py
@@ -0,0 +1,27 @@
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ChunkingConfig:
+    """
+    Configuration for text chunking process.
+    Controls how documents are split into smaller pieces for processing.
+    """
+
+    # Maximum size of each chunk in tokens/chars
+    chunk_size: int = 512
+
+    # Number of tokens/chars to overlap between chunks
+    chunk_overlap: int = 50
+
+    # Text separators in order of priority
+    separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"])
+
+    # Whether to enable multimodal processing (text + images)
+    enable_multimodal: bool = False
+
+    # Preferred field name going forward
+    storage_config: dict[str, str] = field(default_factory=dict)
+
+    # VLM configuration for image captioning
+    vlm_config: dict[str, str] = field(default_factory=dict)
--- a/docreader/parser/init.py
+++ b/docreader/parser/init.py
@@ -13,22 +13,18 @@ The parsers extract content from documents and can split them into
 meaningful chunks for further processing and indexing.
 """

-from .base_parser import BaseParser, ParseResult
-from .docx_parser import DocxParser
 from .doc_parser import DocParser
-from .pdf_parser import PDFParser
-from .markdown_parser import MarkdownParser
-from .text_parser import TextParser
+from .docx2_parser import Docx2Parser
 from .image_parser import ImageParser
-from .web_parser import WebParser
+from .markdown_parser import MarkdownParser
 from .parser import Parser
-from .config import ChunkingConfig
-from .ocr_engine import OCREngine
+from .pdf_parser import PDFParser
+from .text_parser import TextParser
+from .web_parser import WebParser

 # Export public classes and modules
 __all__ = [
-    "BaseParser",  # Base parser class that all format parsers inherit from
-    "DocxParser",  # Parser for .docx files (modern Word documents)
+    "Docx2Parser",  # Parser for .docx files (modern Word documents)
    "DocParser",  # Parser for .doc files (legacy Word documents)
    "PDFParser",  # Parser for PDF documents
    "MarkdownParser",  # Parser for Markdown text files
@@ -36,7 +32,4 @@ __all__ = [
    "ImageParser",  # Parser for images with text content
    "WebParser",  # Parser for web pages
    "Parser",  # Main parser factory that selects the appropriate parser
-    "ChunkingConfig",  # Configuration for text chunking behavior
-    "ParseResult",  # Standard result format returned by all parsers
-    "OCREngine",  # OCR engine for extracting text from images
 ]
--- a/docreader/parser/base_parser.py
+++ b/docreader/parser/base_parser.py
@@ -1,65 +1,28 @@
 # -*- coding: utf-8 -*-
-import re
-import os
 import asyncio
-from typing import List, Dict, Any, Optional, Tuple, Union
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-import logging
-import sys
-import traceback
-import numpy as np
-import time
 import io
-import json
-from .ocr_engine import OCREngine
-from .image_utils import image_to_base64
-from .config import ChunkingConfig
-from .storage import create_storage
+import logging
+import os
+import re
+import time
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Tuple
+
+import requests
 from PIL import Image

-# Add parent directory to Python path for src imports
-current_dir = os.path.dirname(os.path.abspath(__file__))
-parent_dir = os.path.dirname(current_dir)
-if parent_dir not in sys.path:
-    sys.path.insert(0, parent_dir)
-
-try:
-    from services.docreader.src.parser.caption import Caption
-except ImportError:
-    # Fallback: try relative import
-    try:
-        from .caption import Caption
-    except ImportError:
-        # If both imports fail, set to None
-        Caption = None
-        logging.warning(
-            "Failed to import Caption, image captioning will be unavailable"
-        )
+from docreader.models.document import Chunk, Document
+from docreader.models.read_config import ChunkingConfig
+from docreader.parser.caption import Caption
+from docreader.parser.ocr_engine import OCREngine
+from docreader.parser.storage import create_storage
+from docreader.splitter.splitter import TextSplitter
+from docreader.utils import endecode

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)


-@dataclass
-class Chunk:
-    """Chunk result"""
-
-    content: str  # Chunk content
-    seq: int  # Chunk sequence number
-    start: int  # Chunk start position
-    end: int  # Chunk end position
-    images: List[Dict[str, Any]] = field(default_factory=list)  # Images in the chunk
-
-
-@dataclass
-class ParseResult:
-    """Parse result"""
-
-    text: str  # Extracted text content
-    chunks: Optional[List[Chunk]] = None  # Chunk results
-
-
 class BaseParser(ABC):
    """Base parser interface"""

@@ -97,17 +60,17 @@ class BaseParser(ABC):
    def __init__(
        self,
        file_name: str = "",
-        file_type: str = None,
+        file_type: Optional[str] = None,
        enable_multimodal: bool = True,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
-        separators: list = ["\n\n", "\n", "。"],
+        separators: list[str] = ["\n\n", "\n", "。"],
        ocr_backend: str = "paddle",
-        ocr_config: dict = None,
+        ocr_config: dict = {},
        max_image_size: int = 1920,  # Maximum image size
        max_concurrent_tasks: int = 5,  # Max concurrent tasks
        max_chunks: int = 1000,  # Max number of returned chunks
-        chunking_config: ChunkingConfig = None,  # Chunking configuration object
+        chunking_config: Optional[ChunkingConfig] = None,
    ):
        """Initialize parser

@@ -125,7 +88,6 @@ class BaseParser(ABC):
            max_chunks: Max number of returned chunks
        """
        # Storage client instance
-        self._storage = None
        self.file_name = file_name
        self.file_type = file_type or os.path.splitext(file_name)[1]
        self.enable_multimodal = enable_multimodal
@@ -133,15 +95,16 @@ class BaseParser(ABC):
        self.chunk_overlap = chunk_overlap
        self.separators = separators
        self.ocr_backend = os.getenv("OCR_BACKEND", ocr_backend)
-        self.ocr_config = ocr_config or {}
+        self.ocr_config = ocr_config
        self.max_image_size = max_image_size
        self.max_concurrent_tasks = max_concurrent_tasks
        self.max_chunks = max_chunks
        self.chunking_config = chunking_config
-
-        logger.info(
-            f"Initializing {self.__class__.__name__} for file: {file_name}, type: {self.file_type}"
+        self.storage = create_storage(
+            self.chunking_config.storage_config if self.chunking_config else None
        )
+
+        logger.info(f"Initializing parser for file: {file_name}, type: {file_type}")
        logger.info(
            f"Parser config: chunk_size={chunk_size}, "
            f"overlap={chunk_overlap}, "
@@ -150,16 +113,24 @@ class BaseParser(ABC):
            f"max_chunks={max_chunks}"
        )
        # Only initialize Caption service if multimodal is enabled
-        if self.enable_multimodal:
-            try:
-                self.caption_parser = Caption(self.chunking_config.vlm_config)
-            except Exception as e:
-                logger.warning(f"Failed to initialize Caption service: {str(e)}")
-                self.caption_parser = None
-        else:
-            self.caption_parser = None
+        vlm_config = self.chunking_config.vlm_config if self.chunking_config else None
+        self.caption_parser = (
+            Caption(vlm_config=vlm_config) if self.enable_multimodal else None
+        )

-    def perform_ocr(self, image):
+    @abstractmethod
+    def parse_into_text(self, content: bytes) -> Document:
+        """Parse document content
+
+        Args:
+            content: Document content
+
+        Returns:
+            Either a string containing the parsed text, or a tuple of (text, image_map)
+            where image_map is a dict mapping image URLs to Image objects
+        """
+
+    def perform_ocr(self, image: Image.Image):
        """Execute OCR recognition on the image

        Args:
@@ -170,53 +141,23 @@ class BaseParser(ABC):
        """
        start_time = time.time()
        logger.info("Starting OCR recognition")
-        resized_image = None

-        try:
-            # Resize image to avoid processing large images
-            resized_image = self._resize_image_if_needed(image)
+        # Resize image to avoid processing large images
+        resized_image = self._resize_image_if_needed(image)

-            # Get OCR engine
-            ocr_engine = self.get_ocr_engine(
-                backend_type=self.ocr_backend, **self.ocr_config
-            )
-            if ocr_engine is None:
-                logger.error(
-                    f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, "
-                    "skipping OCR recognition"
-                )
-                return ""
+        # Get OCR engine
+        ocr_engine = OCREngine.get_instance(self.ocr_backend)

-            # Execute OCR prediction
-            logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
-            # Add extra exception handling
-            try:
-                ocr_result = ocr_engine.predict(resized_image)
-            except RuntimeError as e:
-                # Handle common CUDA memory issues or other runtime errors
-                logger.error(f"OCR prediction runtime error: {str(e)}")
-                return ""
-            except Exception as e:
-                # Handle other prediction errors
-                logger.error(f"Unexpected OCR prediction error: {str(e)}")
-                return ""
+        # Execute OCR prediction
+        logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
+        ocr_result = ocr_engine.predict(resized_image)

-            process_time = time.time() - start_time
-            logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")
-            return ocr_result
-        except Exception as e:
-            process_time = time.time() - start_time
-            logger.error(
-                f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds"
-            )
-            return ""
-        finally:
-            # Release image resources
-            if resized_image is not image and hasattr(resized_image, "close"):
-                # Only close the new image we created, not the original image
-                resized_image.close()
+        process_time = time.time() - start_time
+        logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")

-    def _resize_image_if_needed(self, image):
+        return ocr_result
+
+    def _resize_image_if_needed(self, image: Image.Image) -> Image.Image:
        """Resize image if it exceeds maximum size limit

        Args:
@@ -225,102 +166,21 @@ class BaseParser(ABC):
        Returns:
            Resized image object
        """
-        try:
-            # If it's a PIL Image
-            if hasattr(image, "size"):
-                width, height = image.size
-                if width > self.max_image_size or height > self.max_image_size:
-                    logger.info(f"Resizing PIL image, original size: {width}x{height}")
-                    scale = min(
-                        self.max_image_size / width, self.max_image_size / height
-                    )
-                    new_width = int(width * scale)
-                    new_height = int(height * scale)
-                    resized_image = image.resize((new_width, new_height))
-                    logger.info(f"Resized to: {new_width}x{new_height}")
-                    return resized_image
-                else:
-                    logger.info(
-                        f"PIL image size {width}x{height} is within limits, no resizing needed"
-                    )
-                    return image
-            # If it's a numpy array
-            elif hasattr(image, "shape"):
-                height, width = image.shape[:2]
-                if width > self.max_image_size or height > self.max_image_size:
-                    logger.info(
-                        f"Resizing numpy image, original size: {width}x{height}"
-                    )
-                    scale = min(
-                        self.max_image_size / width, self.max_image_size / height
-                    )
-                    new_width = int(width * scale)
-                    new_height = int(height * scale)
-                    # Use PIL for resizing numpy arrays
-                    pil_image = Image.fromarray(image)
-                    resized_pil = pil_image.resize((new_width, new_height))
-                    resized_image = np.array(resized_pil)
-                    logger.info(f"Resized to: {new_width}x{new_height}")
-                    return resized_image
-                else:
-                    logger.info(
-                        f"Numpy image size {width}x{height} is within limits, no resizing needed"
-                    )
-                    return image
-            else:
-                logger.warning(f"Unknown image type: {type(image)}, cannot resize")
-                return image
-        except Exception as e:
-            logger.error(f"Error resizing image: {str(e)}")
-            return image
+        width, height = image.size
+        if width > self.max_image_size or height > self.max_image_size:
+            logger.info(f"Resizing PIL image, original size: {width}x{height}")
+            scale = min(self.max_image_size / width, self.max_image_size / height)
+            new_width = int(width * scale)
+            new_height = int(height * scale)
+            resized_image = image.resize((new_width, new_height))
+            logger.info(f"Resized to: {new_width}x{new_height}")
+            return resized_image

-    def process_image(self, image, image_url=None):
-        """Process image: first perform OCR, then get caption if text is available
+        logger.info(f"PIL image size is {width}x{height}, no resizing needed")
+        return image

-        Args:
-            image: Image object (PIL.Image or numpy array)
-            image_url: Image URL (if uploaded)
-
-        Returns:
-            tuple: (ocr_text, caption, image_url)
-            - ocr_text: OCR extracted text
-            - caption: Image description (if OCR has text) or empty string
-            - image_url: Image URL (if provided)
-        """
-        logger.info("Starting image processing (OCR + optional caption)")
-
-        # Resize image
-        image = self._resize_image_if_needed(image)
-
-        # Perform OCR recognition
-        ocr_text = self.perform_ocr(image)
-        caption = ""
-
-        if self.caption_parser:
-            logger.info(
-                f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
-            )
-            # Convert image to base64 for caption generation
-            img_base64 = image_to_base64(image)
-            if img_base64:
-                caption = self.get_image_caption(img_base64)
-                if caption:
-                    logger.info(f"Successfully obtained image caption: {caption}")
-                else:
-                    logger.warning("Failed to get caption")
-            else:
-                logger.warning("Failed to convert image to base64")
-                caption = ""
-        else:
-            logger.info("Caption service not initialized, skipping caption retrieval")
-
-        # Release image resources
-        del image
-
-        return ocr_text, caption, image_url
-
-    async def process_image_async(self, image, image_url=None):
-        """Asynchronously process image: first perform OCR, then get caption if text is available
+    async def process_image_async(self, image: Image.Image, image_url: str):
+        """Asynchronously process image: first perform OCR, then get caption

        Args:
            image: Image object (PIL.Image or numpy array)
@@ -333,84 +193,47 @@ class BaseParser(ABC):
            - image_url: Image URL (if provided)
        """
        logger.info("Starting asynchronous image processing (OCR + optional caption)")
-        resized_image = None

+        # Resize image
+        resized_image = self._resize_image_if_needed(image)
        try:
-            # Resize image
-            resized_image = self._resize_image_if_needed(image)
-
-            # Perform OCR recognition (using run_in_executor to execute synchronous operations in the event loop)
+            # Perform OCR recognition
            loop = asyncio.get_event_loop()
            try:
                # Add timeout mechanism to avoid infinite blocking (30 seconds timeout)
                ocr_task = loop.run_in_executor(None, self.perform_ocr, resized_image)
                ocr_text = await asyncio.wait_for(ocr_task, timeout=30.0)
-            except asyncio.TimeoutError:
-                logger.error(
-                    "OCR processing timed out (30 seconds), skipping this image"
-                )
-                ocr_text = ""
            except Exception as e:
-                logger.error(f"OCR processing error: {str(e)}")
+                logger.error(f"OCR processing error, skipping this image: {str(e)}")
                ocr_text = ""

-            logger.info(
-                f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
-            )
-            caption = ""
-            if self.caption_parser:
-                try:
-                    # Convert image to base64 for caption generation
-                    img_base64 = image_to_base64(resized_image)
-                    if img_base64:
-                        # Add timeout to avoid blocking caption retrieval (30 seconds timeout)
-                        caption_task = self.get_image_caption_async(img_base64)
-                        image_data, caption = await asyncio.wait_for(
-                            caption_task, timeout=30.0
-                        )
-                        if caption:
-                            logger.info(
-                                f"Successfully obtained image caption: {caption}"
-                            )
-                        else:
-                            logger.warning("Failed to get caption")
-                    else:
-                        logger.warning("Failed to convert image to base64")
-                        caption = ""
-                except asyncio.TimeoutError:
-                    logger.warning("Caption retrieval timed out, skipping")
-                except Exception as e:
-                    logger.error(f"Failed to get caption: {str(e)}")
-            else:
-                logger.info(
-                    "Caption service not initialized, skipping caption retrieval"
-                )
-
+            logger.info(f"Successfully obtained image ocr: {ocr_text}")
+            img_base64 = endecode.decode_image(resized_image)
+            caption = self.get_image_caption(img_base64)
+            logger.info(f"Successfully obtained image caption: {caption}")
            return ocr_text, caption, image_url
        finally:
-            # Release image resources
-            if resized_image is not image and hasattr(resized_image, "close"):
-                # Only close the new image we created, not the original image
-                resized_image.close()
+            resized_image.close()

-    async def process_with_limit(self, idx, image, url, semaphore):
+    async def process_with_limit(
+        self, idx: int, image: Image.Image, url: str, semaphore: asyncio.Semaphore
+    ):
        """Function to process a single image using a semaphore"""
        try:
-            logger.info(f"Waiting to process image {idx+1}")
+            logger.info(f"Waiting to process image {idx + 1}")
            async with semaphore:  # Use semaphore to control concurrency
-                logger.info(f"Starting to process image {idx+1}")
+                logger.info(f"Starting to process image {idx + 1}")
                result = await self.process_image_async(image, url)
-                logger.info(f"Completed processing image {idx+1}")
+                logger.info(f"Completed processing image {idx + 1}")
                return result
        except Exception as e:
-            logger.error(f"Error processing image {idx+1}: {str(e)}")
+            logger.error(f"Error processing image {idx + 1}: {str(e)}")
            return ("", "", url)  # Return empty result to avoid overall failure
        finally:
            # Manually release image resources
-            if hasattr(image, "close"):
-                image.close()
+            image.close()

-    async def process_multiple_images(self, images_data):
+    async def process_multiple_images(self, images_data: List[Tuple[Image.Image, str]]):
        """Process multiple images concurrently

        Args:
@@ -450,7 +273,7 @@ class BaseParser(ABC):
            for i, result in enumerate(completed_results):
                if isinstance(result, Exception):
                    logger.error(
-                        f"Image {i+1} processing returned an exception: {str(result)}"
+                        f"Image {i + 1} processing returned an exception: {str(result)}"
                    )
                    # For exceptions, add empty results
                    if i < len(images_data):
@@ -467,47 +290,10 @@ class BaseParser(ABC):
            logger.info("Image processing resource cleanup complete")

        logger.info(
-            f"Completed concurrent processing of {len(results)}/{len(images_data)} images"
+            f"Concurrent processing of {len(results)}/{len(images_data)} images"
        )
        return results

-    def decode_bytes(self, content: bytes) -> str:
-        """Intelligently decode byte stream, supports multiple encodings
-
-        Tries to decode in common encodings, if all fail, uses latin-1 as fallback
-
-        Args:
-            content: Byte stream to decode
-
-        Returns:
-            Decoded string
-        """
-        logger.info(f"Attempting to decode bytes of length: {len(content)}")
-        # Common encodings, sorted by priority
-        encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii", "latin-1"]
-        text = None
-
-        # Try decoding with each encoding format
-        for encoding in encodings:
-            try:
-                text = content.decode(encoding)
-                logger.info(f"Successfully decoded content using {encoding} encoding")
-                break
-            except UnicodeDecodeError:
-                logger.info(f"Failed to decode using {encoding} encoding")
-                continue
-
-        # If all encodings fail, use latin-1 as fallback
-        if text is None:
-            text = content.decode("latin-1")
-            logger.warning(
-                f"Unable to determine correct encoding, using latin-1 as fallback. "
-                f"This may cause character issues."
-            )
-
-        logger.info(f"Decoded text length: {len(text)} characters")
-        return text
-
    def get_image_caption(self, image_data: str) -> str:
        """Get image description

@@ -517,6 +303,9 @@ class BaseParser(ABC):
        Returns:
            Image description
        """
+        if not self.caption_parser:
+            logger.warning("Caption parser not initialized")
+            return ""
        start_time = time.time()
        logger.info(
            f"Getting caption for image: {image_data[:250]}..."
@@ -533,80 +322,7 @@ class BaseParser(ABC):
            logger.warning("Failed to get caption for image")
        return caption

-    async def get_image_caption_async(self, image_data: str) -> Tuple[str, str]:
-        """Asynchronously get image description
-
-        Args:
-            image_data: Image data (base64 encoded string or URL)
-
-        Returns:
-            Tuple[str, str]: Image data and corresponding description
-        """
-        caption = self.get_image_caption(image_data)
-        return image_data, caption
-
-    def __init_storage(self):
-        """Initialize storage client based on configuration"""
-        if self._storage is None:
-            storage_config = (
-                self.chunking_config.storage_config if self.chunking_config else None
-            )
-            self._storage = create_storage(storage_config)
-            logger.info(
-                f"Initialized storage client: {self._storage.__class__.__name__}"
-            )
-        return self._storage
-
-    def upload_file(self, file_path: str) -> str:
-        """Upload file to object storage
-
-        Args:
-            file_path: File path
-
-        Returns:
-            File URL
-        """
-        logger.info(f"Uploading file: {file_path}")
-        try:
-            storage = self.__init_storage()
-            return storage.upload_file(file_path)
-        except Exception as e:
-            logger.error(f"Failed to upload file: {str(e)}")
-            return ""
-
-    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
-        """Upload bytes to object storage
-
-        Args:
-            content: Byte content to upload
-            file_ext: File extension
-
-        Returns:
-            File URL
-        """
-        logger.info(f"Uploading bytes content, size: {len(content)} bytes")
-        try:
-            storage = self.__init_storage()
-            return storage.upload_bytes(content, file_ext)
-        except Exception as e:
-            logger.error(f"Failed to upload bytes to storage: {str(e)}")
-            traceback.print_exc()
-            return ""
-
-    @abstractmethod
-    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
-        """Parse document content
-
-        Args:
-            content: Document content
-
-        Returns:
-            Either a string containing the parsed text, or a tuple of (text, image_map)
-            where image_map is a dict mapping image URLs to Image objects
-        """
-        pass
-
-    def parse(self, content: bytes) -> ParseResult:
+    def parse(self, content: bytes) -> Document:
        """Parse document content

        Args:
@@ -616,17 +332,19 @@ class BaseParser(ABC):
            Parse result
        """
        logger.info(
-            f"Parsing document with {self.__class__.__name__}, content size: {len(content)} bytes"
+            f"Parsing document with {self.__class__.__name__}, bytes: {len(content)}"
        )
-        parse_result = self.parse_into_text(content)
-        if isinstance(parse_result, tuple):
-            text, image_map = parse_result
-        else:
-            text = parse_result
-            image_map = {}
-        logger.info(f"Extracted {len(text)} characters of text from {self.file_name}")
-        logger.info(f"Beginning chunking process for text")
-        chunks = self.chunk_text(text)
+        document = self.parse_into_text(content)
+        logger.info(
+            f"Extracted {len(document.content)} characters from {self.file_name}"
+        )
+        splitter = TextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+            separators=self.separators,
+        )
+        chunk_str = splitter.split_text(document.content)
+        chunks = self._str_to_chunk(chunk_str)
        logger.info(f"Created {len(chunks)} chunks from document")

        # Limit the number of returned chunks
@@ -636,7 +354,7 @@ class BaseParser(ABC):
            )
            chunks = chunks[: self.max_chunks]

-        # If multimodal is enabled and file type is supported, process images in each chunk
+        # If multimodal is enabled and file type is supported, process images
        if self.enable_multimodal:
            # Get file extension and convert to lowercase
            file_ext = (
@@ -647,11 +365,12 @@ class BaseParser(ABC):

            # Define allowed file types for image processing
            allowed_types = [
-                ".pdf",  # PDF files
+                # Text files
+                ".pdf",
                ".md",
-                ".markdown",  # Markdown files
+                ".markdown",
                ".doc",
-                ".docx",  # Word documents
+                ".docx",
                # Image files
                ".jpg",
                ".jpeg",
@@ -666,13 +385,21 @@ class BaseParser(ABC):
                logger.info(
                    f"Processing images in each chunk for file type: {file_ext}"
                )
-                chunks = self.process_chunks_images(chunks, image_map)
+                chunks = self.process_chunks_images(chunks, document.images)
            else:
                logger.info(
                    f"Skipping image processing for unsupported file type: {file_ext}"
                )

-        return ParseResult(text=text, chunks=chunks)
+        document.chunks = chunks
+        return document
+
+    def _str_to_chunk(self, text: List[Tuple[int, int, str]]) -> List[Chunk]:
+        """Convert string to Chunk object"""
+        return [
+            Chunk(seq=i, content=t, start=start, end=end)
+            for i, (start, end, t) in enumerate(text)
+        ]

    def _split_into_units(self, text: str) -> List[str]:
        """
@@ -682,9 +409,7 @@ class BaseParser(ABC):
        Returns:
            基本单元的列表
        """
-        logger.info(
-            f"Splitting text into basic units with robust structure protection, text length: {len(text)}"
-        )
+        logger.info(f"Splitting text into basic units, text length: {len(text)}")

        # 定义所有需要作为整体保护的结构模式 ---
        table_pattern = r"(?m)(^\|.*\|[ \t]*\r?\n(?:[ \t]*\r?\n)?^\|\s*:?--+.*\r?\n(?:^\|.*\|\r?\n?)*)"
@@ -710,7 +435,8 @@ class BaseParser(ABC):
        # 按起始位置排序
        protected_ranges.sort(key=lambda x: x[0])
        logger.info(
-            f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links)."
+            f"Found {len(protected_ranges)} protected structures "
+            "(tables, code, formulas, images, links)."
        )

        # 合并可能重叠的保护范围 ---
@@ -731,7 +457,7 @@ class BaseParser(ABC):
            merged_ranges.append((current_start, current_end))
            protected_ranges = merged_ranges
            logger.info(
-                f"After merging overlaps, {len(protected_ranges)} protected ranges remain."
+                f"After overlaps, {len(protected_ranges)} protected ranges remain."
            )

        # 根据保护范围和分隔符来分割文本 ---
@@ -749,7 +475,7 @@ class BaseParser(ABC):
                segments = re.split(separator_pattern, pre_text)
                units.extend([s for s in segments if s])  # 添加所有非空部分

-            # b. 将整个受保护的块（例如，一个完整的表格）作为一个单独的、不可分割的单元添加
+            # b. 将整个受保护的块（例如，一个完整的表格）作为一个不可分割的单元添加
            protected_text = text[start:end]
            units.append(protected_text)

@@ -764,38 +490,6 @@ class BaseParser(ABC):
        logger.info(f"Text splitting complete, created {len(units)} final basic units.")
        return units

-    def _find_complete_units(self, units: List[str], target_size: int) -> List[str]:
-        """Find a list of complete units that do not exceed the target size
-
-        Args:
-            units: List of units
-            target_size: Target size
-
-        Returns:
-            List of complete units
-        """
-        logger.info(f"Finding complete units with target size: {target_size}")
-        result = []
-        current_size = 0
-
-        for unit in units:
-            unit_size = len(unit)
-            if current_size + unit_size > target_size and result:
-                logger.info(
-                    f"Reached target size limit at {current_size} characters, stopping"
-                )
-                break
-            result.append(unit)
-            current_size += unit_size
-            logger.info(
-                f"Added unit of size {unit_size}, current total: {current_size}/{target_size}"
-            )
-
-        logger.info(
-            f"Found {len(result)} complete units totaling {current_size} characters"
-        )
-        return result
-
    def chunk_text(self, text: str) -> List[Chunk]:
        """Chunk text, preserving Markdown structure

@@ -825,7 +519,7 @@ class BaseParser(ABC):

        for i, unit in enumerate(units):
            unit_size = len(unit)
-            logger.info(f"Processing unit {i+1}/{len(units)}, size: {unit_size}")
+            logger.info(f"Processing unit {i + 1}/{len(units)}, size: {unit_size}")

            # If current chunk plus new unit exceeds size limit, create new chunk
            if current_size + unit_size > self.chunk_size and current_chunk:
@@ -855,14 +549,12 @@ class BaseParser(ABC):
                    for u in reversed(current_chunk):
                        if overlap_size + len(u) > overlap_target:
                            logger.info(
-                                f"Reached overlap target ({overlap_size}/{overlap_target})"
+                                f"Overlap target ({overlap_size}/{overlap_target})"
                            )
                            break
                        overlap_units.insert(0, u)
                        overlap_size += len(u)
-                        logger.info(
-                            f"Added unit to overlap, current overlap size: {overlap_size}"
-                        )
+                        logger.info(f"Added unit to overlap, size: {overlap_size}")

                    # Remove elements from overlap that are included in separators
                    start_index = 0
@@ -883,7 +575,7 @@ class BaseParser(ABC):

                    overlap_units = overlap_units[start_index:]
                    logger.info(
-                        f"Final overlap: {len(overlap_units)} units, {overlap_size} characters"
+                        f"Overlap: {len(overlap_units)} units, {overlap_size} size"
                    )

                    current_chunk = overlap_units
@@ -899,7 +591,7 @@ class BaseParser(ABC):
            current_chunk.append(unit)
            current_size += unit_size
            logger.info(
-                f"Added unit to current chunk, now at {current_size}/{self.chunk_size} characters"
+                f"Added unit to current chunk, at {current_size}/{self.chunk_size}"
            )

        # Add the last chunk
@@ -925,12 +617,13 @@ class BaseParser(ABC):
            chunk: Document chunk

        Returns:
-            List of image information, each element contains image URL and match position
+            List of image information
        """
        logger.info(f"Extracting image information from Chunk #{chunk.seq}")
        text = chunk.content

-        # Regex to extract image information from text, supporting Markdown images and HTML images
+        # Regex to extract image information from text,
+        # support: Markdown images, HTML images
        img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)|<img [^>]*src="([^"]+)" [^>]*>'

        # Extract image information
@@ -954,28 +647,28 @@ class BaseParser(ABC):
            images_info.append(image_info)

            logger.info(
-                f"Image in Chunk #{chunk.seq} {match_idx+1}: " f"URL={img_url[:50]}..."
+                f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url[:50]}..."
                if len(img_url) > 50
-                else f"Image in Chunk #{chunk.seq} {match_idx+1}: URL={img_url}"
+                else f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url}"
            )

        return images_info

-    async def download_and_upload_image(self, img_url: str):
-        """Download image and upload to object storage, if it's already an object storage path or local path, use directly
+    async def download_and_upload_image(
+        self, img_url: str
+    ) -> Tuple[str, str, Image.Image | None]:
+        """Download image and upload to object storage,
+        if it's already an object storage path or local path, use directly

        Args:
            img_url: Image URL or local path

        Returns:
-            tuple: (original URL, storage URL, image object), if failed returns (original URL, None, None)
+            tuple: (original URL, storage URL, image object),
+            if failed returns (original URL, None, None)
        """

        try:
-            import requests
-            from PIL import Image
-            import io
-
            # Check if it's already a storage URL (COS or MinIO)
            is_storage_url = any(
                pattern in img_url
@@ -997,12 +690,7 @@ class BaseParser(ABC):
                    response = requests.get(img_url, timeout=5, proxies=proxies)
                    if response.status_code == 200:
                        image = Image.open(io.BytesIO(response.content))
-                        try:
-                            return img_url, img_url, image
-                        finally:
-                            # Ensure image resources are also released after the function returns
-                            # Image will be closed by the caller
-                            pass
+                        return img_url, img_url, image
                    else:
                        logger.warning(
                            f"Failed to get storage image: {response.status_code}"
@@ -1022,7 +710,7 @@ class BaseParser(ABC):
                    # Upload to storage
                    with open(img_url, "rb") as f:
                        content = f.read()
-                    storage_url = self.upload_bytes(content)
+                    storage_url = self.storage.upload_bytes(content)
                    logger.info(
                        f"Successfully uploaded local image to storage: {storage_url}"
                    )
@@ -1031,7 +719,7 @@ class BaseParser(ABC):
                    logger.error(f"Error processing local image: {str(e)}")
                    if image and hasattr(image, "close"):
                        image.close()
-                    return img_url, None, None
+                    return img_url, img_url, None

            # Normal remote URL download handling
            else:
@@ -1044,9 +732,7 @@ class BaseParser(ABC):
                if https_proxy:
                    proxies["https"] = https_proxy

-                logger.info(
-                    f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}"
-                )
+                logger.info(f"Downloading image {img_url}, using proxy: {proxies}")
                response = requests.get(img_url, timeout=5, proxies=proxies)

                if response.status_code == 200:
@@ -1054,7 +740,7 @@ class BaseParser(ABC):
                    image = Image.open(io.BytesIO(response.content))
                    try:
                        # Upload to storage using the method in BaseParser
-                        storage_url = self.upload_bytes(response.content)
+                        storage_url = self.storage.upload_bytes(response.content)
                        logger.info(
                            f"Successfully uploaded image to storage: {storage_url}"
                        )
@@ -1064,11 +750,11 @@ class BaseParser(ABC):
                        pass
                else:
                    logger.warning(f"Failed to download image: {response.status_code}")
-                    return img_url, None, None
+                    return img_url, img_url, None

        except Exception as e:
            logger.error(f"Error downloading or processing image: {str(e)}")
-            return img_url, None, None
+            return img_url, img_url, None

    async def process_chunk_images_async(
        self, chunk, chunk_idx, total_chunks, image_map=None
@@ -1086,18 +772,19 @@ class BaseParser(ABC):
        """

        logger.info(
-            f"Starting to process images in Chunk #{chunk_idx+1}/{total_chunks}"
+            f"Starting to process images in Chunk #{chunk_idx + 1}/{total_chunks}"
        )

        # Extract image information from the Chunk
        images_info = self.extract_images_from_chunk(chunk)
        if not images_info:
-            logger.info(f"Chunk #{chunk_idx+1} found no images")
+            logger.info(f"Chunk #{chunk_idx + 1} found no images")
            return chunk

        # Prepare images that need to be downloaded and processed
        images_to_process = []
-        url_to_info_map = {}  # Map URL to image information
+        # Map URL to image information
+        url_to_info_map = {}

        # Record all image URLs that need to be processed
        for img_info in images_info:
@@ -1106,14 +793,21 @@ class BaseParser(ABC):

        results = []
        download_tasks = []
-        for img_url in  url_to_info_map.keys():          # Check if image is already in the image_map
+        # Check if image is already in the image_map
+        for img_url in url_to_info_map.keys():
            if image_map and img_url in image_map:
-                logger.info(f"Image already in image_map: {img_url}, using cached object")
-                results.append((img_url, img_url, image_map[img_url]))
+                logger.info(
+                    f"Image already in image_map: {img_url}, using cached object"
+                )
+                image = Image.open(
+                    io.BytesIO(endecode.encode_image(image_map[img_url]))
+                )
+                results.append((img_url, img_url, image))
            else:
                download_task = self.download_and_upload_image(img_url)
                download_tasks.append(download_task)
-        # Concurrent download and upload of images, ignore images that are already in the image_map
+        # Concurrent download and upload of images,
+        # ignore images that are already in the image_map
        results.extend(await asyncio.gather(*download_tasks))

        # Process download results, prepare for OCR processing
@@ -1123,16 +817,17 @@ class BaseParser(ABC):
                img_info["cos_url"] = cos_url
                images_to_process.append((image, cos_url))

-        # If no images were successfully downloaded and uploaded, return the original Chunk
+        # If no images were successfully downloaded and uploaded,
+        # return the original Chunk
        if not images_to_process:
            logger.info(
-                f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images"
+                f"Chunk #{chunk_idx + 1} not found downloaded and uploaded images"
            )
            return chunk

        # Concurrent processing of all images (OCR + caption)
        logger.info(
-            f"Processing {len(images_to_process)} images in Chunk #{chunk_idx+1}"
+            f"Processing {len(images_to_process)} images in Chunk #{chunk_idx + 1}"
        )

        # Concurrent processing of all images
@@ -1163,10 +858,12 @@ class BaseParser(ABC):
        # Update image information in the Chunk
        chunk.images = processed_images

-        logger.info(f"Completed image processing in Chunk #{chunk_idx+1}")
+        logger.info(f"Completed image processing in Chunk #{chunk_idx + 1}")
        return chunk

-    def process_chunks_images(self, chunks: List[Chunk], image_map=None) -> List[Chunk]:
+    def process_chunks_images(
+        self, chunks: List[Chunk], image_map: Dict[str, str] = {}
+    ) -> List[Chunk]:
        """Concurrent processing of images in all Chunks

        Args:
@@ -1210,7 +907,7 @@ class BaseParser(ABC):
            processed_chunks = []
            for i, result in enumerate(results):
                if isinstance(result, Exception):
-                    logger.error(f"Error processing Chunk {i+1}: {str(result)}")
+                    logger.error(f"Error processing Chunk {i + 1}: {str(result)}")
                    # Keep original Chunk
                    if i < len(chunks):
                        processed_chunks.append(chunks[i])
@@ -1235,7 +932,7 @@ class BaseParser(ABC):
            # Execute processing for all Chunks
            processed_chunks = loop.run_until_complete(process_all_chunks())
            logger.info(
-                f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks"
+                f"Completed processing of {len(processed_chunks)}/{len(chunks)} chunks"
            )

            return processed_chunks
--- a/docreader/parser/caption.py
+++ b/docreader/parser/caption.py
@@ -3,11 +3,10 @@ import logging
 import os
 import time
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union

-import requests
 import ollama
-
+import requests

 logger = logging.getLogger(__name__)

@@ -158,11 +157,16 @@ class CaptionChatResp:
        Returns:
            The content string from the first choice, or empty string if no choices
        """
-        if self.choices:
-            logger.info("Retrieving content from first choice")
-            return self.choices[0].message.content
-        logger.warning("No choices available in response")
-        return ""
+        if (
+            not self.choices
+            or not self.choices[0]
+            or not self.choices[0].message
+            or not self.choices[0].message.content
+        ):
+            logger.warning("No choices available in response")
+            return ""
+        logger.info("Retrieving content from first choice")
+        return self.choices[0].message.content


 class Caption:
@@ -171,33 +175,43 @@ class Caption:
    Uses an external API to process images and return textual descriptions.
    """

-    def __init__(self, vlm_config=None):
-        """Initialize the Caption service with configuration from parameters or environment variables."""
+    def __init__(self, vlm_config: Optional[Dict[str, str]] = None):
+        """
+        Initialize the Caption service with configuration
+        from parameters or environment variables.
+        """
        logger.info("Initializing Caption service")
        self.prompt = """简单凝炼的描述图片的主要内容"""
-        
-        # Use provided VLM config if available, otherwise fall back to environment variables
+        self.timeout = 30
+
+        # Use provided VLM config if available,
+        # otherwise fall back to environment variables
        if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"):
            self.completion_url = vlm_config.get("base_url", "") + "/chat/completions"
            self.model = vlm_config.get("model_name", "")
            self.api_key = vlm_config.get("api_key", "")
            self.interface_type = vlm_config.get("interface_type", "openai").lower()
        else:
-            if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "":
+            base_url = os.getenv("VLM_MODEL_BASE_URL")
+            model_name = os.getenv("VLM_MODEL_NAME")
+            if not base_url or not model_name:
                logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set")
                return
-            self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions"
-            self.model = os.getenv("VLM_MODEL_NAME")
-            self.api_key = os.getenv("VLM_MODEL_API_KEY")
+            self.completion_url = base_url + "/chat/completions"
+            self.model = model_name
+            self.api_key = os.getenv("VLM_MODEL_API_KEY", "")
            self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower()
-        
+
        # 验证接口类型
        if self.interface_type not in ["ollama", "openai"]:
-            logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai")
+            logger.warning(
+                f"Unknown interface type: {self.interface_type}, defaulting to openai"
+            )
            self.interface_type = "openai"
-        
+
        logger.info(
-            f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}"
+            f"Configured with model: {self.model}, "
+            f"endpoint: {self.completion_url}, interface: {self.interface_type}"
        )

    def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]:
@@ -210,8 +224,8 @@ class Caption:
        Returns:
            CaptionChatResp object if successful, None otherwise
        """
-        logger.info(f"Calling Caption API for image captioning")
-        logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}")
+        logger.info("Calling Caption API for image captioning")
+        logger.info(f"Processing image data: {image_data[:50]}...")

        # 根据接口类型选择调用方式
        if self.interface_type == "ollama":
@@ -226,39 +240,35 @@ class Caption:

        client = ollama.Client(
            host=host,
+            timeout=self.timeout,
        )
-        
+
        try:
            logger.info(f"Calling Ollama API with model: {self.model}")
-            
+
            # 调用Ollama API，使用images参数传递base64编码的图片
            response = client.generate(
                model=self.model,
                prompt="简单凝炼的描述图片的主要内容",
-                images=[image_base64], # image_base64是base64编码的图片数据
+                images=[image_base64],  # image_base64是base64编码的图片数据
                options={"temperature": 0.1},
                stream=False,
            )
-            
+
            # 构造响应对象
            caption_resp = CaptionChatResp(
                id="ollama_response",
                created=int(time.time()),
-                model=self.model,
+                model=Model(id=self.model),
                object="chat.completion",
                choices=[
-                    Choice(
-                        message=Message(
-                            role="assistant",
-                            content=response.response
-                        )
-                    )
-                ]
+                    Choice(message=Message(role="assistant", content=response.response))
+                ],
            )
-            
+
            logger.info("Successfully received response from Ollama API")
            return caption_resp
-            
+
        except Exception as e:
            logger.error(f"Error calling Ollama API: {e}")
            return None
@@ -266,13 +276,16 @@ class Caption:
    def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]:
        """Call OpenAI-compatible API for image captioning."""
        logger.info(f"Calling OpenAI-compatible API with model: {self.model}")
-        
+
        user_msg = UserMessage(
            role="user",
            content=[
                Content(type="text", text=self.prompt),
                Content(
-                    type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto")
+                    type="image_url",
+                    image_url=ImageUrl(
+                        url="data:image/png;base64," + image_base64, detail="auto"
+                    ),
                ),
            ],
        )
@@ -295,23 +308,23 @@ class Caption:
            headers["Authorization"] = f"Bearer {self.api_key}"

        try:
-            logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}")
+            logger.info(
+                f"Sending request to OpenAI-compatible API with model: {self.model}"
+            )
            response = requests.post(
                self.completion_url,
                data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4),
                headers=headers,
-                timeout=30,
+                timeout=self.timeout,
            )
            if response.status_code != 200:
                logger.error(
-                    f"OpenAI-compatible API returned non-200 status code: {response.status_code}"
+                    f"OpenAI API returned non-200 status code: {response.status_code}"
                )
                response.raise_for_status()

-            logger.info(
-                f"Successfully received response from OpenAI-compatible API with status: {response.status_code}"
-            )
-            logger.info(f"Converting response to CaptionChatResp object")
+            logger.info(f"Received from OpenAI with status: {response.status_code}")
+            logger.info("Converting response to CaptionChatResp object")
            caption_resp = CaptionChatResp.from_json(response.json())

            if caption_resp.usage:
@@ -322,7 +335,7 @@ class Caption:

            return caption_resp
        except requests.exceptions.Timeout:
-            logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds")
+            logger.error("Timeout while calling OpenAI-compatible API after 30 seconds")
            return None
        except requests.exceptions.RequestException as e:
            logger.error(f"Request error calling OpenAI-compatible API: {e}")
--- a/docreader/parser/chain_parser.py
+++ b/docreader/parser/chain_parser.py
@@ -0,0 +1,70 @@
+import logging
+from typing import List, Tuple, Type
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.utils import endecode
+
+logger = logging.getLogger(__name__)
+
+
+class FirstParser(BaseParser):
+    _parser_cls: Tuple[Type["BaseParser"], ...] = ()
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._parsers: List[BaseParser] = []
+        for parser_cls in self._parser_cls:
+            try:
+                parser = parser_cls(*args, **kwargs)
+                self._parsers.append(parser)
+            except Exception as e:
+                logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
+
+    def parse_into_text(self, content: bytes) -> Document:
+        for p in self._parsers:
+            document = p.parse_into_text(content)
+            if document.is_valid():
+                return document
+        return Document()
+
+    @classmethod
+    def create(cls, *parser_classes: Type["BaseParser"]) -> Type["FirstParser"]:
+        names = "_".join([p.__name__ for p in parser_classes])
+        return type(f"FirstParser_{names}", (cls,), {"_parser_cls": parser_classes})
+
+
+class PipelineParser(BaseParser):
+    _parser_cls: Tuple[Type["BaseParser"], ...] = ()
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._parsers: List[BaseParser] = []
+        for parser_cls in self._parser_cls:
+            try:
+                parser = parser_cls(*args, **kwargs)
+                self._parsers.append(parser)
+            except Exception as e:
+                logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
+
+    def parse_into_text(self, content: bytes) -> Document:
+        document = Document()
+        for p in self._parsers:
+            document = p.parse_into_text(content)
+            content = endecode.encode_bytes(document.content)
+        return document
+
+    @classmethod
+    def create(cls, *parser_classes: Type["BaseParser"]) -> Type["PipelineParser"]:
+        names = "_".join([p.__name__ for p in parser_classes])
+        return type(f"PipelineParser_{names}", (cls,), {"_parser_cls": parser_classes})
+
+
+if __name__ == "__main__":
+    from docreader.parser.markdown_parser import MarkdownParser
+
+    cls = FirstParser.create(MarkdownParser)
+    parser = cls()
+    print(parser.parse_into_text(b"aaa"))
--- a/docreader/parser/config.py
+++ b/docreader/parser/config.py
@@ -1,21 +0,0 @@
-from dataclasses import dataclass, field
-
-
-@dataclass
-class ChunkingConfig:
-    """
-    Configuration for text chunking process.
-    Controls how documents are split into smaller pieces for processing.
-    """
-
-    chunk_size: int = 512  # Maximum size of each chunk in tokens/chars
-    chunk_overlap: int = 50  # Number of tokens/chars to overlap between chunks
-    separators: list = field(
-        default_factory=lambda: ["\n\n", "\n", "。"]
-    )  # Text separators in order of priority
-    enable_multimodal: bool = (
-        False  # Whether to enable multimodal processing (text + images)
-    )
-    storage_config: dict = None  # Preferred field name going forward
-    vlm_config: dict = None  # VLM configuration for image captioning
-
--- a/docreader/parser/doc_parser.py
+++ b/docreader/parser/doc_parser.py
@@ -1,134 +1,88 @@
-import asyncio
 import logging
-import re
-import tempfile
 import os
 import subprocess
-import shutil
-from io import BytesIO
-from typing import Optional, List, Tuple
-import textract
-from PIL import Image
-import zipfile
-import xml.etree.ElementTree as ET
+from typing import List, Optional

-from .base_parser import BaseParser
-from .docx_parser import DocxParser, Docx
+import textract
+
+from docreader.models.document import Document
+from docreader.parser.docx2_parser import Docx2Parser
+from docreader.utils.tempfile import TempDirContext, TempFileContext

 logger = logging.getLogger(__name__)


-class DocParser(BaseParser):
+class DocParser(Docx2Parser):
    """DOC document parser"""

-    def parse_into_text(self, content: bytes) -> str:
-        """Parse DOC document
-
-        Args:
-            content: DOC document content
-
-        Returns:
-            Parse result
-        """
+    def parse_into_text(self, content: bytes) -> Document:
        logger.info(f"Parsing DOC document, content size: {len(content)} bytes")

+        handle_chain = [
+            # 1. Try to convert to docx format to extract images
+            self._parse_with_docx,
+            # 2. If image extraction is not needed or conversion failed,
+            # try using antiword to extract text
+            self._parse_with_antiword,
+            # 3. If antiword extraction fails, use textract
+            self._parse_with_textract,
+        ]
+
        # Save byte content as a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
-            temp_file_path = temp_file.name
-            temp_file.write(content)
-            temp_file.flush()
-            logger.info(f"Saved DOC content to temporary file: {temp_file_path}")
+        with TempFileContext(content, ".doc") as temp_file_path:
+            for handle in handle_chain:
+                try:
+                    document = handle(temp_file_path)
+                    if document:
+                        return document
+                except Exception as e:
+                    logger.warning(f"Failed to parse DOC with {handle.__name__} {e}")

-        try:
-            # First try to convert to docx format to extract images
-            if self.enable_multimodal:
-                logger.info("Multimodal enabled, attempting to extract images from DOC")
-                docx_content = self._convert_doc_to_docx(temp_file_path)
+            return Document(content="")

-                if docx_content:
-                    logger.info("Successfully converted DOC to DOCX, using DocxParser")
-                    # Use existing DocxParser to parse the converted docx
-                    docx_parser = DocxParser(
-                        file_name=self.file_name,
-                        file_type="docx",
-                        enable_multimodal=self.enable_multimodal,
-                        chunk_size=self.chunk_size,
-                        chunk_overlap=self.chunk_overlap,
-                        chunking_config=self.chunking_config,
-                        separators=self.separators,
-                    )
-                    text = docx_parser.parse_into_text(docx_content)
-                    logger.info(f"Extracted {len(text)} characters using DocxParser")
+    def _parse_with_docx(self, temp_file_path: str) -> Document:
+        logger.info("Multimodal enabled, attempting to extract images from DOC")

-                    # Clean up temporary file
-                    os.unlink(temp_file_path)
-                    logger.info(f"Deleted temporary file: {temp_file_path}")
+        docx_content = self._try_convert_doc_to_docx(temp_file_path)
+        if not docx_content:
+            raise RuntimeError("Failed to convert DOC to DOCX")

-                    return text
-                else:
-                    logger.warning(
-                        "Failed to convert DOC to DOCX, falling back to text-only extraction"
-                    )
+        logger.info("Successfully converted DOC to DOCX, using DocxParser")
+        # Use existing DocxParser to parse the converted docx
+        document = super(Docx2Parser, self).parse_into_text(docx_content)
+        logger.info(f"Extracted {len(document.content)} characters using DocxParser")
+        return document

-            # If image extraction is not needed or conversion failed, try using antiword to extract text
-            try:
-                logger.info("Attempting to parse DOC file with antiword")
-                # Check if antiword is installed
-                antiword_path = self._find_antiword_path()
+    def _parse_with_antiword(self, temp_file_path: str) -> Document:
+        logger.info("Attempting to parse DOC file with antiword")

-                if antiword_path:
-                    # Use antiword to extract text directly
-                    logger.info(f"Using antiword at {antiword_path} to extract text")
-                    process = subprocess.Popen(
-                        [antiword_path, temp_file_path],
-                        stdout=subprocess.PIPE,
-                        stderr=subprocess.PIPE,
-                    )
-                    stdout, stderr = process.communicate()
+        # Check if antiword is installed
+        antiword_path = self._try_find_antiword()
+        if not antiword_path:
+            raise RuntimeError("antiword not found in PATH")

-                    if process.returncode == 0:
-                        text = stdout.decode("utf-8", errors="ignore")
-                        logger.info(
-                            f"Successfully extracted {len(text)} characters using antiword"
-                        )
-
-                        # Clean up temporary file
-                        os.unlink(temp_file_path)
-                        logger.info(f"Deleted temporary file: {temp_file_path}")
-
-                        return text
-                    else:
-                        logger.warning(
-                            f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
-                        )
-                else:
-                    logger.warning("antiword not found, falling back to textract")
-            except Exception as e:
-                logger.warning(
-                    f"Error using antiword: {str(e)}, falling back to textract"
-                )
-
-            # If antiword fails, try using textract
-            logger.info("Parsing DOC file with textract")
-            text = textract.process(temp_file_path, method="antiword").decode("utf-8")
-            logger.info(
-                f"Successfully extracted {len(text)} characters of text from DOC document using textract"
+        # Use antiword to extract text directly
+        process = subprocess.Popen(
+            [antiword_path, temp_file_path],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        stdout, stderr = process.communicate()
+        if process.returncode != 0:
+            raise RuntimeError(
+                f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
            )
+        text = stdout.decode("utf-8", errors="ignore")
+        logger.info(f"Successfully extracted {len(text)} characters using antiword")
+        return Document(content=text)

-            # Clean up temporary file
-            os.unlink(temp_file_path)
-            logger.info(f"Deleted temporary file: {temp_file_path}")
+    def _parse_with_textract(self, temp_file_path: str) -> Document:
+        logger.info(f"Parsing DOC file with textract: {temp_file_path}")
+        text = textract.process(temp_file_path, method="antiword").decode("utf-8")
+        logger.info(f"Successfully extracted {len(text)} bytes of DOC using textract")
+        return Document(content=str(text))

-            return text
-        except Exception as e:
-            logger.error(f"Error parsing DOC document: {str(e)}")
-            # Ensure temporary file is cleaned up
-            if os.path.exists(temp_file_path):
-                os.unlink(temp_file_path)
-                logger.info(f"Deleted temporary file after error: {temp_file_path}")
-            return ""
-
-    def _convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
+    def _try_convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
        """Convert DOC file to DOCX format

        Uses LibreOffice/OpenOffice for conversion
@@ -141,21 +95,16 @@ class DocParser(BaseParser):
        """
        logger.info(f"Converting DOC to DOCX: {doc_path}")

+        # Check if LibreOffice or OpenOffice is installed
+        soffice_path = self._try_find_soffice()
+        if not soffice_path:
+            return None
+
+        # Execute conversion command
+        logger.info(f"Using {soffice_path} to convert DOC to DOCX")
+
        # Create a temporary directory to store the converted file
-        temp_dir = tempfile.mkdtemp()
-        docx_path = os.path.join(temp_dir, "converted.docx")
-
-        try:
-            # Check if LibreOffice or OpenOffice is installed
-            soffice_path = self._find_soffice_path()
-            if not soffice_path:
-                logger.error(
-                    "LibreOffice/OpenOffice not found, cannot convert DOC to DOCX"
-                )
-                return None
-
-            # Execute conversion command
-            logger.info(f"Using {soffice_path} to convert DOC to DOCX")
+        with TempDirContext() as temp_dir:
            cmd = [
                soffice_path,
                "--headless",
@@ -165,7 +114,6 @@ class DocParser(BaseParser):
                temp_dir,
                doc_path,
            ]
-
            logger.info(f"Running command: {' '.join(cmd)}")
            process = subprocess.Popen(
                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
@@ -173,41 +121,68 @@ class DocParser(BaseParser):
            stdout, stderr = process.communicate()

            if process.returncode != 0:
-                logger.error(
-                    f"Error converting DOC to DOCX: {stderr.decode('utf-8', errors='ignore')}"
+                logger.warning(
+                    f"Error converting DOC to DOCX: {stderr.decode('utf-8')}"
                )
                return None

            # Find the converted file
-            for file in os.listdir(temp_dir):
-                if file.endswith(".docx"):
-                    converted_file = os.path.join(temp_dir, file)
-                    logger.info(f"Found converted file: {converted_file}")
-
-                    # Read the converted file content
-                    with open(converted_file, "rb") as f:
-                        docx_content = f.read()
+            docx_file = [
+                file for file in os.listdir(temp_dir) if file.endswith(".docx")
+            ]
+            logger.info(f"Found {len(docx_file)} DOCX file(s) in temporary directory")
+            for file in docx_file:
+                converted_file = os.path.join(temp_dir, file)
+                logger.info(f"Found converted file: {converted_file}")

+                # Read the converted file content
+                with open(converted_file, "rb") as f:
+                    docx_content = f.read()
                    logger.info(
-                        f"Successfully read converted DOCX file, size: {len(docx_content)} bytes"
+                        f"Successfully read DOCX file, size: {len(docx_content)}"
                    )
                    return docx_content
+        return None

-            logger.error("No DOCX file found after conversion")
-            return None
+    def _try_find_executable_path(
+        self,
+        executable_name: str,
+        possible_path: List[str] = [],
+        environment_variable: List[str] = [],
+    ) -> Optional[str]:
+        """Find executable path
+        Args:
+            executable_name: Executable name
+            possible_path: List of possible paths
+            environment_variable: List of environment variables to check
+            Returns:
+                Executable path, or None if not found
+        """
+        # Common executable paths
+        paths: List[str] = []
+        paths.extend(possible_path)
+        paths.extend(os.environ.get(env_var, "") for env_var in environment_variable)
+        paths = list(set(paths))

-        except Exception as e:
-            logger.error(f"Error during DOC to DOCX conversion: {str(e)}")
-            return None
-        finally:
-            # Clean up temporary directory
-            try:
-                shutil.rmtree(temp_dir)
-                logger.info(f"Cleaned up temporary directory: {temp_dir}")
-            except Exception as e:
-                logger.warning(f"Failed to clean up temporary directory: {str(e)}")
+        # Check if path is set in environment variable
+        for path in paths:
+            if os.path.exists(path):
+                logger.info(f"Found {executable_name} at {path}")
+                return path

-    def _find_soffice_path(self) -> Optional[str]:
+        # Try to find in PATH
+        result = subprocess.run(
+            ["which", executable_name], capture_output=True, text=True
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            path = result.stdout.strip()
+            logger.info(f"Found {executable_name} at {path}")
+            return path
+
+        logger.warning(f"Failed to find {executable_name}")
+        return None
+
+    def _try_find_soffice(self) -> Optional[str]:
        """Find LibreOffice/OpenOffice executable path

        Returns:
@@ -225,32 +200,13 @@ class DocParser(BaseParser):
            "C:\\Program Files\\LibreOffice\\program\\soffice.exe",
            "C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
        ]
+        return self._try_find_executable_path(
+            executable_name="soffice",
+            possible_path=possible_paths,
+            environment_variable=["LIBREOFFICE_PATH"],
+        )

-        # Check if path is set in environment variable
-        if os.environ.get("LIBREOFFICE_PATH"):
-            possible_paths.insert(0, os.environ.get("LIBREOFFICE_PATH"))
-
-        for path in possible_paths:
-            if os.path.exists(path):
-                logger.info(f"Found LibreOffice/OpenOffice at: {path}")
-                return path
-
-        # Try to find in PATH
-        try:
-            result = subprocess.run(
-                ["which", "soffice"], capture_output=True, text=True
-            )
-            if result.returncode == 0 and result.stdout.strip():
-                path = result.stdout.strip()
-                logger.info(f"Found LibreOffice/OpenOffice in PATH: {path}")
-                return path
-        except Exception:
-            pass
-
-        logger.warning("LibreOffice/OpenOffice not found")
-        return None
-
-    def _find_antiword_path(self) -> Optional[str]:
+    def _try_find_antiword(self) -> Optional[str]:
        """Find antiword executable path

        Returns:
@@ -265,51 +221,27 @@ class DocParser(BaseParser):
            "C:\\Program Files\\Antiword\\antiword.exe",
            "C:\\Program Files (x86)\\Antiword\\antiword.exe",
        ]
-
-        # Check if path is set in environment variable
-        if os.environ.get("ANTIWORD_PATH"):
-            possible_paths.insert(0, os.environ.get("ANTIWORD_PATH"))
-
-        for path in possible_paths:
-            if os.path.exists(path):
-                logger.info(f"Found antiword at: {path}")
-                return path
-
-        # Try to find in PATH
-        try:
-            result = subprocess.run(
-                ["which", "antiword"], capture_output=True, text=True
-            )
-            if result.returncode == 0 and result.stdout.strip():
-                path = result.stdout.strip()
-                logger.info(f"Found antiword in PATH: {path}")
-                return path
-        except Exception:
-            pass
-
-        logger.warning("antiword not found")
-        return None
+        return self._try_find_executable_path(
+            executable_name="antiword",
+            possible_path=possible_paths,
+            environment_variable=["ANTIWORD_PATH"],
+        )


 if __name__ == "__main__":
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-    logger.info("Running DocParser in standalone mode")
+    logging.basicConfig(level=logging.DEBUG)

    file_name = "/path/to/your/test.doc"
    logger.info(f"Processing file: {file_name}")
-
    doc_parser = DocParser(
-        file_name, enable_multimodal=True, chunk_size=512, chunk_overlap=60
+        file_name=file_name,
+        enable_multimodal=True,
+        chunk_size=512,
+        chunk_overlap=60,
    )
-    logger.info("Parser initialized, starting processing")
-
    with open(file_name, "rb") as f:
        content = f.read()

-    text = doc_parser.parse_into_text(content)
-    logger.info(f"Processing complete, extracted text length: {len(text)}")
-    logger.info(f"Sample text: {text[:200]}...")
+    document = doc_parser.parse_into_text(content)
+    logger.info(f"Processing complete, extracted text length: {len(document.content)}")
+    logger.info(f"Sample text: {document.content[:200]}...")
--- a/docreader/parser/docx2_parser.py
+++ b/docreader/parser/docx2_parser.py
@@ -0,0 +1,28 @@
+import logging
+
+from docreader.parser.chain_parser import FirstParser
+from docreader.parser.docx_parser import DocxParser
+from docreader.parser.markitdown_parser import MarkitdownParser
+
+logger = logging.getLogger(__name__)
+
+
+class Docx2Parser(FirstParser):
+    _parser_cls = (MarkitdownParser, DocxParser)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+
+    your_file = "/path/to/your/file.docx"
+    parser = Docx2Parser(separators=[".", "?", "!", "。", "？", "！"])
+    with open(your_file, "rb") as f:
+        content = f.read()
+
+        document = parser.parse(content)
+        for cc in document.chunks:
+            logger.info(f"chunk: {cc}")
+
+        # document = parser.parse_into_text(content)
+        # logger.info(f"docx content: {document.content}")
+        # logger.info(f"find images {document.images.keys()}")
--- a/docreader/parser/docx_parser.py
+++ b/docreader/parser/docx_parser.py
@@ -1,37 +1,36 @@
 import logging
-import tempfile
 import os
-import sys
-import time
-from io import BytesIO
-from typing import Optional, Dict, Any, Tuple, List, Union
-from dataclasses import dataclass, field
-from PIL import Image
-from docx import Document
-from docx.image.exceptions import (
-    UnrecognizedImageError,
-    UnexpectedEndOfFileError,
-    InvalidImageStreamError,
-)
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
+import re
 import tempfile
 import threading
+import time
 import traceback
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from io import BytesIO
 from multiprocessing import Manager
-import re
+from typing import Any, Dict, List, Optional, Tuple

-from .base_parser import BaseParser
+from docx import Document
+from docx.image.exceptions import (
+    InvalidImageStreamError,
+    UnexpectedEndOfFileError,
+    UnrecognizedImageError,
+)
+from PIL import Image
+
+from docreader.models.document import Document as DocumentModel
+from docreader.parser.base_parser import BaseParser
+from docreader.utils import endecode

 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-# Add thread local storage to track the processing status of each thread
-thread_local = threading.local()


 class ImageData:
    """Represents a processed image of document content"""
+
    local_path: str = ""
-    object: Image.Image = None
+    object: Optional[Image.Image] = None
    url: str = ""


@@ -40,7 +39,9 @@ class LineData:
    """Represents a processed line of document content with associated images"""

    text: str = ""  # Extracted text content
-    images: List[ImageData] = field(default_factory=list)  # List of images or image paths
+    images: List[ImageData] = field(
+        default_factory=list
+    )  # List of images or image paths
    extra_info: str = ""  # Placeholder for additional info (currently unused)
    page_num: int = 0  # Page number
    content_sequence: List[Tuple[str, Any]] = field(
@@ -53,18 +54,8 @@ class DocxParser(BaseParser):

    def __init__(
        self,
-        file_name: str = "",
-        file_type: str = None,
-        enable_multimodal: bool = True,
-        chunk_size: int = 1000,
-        chunk_overlap: int = 200,
-        separators: list = ["\n\n", "\n", "。"],
-        ocr_backend: str = "paddle",
-        ocr_config: dict = None,
-        max_image_size: int = 1920,
-        max_concurrent_tasks: int = 5,
-        max_pages: int = 100,  # Maximum number of pages to process, default to 50 pages
-        chunking_config=None,
+        max_pages: int = 100,  # Maximum number of pages to process
+        **kwargs,
    ):
        """Initialize DOCX document parser

@@ -79,37 +70,16 @@ class DocxParser(BaseParser):
            ocr_config: OCR engine configuration
            max_image_size: Maximum image size limit
            max_concurrent_tasks: Maximum number of concurrent tasks
-            max_pages: Maximum number of pages to process, if more than this, only process the first max_pages pages
+            max_pages: Maximum number of pages to process
        """
-        super().__init__(
-            file_name=file_name,
-            file_type=file_type,
-            enable_multimodal=enable_multimodal,
-            chunk_size=chunk_size,
-            chunk_overlap=chunk_overlap,
-            separators=separators,
-            ocr_backend=ocr_backend,
-            ocr_config=ocr_config,
-            max_image_size=max_image_size,
-            max_concurrent_tasks=max_concurrent_tasks,
-            chunking_config=chunking_config,
-        )
+        super().__init__(**kwargs)
        self.max_pages = max_pages
        logger.info(f"DocxParser initialized with max_pages={max_pages}")

-    def parse_into_text(self, content: bytes) ->  Union[str, Tuple[str, Dict[str, Any]]]:
-        """Parse DOCX document, extract text content and image Markdown links
-
-        Args:
-            content: DOCX document content
-
-        Returns:
-            Tuple of (parsed_text, image_map) where image_map maps image URLs to Image objects
-            All LineData objects are used internally but not returned directly through this interface
-        """
+    def parse_into_text(self, content: bytes) -> DocumentModel:
+        """Parse DOCX document, extract text content and image Markdown links"""
        logger.info(f"Parsing DOCX document, content size: {len(content)} bytes")
        logger.info(f"Max pages limit set to: {self.max_pages}")
-        logger.info("Converting DOCX content to sections and tables")

        start_time = time.time()
        # Use concurrent processing to handle the document
@@ -123,7 +93,7 @@ class DocxParser(BaseParser):
            docx_processor = Docx(
                max_image_size=self.max_image_size,
                enable_multimodal=self.enable_multimodal,
-                upload_file=self.upload_file,
+                upload_file=self.storage.upload_file,
            )
            all_lines, tables = docx_processor(
                binary=content,
@@ -140,7 +110,7 @@ class DocxParser(BaseParser):
            section_start_time = time.time()

            text_parts = []
-            image_parts = {}
+            image_parts: Dict[str, str] = {}

            for sec_idx, line in enumerate(all_lines):
                try:
@@ -148,16 +118,19 @@ class DocxParser(BaseParser):
                        text_parts.append(line.text)
                        if sec_idx < 3 or sec_idx % 50 == 0:
                            logger.info(
-                                f"Added section {sec_idx+1} text: {line.text[:50]}..."
+                                f"Added section {sec_idx + 1} text: {line.text[:50]}..."
                                if len(line.text) > 50
-                                else f"Added section {sec_idx+1} text: {line.text}"
+                                else f"Added section {sec_idx + 1} text: {line.text}"
                            )
                    if line.images:
                        for image_data in line.images:
-                            if image_data.url:
-                                image_parts[image_data.url] = image_data.object
+                            if image_data.url and image_data.object:
+                                image_parts[image_data.url] = endecode.decode_image(
+                                    image_data.object
+                                )
+                                image_data.object.close()
                except Exception as e:
-                    logger.error(f"Error processing section {sec_idx+1}: {str(e)}")
+                    logger.error(f"Error processing section {sec_idx + 1}: {str(e)}")
                    logger.error(f"Detailed stack trace: {traceback.format_exc()}")
                    continue

@@ -176,17 +149,17 @@ class DocxParser(BaseParser):

            total_processing_time = time.time() - start_time
            logger.info(
-                f"Parsing complete in {total_processing_time:.2f}s, generated {len(text)} characters of text "
+                f"Parsing complete in {total_processing_time:.2f}s, "
+                f"generated {len(text)} characters of text"
            )

-            return text, image_parts
+            return DocumentModel(content=text, images=image_parts)
        except Exception as e:
            logger.error(f"Error parsing DOCX document: {str(e)}")
            logger.error(f"Detailed stack trace: {traceback.format_exc()}")
-            fallback_text = self._parse_using_simple_method(content)
-            return fallback_text, {}
+            return self._parse_using_simple_method(content)

-    def _parse_using_simple_method(self, content: bytes) -> str:
+    def _parse_using_simple_method(self, content: bytes) -> DocumentModel:
        """Parse document using a simplified method, as a fallback

        Args:
@@ -201,7 +174,8 @@ class DocxParser(BaseParser):
            doc = Document(BytesIO(content))
            logger.info(
                f"Successfully loaded document in simplified method, "
-                f"contains {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables"
+                f"contains {len(doc.paragraphs)} paragraphs "
+                f"and {len(doc.tables)} tables"
            )
            text_parts = []

@@ -211,7 +185,7 @@ class DocxParser(BaseParser):
            para_with_text = 0
            for i, para in enumerate(doc.paragraphs):
                if i % 100 == 0:
-                    logger.info(f"Processing paragraph {i+1}/{para_count}")
+                    logger.info(f"Processing paragraph {i + 1}/{para_count}")
                if para.text.strip():
                    text_parts.append(para.text.strip())
                    para_with_text += 1
@@ -225,7 +199,7 @@ class DocxParser(BaseParser):
            rows_processed = 0
            for i, table in enumerate(doc.tables):
                if i % 10 == 0:
-                    logger.info(f"Processing table {i+1}/{table_count}")
+                    logger.info(f"Processing table {i + 1}/{table_count}")

                table_has_content = False
                for row in table.rows:
@@ -256,25 +230,24 @@ class DocxParser(BaseParser):
            # If the result is still empty, return an error message
            if not result_text:
                logger.warning("No text extracted using simplified method")
-                return "", {}
+                return DocumentModel()

-            return result_text, {}
+            return DocumentModel(content=result_text)
        except Exception as backup_error:
            processing_time = time.time() - start_time
            logger.error(
-                f"Simplified parsing failed after {processing_time:.2f}s: {str(backup_error)}"
+                f"Simplified parsing failed {processing_time:.2f}s: {backup_error}"
            )
            logger.error(f"Detailed traceback: {traceback.format_exc()}")
-            return "", {}
+            return DocumentModel()


 class Docx:
    def __init__(self, max_image_size=1920, enable_multimodal=False, upload_file=None):
        logger.info("Initializing DOCX processor")
        self.max_image_size = max_image_size  # Maximum image size limit
-        self.picture_cache = (
-            {}
-        )  # Image cache to avoid processing the same image repeatedly
+        # Image cache to avoid processing the same image repeatedly
+        self.picture_cache = {}
        self.enable_multimodal = enable_multimodal
        self.upload_file = upload_file

@@ -454,7 +427,6 @@ class Docx:

        return page_to_paragraphs

-
    def __call__(
        self,
        binary: Optional[bytes] = None,
@@ -611,7 +583,6 @@ class Docx:

        return pages_to_process

-
    def _process_document(
        self,
        binary,
@@ -806,7 +777,9 @@ class Docx:
                # Collect temporary image paths for later cleanup
                for line in page_lines:
                    for image_data in line.images:
-                        if image_data.local_path and image_data.local_path.startswith("/tmp/docx_img_"):
+                        if image_data.local_path and image_data.local_path.startswith(
+                            "/tmp/docx_img_"
+                        ):
                            temp_img_paths.add(image_data.local_path)

                results.extend(page_lines)
@@ -876,7 +849,11 @@ class Docx:

                # Process all image data objects
                for image_data in image_paths:
-                    if image_data.local_path and os.path.exists(image_data.local_path) and image_data.local_path not in image_url_map:
+                    if (
+                        image_data.local_path
+                        and os.path.exists(image_data.local_path)
+                        and image_data.local_path not in image_url_map
+                    ):
                        try:
                            # Upload the image if it doesn't have a URL yet
                            if not image_data.url:
@@ -886,12 +863,16 @@ class Docx:
                                    image_data.url = image_url
                                    # Add image URL as Markdown format
                                    markdown_image = f"![]({image_url})"
-                                    image_url_map[image_data.local_path] = markdown_image
+                                    image_url_map[image_data.local_path] = (
+                                        markdown_image
+                                    )
                                    logger.info(
                                        f"Added image URL for {image_data.local_path}: {image_url}"
                                    )
                                else:
-                                    logger.warning(f"Failed to upload image: {image_data.local_path}")
+                                    logger.warning(
+                                        f"Failed to upload image: {image_data.local_path}"
+                                    )
                            else:
                                # Already has a URL, use it
                                markdown_image = f"![]({image_data.url})"
@@ -925,12 +906,19 @@ class Docx:
                        # For ImageData objects, use the URL
                        if isinstance(content, str) and content in image_url_map:
                            combined_parts.append(image_url_map[content])
-                        elif hasattr(content, 'local_path') and content.local_path in image_url_map:
+                        elif (
+                            hasattr(content, "local_path")
+                            and content.local_path in image_url_map
+                        ):
                            combined_parts.append(image_url_map[content.local_path])

                # Create the final text with proper ordering
                final_text = "\n\n".join(part for part in combined_parts if part)
-                processed_lines.append(LineData(text=final_text, page_num=page_num, images=line_data.images))
+                processed_lines.append(
+                    LineData(
+                        text=final_text, page_num=page_num, images=line_data.images
+                    )
+                )
        else:
            processed_lines = lines

@@ -1003,11 +991,11 @@ class Docx:
            logger.info(f"Processing {table_count} tables")
            for tb_idx, tb in enumerate(self.doc.tables):
                if tb_idx % 10 == 0:  # Log only every 10 tables to reduce log volume
-                    logger.info(f"Processing table {tb_idx+1}/{table_count}")
+                    logger.info(f"Processing table {tb_idx + 1}/{table_count}")

                # Optimize: Check if table is empty
                if len(tb.rows) == 0 or all(len(r.cells) == 0 for r in tb.rows):
-                    logger.info(f"Skipping empty table {tb_idx+1}")
+                    logger.info(f"Skipping empty table {tb_idx + 1}")
                    continue

                table_html = self._convert_table_to_html(tb)
@@ -1111,8 +1099,8 @@ def _save_image_to_temp(logger, image, page_num, img_idx):
    if not image:
        return None

-    import tempfile
    import os
+    import tempfile

    try:
        # Create a temporary file
@@ -1187,8 +1175,15 @@ def process_page_multiprocess(
            return []

        # Extract page content
-        combined_text, image_objects, content_sequence = _extract_page_content_in_process(
-            process_logger, doc, page_num, paragraphs, enable_multimodal, max_image_size
+        combined_text, image_objects, content_sequence = (
+            _extract_page_content_in_process(
+                process_logger,
+                doc,
+                page_num,
+                paragraphs,
+                enable_multimodal,
+                max_image_size,
+            )
        )

        # Process content sequence to maintain order between processes
@@ -1199,7 +1194,9 @@ def process_page_multiprocess(
        if enable_multimodal:
            # First pass: save all images to temporary files
            for i, image_object in enumerate(image_objects):
-                img_path = _save_image_to_temp(process_logger, image_object, page_num, i)
+                img_path = _save_image_to_temp(
+                    process_logger, image_object, page_num, i
+                )
                if img_path:
                    # Create ImageData object
                    image_data = ImageData()
--- a/docreader/parser/image_parser.py
+++ b/docreader/parser/image_parser.py
@@ -1,15 +1,13 @@
+import base64
 import logging
 import os
-import asyncio
-from PIL import Image
-import io
-from typing import Dict, Any, Tuple, Union
-from .base_parser import BaseParser, ParseResult
-import numpy as np
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser

 # Set up logger for this module
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+

 class ImageParser(BaseParser):
    """
@@ -23,46 +21,24 @@ class ImageParser(BaseParser):
    4. Returning a combined result with both text and image reference
    """

-    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+    def parse_into_text(self, content: bytes) -> Document:
        """
-        Parse image content, upload the image and return Markdown reference along with image map.
-
-        Args:
-            content: Raw image data (bytes)
-
-        Returns:
-            Tuple of (markdown_text, image_map) where image_map maps image URLs to PIL Image objects
+        Parse image content into markdown text
+        :param content: bytes content of the image
+        :return: Document object
        """
        logger.info(f"Parsing image content, size: {len(content)} bytes")
-        image_map = {}
-        
-        try:
-            # Upload image to storage service
-            logger.info("Uploading image to storage")
-            _, ext = os.path.splitext(self.file_name)
-            image_url = self.upload_bytes(content, file_ext=ext)
-            if not image_url:
-                logger.error("Failed to upload image to storage")
-                return "", {}
-            logger.info(
-                f"Successfully uploaded image, URL: {image_url[:50]}..."
-                if len(image_url) > 50
-                else f"Successfully uploaded image, URL: {image_url}"
-            )

-            # Create image object and add to map
-            try:
-                from PIL import Image
-                import io
-                image = Image.open(io.BytesIO(content))
-                image_map[image_url] = image
-                logger.info(f"Added image to image_map for URL: {image_url}")
-            except Exception as img_err:
-                logger.error(f"Error creating image object: {str(img_err)}")
+        # Get file extension
+        ext = os.path.splitext(self.file_name)[1].lower()

-            markdown_text = f"![{self.file_name}]({image_url})"
-            return markdown_text, image_map
+        # Upload image to storage
+        image_url = self.storage.upload_bytes(content, file_ext=ext)
+        logger.info(f"Successfully uploaded image, URL: {image_url[:50]}...")

-        except Exception as e:
-            logger.error(f"Error parsing image: {str(e)}")
-            return "", {}
+        # Generate markdown text
+        text = f"![{self.file_name}]({image_url})"
+        images = {image_url: base64.b64encode(content).decode()}
+
+        # Create image object and add to map
+        return Document(content=text, images=images)
--- a/docreader/parser/image_utils.py
+++ b/docreader/parser/image_utils.py
@@ -1,43 +0,0 @@
-import base64
-import io
-import logging
-from typing import Union
-from PIL import Image
-import numpy as np
-
-logger = logging.getLogger(__name__)
-
-def image_to_base64(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
-    """Convert image to base64 encoded string
-    
-    Args:
-        image: Image file path, bytes, PIL Image object, or numpy array
-        
-    Returns:
-        Base64 encoded image string, or empty string if conversion fails
-    """
-    try:
-        if isinstance(image, str):
-            # It's a file path
-            with open(image, "rb") as image_file:
-                return base64.b64encode(image_file.read()).decode("utf-8")
-        elif isinstance(image, bytes):
-            # It's bytes data
-            return base64.b64encode(image).decode("utf-8")
-        elif isinstance(image, Image.Image):
-            # It's a PIL Image
-            buffer = io.BytesIO()
-            image.save(buffer, format="PNG")
-            return base64.b64encode(buffer.getvalue()).decode("utf-8")
-        elif isinstance(image, np.ndarray):
-            # It's a numpy array
-            pil_image = Image.fromarray(image)
-            buffer = io.BytesIO()
-            pil_image.save(buffer, format="PNG")
-            return base64.b64encode(buffer.getvalue()).decode("utf-8")
-        else:
-            logger.error(f"Unsupported image type: {type(image)}")
-            return ""
-    except Exception as e:
-        logger.error(f"Error converting image to base64: {str(e)}")
-        return ""
--- a/docreader/parser/markdown_image_util.py
+++ b/docreader/parser/markdown_image_util.py
@@ -0,0 +1,111 @@
+import logging
+import re
+import uuid
+from typing import Dict, List, Match, Optional, Tuple
+
+from docreader.utils import endecode
+
+# Get logger object
+logger = logging.getLogger(__name__)
+
+
+class MarkdownImageUtil:
+    def __init__(self):
+        self.b64_pattern = re.compile(
+            r"!\[([^\]]*)\]\(data:image/(\w+);base64,([^\)]+)\)"
+        )
+        self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
+        self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
+
+    def extract_image(
+        self,
+        content: str,
+        path_prefix: Optional[str] = None,
+        replace: bool = True,
+    ) -> Tuple[str, List[str]]:
+        """Extract base64 encoded images from Markdown content"""
+
+        # image_path => base64 bytes
+        images: List[str] = []
+
+        def repl(match: Match[str]) -> str:
+            title = match.group(1)
+            image_path = match.group(2)
+            if path_prefix:
+                image_path = f"{path_prefix}/{image_path}"
+
+            images.append(image_path)
+
+            if not replace:
+                return match.group(0)
+
+            # Replace image path with URL
+            return f"![{title}]({image_path})"
+
+        text = self.image_pattern.sub(repl, content)
+        logger.debug(f"Extracted {len(images)} images from markdown")
+        return text, images
+
+    def extract_base64(
+        self,
+        content: str,
+        path_prefix: Optional[str] = None,
+        replace: bool = True,
+    ) -> Tuple[str, Dict[str, bytes]]:
+        """Extract base64 encoded images from Markdown content"""
+
+        # image_path => base64 bytes
+        images: Dict[str, bytes] = {}
+
+        def repl(match: Match[str]) -> str:
+            title = match.group(1)
+            img_ext = match.group(2)
+            img_b64 = match.group(3)
+
+            image_byte = endecode.encode_image(img_b64, errors="ignore")
+            if not image_byte:
+                logger.error(f"Failed to decode base64 image skip it: {img_b64}")
+                return title
+
+            image_path = f"{uuid.uuid4()}.{img_ext}"
+            if path_prefix:
+                image_path = f"{path_prefix}/{image_path}"
+            images[image_path] = image_byte
+
+            if not replace:
+                return match.group(0)
+
+            # Replace image path with URL
+            return f"![{title}]({image_path})"
+
+        text = self.b64_pattern.sub(repl, content)
+        logger.debug(f"Extracted {len(images)} base64 images from markdown")
+        return text, images
+
+    def replace_path(self, content: str, images: Dict[str, str]) -> str:
+        content_replace: set = set()
+
+        def repl(match: Match[str]) -> str:
+            title = match.group(1)
+            image_path = match.group(2)
+            if image_path not in images:
+                return match.group(0)
+
+            content_replace.add(image_path)
+            image_path = images[image_path]
+            return f"![{title}]({image_path})"
+
+        text = self.replace_pattern.sub(repl, content)
+        logger.debug(f"Replaced {len(content_replace)} images in markdown")
+        return text
+
+
+if __name__ == "__main__":
+    your_content = "test![](data:image/png;base64,iVBORw0KGgoAAAA)test"
+    image_handle = MarkdownImageUtil()
+    text, images = image_handle.extract_base64(your_content)
+    print(text)
+
+    for image_url, image_byte in images.items():
+        with open(image_url, "wb") as f:
+            f.write(image_byte)
--- a/docreader/parser/markdown_parser.py
+++ b/docreader/parser/markdown_parser.py
@@ -1,33 +1,53 @@
-import asyncio
-import re
+import base64
 import logging
-import numpy as np
-import os  # Import os module to get environment variables
-from typing import Dict, List, Optional, Tuple, Union, Any
-from .base_parser import BaseParser
+import os
+from typing import Dict
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.parser.chain_parser import PipelineParser
+from docreader.parser.markdown_image_util import MarkdownImageUtil
+from docreader.utils import endecode

 # Get logger object
 logger = logging.getLogger(__name__)


-class MarkdownParser(BaseParser):
-    """Markdown document parser"""
-
-    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
-        """Parse Markdown document, only extract text content, do not process images
-
-        Args:
-            content: Markdown document content
-
-        Returns:
-            Parsed text result
-        """
-        logger.info(f"Parsing Markdown document, content size: {len(content)} bytes")
+class MarkdownImageBase64(BaseParser):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.image_helper = MarkdownImageUtil()

+    def parse_into_text(self, content: bytes) -> Document:
        # Convert byte content to string using universal decoding method
-        text = self.decode_bytes(content)
-        logger.info(f"Decoded Markdown content, text length: {len(text)} characters")
+        text = endecode.decode_bytes(content)
+        text, img_b64 = self.image_helper.extract_base64(text, path_prefix="images")

-        logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text")
-        return text
+        images: Dict[str, str] = {}
+        image_replace: Dict[str, str] = {}

+        logger.debug(f"Uploading {len(img_b64)} images from markdown")
+        for ipath, b64_bytes in img_b64.items():
+            ext = os.path.splitext(ipath)[1].lower()
+            image_url = self.storage.upload_bytes(b64_bytes, ext)
+
+            image_replace[ipath] = image_url
+            images[image_url] = base64.b64encode(b64_bytes).decode()
+
+        text = self.image_helper.replace_path(text, image_replace)
+        return Document(content=text, images=images)
+
+
+class MarkdownParser(PipelineParser):
+    _parser_cls = (MarkdownImageBase64,)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+
+    your_content = "test![](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMgA)test"
+    parser = MarkdownParser()
+
+    document = parser.parse_into_text(your_content.encode())
+    logger.info(document.content)
+    logger.info(f"Images: {len(document.images)}, name: {document.images.keys()}")
--- a/docreader/parser/markitdown_parser.py
+++ b/docreader/parser/markitdown_parser.py
@@ -0,0 +1,31 @@
+import io
+import logging
+
+from markitdown import MarkItDown
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.parser.chain_parser import PipelineParser
+from docreader.parser.markdown_parser import MarkdownParser
+
+logger = logging.getLogger(__name__)
+
+
+class StdMarkitdownParser(BaseParser):
+    """
+    PDF Document Parser
+
+    This parser handles PDF documents by extracting text content.
+    It uses the markitdown library for simple text extraction.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.markitdown = MarkItDown()
+
+    def parse_into_text(self, content: bytes) -> Document:
+        result = self.markitdown.convert(io.BytesIO(content), keep_data_uris=True)
+        return Document(content=result.text_content)
+
+
+class MarkitdownParser(PipelineParser):
+    _parser_cls = (StdMarkitdownParser, MarkdownParser)
--- a/docreader/parser/mineru_parser.py
+++ b/docreader/parser/mineru_parser.py
@@ -0,0 +1,124 @@
+import logging
+import os
+import re
+from typing import Dict
+
+import markdownify
+import requests
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.parser.markdown_parser import MarkdownImageUtil
+from docreader.utils import endecode
+
+logger = logging.getLogger(__name__)
+
+
+class MinerUParser(BaseParser):
+    def __init__(
+        self,
+        enable_markdownify: bool = True,
+        mineru_endpoint: str = "",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.minerU = os.getenv("MINERU_ENDPOINT", mineru_endpoint)
+        self.enable_markdownify = enable_markdownify
+        self.image_helper = MarkdownImageUtil()
+        self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
+        self.enable = self.ping()
+        assert self.ping(), "MinerU API is not reachable"
+
+    def ping(self, timeout: int = 5) -> bool:
+        try:
+            response = requests.get(
+                self.minerU + "/docs", timeout=timeout, allow_redirects=True
+            )
+            response.raise_for_status()
+            return True
+        except Exception:
+            return False
+
+    def parse_into_text(self, content: bytes) -> Document:
+        logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
+        md_content: str = ""
+        images_b64: Dict[str, str] = {}
+        try:
+            response = requests.post(
+                url=self.minerU + "/file_parse",
+                data={
+                    "return_md": True,
+                    "return_images": True,
+                    "lang_list": ["ch", "en"],
+                    "table_enable": True,
+                    "formula_enable": True,
+                    "parse_method": "auto",
+                    "start_page_id": 0,
+                    "end_page_id": 99999,
+                    "backend": "pipeline",
+                    "response_format_zip": False,
+                    "return_middle_json": False,
+                    "return_model_output": False,
+                    "return_content_list": False,
+                },
+                files={"files": content},
+                timeout=1000,
+            )
+            response.raise_for_status()
+            result = response.json()["results"]["files"]
+            md_content = result["md_content"]
+            images_b64 = result.get("images", {})
+        except Exception as e:
+            logger.error(f"MinerU parsing failed: {e}", exc_info=True)
+            return Document()
+
+        # convert table(HTML) in markdown to markdown table
+        if self.enable_markdownify:
+            logger.debug("Converting HTML to Markdown")
+            md_content = markdownify.markdownify(md_content)
+
+        images = {}
+        image_replace = {}
+        # image in images_bs64 may not be used in md_content
+        # such as: table ...
+        # so we need to filter them
+        for ipath, b64_str in images_b64.items():
+            if f"images/{ipath}" not in md_content:
+                logger.debug(f"Image {ipath} not used in markdown")
+                continue
+            match = self.base64_pattern.match(b64_str)
+            if match:
+                file_ext = match.group(1)
+                b64_str = match.group(2)
+
+                image_bytes = endecode.encode_image(b64_str, errors="ignore")
+                if not image_bytes:
+                    logger.error("Failed to decode base64 image skip it")
+                    continue
+
+                image_url = self.storage.upload_bytes(
+                    image_bytes, file_ext=f".{file_ext}"
+                )
+
+                images[image_url] = b64_str
+                image_replace[f"images/{ipath}"] = image_url
+
+        logger.info(f"Replaced {len(image_replace)} images in markdown")
+        text = self.image_helper.replace_path(md_content, image_replace)
+
+        logger.info(
+            f"Successfully parsed PDF, text: {len(text)}, images: {len(images)}"
+        )
+        return Document(content=text, images=images)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+
+    your_file = "/path/to/your/file.pdf"
+    your_mineru = "http://host.docker.internal:9987"
+    parser = MinerUParser(mineru_endpoint=your_mineru)
+    with open(your_file, "rb") as f:
+        content = f.read()
+        document = parser.parse_into_text(content)
+        logger.error(document.content)
--- a/docreader/parser/ocr_engine.py
+++ b/docreader/parser/ocr_engine.py
@@ -1,71 +1,96 @@
-import os
-import logging
-import base64
-from typing import Optional, Union, Dict, Any
-from abc import ABC, abstractmethod
-from PIL import Image
 import io
+import logging
+import os
+import platform
+import subprocess
+from abc import ABC, abstractmethod
+from typing import Dict, Union
+
 import numpy as np
-from .image_utils import image_to_base64
+from openai import OpenAI
+from PIL import Image
+
+from docreader.utils import endecode

 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+

 class OCRBackend(ABC):
    """Base class for OCR backends"""
-    
+
    @abstractmethod
    def predict(self, image: Union[str, bytes, Image.Image]) -> str:
        """Extract text from an image
-        
+
        Args:
            image: Image file path, bytes, or PIL Image object
-            
+
        Returns:
            Extracted text
        """
        pass

+
+class DummyOCRBackend(OCRBackend):
+    """Dummy OCR backend implementation"""
+
+    def predict(self, image: Union[str, bytes, Image.Image]) -> str:
+        logger.warning("Dummy OCR backend is used")
+        return ""
+
+
 class PaddleOCRBackend(OCRBackend):
    """PaddleOCR backend implementation"""
-    
-    def __init__(self, **kwargs):
+
+    def __init__(self):
        """Initialize PaddleOCR backend"""
        self.ocr = None
        try:
-            import os
            import paddle
-            
+
            # Set PaddlePaddle to use CPU and disable GPU
-            os.environ['CUDA_VISIBLE_DEVICES'] = ''
-            paddle.set_device('cpu')
-            
+            os.environ["CUDA_VISIBLE_DEVICES"] = ""
+            paddle.device.set_device("cpu")
+
            # 尝试检测CPU是否支持AVX指令集
            try:
-                import subprocess
-                import platform
-                
                # 检测CPU是否支持AVX
                if platform.system() == "Linux":
                    try:
-                        result = subprocess.run(['grep', '-o', 'avx', '/proc/cpuinfo'], 
-                                              capture_output=True, text=True, timeout=5)
-                        has_avx = 'avx' in result.stdout.lower()
+                        result = subprocess.run(
+                            ["grep", "-o", "avx", "/proc/cpuinfo"],
+                            capture_output=True,
+                            text=True,
+                            timeout=5,
+                        )
+                        has_avx = "avx" in result.stdout.lower()
                        if not has_avx:
-                            logger.warning("CPU does not support AVX instructions, using compatibility mode")
+                            logger.warning(
+                                "CPU does not support AVX instructions, "
+                                "using compatibility mode"
+                            )
                            # 进一步限制指令集使用
-                            os.environ['FLAGS_use_avx2'] = '0'
-                            os.environ['FLAGS_use_avx'] = '1'
-                    except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
-                        logger.warning("Could not detect AVX support, using compatibility mode")
-                        os.environ['FLAGS_use_avx2'] = '0'
-                        os.environ['FLAGS_use_avx'] = '1'
+                            os.environ["FLAGS_use_avx2"] = "0"
+                            os.environ["FLAGS_use_avx"] = "1"
+                    except (
+                        subprocess.TimeoutExpired,
+                        FileNotFoundError,
+                        subprocess.SubprocessError,
+                    ):
+                        logger.warning(
+                            "Could not detect AVX support, using compatibility mode"
+                        )
+                        os.environ["FLAGS_use_avx2"] = "0"
+                        os.environ["FLAGS_use_avx"] = "1"
            except Exception as e:
-                logger.warning(f"Error detecting CPU capabilities: {e}, using compatibility mode")
-                os.environ['FLAGS_use_avx2'] = '0'
-                os.environ['FLAGS_use_avx'] = '1'
-            
+                logger.warning(
+                    f"Error detecting CPU capabilities: {e}, using compatibility mode"
+                )
+                os.environ["FLAGS_use_avx2"] = "0"
+                os.environ["FLAGS_use_avx"] = "1"
+
            from paddleocr import PaddleOCR
+
            # OCR configuration with text orientation classification enabled
            ocr_config = {
                "use_gpu": False,
@@ -86,23 +111,53 @@ class PaddleOCRBackend(OCRBackend):
                "use_dilation": True,  # improves accuracy
                "det_db_score_mode": "slow",  # improves accuracy
            }
-            
+
            self.ocr = PaddleOCR(**ocr_config)
            logger.info("PaddleOCR engine initialized successfully")
-            
+
        except ImportError as e:
-            logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
+            logger.error(
+                f"Failed to import paddleocr: {str(e)}. "
+                "Please install it with 'pip install paddleocr'"
+            )
        except OSError as e:
            if "Illegal instruction" in str(e) or "core dumped" in str(e):
-                logger.error(f"PaddlePaddle crashed due to CPU instruction set incompatibility: {str(e)}")
-                logger.error("This usually happens when the CPU doesn't support AVX instructions.")
-                logger.error("Please try installing a CPU-only version of PaddlePaddle or use a different OCR backend.")
+                logger.error(
+                    f"PaddlePaddle crashed due to CPU instruction set incompatibility:"
+                    f"{e}"
+                )
+                logger.error(
+                    "This happens when the CPU doesn't support AVX instructions. "
+                    "Try install CPU-only version of PaddlePaddle, "
+                    "or use a different OCR backend."
+                )
            else:
-                logger.error(f"Failed to initialize PaddleOCR due to OS error: {str(e)}")
+                logger.error(
+                    f"Failed to initialize PaddleOCR due to OS error: {str(e)}"
+                )
        except Exception as e:
            logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
-    
-    def predict(self, image):
+
+    def predict(self, image: Union[str, bytes, Image.Image]) -> str:
+        """Extract text from an image
+
+        Args:
+            image: Image file path, bytes, or PIL Image object
+
+        Returns:
+            Extracted text
+        """
+        if isinstance(image, str):
+            image = Image.open(image)
+        elif isinstance(image, bytes):
+            image = Image.open(io.BytesIO(image))
+
+        if not isinstance(image, Image.Image):
+            raise TypeError("image must be a string, bytes, or PIL Image object")
+
+        return self._predict(image)
+
+    def _predict(self, image: Image.Image) -> str:
        """Perform OCR recognition on the image

        Args:
@@ -111,63 +166,59 @@ class PaddleOCRBackend(OCRBackend):
        Returns:
            Extracted text string
        """
+        if self.ocr is None:
+            logger.error("PaddleOCR engine not initialized")
+            return ""
        try:
            # Ensure image is in RGB format
-            if hasattr(image, "convert") and image.mode != "RGB":
+            if image.mode != "RGB":
                image = image.convert("RGB")

            # Convert to numpy array if needed
-            if hasattr(image, "convert"):
-                image_array = np.array(image)
-            else:
-                image_array = image
+            image_array = np.array(image)

            # Perform OCR
            ocr_result = self.ocr.ocr(image_array, cls=False)
-   
+
            # Extract text
            ocr_text = ""
            if ocr_result and ocr_result[0]:
-                for line in ocr_result[0]:
-                    if line and len(line) >= 2:
-                        text = line[1][0] if line[1] else ""
-                        if text:
-                            ocr_text += text + " "
-            
-            text_length = len(ocr_text.strip())
-            if text_length > 0:
-                logger.info(f"OCR extracted {text_length} characters")
-                return ocr_text.strip()
-            else:
-                logger.warning("OCR returned empty result")
-                return ""
-                
+                text = [
+                    line[1][0] if line and len(line) >= 2 and line[1] else ""
+                    for line in ocr_result[0]
+                ]
+                text = [t.strip() for t in text if t]
+                ocr_text = " ".join(text)
+
+            logger.info(f"OCR extracted {len(ocr_text)} characters")
+            return ocr_text
+
        except Exception as e:
            logger.error(f"OCR recognition error: {str(e)}")
            return ""
-    
+
+
 class NanonetsOCRBackend(OCRBackend):
    """Nanonets OCR backend implementation using OpenAI API format"""
-    
-    def __init__(self, **kwargs):
+
+    def __init__(self):
        """Initialize Nanonets OCR backend
-        
+
        Args:
            api_key: API key for OpenAI API
            base_url: Base URL for OpenAI API
            model: Model name
        """
-        try:
-            from openai import OpenAI
-            self.api_key = kwargs.get("api_key", "123")
-            self.base_url = kwargs.get("base_url", "http://localhost:8000/v1")
-            self.model = kwargs.get("model", "nanonets/Nanonets-OCR-s")
-            self.temperature = kwargs.get("temperature", 0.0)
-            self.max_tokens = kwargs.get("max_tokens", 15000)
-            
-            self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
-            self.prompt = """
-## 任务说明
+        base_url = os.getenv("OCR_API_BASE_URL", "http://localhost:8000/v1")
+        api_key = os.getenv("OCR_API_KEY", "123")
+        timeout = 30
+        self.client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout)
+
+        self.model = os.getenv("OCR_MODEL", "nanonets/Nanonets-OCR-s")
+        logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
+        self.temperature = 0.0
+        self.max_tokens = 15000
+        self.prompt = """## 任务说明

 请从上传的文档中提取文字内容，严格按自然阅读顺序（从上到下，从左到右）输出，并遵循以下格式规范。

@@ -192,33 +243,26 @@ class NanonetsOCRBackend(OCRBackend):

 * 不要猜测或补全不确定的链接地址。
 """
-            logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
-        except ImportError:
-            logger.error("Failed to import openai. Please install it with 'pip install openai'")
-            self.client = None
-        except Exception as e:
-            logger.error(f"Failed to initialize Nanonets OCR: {str(e)}")
-            self.client = None
-    
+
    def predict(self, image: Union[str, bytes, Image.Image]) -> str:
        """Extract text from an image using Nanonets OCR
-        
+
        Args:
            image: Image file path, bytes, or PIL Image object
-            
+
        Returns:
            Extracted text
        """
        if self.client is None:
            logger.error("Nanonets OCR client not initialized")
            return ""
-        
+
        try:
            # Encode image to base64
-            img_base64 = image_to_base64(image)
+            img_base64 = endecode.decode_image(image)
            if not img_base64:
                return ""
-            
+
            # Call Nanonets OCR API
            logger.info(f"Calling Nanonets OCR API with model: {self.model}")
            response = self.client.chat.completions.create(
@@ -229,7 +273,9 @@ class NanonetsOCRBackend(OCRBackend):
                        "content": [
                            {
                                "type": "image_url",
-                                "image_url": {"url": f"data:image/png;base64,{img_base64}"},
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{img_base64}"
+                                },
                            },
                            {
                                "type": "text",
@@ -239,40 +285,43 @@ class NanonetsOCRBackend(OCRBackend):
                    }
                ],
                temperature=self.temperature,
-                max_tokens=self.max_tokens
+                max_tokens=self.max_tokens,
            )
-            
-            return response.choices[0].message.content
+            return response.choices[0].message.content or ""
        except Exception as e:
            logger.error(f"Nanonets OCR prediction error: {str(e)}")
            return ""

+
 class OCREngine:
    """OCR Engine factory class"""
-    
-    _instance = None
-    
+
+    _instance: Dict[str, OCRBackend] = {}
+
    @classmethod
-    def get_instance(cls, backend_type="paddle", **kwargs) -> Optional[OCRBackend]:
+    def get_instance(cls, backend_type: str) -> OCRBackend:
        """Get OCR engine instance
-        
+
        Args:
            backend_type: OCR backend type, one of: "paddle", "nanonets"
            **kwargs: Additional arguments for the backend
-            
+
        Returns:
            OCR engine instance or None if initialization fails
        """
-        if cls._instance is None:
-            logger.info(f"Initializing OCR engine with backend: {backend_type}")
-            
-            if backend_type.lower() == "paddle":
-                cls._instance = PaddleOCRBackend(**kwargs)
-            elif backend_type.lower() == "nanonets":
-                cls._instance = NanonetsOCRBackend(**kwargs)
-            else:
-                logger.error(f"Unknown OCR backend type: {backend_type}")
-                return None
-        
-        return cls._instance
-    
+        backend_type = backend_type.lower()
+        if cls._instance.get(backend_type):
+            return cls._instance[backend_type]
+
+        logger.info(f"Initializing OCR engine with backend: {backend_type}")
+
+        if backend_type == "paddle":
+            cls._instance[backend_type] = PaddleOCRBackend()
+
+        elif backend_type == "nanonets":
+            cls._instance[backend_type] = NanonetsOCRBackend()
+
+        else:
+            cls._instance[backend_type] = DummyOCRBackend()
+
+        return cls._instance[backend_type]
--- a/docreader/parser/parser.py
+++ b/docreader/parser/parser.py
@@ -1,30 +1,19 @@
 import logging
-from dataclasses import dataclass, field
-from typing import Dict, Any, Optional, Type
+from typing import Dict, Type

-from .base_parser import BaseParser, ParseResult
-from .docx_parser import DocxParser
-from .doc_parser import DocParser
-from .pdf_parser import PDFParser
-from .markdown_parser import MarkdownParser
-from .text_parser import TextParser
-from .image_parser import ImageParser
-from .web_parser import WebParser
-from .config import ChunkingConfig
-import traceback
+from docreader.models.document import Document
+from docreader.models.read_config import ChunkingConfig
+from docreader.parser.base_parser import BaseParser
+from docreader.parser.doc_parser import DocParser
+from docreader.parser.docx2_parser import Docx2Parser
+from docreader.parser.image_parser import ImageParser
+from docreader.parser.markdown_parser import MarkdownParser
+from docreader.parser.pdf_parser import PDFParser
+from docreader.parser.text_parser import TextParser
+from docreader.parser.web_parser import WebParser

 logger = logging.getLogger(__name__)

-@dataclass
-class Chunk:
-    """
-    Represents a single text chunk with associated metadata.
-    Basic unit for document processing and embedding.
-    """
-
-    content: str  # Text content of the chunk
-    metadata: Dict[str, Any] = None  # Associated metadata (source, page number, etc.)
-

 class Parser:
    """
@@ -33,10 +22,9 @@ class Parser:
    """

    def __init__(self):
-        logger.info("Initializing document parser")
        # Initialize all parser types
        self.parsers: Dict[str, Type[BaseParser]] = {
-            "docx": DocxParser,
+            "docx": Docx2Parser,
            "doc": DocParser,
            "pdf": PDFParser,
            "md": MarkdownParser,
@@ -56,8 +44,7 @@ class Parser:
            ", ".join(self.parsers.keys()),
        )

-
-    def get_parser(self, file_type: str) -> Optional[Type[BaseParser]]:
+    def get_parser(self, file_type: str) -> Type[BaseParser]:
        """
        Get parser class for the specified file type.

@@ -67,12 +54,9 @@ class Parser:
        Returns:
            Parser class for the file type, or None if unsupported
        """
-        file_type = file_type.lower()
-        parser = self.parsers.get(file_type)
-        if parser:
-            logger.info(f"Found parser for file type: {file_type}")
-        else:
-            logger.warning(f"No parser found for file type: {file_type}")
+        parser = self.parsers.get(file_type.lower())
+        if not parser:
+            raise ValueError(f"Unsupported file type: {file_type}")
        return parser

    def parse_file(
@@ -81,7 +65,7 @@ class Parser:
        file_type: str,
        content: bytes,
        config: ChunkingConfig,
-    ) -> Optional[ParseResult]:
+    ) -> Document:
        """
        Parse file content using appropriate parser based on file type.

@@ -96,60 +80,41 @@ class Parser:
        """
        logger.info(f"Parsing file: {file_name} with type: {file_type}")
        logger.info(
-            f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
+            f"Chunking config: size={config.chunk_size}, "
+            f"overlap={config.chunk_overlap}, "
            f"multimodal={config.enable_multimodal}"
        )
-        
-        parser_instance = None
-        
-        try:
-            # Get appropriate parser for file type
-            cls = self.get_parser(file_type)
-            if cls is None:
-                logger.error(f"Unsupported file type: {file_type}")
-                return None

-            # Parse file content
-            logger.info(f"Creating parser instance for {file_type} file")
-            parser_instance = cls(
-                file_name=file_name,
-                file_type=file_type,
-                chunk_size=config.chunk_size,
-                chunk_overlap=config.chunk_overlap,
-                separators=config.separators,
-                enable_multimodal=config.enable_multimodal,
-                max_image_size=1920,  # Limit image size to 1920px
-                max_concurrent_tasks=5,  # Limit concurrent tasks to 5
-                chunking_config=config,  # Pass the entire chunking config
-            )
+        # Get appropriate parser for file type
+        cls = self.get_parser(file_type)

-            logger.info(f"Starting to parse file content, size: {len(content)} bytes")
-            result = parser_instance.parse(content)
+        # Parse file content
+        logger.info(f"Creating parser instance for {file_type} file")
+        parser = cls(
+            file_name=file_name,
+            file_type=file_type,
+            chunk_size=config.chunk_size,
+            chunk_overlap=config.chunk_overlap,
+            separators=config.separators,
+            enable_multimodal=config.enable_multimodal,
+            max_image_size=1920,  # Limit image size to 1920px
+            max_concurrent_tasks=5,  # Limit concurrent tasks to 5
+            chunking_config=config,  # Pass the entire chunking config
+        )

-            if result:
-                logger.info(
-                    f"Successfully parsed file {file_name}, generated {len(result.chunks)} chunks"
-                )
-                if result.chunks and len(result.chunks) > 0:
-                    logger.info(
-                        f"First chunk content length: {len(result.chunks[0].content)}"
-                    )
-                else:
-                    logger.warning(f"Parser returned empty chunks for file: {file_name}")
-            else:
-                logger.warning(f"Parser returned None result for file: {file_name}")
+        logger.info(f"Starting to parse file content, size: {len(content)} bytes")
+        result = parser.parse(content)

-            # Return parse results
-            return result
+        if not result.content:
+            logger.warning(f"Parser returned empty content for file: {file_name}")
+        elif not result.chunks:
+            logger.warning(f"Parser returned empty chunks for file: {file_name}")
+        elif result.chunks[0]:
+            logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
+        logger.info(f"Parsed file {file_name}, with {len(result.chunks)} chunks")
+        return result

-        except Exception as e:
-            logger.error(f"Error parsing file {file_name}: {str(e)}")
-            logger.info(f"Detailed traceback: {traceback.format_exc()}")
-            return None
-
-    def parse_url(
-        self, url: str, title: str, config: ChunkingConfig
-    ) -> Optional[ParseResult]:
+    def parse_url(self, url: str, title: str, config: ChunkingConfig) -> Document:
        """
        Parse content from a URL using the WebParser.

@@ -163,44 +128,31 @@ class Parser:
        """
        logger.info(f"Parsing URL: {url}, title: {title}")
        logger.info(
-            f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
-            f"multimodal={config.enable_multimodal}"
+            f"Chunking config: size={config.chunk_size}, "
+            f"overlap={config.chunk_overlap}, multimodal={config.enable_multimodal}"
        )
-        
-        parser_instance = None

-        try:
-            # Create web parser instance
-            logger.info("Creating WebParser instance")
-            parser_instance = WebParser(
-                title=title,
-                chunk_size=config.chunk_size,
-                chunk_overlap=config.chunk_overlap,
-                separators=config.separators,
-                enable_multimodal=config.enable_multimodal,
-                max_image_size=1920,  # Limit image size
-                max_concurrent_tasks=5,  # Limit concurrent tasks
-                chunking_config=config,
-            )
+        # Create web parser instance
+        logger.info("Creating WebParser instance")
+        parser = WebParser(
+            title=title,
+            chunk_size=config.chunk_size,
+            chunk_overlap=config.chunk_overlap,
+            separators=config.separators,
+            enable_multimodal=config.enable_multimodal,
+            max_image_size=1920,  # Limit image size
+            max_concurrent_tasks=5,  # Limit concurrent tasks
+            chunking_config=config,
+        )

-            logger.info(f"Starting to parse URL content")
-            result = parser_instance.parse(url)
-
-            if result:
-                logger.info(
-                    f"Successfully parsed URL, generated {len(result.chunks)} chunks"
-                )
-                logger.info(
-                    f"First chunk content length: {len(result.chunks[0].content) if result.chunks else 0}"
-                )
-            else:
-                logger.warning(f"Parser returned empty result for URL: {url}")
-
-            # Return parse results
-            return result
-
-        except Exception as e:
-            logger.error(f"Error parsing URL {url}: {str(e)}")
-            logger.info(f"Detailed traceback: {traceback.format_exc()}")
-            return None
+        logger.info("Starting to parse URL content")
+        result = parser.parse(url.encode())

+        if not result.content:
+            logger.warning(f"Parser returned empty content for url: {url}")
+        elif not result.chunks:
+            logger.warning(f"Parser returned empty chunks for url: {url}")
+        elif result.chunks[0]:
+            logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
+        logger.info(f"Parsed url {url}, with {len(result.chunks)} chunks")
+        return result
--- a/docreader/parser/pdf_parser.py
+++ b/docreader/parser/pdf_parser.py
@@ -1,113 +1,7 @@
-import logging
-import os
-import io
-from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
+from docreader.parser.chain_parser import FirstParser
+from docreader.parser.markitdown_parser import MarkitdownParser
+from docreader.parser.mineru_parser import MinerUParser

-import pdfplumber
-import tempfile
-from .base_parser import BaseParser

-logger = logging.getLogger(__name__)
-
-class PDFParser(BaseParser):
-    """
-    PDF Document Parser
-
-    This parser handles PDF documents by extracting text content.
-    It uses the pypdf library for simple text extraction.
-    """
-    def _convert_table_to_markdown(self, table_data: list) -> str:
-    
-        if not table_data or not table_data[0]: return ""
-        def clean_cell(cell):
-            if cell is None: return ""
-            return str(cell).replace("\n", " <br> ")
-        try:
-            markdown = ""
-            header = [clean_cell(cell) for cell in table_data[0]]
-            markdown += "| " + " | ".join(header) + " |\n"
-            markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
-            for row in table_data[1:]:
-                if not row: continue
-                body_row = [clean_cell(cell) for cell in row]
-                if len(body_row) != len(header):
-                    logger.warning(f"Skipping malformed table row: {body_row}")
-                    continue
-                markdown += "| " + " | ".join(body_row) + " |\n"
-            return markdown
-        except Exception as e:
-            logger.error(f"Error converting table to markdown: {e}")
-            return ""
-    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
-       
-        logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
-
-        all_page_content = []
-     
-
-        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
-        temp_pdf_path = temp_pdf.name
-        
-        try:
-            temp_pdf.write(content)
-            temp_pdf.close()
-            logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
-            
-            with pdfplumber.open(temp_pdf_path) as pdf:
-                logger.info(f"PDF has {len(pdf.pages)} pages")
-                
-                for page_num, page in enumerate(pdf.pages):
-                    page_content_parts = []
-                    
-                    # Try-fallback strategy for table detection
-                    default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
-                    found_tables = page.find_tables(default_settings)
-                    if not found_tables:
-                        logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
-                        fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
-                        found_tables = page.find_tables(fallback_settings)
-
-                    table_bboxes = [table.bbox for table in found_tables]
-                    # Define a filter function that keeps objects NOT inside any table bbox.
-                    def not_within_bboxes(obj):
-                        """Check if an object is outside all table bounding boxes."""
-                        for bbox in table_bboxes:
-                            # Check if the object's vertical center is within a bbox
-                            if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
-                                return False # It's inside a table, so we DON'T keep it
-                        return True # It's outside all tables, so we DO keep it
-
-                    # that contains only the non-table text.
-                    non_table_page = page.filter(not_within_bboxes)
-
-                    # Now, extract text from this filtered page view.
-                    text = non_table_page.extract_text(x_tolerance=2)
-                    if text:
-                        page_content_parts.append(text)
-              
-                    # Process and append the structured Markdown tables
-                    if found_tables:
-                        logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
-                        for table in found_tables:
-                            markdown_table = self._convert_table_to_markdown(table.extract())
-                            page_content_parts.append(f"\n\n{markdown_table}\n\n")
-                    
-                    
-                    all_page_content.append("".join(page_content_parts))
-
-            final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
-            logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
-            
-            return final_text
-            
-        except Exception as e:
-            logger.error(f"Failed to parse PDF document: {str(e)}")
-            return ""
-        finally:
-            # This block is GUARANTEED to execute, preventing resource leaks.
-            if os.path.exists(temp_pdf_path):
-                try:
-                    os.remove(temp_pdf_path)
-                    logger.info(f"Temporary file cleaned up: {temp_pdf_path}")
-                except OSError as e:
-                    logger.error(f"Error removing temporary file {temp_pdf_path}: {e}")
+class PDFParser(FirstParser):
+    _parser_cls = (MinerUParser, MarkitdownParser)
--- a/docreader/parser/storage.py
+++ b/docreader/parser/storage.py
@@ -1,64 +1,68 @@
 # -*- coding: utf-8 -*-
-import os
-import uuid
-import logging
 import io
+import logging
+import os
 import traceback
+import uuid
 from abc import ABC, abstractmethod
-from typing import Tuple, Optional
+from typing import Dict

-from qcloud_cos import CosConfig, CosS3Client
 from minio import Minio
+from qcloud_cos import CosConfig, CosS3Client
+
+from docreader.utils import endecode

 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)


 class Storage(ABC):
    """Abstract base class for object storage operations"""
-    
+
    @abstractmethod
    def upload_file(self, file_path: str) -> str:
        """Upload file to object storage
-        
+
        Args:
            file_path: File path
-            
-        Returns:
-            File URL
-        """
-        pass
-    
-    @abstractmethod
-    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
-        """Upload bytes to object storage
-        
-        Args:
-            content: Byte content to upload
-            file_ext: File extension
-            
+
        Returns:
            File URL
        """
        pass

-        
+    @abstractmethod
+    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
+        """Upload bytes to object storage
+
+        Args:
+            content: Byte content to upload
+            file_ext: File extension
+
+        Returns:
+            File URL
+        """
+        pass
+
+
 class CosStorage(Storage):
    """Tencent Cloud COS storage implementation"""
-    
+
    def __init__(self, storage_config=None):
        """Initialize COS storage
-        
+
        Args:
            storage_config: Storage configuration
        """
        self.storage_config = storage_config
-        self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client()
-        
+        self.client, self.bucket_name, self.region, self.prefix = (
+            self._init_cos_client()
+        )
+
    def _init_cos_client(self):
        """Initialize Tencent Cloud COS client"""
        try:
-            # Use provided COS config if available, otherwise fall back to environment variables
+            # Use provided COS config if available,
+            # otherwise fall back to environment variables
            if self.storage_config and self.storage_config.get("access_key_id") != "":
                cos_config = self.storage_config
                secret_id = cos_config.get("access_key_id")
@@ -75,15 +79,16 @@ class CosStorage(Storage):
                bucket_name = os.getenv("COS_BUCKET_NAME")
                appid = os.getenv("COS_APP_ID")
                prefix = os.getenv("COS_PATH_PREFIX")
-                
+
            enable_old_domain = (
                os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true"
            )

            if not all([secret_id, secret_key, region, bucket_name, appid]):
                logger.error(
-                    "Incomplete COS configuration, missing required environment variables"
-                    f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}"
+                    "Incomplete COS configuration, missing environment variables"
+                    f"secret_id: {secret_id}, secret_key: {secret_key}, "
+                    f"region: {region}, bucket_name: {bucket_name}, appid: {appid}"
                )
                return None, None, None, None

@@ -105,27 +110,26 @@ class CosStorage(Storage):
        except Exception as e:
            logger.error(f"Failed to initialize COS client: {str(e)}")
            return None, None, None, None
-            
+
    def _get_download_url(self, bucket_name, region, object_key):
        """Generate COS object URL
-        
+
        Args:
            bucket_name: Bucket name
            region: Region
            object_key: Object key
-            
+
        Returns:
            File URL
        """
        return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
-    
-        
+
    def upload_file(self, file_path: str) -> str:
        """Upload file to Tencent Cloud COS
-        
+
        Args:
            file_path: File path
-            
+
        Returns:
            File URL
        """
@@ -135,16 +139,16 @@ class CosStorage(Storage):
                return ""

            # Generate object key, use UUID to avoid conflicts
-            file_name = os.path.basename(file_path)
-            object_key = (
-                f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
-            )
+            file_ext = os.path.splitext(file_path)[1]
+            object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
            logger.info(f"Generated object key: {object_key}")

            # Upload file
            logger.info("Attempting to upload file to COS")
-            response = self.client.upload_file(
-                Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key
+            self.client.upload_file(
+                Bucket=self.bucket_name,
+                LocalFilePath=file_path,
+                Key=object_key,
            )

            # Get file URL
@@ -156,14 +160,14 @@ class CosStorage(Storage):
        except Exception as e:
            logger.error(f"Failed to upload file to COS: {str(e)}")
            return ""
-            
+
    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
        """Upload bytes to Tencent Cloud COS
-        
+
        Args:
            content: Byte content to upload
            file_ext: File extension
-            
+
        Returns:
            File URL
        """
@@ -171,10 +175,16 @@ class CosStorage(Storage):
            logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes")
            if not self.client:
                return ""
-                
-            object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}"
+
+            object_key = (
+                f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
+                if self.prefix
+                else f"images/{uuid.uuid4().hex}{file_ext}"
+            )
            logger.info(f"Generated object key: {object_key}")
-            self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key)
+            self.client.put_object(
+                Bucket=self.bucket_name, Body=content, Key=object_key
+            )
            file_url = self._get_download_url(self.bucket_name, self.region, object_key)
            logger.info(f"Successfully uploaded bytes to COS: {file_url}")
            return file_url
@@ -186,16 +196,18 @@ class CosStorage(Storage):

 class MinioStorage(Storage):
    """MinIO storage implementation"""
-    
+
    def __init__(self, storage_config=None):
        """Initialize MinIO storage
-        
+
        Args:
            storage_config: Storage configuration
        """
        self.storage_config = storage_config
-        self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client()
-        
+        self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = (
+            self._init_minio_client()
+        )
+
    def _init_minio_client(self):
        """Initialize MinIO client from environment variables or injected config.

@@ -203,58 +215,69 @@ class MinioStorage(Storage):
        prefer those values to override envs.
        """
        try:
-            endpoint = os.getenv("MINIO_ENDPOINT")
+            endpoint = os.getenv("MINIO_ENDPOINT", "")
            use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true"
            if self.storage_config and self.storage_config.get("bucket_name"):
                storage_config = self.storage_config
-                bucket_name = storage_config.get("bucket_name")
+                bucket_name = storage_config.get("bucket_name", "")
                path_prefix = storage_config.get("path_prefix").strip().strip("/")
                access_key = storage_config.get("access_key_id")
                secret_key = storage_config.get("secret_access_key")
            else:
                access_key = os.getenv("MINIO_ACCESS_KEY_ID")
                secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY")
-                bucket_name = os.getenv("MINIO_BUCKET_NAME")
+                bucket_name = os.getenv("MINIO_BUCKET_NAME", "")
                path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/")

            if not all([endpoint, access_key, secret_key, bucket_name]):
-                logger.error("Incomplete MinIO configuration, missing required environment variables")
+                logger.error(
+                    "Incomplete MinIO configuration, missing environment variables"
+                )
                return None, None, None, None, None

            # Initialize client
-            client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl)
+            client = Minio(
+                endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl
+            )

            # Ensure bucket exists
            found = client.bucket_exists(bucket_name)
            if not found:
                client.make_bucket(bucket_name)
-                policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name)
+                policy = (
+                    '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}'
+                    % (bucket_name, bucket_name)
+                )
                client.set_bucket_policy(bucket_name, policy)

            return client, bucket_name, use_ssl, endpoint, path_prefix
        except Exception as e:
            logger.error(f"Failed to initialize MinIO client: {str(e)}")
            return None, None, None, None, None
-            
-    def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None):
+
+    def _get_download_url(self, object_key: str):
        """Construct a public URL for MinIO object.

        If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint.
        """
-        if public_endpoint:
-            base = public_endpoint
-        else:
-            scheme = "https" if use_ssl else "http"
-            base = f"{scheme}://{endpoint}"
-        # Path-style URL for MinIO
-        return f"{base}/{bucket_name}/{object_key}"
-        
+        # 1. Use public endpoint if provided
+        endpoint = os.getenv("MINIO_PUBLIC_ENDPOINT")
+        if endpoint:
+            return f"{endpoint}/{self.bucket_name}/{object_key}"
+
+        # 2. Use SSL if enabled
+        if self.use_ssl:
+            return f"https://{self.endpoint}/{self.bucket_name}/{object_key}"
+
+        # 3. Use HTTP default
+        return f"http://{self.endpoint}/{self.bucket_name}/{object_key}"
+
    def upload_file(self, file_path: str) -> str:
        """Upload file to MinIO
-        
+
        Args:
            file_path: File path
-            
+
        Returns:
            File URL
        """
@@ -265,29 +288,27 @@ class MinioStorage(Storage):

            # Generate object key, use UUID to avoid conflicts
            file_name = os.path.basename(file_path)
-            object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
+            object_key = (
+                f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
+                if self.path_prefix
+                else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
+            )
            logger.info(f"Generated MinIO object key: {object_key}")

            # Upload file
            logger.info("Attempting to upload file to MinIO")
-            with open(file_path, 'rb') as file_data:
+            with open(file_path, "rb") as file_data:
                file_size = os.path.getsize(file_path)
                self.client.put_object(
-                    bucket_name=self.bucket_name,
+                    bucket_name=self.bucket_name or "",
                    object_name=object_key,
                    data=file_data,
                    length=file_size,
-                    content_type='application/octet-stream'
+                    content_type="application/octet-stream",
                )

            # Get file URL
-            file_url = self._get_download_url(
-                self.bucket_name, 
-                object_key, 
-                self.use_ssl, 
-                self.endpoint,
-                os.getenv("MINIO_PUBLIC_ENDPOINT", None)
-            )
+            file_url = self._get_download_url(object_key)

            logger.info(f"Successfully uploaded file to MinIO: {file_url}")
            return file_url
@@ -295,14 +316,14 @@ class MinioStorage(Storage):
        except Exception as e:
            logger.error(f"Failed to upload file to MinIO: {str(e)}")
            return ""
-            
+
    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
        """Upload bytes to MinIO
-        
+
        Args:
            content: Byte content to upload
            file_ext: File extension
-            
+
        Returns:
            File URL
        """
@@ -310,23 +331,21 @@ class MinioStorage(Storage):
            logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes")
            if not self.client:
                return ""
-                
-            object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}"
+
+            object_key = (
+                f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}"
+                if self.path_prefix
+                else f"images/{uuid.uuid4().hex}{file_ext}"
+            )
            logger.info(f"Generated MinIO object key: {object_key}")
            self.client.put_object(
-                self.bucket_name, 
-                object_key, 
-                data=io.BytesIO(content), 
-                length=len(content), 
-                content_type="application/octet-stream"
-            )
-            file_url = self._get_download_url(
-                self.bucket_name, 
-                object_key, 
-                self.use_ssl, 
-                self.endpoint,
-                os.getenv("MINIO_PUBLIC_ENDPOINT", None)
+                self.bucket_name or "",
+                object_key,
+                data=io.BytesIO(content),
+                length=len(content),
+                content_type="application/octet-stream",
            )
+            file_url = self._get_download_url(object_key)
            logger.info(f"Successfully uploaded bytes to MinIO: {file_url}")
            return file_url
        except Exception as e:
@@ -335,26 +354,61 @@ class MinioStorage(Storage):
            return ""


-def create_storage(storage_config=None) -> Storage:
+class LocalStorage(Storage):
+    """Local file system storage implementation"""
+
+    def __init__(self, storage_config: Dict[str, str] = {}):
+        self.storage_config = storage_config
+        base_dir = storage_config.get(
+            "base_dir", os.getenv("LOCAL_STORAGE_BASE_DIR", "")
+        )
+        self.image_dir = os.path.join(base_dir, "images")
+        os.makedirs(self.image_dir, exist_ok=True)
+
+    def upload_file(self, file_path: str) -> str:
+        logger.info(f"Uploading file to local storage: {file_path}")
+        return file_path
+
+    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
+        logger.info(f"Uploading file to local storage: {len(content)} bytes")
+        fname = os.path.join(self.image_dir, f"{uuid.uuid4()}{file_ext}")
+        with open(fname, "wb") as f:
+            f.write(content)
+        return fname
+
+
+class Base64Storage(Storage):
+    def upload_file(self, file_path: str) -> str:
+        logger.info(f"Uploading file to base64 storage: {file_path}")
+        return file_path
+
+    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
+        logger.info(f"Uploading file to base64 storage: {len(content)} bytes")
+        file_ext = file_ext.lstrip(".")
+        return f"data:image/{file_ext};base64,{endecode.decode_image(content)}"
+
+
+def create_storage(storage_config: Dict[str, str] | None = None) -> Storage:
    """Create a storage instance based on configuration or environment variables
-    
+
    Args:
        storage_config: Storage configuration dictionary
-        
+
    Returns:
        Storage instance
    """
    storage_type = os.getenv("STORAGE_TYPE", "cos").lower()
-    
    if storage_config:
        storage_type = str(storage_config.get("provider", storage_type)).lower()
-        
    logger.info(f"Creating {storage_type} storage instance")
-    
+
    if storage_type == "minio":
        return MinioStorage(storage_config)
    elif storage_type == "cos":
-        # Default to COS
        return CosStorage(storage_config)
-    else:
-        return None
+    elif storage_type == "local":
+        return LocalStorage(storage_config or {})
+    elif storage_type == "base64":
+        return Base64Storage()
+
+    raise ValueError(f"Invalid storage type: {storage_type}")
--- a/docreader/parser/text_parser.py
+++ b/docreader/parser/text_parser.py
@@ -1,6 +1,8 @@
 import logging
-from .base_parser import BaseParser
-from typing import Dict, Any, Tuple, Union
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.utils import endecode

 logger = logging.getLogger(__name__)

@@ -11,7 +13,7 @@ class TextParser(BaseParser):
    This parser handles text extraction and chunking from plain text documents.
    """

-    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+    def parse_into_text(self, content: bytes) -> Document:
        """
        Parse text document content by decoding bytes to string.

@@ -25,20 +27,15 @@ class TextParser(BaseParser):
            Parsed text content as string
        """
        logger.info(f"Parsing text document, content size: {len(content)} bytes")
-        text = self.decode_bytes(content)
+        text = endecode.decode_bytes(content)
        logger.info(
            f"Successfully parsed text document, extracted {len(text)} characters"
        )
-        return text
+        return Document(content=text)


 if __name__ == "__main__":
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-    logger.info("Running TextParser in standalone mode")
+    logger = logging.getLogger(__name__)

    # Sample text for testing
    text = """## 标题1
--- a/docreader/parser/web_parser.py
+++ b/docreader/parser/web_parser.py
@@ -1,11 +1,14 @@
-from typing import Any, Optional, Tuple, Dict, Union
-import os
-
-from playwright.async_api import async_playwright
-from bs4 import BeautifulSoup
-from .base_parser import BaseParser, ParseResult
-import logging
 import asyncio
+import logging
+import os
+from typing import Any
+
+from bs4 import BeautifulSoup
+from playwright.async_api import async_playwright
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.utils import endecode

 logger = logging.getLogger(__name__)

@@ -59,7 +62,7 @@ class WebParser(BaseParser):
            # Return empty BeautifulSoup object on error
            return BeautifulSoup("", "html.parser")

-    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+    def parse_into_text(self, content: bytes) -> Document:
        """Parse web page

        Args:
@@ -78,10 +81,10 @@ class WebParser(BaseParser):
            # Run async method
            # Handle content possibly being a string
            if isinstance(content, bytes):
-                url = self.decode_bytes(content)
+                url = endecode.decode_bytes(content)
                logger.info(f"Decoded URL from bytes: {url}")
            else:
-                url = content
+                url = str(content)
                logger.info(f"Using content as URL directly: {url}")

            logger.info(f"Scraping web page: {url}")
@@ -118,11 +121,11 @@ class WebParser(BaseParser):
            logger.info(
                f"Web page parsing complete, total content: {len(result)} characters"
            )
-            return result
+            return Document(content=result)

        except Exception as e:
            logger.error(f"Error parsing web page: {str(e)}")
-            return f"Error parsing web page: {str(e)}"
+            return Document(content=f"Error parsing web page: {str(e)}")

        finally:
            # Close event loop
--- a/docreader/proto/docreader_pb2.pyi
+++ b/docreader/proto/docreader_pb2.pyi
@@ -0,0 +1,127 @@
+from google.protobuf.internal import containers as _containers
+from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from collections.abc import Iterable as _Iterable, Mapping as _Mapping
+from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
+
+DESCRIPTOR: _descriptor.FileDescriptor
+
+class StorageProvider(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
+    __slots__ = ()
+    STORAGE_PROVIDER_UNSPECIFIED: _ClassVar[StorageProvider]
+    COS: _ClassVar[StorageProvider]
+    MINIO: _ClassVar[StorageProvider]
+STORAGE_PROVIDER_UNSPECIFIED: StorageProvider
+COS: StorageProvider
+MINIO: StorageProvider
+
+class StorageConfig(_message.Message):
+    __slots__ = ("provider", "region", "bucket_name", "access_key_id", "secret_access_key", "app_id", "path_prefix")
+    PROVIDER_FIELD_NUMBER: _ClassVar[int]
+    REGION_FIELD_NUMBER: _ClassVar[int]
+    BUCKET_NAME_FIELD_NUMBER: _ClassVar[int]
+    ACCESS_KEY_ID_FIELD_NUMBER: _ClassVar[int]
+    SECRET_ACCESS_KEY_FIELD_NUMBER: _ClassVar[int]
+    APP_ID_FIELD_NUMBER: _ClassVar[int]
+    PATH_PREFIX_FIELD_NUMBER: _ClassVar[int]
+    provider: StorageProvider
+    region: str
+    bucket_name: str
+    access_key_id: str
+    secret_access_key: str
+    app_id: str
+    path_prefix: str
+    def __init__(self, provider: _Optional[_Union[StorageProvider, str]] = ..., region: _Optional[str] = ..., bucket_name: _Optional[str] = ..., access_key_id: _Optional[str] = ..., secret_access_key: _Optional[str] = ..., app_id: _Optional[str] = ..., path_prefix: _Optional[str] = ...) -> None: ...
+
+class VLMConfig(_message.Message):
+    __slots__ = ("model_name", "base_url", "api_key", "interface_type")
+    MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
+    BASE_URL_FIELD_NUMBER: _ClassVar[int]
+    API_KEY_FIELD_NUMBER: _ClassVar[int]
+    INTERFACE_TYPE_FIELD_NUMBER: _ClassVar[int]
+    model_name: str
+    base_url: str
+    api_key: str
+    interface_type: str
+    def __init__(self, model_name: _Optional[str] = ..., base_url: _Optional[str] = ..., api_key: _Optional[str] = ..., interface_type: _Optional[str] = ...) -> None: ...
+
+class ReadConfig(_message.Message):
+    __slots__ = ("chunk_size", "chunk_overlap", "separators", "enable_multimodal", "storage_config", "vlm_config")
+    CHUNK_SIZE_FIELD_NUMBER: _ClassVar[int]
+    CHUNK_OVERLAP_FIELD_NUMBER: _ClassVar[int]
+    SEPARATORS_FIELD_NUMBER: _ClassVar[int]
+    ENABLE_MULTIMODAL_FIELD_NUMBER: _ClassVar[int]
+    STORAGE_CONFIG_FIELD_NUMBER: _ClassVar[int]
+    VLM_CONFIG_FIELD_NUMBER: _ClassVar[int]
+    chunk_size: int
+    chunk_overlap: int
+    separators: _containers.RepeatedScalarFieldContainer[str]
+    enable_multimodal: bool
+    storage_config: StorageConfig
+    vlm_config: VLMConfig
+    def __init__(self, chunk_size: _Optional[int] = ..., chunk_overlap: _Optional[int] = ..., separators: _Optional[_Iterable[str]] = ..., enable_multimodal: bool = ..., storage_config: _Optional[_Union[StorageConfig, _Mapping]] = ..., vlm_config: _Optional[_Union[VLMConfig, _Mapping]] = ...) -> None: ...
+
+class ReadFromFileRequest(_message.Message):
+    __slots__ = ("file_content", "file_name", "file_type", "read_config", "request_id")
+    FILE_CONTENT_FIELD_NUMBER: _ClassVar[int]
+    FILE_NAME_FIELD_NUMBER: _ClassVar[int]
+    FILE_TYPE_FIELD_NUMBER: _ClassVar[int]
+    READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
+    REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
+    file_content: bytes
+    file_name: str
+    file_type: str
+    read_config: ReadConfig
+    request_id: str
+    def __init__(self, file_content: _Optional[bytes] = ..., file_name: _Optional[str] = ..., file_type: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
+
+class ReadFromURLRequest(_message.Message):
+    __slots__ = ("url", "title", "read_config", "request_id")
+    URL_FIELD_NUMBER: _ClassVar[int]
+    TITLE_FIELD_NUMBER: _ClassVar[int]
+    READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
+    REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
+    url: str
+    title: str
+    read_config: ReadConfig
+    request_id: str
+    def __init__(self, url: _Optional[str] = ..., title: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
+
+class Image(_message.Message):
+    __slots__ = ("url", "caption", "ocr_text", "original_url", "start", "end")
+    URL_FIELD_NUMBER: _ClassVar[int]
+    CAPTION_FIELD_NUMBER: _ClassVar[int]
+    OCR_TEXT_FIELD_NUMBER: _ClassVar[int]
+    ORIGINAL_URL_FIELD_NUMBER: _ClassVar[int]
+    START_FIELD_NUMBER: _ClassVar[int]
+    END_FIELD_NUMBER: _ClassVar[int]
+    url: str
+    caption: str
+    ocr_text: str
+    original_url: str
+    start: int
+    end: int
+    def __init__(self, url: _Optional[str] = ..., caption: _Optional[str] = ..., ocr_text: _Optional[str] = ..., original_url: _Optional[str] = ..., start: _Optional[int] = ..., end: _Optional[int] = ...) -> None: ...
+
+class Chunk(_message.Message):
+    __slots__ = ("content", "seq", "start", "end", "images")
+    CONTENT_FIELD_NUMBER: _ClassVar[int]
+    SEQ_FIELD_NUMBER: _ClassVar[int]
+    START_FIELD_NUMBER: _ClassVar[int]
+    END_FIELD_NUMBER: _ClassVar[int]
+    IMAGES_FIELD_NUMBER: _ClassVar[int]
+    content: str
+    seq: int
+    start: int
+    end: int
+    images: _containers.RepeatedCompositeFieldContainer[Image]
+    def __init__(self, content: _Optional[str] = ..., seq: _Optional[int] = ..., start: _Optional[int] = ..., end: _Optional[int] = ..., images: _Optional[_Iterable[_Union[Image, _Mapping]]] = ...) -> None: ...
+
+class ReadResponse(_message.Message):
+    __slots__ = ("chunks", "error")
+    CHUNKS_FIELD_NUMBER: _ClassVar[int]
+    ERROR_FIELD_NUMBER: _ClassVar[int]
+    chunks: _containers.RepeatedCompositeFieldContainer[Chunk]
+    error: str
+    def __init__(self, chunks: _Optional[_Iterable[_Union[Chunk, _Mapping]]] = ..., error: _Optional[str] = ...) -> None: ...
--- a/docreader/proto/docreader_pb2_grpc.py
+++ b/docreader/proto/docreader_pb2_grpc.py
@@ -3,7 +3,7 @@
 import grpc
 import warnings

-from . import docreader_pb2 as docreader__pb2
+import docreader_pb2 as docreader__pb2

 GRPC_GENERATED_VERSION = '1.76.0'
 GRPC_VERSION = grpc.__version__
--- a/docreader/pyproject.toml
+++ b/docreader/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
    "lxml>=6.0.2",
    "markdown>=3.10",
    "markdownify>=1.2.0",
+    "markitdown[docx,pdf,xls,xlsx]>=0.1.3",
    "minio>=7.2.18",
    "mistletoe>=1.5.0",
    "ollama>=0.6.0",
@@ -26,6 +27,7 @@ dependencies = [
    "pillow>=12.0.0",
    "playwright>=1.55.0",
    "protobuf>=6.33.0",
+    "pydantic>=2.12.3",
    "pypdf>=6.1.3",
    "pypdf2>=3.0.1",
    "python-docx>=1.2.0",
--- a/docreader/scripts/generate_proto.sh
+++ b/docreader/scripts/generate_proto.sh
@@ -2,13 +2,14 @@
 set -x

 # 设置目录
-PROTO_DIR="proto"
-PYTHON_OUT="proto"
-GO_OUT="proto"
+PROTO_DIR="docreader/proto"
+PYTHON_OUT="docreader/proto"
+GO_OUT="docreader/proto"

 # 生成Python代码
 python3 -m grpc_tools.protoc -I${PROTO_DIR} \
    --python_out=${PYTHON_OUT} \
+    --pyi_out=${PYTHON_OUT} \
    --grpc_python_out=${PYTHON_OUT} \
    ${PROTO_DIR}/docreader.proto

@@ -22,10 +23,10 @@ protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
 # 修复Python导入问题（MacOS兼容版本）
 if [ "$(uname)" == "Darwin" ]; then
    # MacOS版本
-    sed -i '' 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
+    sed -i '' 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
 else
    # Linux版本
-    sed -i 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
+    sed -i 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
 fi

 echo "Proto files generated successfully!"
--- a/docreader/splitter/header_hook.py
+++ b/docreader/splitter/header_hook.py
@@ -0,0 +1,112 @@
+import re
+from typing import Callable, Dict, List, Match, Pattern, Union
+
+from pydantic import BaseModel, Field
+
+
+class HeaderTrackerHook(BaseModel):
+    """表头追踪Hook的配置类，支持多种场景的表头识别"""
+
+    start_pattern: Pattern[str] = Field(
+        description="表头开始匹配（正则表达式或字符串）"
+    )
+    end_pattern: Pattern[str] = Field(description="表头结束匹配（正则表达式或字符串）")
+    extract_header_fn: Callable[[Match[str]], str] = Field(
+        default=lambda m: m.group(0),
+        description="从开始匹配结果中提取表头内容的函数（默认取匹配到的整个内容）",
+    )
+    priority: int = Field(default=0, description="优先级（多个配置时，高优先级先匹配）")
+    case_sensitive: bool = Field(
+        default=True, description="是否大小写敏感（仅当传入字符串pattern时生效）"
+    )
+
+    def __init__(
+        self,
+        start_pattern: Union[str, Pattern[str]],
+        end_pattern: Union[str, Pattern[str]],
+        **kwargs,
+    ):
+        flags = 0 if kwargs.get("case_sensitive", True) else re.IGNORECASE
+        if isinstance(start_pattern, str):
+            start_pattern = re.compile(start_pattern, flags | re.DOTALL)
+        if isinstance(end_pattern, str):
+            end_pattern = re.compile(end_pattern, flags | re.DOTALL)
+        super().__init__(
+            start_pattern=start_pattern,
+            end_pattern=end_pattern,
+            **kwargs,
+        )
+
+
+# 初始化表头Hook配置（提供默认配置：支持Markdown表格、代码块）
+DEFAULT_CONFIGS = [
+    # 代码块配置（```开头，```结尾）
+    # HeaderTrackerHook(
+    #     # 代码块开始（支持语言指定）
+    #     start_pattern=r"^\s*```(\w+).*(?!```$)",
+    #     # 代码块结束
+    #     end_pattern=r"^\s*```.*$",
+    #     extract_header_fn=lambda m: f"```{m.group(1)}" if m.group(1) else "```",
+    #     priority=20,  # 代码块优先级高于表格
+    #     case_sensitive=True,
+    # ),
+    # Markdown表格配置（表头带下划线）
+    HeaderTrackerHook(
+        # 表头行 + 分隔行
+        start_pattern=r"^\s*(?:\|[^|\n]*)+[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|?[\r\n]+$",
+        # 空行或非表格内容
+        end_pattern=r"^\s*$|^\s*[^|\s].*$",
+        priority=15,
+        case_sensitive=False,
+    ),
+]
+DEFAULT_CONFIGS.sort(key=lambda x: -x.priority)
+
+
+# 定义Hook状态数据结构
+class HeaderTracker(BaseModel):
+    """表头追踪 Hook 的状态类"""
+
+    header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS)
+    active_headers: Dict[int, str] = Field(default_factory=dict)
+    ended_headers: set[int] = Field(default_factory=set)
+
+    def update(self, split: str) -> Dict[int, str]:
+        """检测当前split中的表头开始/结束，更新Hook状态"""
+        new_headers: Dict[int, str] = {}
+
+        # 1. 检查是否有表头结束标记
+        for config in self.header_hook_configs:
+            if config.priority in self.active_headers and config.end_pattern.search(
+                split
+            ):
+                self.ended_headers.add(config.priority)
+                del self.active_headers[config.priority]
+
+        # 2. 检查是否有新的表头开始标记（只处理未活跃且未结束的）
+        for config in self.header_hook_configs:
+            if (
+                config.priority not in self.active_headers
+                and config.priority not in self.ended_headers
+            ):
+                match = config.start_pattern.search(split)
+                if match:
+                    header = config.extract_header_fn(match)
+                    self.active_headers[config.priority] = header
+                    new_headers[config.priority] = header
+
+        # 3. 检查是否所有活跃表头都已结束（清空结束标记）
+        if not self.active_headers:
+            self.ended_headers.clear()
+
+        return new_headers
+
+    def get_headers(self) -> str:
+        """获取当前所有活跃表头的拼接文本（按优先级排序）"""
+        # 按优先级降序排列表头
+        sorted_headers = sorted(self.active_headers.items(), key=lambda x: -x[0])
+        return (
+            "\n".join([header for _, header in sorted_headers])
+            if sorted_headers
+            else ""
+        )
--- a/docreader/splitter/splitter.py
+++ b/docreader/splitter/splitter.py
@@ -0,0 +1,313 @@
+"""Token splitter."""
+
+import itertools
+import logging
+import re
+from typing import Callable, Generic, List, Pattern, Tuple, TypeVar
+
+from pydantic import BaseModel, Field, PrivateAttr
+
+from docreader.splitter.header_hook import (
+    HeaderTracker,
+)
+from docreader.utils.split import split_by_char, split_by_sep
+
+DEFAULT_CHUNK_OVERLAP = 100
+DEFAULT_CHUNK_SIZE = 512
+
+T = TypeVar("T")
+
+logger = logging.getLogger(__name__)
+
+
+class TextSplitter(BaseModel, Generic[T]):
+    chunk_size: int = Field(description="The token chunk size for each chunk.")
+    chunk_overlap: int = Field(
+        description="The token overlap of each chunk when splitting."
+    )
+    separators: List[str] = Field(
+        description="Default separators for splitting into words"
+    )
+
+    # Try to keep the matched characters as a whole.
+    # If it's too long, the content will be further segmented.
+    protected_regex: List[str] = Field(
+        description="Protected regex for splitting into words"
+    )
+    len_function: Callable[[str], int] = Field(description="The length function.")
+    # Header tracking Hook related attributes
+    header_hook: HeaderTracker = Field(default_factory=HeaderTracker, exclude=True)
+
+    _protected_fns: List[Pattern] = PrivateAttr()
+    _split_fns: List[Callable] = PrivateAttr()
+
+    def __init__(
+        self,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
+        chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
+        separators: List[str] = ["\n", "。", " "],
+        protected_regex: List[str] = [
+            # math formula
+            r"\$\$[\s\S]*?\$\$",
+            # image
+            r"!\[.*?\]\(.*?\)",
+            # link
+            r"\[.*?\]\(.*?\)",
+            # table header
+            r"(?:\|[^|\n]*)+\|[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|[\r\n]+",
+            # table body
+            r"(?:\|[^|\n]*)+\|[\r\n]+",
+            # code header
+            r"```(?:\w+)[\r\n]+[^\r\n]*",
+        ],
+        length_function: Callable[[str], int] = lambda x: len(x),
+    ):
+        """Initialize with parameters."""
+        if chunk_overlap > chunk_size:
+            raise ValueError(
+                f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
+                f"({chunk_size}), should be smaller."
+            )
+
+        super().__init__(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators=separators,
+            protected_regex=protected_regex,
+            len_function=length_function,
+        )
+        self._protected_fns = [re.compile(reg) for reg in protected_regex]
+        self._split_fns = [split_by_sep(sep) for sep in separators] + [split_by_char()]
+
+    def split_text(self, text: str) -> List[Tuple[int, int, str]]:
+        """Split text into chunks."""
+        if text == "":
+            return []
+
+        splits = self._split(text)
+        protect = self._split_protected(text)
+        splits = self._join(splits, protect)
+
+        assert "".join(splits) == text
+
+        chunks = self._merge(splits)
+        return chunks
+
+    def _split(self, text: str) -> List[str]:
+        """Break text into splits that are smaller than chunk size.
+
+        NOTE: the splits contain the separators.
+        """
+        if self.len_function(text) <= self.chunk_size:
+            return [text]
+
+        splits = []
+        for split_fn in self._split_fns:
+            splits = split_fn(text)
+            if len(splits) > 1:
+                break
+
+        new_splits = []
+        for split in splits:
+            split_len = self.len_function(split)
+            if split_len <= self.chunk_size:
+                new_splits.append(split)
+            else:
+                # recursively split
+                new_splits.extend(self._split(split))
+        return new_splits
+
+    def _merge(self, splits: List[str]) -> List[Tuple[int, int, str]]:
+        """Merge splits into chunks.
+
+        The high-level idea is to keep adding splits to a chunk until we
+        exceed the chunk size, then we start a new chunk with overlap.
+
+        When we start a new chunk, we pop off the first element of the previous
+        chunk until the total length is less than the chunk size.
+        """
+        chunks: List[Tuple[int, int, str]] = []
+
+        cur_chunk: List[Tuple[int, int, str]] = []
+
+        cur_headers, cur_len = "", 0
+        cur_start, cur_end = 0, 0
+        for split in splits:
+            cur_end = cur_start + len(split)
+            split_len = self.len_function(split)
+            if split_len > self.chunk_size:
+                logger.error(
+                    f"Got a split of size {split_len}, ",
+                    f"larger than chunk size {self.chunk_size}.",
+                )
+
+            self.header_hook.update(split)
+            cur_headers = self.header_hook.get_headers()
+            cur_headers_len = self.len_function(cur_headers)
+
+            if cur_headers_len > self.chunk_size:
+                logger.error(
+                    f"Got headers of size {cur_headers_len}, ",
+                    f"larger than chunk size {self.chunk_size}.",
+                )
+                cur_headers, cur_headers_len = "", 0
+
+            # if we exceed the chunk size after adding the new split, then
+            # we need to end the current chunk and start a new one
+            if cur_len + split_len + cur_headers_len > self.chunk_size:
+                # end the previous chunk
+                if len(cur_chunk) > 0:
+                    chunks.append(
+                        (
+                            cur_chunk[0][0],
+                            cur_chunk[-1][1],
+                            "".join([c[2] for c in cur_chunk]),
+                        )
+                    )
+
+                # start a new chunk with overlap
+                # keep popping off the first element of the previous chunk until:
+                #   1. the current chunk length is less than chunk overlap
+                #   2. the total length is less than chunk size
+                while cur_chunk and (
+                    cur_len > self.chunk_overlap
+                    or cur_len + split_len + cur_headers_len > self.chunk_size
+                ):
+                    # pop off the first element
+                    first_chunk = cur_chunk.pop(0)
+                    cur_len -= self.len_function(first_chunk[2])
+
+                if (
+                    cur_headers
+                    and split_len + cur_headers_len < self.chunk_size
+                    and cur_headers not in split
+                ):
+                    cur_chunk.insert(
+                        0,
+                        (
+                            cur_chunk[0][0] if cur_chunk else cur_start,
+                            cur_chunk[0][1] if cur_chunk else cur_end,
+                            cur_headers,
+                        ),
+                    )
+                    cur_len += cur_headers_len
+
+            cur_chunk.append((cur_start, cur_end, split))
+            cur_len += split_len
+            cur_start = cur_end
+
+        # handle the last chunk
+        assert cur_chunk
+        if cur_headers and cur_len < self.chunk_size:
+            cur_chunk.insert(0, (cur_chunk[0][0], cur_chunk[0][1], cur_headers))
+        chunks.append(
+            (
+                cur_chunk[0][0],
+                cur_chunk[-1][1],
+                "".join([c[2] for c in cur_chunk]),
+            )
+        )
+
+        return chunks
+
+    def _split_protected(self, text: str) -> List[Tuple[int, str]]:
+        matches = [
+            (match.start(), match.end())
+            for pattern in self._protected_fns
+            for match in pattern.finditer(text)
+        ]
+        matches.sort(key=lambda x: (x[0], -x[1]))
+
+        res = []
+
+        def fold(initial: int, current: Tuple[int, int]) -> int:
+            if current[0] >= initial:
+                if current[1] - current[0] < self.chunk_size:
+                    res.append((current[0], text[current[0] : current[1]]))
+                else:
+                    logger.warning(f"Protected text ignore: {current}")
+            return max(initial, current[1])
+
+        # filter overlapping matches
+        list(itertools.accumulate(matches, fold, initial=-1))
+        return res
+
+    def _join(self, splits: List[str], protect: List[Tuple[int, str]]) -> List[str]:
+        """
+        Merges and splits elements in splits array based on protected substrings.
+
+        The function processes the input splits to ensure all protected substrings
+        remain as single items. If a protected substring is concatenated with preceding
+        or following content in any split element, it will be separated from
+        the adjacent content. The final result maintains the original order of content
+        while enforcing the integrity of protected substrings.
+
+        Key behaviors:
+        1. Preserves the complete structure of each protected substring
+        2. Separates protected substrings from any adjacent non-protected content
+        3. Maintains the original sequence of all content except for necessary
+        4. Handles cases where protected substrings are partially concatenated
+        """
+        j = 0
+        point, start = 0, 0
+        res = []
+
+        for split in splits:
+            end = start + len(split)
+
+            cur = split[point - start :]
+            while j < len(protect):
+                p_start, p_content = protect[j]
+                p_end = p_start + len(p_content)
+
+                if end <= p_start:
+                    break
+
+                if point < p_start:
+                    local_end = p_start - point
+                    res.append(cur[:local_end])
+                    cur = cur[local_end:]
+                    point = p_start
+
+                res.append(p_content)
+                j += 1
+
+                if point < p_end:
+                    local_start = p_end - point
+                    cur = cur[local_start:]
+                    point = p_end
+
+                if not cur:
+                    break
+
+            if cur:
+                res.append(cur)
+                point = end
+
+            start = end
+        return res
+
+
+if __name__ == "__main__":
+    s = """
+    这是一些普通文本。
+
+    | 姓名 | 年龄 | 城市 |
+    |------|------|------|
+    | 张三 | 25   | 北京 |
+    | 李四 | 30   | 上海 |
+    | 王五 | 28   | 广州 |
+    | 张三 | 25   | 北京 |
+    | 李四 | 30   | 上海 |
+    | 王五 | 28   | 广州 |
+
+    这是文本结束。
+
+"""
+
+    sp = TextSplitter(chunk_size=200, chunk_overlap=2)
+    ck = sp.split_text(s)
+    for c in ck:
+        print("------", len(c))
+        print(c)
+    pass
--- a/docreader/utils/endecode.py
+++ b/docreader/utils/endecode.py
@@ -0,0 +1,103 @@
+import base64
+import binascii
+import io
+import logging
+from typing import List, Union
+
+import numpy as np
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+def decode_image(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
+    """Convert image to base64 encoded string
+
+    Args:
+        image: Image file path, bytes, PIL Image object, or numpy array
+
+    Returns:
+        Base64 encoded image string, or empty string if conversion fails
+    """
+    if isinstance(image, str):
+        # It's a file path
+        with open(image, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode()
+
+    elif isinstance(image, bytes):
+        # It's bytes data
+        return base64.b64encode(image).decode()
+
+    elif isinstance(image, Image.Image):
+        # It's a PIL Image
+        buffer = io.BytesIO()
+        image.save(buffer, format=image.format)
+        return base64.b64encode(buffer.getvalue()).decode()
+
+    elif isinstance(image, np.ndarray):
+        # It's a numpy array
+        pil_image = Image.fromarray(image)
+        buffer = io.BytesIO()
+        pil_image.save(buffer, format="PNG")
+        return base64.b64encode(buffer.getvalue()).decode()
+
+    raise ValueError(f"Unsupported image type: {type(image)}")
+
+
+def encode_image(image: str, errors="strict") -> bytes:
+    """
+    Decode image bytes using base64.
+
+    errors
+        The error handling scheme to use for the handling of decoding errors.
+        The default is 'strict' meaning that decoding errors raise a
+        UnicodeDecodeError. Other possible values are 'ignore' and '????'
+        as well as any other name registered with codecs.register_error that
+        can handle UnicodeDecodeErrors.
+    """
+    try:
+        image_bytes = base64.b64decode(image)
+    except binascii.Error as e:
+        if errors == "ignore":
+            return b""
+        else:
+            raise e
+    return image_bytes
+
+
+def encode_bytes(content: str) -> bytes:
+    return content.encode()
+
+
+def decode_bytes(
+    content: bytes,
+    encodings: List[str] = [
+        "utf-8",
+        "gb18030",
+        "gb2312",
+        "gbk",
+        "big5",
+        "ascii",
+        "latin-1",
+    ],
+) -> str:
+    # Try decoding with each encoding format
+    for encoding in encodings:
+        try:
+            text = content.decode(encoding)
+            logger.debug(f"Decode content with {encoding}: {len(text)} characters")
+            return text
+        except UnicodeDecodeError:
+            continue
+
+    text = content.decode(encoding="latin-1", errors="replace")
+    logger.warning(
+        "Unable to determine correct encoding, using latin-1 as fallback. "
+        "This may cause character issues."
+    )
+    return text
+
+
+if __name__ == "__main__":
+    img = "test![](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMgA)test"
+    encode_image(img, errors="ignore")
--- a/docreader/utils/request.py
+++ b/docreader/utils/request.py
@@ -1,10 +1,10 @@
-from contextvars import ContextVar
-import logging
-import uuid
 import contextlib
+import logging
 import time
-from typing import Optional
+import uuid
+from contextvars import ContextVar
 from logging import LogRecord
+from typing import Optional

 # 配置日志
 logger = logging.getLogger(__name__)
@@ -26,21 +26,21 @@ def get_request_id() -> Optional[str]:

 class MillisecondFormatter(logging.Formatter):
    """自定义日志格式化器，只显示毫秒级时间戳(3位数字)而不是微秒(6位)"""
-    
+
    def formatTime(self, record, datefmt=None):
        """重写formatTime方法，将微秒格式化为毫秒"""
        # 先获取标准的格式化时间
        result = super().formatTime(record, datefmt)
-        
+
        # 如果使用了包含.%f的格式，则将微秒(6位)截断为毫秒(3位)
        if datefmt and ".%f" in datefmt:
            # 格式化的时间字符串应该在最后有6位微秒数
-            parts = result.split('.')
+            parts = result.split(".")
            if len(parts) > 1 and len(parts[1]) >= 6:
                # 只保留前3位作为毫秒
                millis = parts[1][:3]
                result = f"{parts[0]}.{millis}"
-                
+
        return result


--- a/docreader/utils/split.py
+++ b/docreader/utils/split.py
@@ -0,0 +1,34 @@
+import re
+from typing import Callable, List
+
+
+def split_text_keep_separator(text: str, separator: str) -> List[str]:
+    """Split text with separator and keep the separator at the end of each split."""
+    parts = text.split(separator)
+    result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
+    return [s for s in result if s]
+
+
+def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
+    """Split text by separator."""
+    if keep_sep:
+        return lambda text: split_text_keep_separator(text, sep)
+    else:
+        return lambda text: text.split(sep)
+
+
+def split_by_char() -> Callable[[str], List[str]]:
+    """Split text by character."""
+    return lambda text: list(text)
+
+
+def split_by_regex(regex: str) -> Callable[[str], List[str]]:
+    """Split text by regex."""
+    pattern = re.compile(f"({regex})")
+    return lambda text: list(filter(None, pattern.split(text)))
+
+
+def match_by_regex(regex: str) -> Callable[[str], bool]:
+    """Split text by regex."""
+    pattern = re.compile(regex)
+    return lambda text: bool(pattern.match(text))
--- a/docreader/utils/tempfile.py
+++ b/docreader/utils/tempfile.py
@@ -0,0 +1,77 @@
+import logging
+import os
+import tempfile
+
+logger = logging.getLogger(__name__)
+
+
+class TempFileContext:
+    def __init__(self, file_content: bytes, suffix: str):
+        """
+        Initialize the context
+        :param file_content: Byte data to write to file
+        :param suffix: File suffix
+        """
+        self.file_content = file_content
+        self.suffix = suffix
+        self.file = None
+
+    def __enter__(self):
+        """
+        Create file when entering context
+        """
+        self.temp_file = tempfile.NamedTemporaryFile(suffix=self.suffix, delete=False)
+        self.temp_file.write(self.file_content)
+        self.temp_file.flush()
+        logger.info(
+            f"Saved {self.suffix} content to temporary file: {self.temp_file.name}"
+        )
+        return self.temp_file.name
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """
+        Delete file when exiting context
+        """
+        if self.temp_file:
+            self.temp_file.close()
+            if os.path.exists(self.temp_file.name):
+                os.remove(self.temp_file.name)
+            logger.info(f"File {self.temp_file.name} has been deleted.")
+        # Return False to propagate exception (if any exception occurred)
+        return False
+
+
+class TempDirContext:
+    def __init__(self):
+        """
+        Initialize the context
+        """
+        self.temp_dir = None
+
+    def __enter__(self):
+        """
+        Create directory when entering context
+        """
+        self.temp_dir = tempfile.TemporaryDirectory()
+        logger.info(f"Created temporary directory: {self.temp_dir.name}")
+        return self.temp_dir.name
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """
+        Delete directory when exiting context
+        """
+        if self.temp_dir and os.path.exists(self.temp_dir.name):
+            self.temp_dir.cleanup()
+            logger.info(f"Directory {self.temp_dir.name} has been deleted.")
+        # Return False to propagate exception (if any exception occurred)
+        return False
+
+
+if __name__ == "__main__":
+    example_bytes = b"Hello, this is a test file."
+    file_name = "test_file.txt"
+
+    # Using with statement
+    with TempFileContext(example_bytes, file_name) as temp_file:
+        # File operations can be performed within the context
+        print(f"Does file {file_name} exist: {os.path.exists(file_name)}")
--- a/docreader/uv.lock
+++ b/docreader/uv.lock
@@ -6,17 +6,22 @@ resolution-markers = [
    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version < '3.11' and sys_platform == 'darwin'",
    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
+    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]

 [[package]]
@@ -423,6 +428,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" },
 ]

+[[package]]
+name = "cobble"
+version = "0.1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa", size = 3805, upload-time = "2024-06-01T18:11:09.528Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44", size = 3984, upload-time = "2024-06-01T18:11:07.911Z" },
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -432,6 +446,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
 ]

+[[package]]
+name = "coloredlogs"
+version = "15.0.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "humanfriendly" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" },
+]
+
 [[package]]
 name = "cos-python-sdk-v5"
 version = "1.9.38"
@@ -587,6 +613,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/18/d5/7a04640bf559bb890455ffb28978daf7d44f667c3f04a4d422c655c1ba92/cython-3.1.6-py3-none-any.whl", hash = "sha256:91dcf7eb9b6a089ce4e9e1140e571d84c3bca834afb77ec269be7aa9d31a8157", size = 1223550, upload-time = "2025-10-23T12:38:16.732Z" },
 ]

+[[package]]
+name = "defusedxml"
+version = "0.7.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" },
+]
+
 [[package]]
 name = "distro"
 version = "1.9.0"
@@ -612,6 +647,7 @@ dependencies = [
    { name = "lxml" },
    { name = "markdown" },
    { name = "markdownify" },
+    { name = "markitdown", extra = ["docx", "pdf", "xls", "xlsx"] },
    { name = "minio" },
    { name = "mistletoe" },
    { name = "ollama" },
@@ -622,6 +658,7 @@ dependencies = [
    { name = "pillow" },
    { name = "playwright" },
    { name = "protobuf" },
+    { name = "pydantic" },
    { name = "pypdf" },
    { name = "pypdf2" },
    { name = "python-docx" },
@@ -643,6 +680,7 @@ requires-dist = [
    { name = "lxml", specifier = ">=6.0.2" },
    { name = "markdown", specifier = ">=3.10" },
    { name = "markdownify", specifier = ">=1.2.0" },
+    { name = "markitdown", extras = ["docx", "pdf", "xls", "xlsx"], specifier = ">=0.1.3" },
    { name = "minio", specifier = ">=7.2.18" },
    { name = "mistletoe", specifier = ">=1.5.0" },
    { name = "ollama", specifier = ">=0.6.0" },
@@ -653,6 +691,7 @@ requires-dist = [
    { name = "pillow", specifier = ">=12.0.0" },
    { name = "playwright", specifier = ">=1.55.0" },
    { name = "protobuf", specifier = ">=6.33.0" },
+    { name = "pydantic", specifier = ">=2.12.3" },
    { name = "pypdf", specifier = ">=6.1.3" },
    { name = "pypdf2", specifier = ">=3.0.1" },
    { name = "python-docx", specifier = ">=1.2.0" },
@@ -683,6 +722,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/bf/ee/aa015c5de8b0dc42a8e507eae8c2de5d1c0e068c896858fec6d502402ed6/ebooklib-0.20-py3-none-any.whl", hash = "sha256:fff5322517a37e31c972d27be7d982cc3928c16b3dcc5fd7e8f7c0f5d7bcf42b", size = 40995, upload-time = "2025-10-26T20:56:19.104Z" },
 ]

+[[package]]
+name = "et-xmlfile"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.3.0"
@@ -707,6 +755,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" },
 ]

+[[package]]
+name = "flatbuffers"
+version = "25.9.23"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" },
+]
+
 [[package]]
 name = "fonttools"
 version = "4.60.1"
@@ -850,6 +907,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" },
    { url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" },
    { url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126, upload-time = "2025-08-07T13:18:20.239Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/29/74242b7d72385e29bcc5563fba67dad94943d7cd03552bac320d597f29b2/greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7", size = 1544904, upload-time = "2025-11-04T12:42:04.763Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/e2/1572b8eeab0f77df5f6729d6ab6b141e4a84ee8eb9bc8c1e7918f94eda6d/greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8", size = 1611228, upload-time = "2025-11-04T12:42:08.423Z" },
    { url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654, upload-time = "2025-08-07T13:50:00.469Z" },
    { url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" },
    { url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" },
@@ -859,6 +918,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" },
    { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" },
    { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" },
+    { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" },
    { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" },
    { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
    { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
@@ -868,6 +929,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
    { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
    { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" },
+    { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" },
+    { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" },
    { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" },
    { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
    { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
@@ -877,6 +940,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
    { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
    { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" },
+    { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" },
    { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" },
    { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
    { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
@@ -884,6 +949,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
    { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
    { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
+    { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" },
    { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" },
 ]

@@ -1061,6 +1128,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
 ]

+[[package]]
+name = "humanfriendly"
+version = "10.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyreadline3", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" },
+]
+
 [[package]]
 name = "idna"
 version = "3.11"
@@ -1386,6 +1465,38 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" },
 ]

+[[package]]
+name = "magika"
+version = "0.6.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "onnxruntime", version = "1.23.2", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+    { name = "python-dotenv" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a3/f3/3d1dcdd7b9c41d589f5cff252d32ed91cdf86ba84391cfc81d9d8773571d/magika-0.6.3.tar.gz", hash = "sha256:7cc52aa7359af861957043e2bf7265ed4741067251c104532765cd668c0c0cb1", size = 3042784, upload-time = "2025-10-30T15:22:34.499Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a2/e4/35c323beb3280482c94299d61626116856ac2d4ec16ecef50afc4fdd4291/magika-0.6.3-py3-none-any.whl", hash = "sha256:eda443d08006ee495e02083b32e51b98cb3696ab595a7d13900d8e2ef506ec9d", size = 2969474, upload-time = "2025-10-30T15:22:25.298Z" },
+    { url = "https://files.pythonhosted.org/packages/25/8f/132b0d7cd51c02c39fd52658a5896276c30c8cc2fd453270b19db8c40f7e/magika-0.6.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:86901e64b05dde5faff408c9b8245495b2e1fd4c226e3393d3d2a3fee65c504b", size = 13358841, upload-time = "2025-10-30T15:22:27.413Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/03/5ed859be502903a68b7b393b17ae0283bf34195cfcca79ce2dc25b9290e7/magika-0.6.3-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:3d9661eedbdf445ac9567e97e7ceefb93545d77a6a32858139ea966b5806fb64", size = 15367335, upload-time = "2025-10-30T15:22:29.907Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/9e/f8ee7d644affa3b80efdd623a3d75865c8f058f3950cb87fb0c48e3559bc/magika-0.6.3-py3-none-win_amd64.whl", hash = "sha256:e57f75674447b20cab4db928ae58ab264d7d8582b55183a0b876711c2b2787f3", size = 12692831, upload-time = "2025-10-30T15:22:32.063Z" },
+]
+
+[[package]]
+name = "mammoth"
+version = "1.11.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cobble" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ed/3c/a58418d2af00f2da60d4a51e18cd0311307b72d48d2fffec36a97b4a5e44/mammoth-1.11.0.tar.gz", hash = "sha256:a0f59e442f34d5b6447f4b0999306cbf3e67aaabfa8cb516f878fb1456744637", size = 53142, upload-time = "2025-09-19T10:35:20.373Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ca/54/2e39566a131b13f6d8d193f974cb6a34e81bb7cc2fa6f7e03de067b36588/mammoth-1.11.0-py2.py3-none-any.whl", hash = "sha256:c077ab0d450bd7c0c6ecd529a23bf7e0fa8190c929e28998308ff4eada3f063b", size = 54752, upload-time = "2025-09-19T10:35:18.699Z" },
+]
+
 [[package]]
 name = "markdown"
 version = "3.10"
@@ -1408,6 +1519,41 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/6a/e2/7af643acb4cae0741dffffaa7f3f7c9e7ab4046724543ba1777c401d821c/markdownify-1.2.0-py3-none-any.whl", hash = "sha256:48e150a1c4993d4d50f282f725c0111bd9eb25645d41fa2f543708fd44161351", size = 15561, upload-time = "2025-08-09T17:44:14.074Z" },
 ]

+[[package]]
+name = "markitdown"
+version = "0.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "beautifulsoup4" },
+    { name = "charset-normalizer" },
+    { name = "defusedxml" },
+    { name = "magika" },
+    { name = "markdownify" },
+    { name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'win32'" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/87/31/90cef2bc8ecd85c200ed3b3d1e20fc7a724213502685c4b05b5431e02668/markitdown-0.1.3.tar.gz", hash = "sha256:b0d9127c3373a68274dede6af6c9bb0684b78ce364c727c4c304da97a20d6fd9", size = 40039, upload-time = "2025-08-26T22:37:04.4Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/97/83/7b47d2ecbf58650a03aeeb21ba2d59175f202bf4fb81d44f40f1deb82bc0/markitdown-0.1.3-py3-none-any.whl", hash = "sha256:08d9a25770979d78f60dcc0afcb868de6799608e4db65342b2e03304fb091251", size = 58391, upload-time = "2025-08-26T22:37:02.924Z" },
+]
+
+[package.optional-dependencies]
+docx = [
+    { name = "lxml" },
+    { name = "mammoth" },
+]
+pdf = [
+    { name = "pdfminer-six" },
+]
+xls = [
+    { name = "pandas" },
+    { name = "xlrd" },
+]
+xlsx = [
+    { name = "openpyxl" },
+    { name = "pandas" },
+]
+
 [[package]]
 name = "minio"
 version = "7.2.18"
@@ -1433,6 +1579,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/3f/c2/bd0fc48dc323035cd65d21c45a3bb5c7b054001bf4ea75e461beb7656e07/mistletoe-1.5.0-py3-none-any.whl", hash = "sha256:d4e77b991b998c5efe3c4eab4e1b472263b5349688acd50d79bd9a6c317a9df1", size = 55262, upload-time = "2025-10-18T16:37:08.376Z" },
 ]

+[[package]]
+name = "mpmath"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
+]
+
 [[package]]
 name = "networkx"
 version = "3.4.2"
@@ -1440,7 +1595,8 @@ source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
    "python_full_version < '3.11' and sys_platform == 'darwin'",
    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
+    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" }
 wheels = [
@@ -1456,14 +1612,18 @@ resolution-markers = [
    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" }
 wheels = [
@@ -1492,7 +1652,8 @@ source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
    "python_full_version < '3.11' and sys_platform == 'darwin'",
    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
+    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
 wheels = [
@@ -1561,14 +1722,18 @@ resolution-markers = [
    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b5/f4/098d2270d52b41f1bd7db9fc288aaa0400cb48c2a3e2af6fa365d9720947/numpy-2.3.4.tar.gz", hash = "sha256:a7d018bfedb375a8d979ac758b120ba846a7fe764911a64465fd87b8729f4a6a", size = 20582187, upload-time = "2025-10-15T16:18:11.77Z" }
 wheels = [
@@ -1660,6 +1825,97 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/b5/c1/edc9f41b425ca40b26b7c104c5f6841a4537bb2552bfa6ca66e81405bb95/ollama-0.6.0-py3-none-any.whl", hash = "sha256:534511b3ccea2dff419ae06c3b58d7f217c55be7897c8ce5868dfb6b219cf7a0", size = 14130, upload-time = "2025-09-24T22:46:01.19Z" },
 ]

+[[package]]
+name = "onnxruntime"
+version = "1.20.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
+    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+]
+dependencies = [
+    { name = "coloredlogs", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "flatbuffers", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "packaging", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "protobuf", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "sympy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6d/c6/c4c0860bee2fde6037bdd9dcd12d323f6e38cf00fcc9a5065b394337fc55/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b2908b50101a19e99c4d4e97ebb9905561daf61829403061c1adc1b588bc0de", size = 11954010, upload-time = "2024-11-21T00:48:35.254Z" },
+    { url = "https://files.pythonhosted.org/packages/63/47/3dc0b075ab539f16b3d8b09df6b504f51836086ee709690a6278d791737d/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d82daaec24045a2e87598b8ac2b417b1cce623244e80e663882e9fe1aae86410", size = 13330452, upload-time = "2024-11-21T00:48:40.02Z" },
+    { url = "https://files.pythonhosted.org/packages/27/ef/80fab86289ecc01a734b7ddf115dfb93d8b2e004bd1e1977e12881c72b12/onnxruntime-1.20.1-cp310-cp310-win32.whl", hash = "sha256:4c4b251a725a3b8cf2aab284f7d940c26094ecd9d442f07dd81ab5470e99b83f", size = 9813849, upload-time = "2024-11-21T00:48:43.569Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/e6/33ab10066c9875a29d55e66ae97c3bf91b9b9b987179455d67c32261a49c/onnxruntime-1.20.1-cp310-cp310-win_amd64.whl", hash = "sha256:d3b616bb53a77a9463707bb313637223380fc327f5064c9a782e8ec69c22e6a2", size = 11329702, upload-time = "2024-11-21T00:48:46.599Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/da/c44bf9bd66cd6d9018a921f053f28d819445c4d84b4dd4777271b0fe52a2/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6243e34d74423bdd1edf0ae9596dd61023b260f546ee17d701723915f06a9f7", size = 11955227, upload-time = "2024-11-21T00:48:54.556Z" },
+    { url = "https://files.pythonhosted.org/packages/11/ac/4120dfb74c8e45cce1c664fc7f7ce010edd587ba67ac41489f7432eb9381/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5eec64c0269dcdb8d9a9a53dc4d64f87b9e0c19801d9321246a53b7eb5a7d1bc", size = 13331703, upload-time = "2024-11-21T00:48:57.97Z" },
+    { url = "https://files.pythonhosted.org/packages/12/f1/cefacac137f7bb7bfba57c50c478150fcd3c54aca72762ac2c05ce0532c1/onnxruntime-1.20.1-cp311-cp311-win32.whl", hash = "sha256:a19bc6e8c70e2485a1725b3d517a2319603acc14c1f1a017dda0afe6d4665b41", size = 9813977, upload-time = "2024-11-21T00:49:00.519Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/2d/2d4d202c0bcfb3a4cc2b171abb9328672d7f91d7af9ea52572722c6d8d96/onnxruntime-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:8508887eb1c5f9537a4071768723ec7c30c28eb2518a00d0adcd32c89dea3221", size = 11329895, upload-time = "2024-11-21T00:49:03.845Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/9d/a42a84e10f1744dd27c6f2f9280cc3fb98f869dd19b7cd042e391ee2ab61/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f56e898815963d6dc4ee1c35fc6c36506466eff6d16f3cb9848cea4e8c8172", size = 11952833, upload-time = "2024-11-21T00:49:10.563Z" },
+    { url = "https://files.pythonhosted.org/packages/47/42/2f71f5680834688a9c81becbe5c5bb996fd33eaed5c66ae0606c3b1d6a02/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb71a814f66517a65628c9e4a2bb530a6edd2cd5d87ffa0af0f6f773a027d99e", size = 13333903, upload-time = "2024-11-21T00:49:12.984Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/f1/aabfdf91d013320aa2fc46cf43c88ca0182860ff15df872b4552254a9680/onnxruntime-1.20.1-cp312-cp312-win32.whl", hash = "sha256:bd386cc9ee5f686ee8a75ba74037750aca55183085bf1941da8efcfe12d5b120", size = 9814562, upload-time = "2024-11-21T00:49:15.453Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/80/76979e0b744307d488c79e41051117634b956612cc731f1028eb17ee7294/onnxruntime-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:19c2d843eb074f385e8bbb753a40df780511061a63f9def1b216bf53860223fb", size = 11331482, upload-time = "2024-11-21T00:49:19.412Z" },
+    { url = "https://files.pythonhosted.org/packages/81/0d/13bbd9489be2a6944f4a940084bfe388f1100472f38c07080a46fbd4ab96/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb44b08e017a648924dbe91b82d89b0c105b1adcfe31e90d1dc06b8677ad37be", size = 11951459, upload-time = "2024-11-21T00:49:26.269Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/ea/4454ae122874fd52bbb8a961262de81c5f932edeb1b72217f594c700d6ef/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bda6aebdf7917c1d811f21d41633df00c58aff2bef2f598f69289c1f1dabc4b3", size = 13331620, upload-time = "2024-11-21T00:49:28.875Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/e0/50db43188ca1c945decaa8fc2a024c33446d31afed40149897d4f9de505f/onnxruntime-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:d30367df7e70f1d9fc5a6a68106f5961686d39b54d3221f760085524e8d38e16", size = 11331758, upload-time = "2024-11-21T00:49:31.417Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/55/3821c5fd60b52a6c82a00bba18531793c93c4addfe64fbf061e235c5617a/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9158465745423b2b5d97ed25aa7740c7d38d2993ee2e5c3bfacb0c4145c49d8", size = 11950342, upload-time = "2024-11-21T00:49:34.164Z" },
+    { url = "https://files.pythonhosted.org/packages/14/56/fd990ca222cef4f9f4a9400567b9a15b220dee2eafffb16b2adbc55c8281/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0df6f2df83d61f46e842dbcde610ede27218947c33e994545a22333491e72a3b", size = 13337040, upload-time = "2024-11-21T00:49:37.271Z" },
+]
+
+[[package]]
+name = "onnxruntime"
+version = "1.23.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'darwin'",
+    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
+    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
+    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
+    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform == 'darwin'",
+    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "coloredlogs", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+    { name = "flatbuffers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'darwin')" },
+    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform == 'darwin')" },
+    { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+    { name = "protobuf", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+    { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/35/d6/311b1afea060015b56c742f3531168c1644650767f27ef40062569960587/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a7730122afe186a784660f6ec5807138bf9d792fa1df76556b27307ea9ebcbe3", size = 17195934, upload-time = "2025-10-27T23:06:14.143Z" },
+    { url = "https://files.pythonhosted.org/packages/db/db/81bf3d7cecfbfed9092b6b4052e857a769d62ed90561b410014e0aae18db/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:b28740f4ecef1738ea8f807461dd541b8287d5650b5be33bca7b474e3cbd1f36", size = 19153079, upload-time = "2025-10-27T23:05:57.686Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/4d/a382452b17cf70a2313153c520ea4c96ab670c996cb3a95cc5d5ac7bfdac/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f7d1fe034090a1e371b7f3ca9d3ccae2fabae8c1d8844fb7371d1ea38e8e8d2", size = 15219883, upload-time = "2025-10-22T03:46:21.66Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/56/179bf90679984c85b417664c26aae4f427cba7514bd2d65c43b181b7b08b/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ca88747e708e5c67337b0f65eed4b7d0dd70d22ac332038c9fc4635760018f7", size = 17370357, upload-time = "2025-10-22T03:46:57.968Z" },
+    { url = "https://files.pythonhosted.org/packages/44/be/467b00f09061572f022ffd17e49e49e5a7a789056bad95b54dfd3bee73ff/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f91d2c9b0965e86827a5ba01531d5b669770b01775b23199565d6c1f136616c", size = 17196113, upload-time = "2025-10-22T03:47:33.526Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/a8/3c23a8f75f93122d2b3410bfb74d06d0f8da4ac663185f91866b03f7da1b/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:87d8b6eaf0fbeb6835a60a4265fde7a3b60157cf1b2764773ac47237b4d48612", size = 19153857, upload-time = "2025-10-22T03:46:37.578Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/d8/506eed9af03d86f8db4880a4c47cd0dffee973ef7e4f4cff9f1d4bcf7d22/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbfd2fca76c855317568c1b36a885ddea2272c13cb0e395002c402f2360429a6", size = 15220095, upload-time = "2025-10-22T03:46:24.769Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/80/113381ba832d5e777accedc6cb41d10f9eca82321ae31ebb6bcede530cea/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da44b99206e77734c5819aa2142c69e64f3b46edc3bd314f6a45a932defc0b3e", size = 17372080, upload-time = "2025-10-22T03:47:00.265Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/9e/f748cd64161213adeef83d0cb16cb8ace1e62fa501033acdd9f9341fff57/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:b8f029a6b98d3cf5be564d52802bb50a8489ab73409fa9db0bf583eabb7c2321", size = 17195929, upload-time = "2025-10-22T03:47:36.24Z" },
+    { url = "https://files.pythonhosted.org/packages/91/9d/a81aafd899b900101988ead7fb14974c8a58695338ab6a0f3d6b0100f30b/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:218295a8acae83905f6f1aed8cacb8e3eb3bd7513a13fe4ba3b2664a19fc4a6b", size = 19157705, upload-time = "2025-10-22T03:46:40.415Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/35/4e40f2fba272a6698d62be2cd21ddc3675edfc1a4b9ddefcc4648f115315/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76ff670550dc23e58ea9bc53b5149b99a44e63b34b524f7b8547469aaa0dcb8c", size = 15226915, upload-time = "2025-10-22T03:46:27.773Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/88/9cc25d2bafe6bc0d4d3c1db3ade98196d5b355c0b273e6a5dc09c5d5d0d5/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f9b4ae77f8e3c9bee50c27bc1beede83f786fe1d52e99ac85aa8d65a01e9b77", size = 17382649, upload-time = "2025-10-22T03:47:02.782Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/41/fba0cabccecefe4a1b5fc8020c44febb334637f133acefc7ec492029dd2c/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2ff531ad8496281b4297f32b83b01cdd719617e2351ffe0dba5684fb283afa1f", size = 17196337, upload-time = "2025-10-22T03:46:35.168Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/f9/2d49ca491c6a986acce9f1d1d5fc2099108958cc1710c28e89a032c9cfe9/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:162f4ca894ec3de1a6fd53589e511e06ecdc3ff646849b62a9da7489dee9ce95", size = 19157691, upload-time = "2025-10-22T03:46:43.518Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/a1/428ee29c6eaf09a6f6be56f836213f104618fb35ac6cc586ff0f477263eb/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45d127d6e1e9b99d1ebeae9bcd8f98617a812f53f46699eafeb976275744826b", size = 15226898, upload-time = "2025-10-22T03:46:30.039Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/2b/b57c8a2466a3126dbe0a792f56ad7290949b02f47b86216cd47d857e4b77/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8bace4e0d46480fbeeb7bbe1ffe1f080e6663a42d1086ff95c1551f2d39e7872", size = 17382518, upload-time = "2025-10-22T03:47:05.407Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/3d/6830fa61c69ca8e905f237001dbfc01689a4e4ab06147020a4518318881f/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466", size = 15229610, upload-time = "2025-10-22T03:46:32.239Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/ca/862b1e7a639460f0ca25fd5b6135fb42cf9deea86d398a92e44dfda2279d/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2b9233c4947907fd1818d0e581c049c41ccc39b2856cc942ff6d26317cee145", size = 17394184, upload-time = "2025-10-22T03:47:08.127Z" },
+]
+
 [[package]]
 name = "openai"
 version = "2.7.1"
@@ -1733,6 +1989,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" },
 ]

+[[package]]
+name = "openpyxl"
+version = "3.1.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "et-xmlfile" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
+]
+
 [[package]]
 name = "opt-einsum"
 version = "3.3.0"
@@ -1821,6 +2089,68 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/a3/6f/042d73453ad01a2e3dd4adeae4870e25679d9eaa340d70bd54cd409cb982/paddlepaddle-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:d16fd322246c4a580ca512971840ac8c16126a77d59a7bceee165d8f2196a6b2", size = 101708504, upload-time = "2025-10-30T13:21:18.125Z" },
 ]

+[[package]]
+name = "pandas"
+version = "2.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "python-dateutil" },
+    { name = "pytz" },
+    { name = "tzdata" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" },
+    { url = "https://files.pythonhosted.org/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" },
+    { url = "https://files.pythonhosted.org/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" },
+    { url = "https://files.pythonhosted.org/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" },
+    { url = "https://files.pythonhosted.org/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" },
+    { url = "https://files.pythonhosted.org/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" },
+    { url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
+    { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" },
+    { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" },
+    { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" },
+    { url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" },
+    { url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" },
+    { url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" },
+    { url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" },
+    { url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" },
+    { url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" },
+    { url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
+    { url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" },
+    { url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" },
+    { url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" },
+    { url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
+    { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
+    { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
+]
+
 [[package]]
 name = "pdfminer-six"
 version = "20250506"
@@ -2266,6 +2596,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/ae/43/2b0607ef7f16d63fbe00de728151a090397ef5b3b9147b4aefe975d17106/pypdfium2-5.0.0-py3-none-win_arm64.whl", hash = "sha256:0a2a473fe95802e7a5f4140f25e5cd036cf17f060f27ee2d28c3977206add763", size = 2939015, upload-time = "2025-10-26T13:31:40.531Z" },
 ]

+[[package]]
+name = "pyreadline3"
+version = "3.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -2291,6 +2630,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" },
 ]

+[[package]]
+name = "python-dotenv"
+version = "1.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
+]
+
 [[package]]
 name = "python-pptx"
 version = "1.0.2"
@@ -2306,6 +2654,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" },
 ]

+[[package]]
+name = "pytz"
+version = "2025.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
+]
+
 [[package]]
 name = "pyyaml"
 version = "6.0.3"
@@ -2654,7 +3011,8 @@ source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
    "python_full_version < '3.11' and sys_platform == 'darwin'",
    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
+    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 dependencies = [
    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -2717,14 +3075,18 @@ resolution-markers = [
    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 dependencies = [
    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -3083,6 +3445,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/97/45/1097a0eb3ad2a1cb5a29d493aefffe4170e454b2726cfe2a76f6652e91af/stringzilla-4.2.3-cp313-cp313-win_arm64.whl", hash = "sha256:1000a8df547fb3b194def48cc3ed6494715fe1c8ac25c6444ed3cfb0b52942c7", size = 99815, upload-time = "2025-10-27T18:35:56.555Z" },
 ]

+[[package]]
+name = "sympy"
+version = "1.14.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mpmath" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
+]
+
 [[package]]
 name = "termcolor"
 version = "3.2.0"
@@ -3116,7 +3490,8 @@ source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
    "python_full_version < '3.11' and sys_platform == 'darwin'",
    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
+    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 dependencies = [
    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -3135,14 +3510,18 @@ resolution-markers = [
    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+    "python_full_version == '3.13.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 dependencies = [
    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -3185,6 +3564,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
 ]

+[[package]]
+name = "tzdata"
+version = "2025.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
+]
+
 [[package]]
 name = "unidic-lite"
 version = "1.0.8"