feat: 新增文档模型类，调整配置与解析逻辑，优化日志及导入

移除日志设置与冗余代码，优化导入、类型提示及OCR后端管理统一调整各文件模块导入路径为绝对导入调整导入路径，移除部分导入，优化日志及注释升级文档解析器为 Docx2Parser，优化超时与图片处理逻辑
2025-11-25 03:15:00 +08:00 · 2025-11-07 10:30:02 +08:00
parent af620806e0
commit 2d66abedf0
39 changed files with 2676 additions and 1570 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -24,17 +24,14 @@ node_modules/
 tmp/
 temp/
 # Docker compose файл (локальные настройки)
 # docker-compose.yml
 WeKnora
 /models/
 **/__pycache__
 test/data/mswag.txt
 data/files/
 .python-version
 .venv/
 **/__pycache__
 .python-version
 ### macOS
 # General
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -127,6 +127,7 @@ services:
      - MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
      - MINIO_USE_SSL=${MINIO_USE_SSL:-}
      - WEB_PROXY=${WEB_PROXY:-}
      - MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
    healthcheck:
      test: ["CMD", "grpc_health_probe", "-addr=:50051"]
      interval: 30s
--- a/docker/Dockerfile.docreader
+++ b/docker/Dockerfile.docreader
@@ -80,10 +80,11 @@ RUN pip install uv --break-system-packages && \
    python -m uv sync --locked --no-dev
 # 复制源代码和生成脚本
-COPY docreader .
+COPY docreader docreader
 # 生成 protobuf 代码
-RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh
+RUN chmod +x docreader/scripts/generate_proto.sh && \
    bash docreader/scripts/generate_proto.sh
 # 确保模型目录存在
 RUN ls -la /root/.paddleocr/whl/
@@ -150,10 +151,11 @@ RUN python -m playwright install-deps webkit
 # COPY docreader/scripts/download_deps.py download_deps.py
 # RUN python -m download_deps
-COPY --from=builder /app/ ./
+COPY docreader/pyproject.toml docreader/uv.lock ./
 COPY --from=builder /app/docreader docreader
 # 暴露 gRPC 端口
 EXPOSE 50051
 # 直接运行 Python 服务（日志输出到 stdout/stderr）
-CMD ["uv", "run", "main.py"]
+CMD ["uv", "run", "-m", "docreader.main"]
--- a/docreader/.pylintrc
+++ b/docreader/.pylintrc
@@ -0,0 +1,5 @@
 [LOGGING]
 logging-format-style=fstr
 [MESSAGES CONTROL]
 ; disable=W1203
--- a/docreader/main.py
+++ b/docreader/main.py
@@ -1,37 +1,25 @@
 import os
 import sys
 import logging
-from concurrent import futures
+import os
 import re
 import sys
 import traceback
 import grpc
 import uuid
-import atexit
+from concurrent import futures
 from typing import Optional
 import grpc
 from grpc_health.v1 import health_pb2_grpc
 from grpc_health.v1.health import HealthServicer
-# Add parent directory to Python path
+from docreader.models.read_config import ChunkingConfig
-current_dir = os.path.dirname(os.path.abspath(__file__))
+from docreader.parser import Parser
-parent_dir = os.path.dirname(current_dir)
+from docreader.parser.ocr_engine import OCREngine
-if parent_dir not in sys.path:
+from docreader.proto import docreader_pb2_grpc
-    sys.path.insert(0, parent_dir)
+from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse
 from docreader.utils.request import init_logging_request_id, request_id_context
-from proto.docreader_pb2 import ReadResponse, Chunk, Image
+# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
-from proto import docreader_pb2_grpc
+# cannot be encoded to UTF-8
 from parser import Parser, OCREngine
 from parser.config import ChunkingConfig
 from utils.request import request_id_context, init_logging_request_id
 # --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
 import re
 from typing import Optional
 try:
    # Optional dependency for charset detection; install via `pip install charset-normalizer`
    from charset_normalizer import from_bytes as _cn_from_bytes  # type: ignore
 except Exception:  # pragma: no cover
    _cn_from_bytes = None  # type: ignore
 # Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
 _SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
@@ -47,29 +35,6 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
    return s.encode("utf-8", errors="replace").decode("utf-8")
 def read_text_with_fallback(file_path: str) -> str:
    """Read text from file supporting multiple encodings with graceful fallback.
    This server currently receives bytes over gRPC and delegates decoding to the parser.
    This helper is provided for future local-file reads if needed.
    """
    with open(file_path, "rb") as f:
        raw = f.read()
    if _cn_from_bytes is not None:
        try:
            result = _cn_from_bytes(raw).best()
            if result:
                return str(result)
        except Exception:
            pass
    for enc in ("utf-8", "gb18030", "latin-1"):
        try:
            return raw.decode(enc, errors="replace")
        except UnicodeDecodeError:
            continue
    return raw.decode("utf-8", errors="replace")
 # Ensure no existing handlers
 for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
@@ -113,7 +78,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                    request.file_type or os.path.splitext(request.file_name)[1][1:]
                )
                logger.info(
-                    f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
+                    f"ReadFromFile for file: {request.file_name}, type: {file_type}"
                )
                logger.info(f"File content size: {len(request.file_content)} bytes")
@@ -124,8 +89,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                enable_multimodal = request.read_config.enable_multimodal or False
                logger.info(
-                    f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
+                    f"Using chunking config: size={chunk_size}, "
-                    f"multimodal={enable_multimodal}"
+                    f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
                )
                # Get Storage and VLM config from request
@@ -144,7 +109,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                    "path_prefix": sc.path_prefix,
                }
                logger.info(
-                    f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
+                    f"Using Storage config: provider={storage_config.get('provider')}, "
                    f"bucket={storage_config['bucket_name']}"
                )
                vlm_config = {
@@ -170,7 +136,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                )
                # Parse file
-                logger.info(f"Starting file parsing process")
+                logger.info("Starting file parsing process")
                result = self.parser.parse_file(
                    request.file_name, file_type, request.file_content, chunking_config
                )
@@ -184,7 +150,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                # Convert to protobuf message
                logger.info(
-                    f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
+                    f"Parsed file {request.file_name}, with {len(result.chunks)} chunks"
                )
                # Build response, including image info
@@ -224,8 +190,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                enable_multimodal = request.read_config.enable_multimodal or False
                logger.info(
-                    f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
+                    f"Using chunking config: size={chunk_size}, "
-                    f"multimodal={enable_multimodal}"
+                    f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
                )
                # Get Storage and VLM config from request
@@ -243,7 +209,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                    "path_prefix": sc.path_prefix,
                }
                logger.info(
-                    f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
+                    f"Using Storage config: provider={storage_config.get('provider')}, "
                    f"bucket={storage_config['bucket_name']}"
                )
                vlm_config = {
@@ -269,7 +236,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                )
                # Parse URL
-                logger.info(f"Starting URL parsing process")
+                logger.info("Starting URL parsing process")
                result = self.parser.parse_url(
                    request.url, request.title, chunking_config
                )
@@ -282,7 +249,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
                # Convert to protobuf message, including image info
                logger.info(
-                    f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
+                    f"Parsed URL {request.url}, returning {len(result.chunks)} chunks"
                )
                response = ReadResponse(
@@ -335,29 +302,15 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
        return proto_chunk
-def init_ocr_engine(ocr_backend, ocr_config):
+def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs):
    """Initialize OCR engine"""
-    try:
+    backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle")
-        logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
+    logger.info(f"Initializing OCR engine with backend: {backend_type}")
-        ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
+    OCREngine.get_instance(backend_type=backend_type, **kwargs)
        if ocr_engine:
            logger.info("OCR engine initialized successfully")
            return True
        else:
            logger.error("OCR engine initialization failed")
            return False
    except Exception as e:
        logger.error(f"Error initializing OCR engine: {str(e)}")
        return False
 def main():
-    init_ocr_engine(
+    init_ocr_engine()
        os.getenv("OCR_BACKEND", "paddle"),
        {
            "OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
        },
    )
    # Set max number of worker threads
    max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
--- a/docreader/models/init.py
+++ b/docreader/models/init.py
--- a/docreader/models/document.py
+++ b/docreader/models/document.py
@@ -0,0 +1,87 @@
 """Chunk document schema."""
 import json
 from typing import Any, Dict, List
 from pydantic import BaseModel, Field
 class Chunk(BaseModel):
    """Document Chunk including chunk content, chunk metadata."""
    content: str = Field(default="", description="chunk text content")
    seq: int = Field(default=0, description="Chunk sequence number")
    start: int = Field(default=0, description="Chunk start position")
    end: int = Field(description="Chunk end position")
    images: List[Dict[str, Any]] = Field(
        default_factory=list, description="Images in the chunk"
    )
    metadata: Dict[str, Any] = Field(
        default_factory=dict,
        description="metadata fields",
    )
    def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
        """Convert Chunk to dict."""
        data = self.model_dump()
        data.update(kwargs)
        data["class_name"] = self.__class__.__name__
        return data
    def to_json(self, **kwargs: Any) -> str:
        """Convert Chunk to json."""
        data = self.to_dict(**kwargs)
        return json.dumps(data)
    def __hash__(self):
        """Hash function."""
        return hash((self.content,))
    def __eq__(self, other):
        """Equal function."""
        return self.content == other.content
    @classmethod
    def from_dict(cls, data: Dict[str, Any], **kwargs: Any):  # type: ignore
        """Create Chunk from dict."""
        if isinstance(kwargs, dict):
            data.update(kwargs)
        data.pop("class_name", None)
        return cls(**data)
    @classmethod
    def from_json(cls, data_str: str, **kwargs: Any):  # type: ignore
        """Create Chunk from json."""
        data = json.loads(data_str)
        return cls.from_dict(data, **kwargs)
 class Document(BaseModel):
    """Document including document content, document metadata."""
    model_config = {"arbitrary_types_allowed": True}
    content: str = Field(default="", description="document text content")
    images: Dict[str, str] = Field(
        default_factory=dict, description="Images in the document"
    )
    chunks: List[Chunk] = Field(default_factory=list, description="document chunks")
    metadata: Dict[str, Any] = Field(
        default_factory=dict,
        description="metadata fields",
    )
    def set_content(self, content: str) -> None:
        """Set document content."""
        self.content = content
    def get_content(self) -> str:
        """Get document content."""
        return self.content
    def is_valid(self) -> bool:
        return self.content != ""
--- a/docreader/models/read_config.py
+++ b/docreader/models/read_config.py
@@ -0,0 +1,27 @@
 from dataclasses import dataclass, field
@dataclass
 class ChunkingConfig:
    """
    Configuration for text chunking process.
    Controls how documents are split into smaller pieces for processing.
    """
    # Maximum size of each chunk in tokens/chars
    chunk_size: int = 512
    # Number of tokens/chars to overlap between chunks
    chunk_overlap: int = 50
    # Text separators in order of priority
    separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"])
    # Whether to enable multimodal processing (text + images)
    enable_multimodal: bool = False
    # Preferred field name going forward
    storage_config: dict[str, str] = field(default_factory=dict)
    # VLM configuration for image captioning
    vlm_config: dict[str, str] = field(default_factory=dict)
--- a/docreader/parser/init.py
+++ b/docreader/parser/init.py
@@ -13,22 +13,18 @@ The parsers extract content from documents and can split them into
 meaningful chunks for further processing and indexing.
 """
 from .base_parser import BaseParser, ParseResult
 from .docx_parser import DocxParser
 from .doc_parser import DocParser
-from .pdf_parser import PDFParser
+from .docx2_parser import Docx2Parser
 from .markdown_parser import MarkdownParser
 from .text_parser import TextParser
 from .image_parser import ImageParser
-from .web_parser import WebParser
+from .markdown_parser import MarkdownParser
 from .parser import Parser
-from .config import ChunkingConfig
+from .pdf_parser import PDFParser
-from .ocr_engine import OCREngine
+from .text_parser import TextParser
 from .web_parser import WebParser
 # Export public classes and modules
 __all__ = [
-    "BaseParser",  # Base parser class that all format parsers inherit from
+    "Docx2Parser",  # Parser for .docx files (modern Word documents)
    "DocxParser",  # Parser for .docx files (modern Word documents)
    "DocParser",  # Parser for .doc files (legacy Word documents)
    "PDFParser",  # Parser for PDF documents
    "MarkdownParser",  # Parser for Markdown text files
@@ -36,7 +32,4 @@ __all__ = [
    "ImageParser",  # Parser for images with text content
    "WebParser",  # Parser for web pages
    "Parser",  # Main parser factory that selects the appropriate parser
    "ChunkingConfig",  # Configuration for text chunking behavior
    "ParseResult",  # Standard result format returned by all parsers
    "OCREngine",  # OCR engine for extracting text from images
 ]
--- a/docreader/parser/base_parser.py
+++ b/docreader/parser/base_parser.py
@@ -1,65 +1,28 @@
 # -*- coding: utf-8 -*-
 import re
 import os
 import asyncio
 from typing import List, Dict, Any, Optional, Tuple, Union
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 import logging
 import sys
 import traceback
 import numpy as np
 import time
 import io
-import json
+import logging
-from .ocr_engine import OCREngine
+import os
-from .image_utils import image_to_base64
+import re
-from .config import ChunkingConfig
+import time
-from .storage import create_storage
+from abc import ABC, abstractmethod
 from typing import Dict, List, Optional, Tuple
 import requests
 from PIL import Image
-# Add parent directory to Python path for src imports
+from docreader.models.document import Chunk, Document
-current_dir = os.path.dirname(os.path.abspath(__file__))
+from docreader.models.read_config import ChunkingConfig
-parent_dir = os.path.dirname(current_dir)
+from docreader.parser.caption import Caption
-if parent_dir not in sys.path:
+from docreader.parser.ocr_engine import OCREngine
-    sys.path.insert(0, parent_dir)
+from docreader.parser.storage import create_storage
-
+from docreader.splitter.splitter import TextSplitter
-try:
+from docreader.utils import endecode
    from services.docreader.src.parser.caption import Caption
 except ImportError:
    # Fallback: try relative import
    try:
        from .caption import Caption
    except ImportError:
        # If both imports fail, set to None
        Caption = None
        logging.warning(
            "Failed to import Caption, image captioning will be unavailable"
        )
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@dataclass
 class Chunk:
    """Chunk result"""
    content: str  # Chunk content
    seq: int  # Chunk sequence number
    start: int  # Chunk start position
    end: int  # Chunk end position
    images: List[Dict[str, Any]] = field(default_factory=list)  # Images in the chunk
@dataclass
 class ParseResult:
    """Parse result"""
    text: str  # Extracted text content
    chunks: Optional[List[Chunk]] = None  # Chunk results
 class BaseParser(ABC):
    """Base parser interface"""
@@ -97,17 +60,17 @@ class BaseParser(ABC):
    def __init__(
        self,
        file_name: str = "",
-        file_type: str = None,
+        file_type: Optional[str] = None,
        enable_multimodal: bool = True,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
-        separators: list = ["\n\n", "\n", "。"],
+        separators: list[str] = ["\n\n", "\n", "。"],
        ocr_backend: str = "paddle",
-        ocr_config: dict = None,
+        ocr_config: dict = {},
        max_image_size: int = 1920,  # Maximum image size
        max_concurrent_tasks: int = 5,  # Max concurrent tasks
        max_chunks: int = 1000,  # Max number of returned chunks
-        chunking_config: ChunkingConfig = None,  # Chunking configuration object
+        chunking_config: Optional[ChunkingConfig] = None,
    ):
        """Initialize parser
@@ -125,7 +88,6 @@ class BaseParser(ABC):
            max_chunks: Max number of returned chunks
        """
        # Storage client instance
        self._storage = None
        self.file_name = file_name
        self.file_type = file_type or os.path.splitext(file_name)[1]
        self.enable_multimodal = enable_multimodal
@@ -133,15 +95,16 @@ class BaseParser(ABC):
        self.chunk_overlap = chunk_overlap
        self.separators = separators
        self.ocr_backend = os.getenv("OCR_BACKEND", ocr_backend)
-        self.ocr_config = ocr_config or {}
+        self.ocr_config = ocr_config
        self.max_image_size = max_image_size
        self.max_concurrent_tasks = max_concurrent_tasks
        self.max_chunks = max_chunks
        self.chunking_config = chunking_config
-
+        self.storage = create_storage(
-        logger.info(
+            self.chunking_config.storage_config if self.chunking_config else None
            f"Initializing {self.__class__.__name__} for file: {file_name}, type: {self.file_type}"
        )
        logger.info(f"Initializing parser for file: {file_name}, type: {file_type}")
        logger.info(
            f"Parser config: chunk_size={chunk_size}, "
            f"overlap={chunk_overlap}, "
@@ -150,16 +113,24 @@ class BaseParser(ABC):
            f"max_chunks={max_chunks}"
        )
        # Only initialize Caption service if multimodal is enabled
-        if self.enable_multimodal:
+        vlm_config = self.chunking_config.vlm_config if self.chunking_config else None
-            try:
+        self.caption_parser = (
-                self.caption_parser = Caption(self.chunking_config.vlm_config)
+            Caption(vlm_config=vlm_config) if self.enable_multimodal else None
-            except Exception as e:
+        )
                logger.warning(f"Failed to initialize Caption service: {str(e)}")
                self.caption_parser = None
        else:
            self.caption_parser = None
-    def perform_ocr(self, image):
+    @abstractmethod
    def parse_into_text(self, content: bytes) -> Document:
        """Parse document content
        Args:
            content: Document content
        Returns:
            Either a string containing the parsed text, or a tuple of (text, image_map)
            where image_map is a dict mapping image URLs to Image objects
        """
    def perform_ocr(self, image: Image.Image):
        """Execute OCR recognition on the image
        Args:
@@ -170,53 +141,23 @@ class BaseParser(ABC):
        """
        start_time = time.time()
        logger.info("Starting OCR recognition")
        resized_image = None
-        try:
+        # Resize image to avoid processing large images
-            # Resize image to avoid processing large images
+        resized_image = self._resize_image_if_needed(image)
            resized_image = self._resize_image_if_needed(image)
-            # Get OCR engine
+        # Get OCR engine
-            ocr_engine = self.get_ocr_engine(
+        ocr_engine = OCREngine.get_instance(self.ocr_backend)
                backend_type=self.ocr_backend, **self.ocr_config
            )
            if ocr_engine is None:
                logger.error(
                    f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, "
                    "skipping OCR recognition"
                )
                return ""
-            # Execute OCR prediction
+        # Execute OCR prediction
-            logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
+        logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
-            # Add extra exception handling
+        ocr_result = ocr_engine.predict(resized_image)
            try:
                ocr_result = ocr_engine.predict(resized_image)
            except RuntimeError as e:
                # Handle common CUDA memory issues or other runtime errors
                logger.error(f"OCR prediction runtime error: {str(e)}")
                return ""
            except Exception as e:
                # Handle other prediction errors
                logger.error(f"Unexpected OCR prediction error: {str(e)}")
                return ""
-            process_time = time.time() - start_time
+        process_time = time.time() - start_time
-            logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")
+        logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")
            return ocr_result
        except Exception as e:
            process_time = time.time() - start_time
            logger.error(
                f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds"
            )
            return ""
        finally:
            # Release image resources
            if resized_image is not image and hasattr(resized_image, "close"):
                # Only close the new image we created, not the original image
                resized_image.close()
-    def _resize_image_if_needed(self, image):
+        return ocr_result
    def _resize_image_if_needed(self, image: Image.Image) -> Image.Image:
        """Resize image if it exceeds maximum size limit
        Args:
@@ -225,102 +166,21 @@ class BaseParser(ABC):
        Returns:
            Resized image object
        """
-        try:
+        width, height = image.size
-            # If it's a PIL Image
+        if width > self.max_image_size or height > self.max_image_size:
-            if hasattr(image, "size"):
+            logger.info(f"Resizing PIL image, original size: {width}x{height}")
-                width, height = image.size
+            scale = min(self.max_image_size / width, self.max_image_size / height)
-                if width > self.max_image_size or height > self.max_image_size:
+            new_width = int(width * scale)
-                    logger.info(f"Resizing PIL image, original size: {width}x{height}")
+            new_height = int(height * scale)
-                    scale = min(
+            resized_image = image.resize((new_width, new_height))
-                        self.max_image_size / width, self.max_image_size / height
+            logger.info(f"Resized to: {new_width}x{new_height}")
-                    )
+            return resized_image
                    new_width = int(width * scale)
                    new_height = int(height * scale)
                    resized_image = image.resize((new_width, new_height))
                    logger.info(f"Resized to: {new_width}x{new_height}")
                    return resized_image
                else:
                    logger.info(
                        f"PIL image size {width}x{height} is within limits, no resizing needed"
                    )
                    return image
            # If it's a numpy array
            elif hasattr(image, "shape"):
                height, width = image.shape[:2]
                if width > self.max_image_size or height > self.max_image_size:
                    logger.info(
                        f"Resizing numpy image, original size: {width}x{height}"
                    )
                    scale = min(
                        self.max_image_size / width, self.max_image_size / height
                    )
                    new_width = int(width * scale)
                    new_height = int(height * scale)
                    # Use PIL for resizing numpy arrays
                    pil_image = Image.fromarray(image)
                    resized_pil = pil_image.resize((new_width, new_height))
                    resized_image = np.array(resized_pil)
                    logger.info(f"Resized to: {new_width}x{new_height}")
                    return resized_image
                else:
                    logger.info(
                        f"Numpy image size {width}x{height} is within limits, no resizing needed"
                    )
                    return image
            else:
                logger.warning(f"Unknown image type: {type(image)}, cannot resize")
                return image
        except Exception as e:
            logger.error(f"Error resizing image: {str(e)}")
            return image
-    def process_image(self, image, image_url=None):
+        logger.info(f"PIL image size is {width}x{height}, no resizing needed")
-        """Process image: first perform OCR, then get caption if text is available
+        return image
-        Args:
+    async def process_image_async(self, image: Image.Image, image_url: str):
-            image: Image object (PIL.Image or numpy array)
+        """Asynchronously process image: first perform OCR, then get caption
            image_url: Image URL (if uploaded)
        Returns:
            tuple: (ocr_text, caption, image_url)
            - ocr_text: OCR extracted text
            - caption: Image description (if OCR has text) or empty string
            - image_url: Image URL (if provided)
        """
        logger.info("Starting image processing (OCR + optional caption)")
        # Resize image
        image = self._resize_image_if_needed(image)
        # Perform OCR recognition
        ocr_text = self.perform_ocr(image)
        caption = ""
        if self.caption_parser:
            logger.info(
                f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
            )
            # Convert image to base64 for caption generation
            img_base64 = image_to_base64(image)
            if img_base64:
                caption = self.get_image_caption(img_base64)
                if caption:
                    logger.info(f"Successfully obtained image caption: {caption}")
                else:
                    logger.warning("Failed to get caption")
            else:
                logger.warning("Failed to convert image to base64")
                caption = ""
        else:
            logger.info("Caption service not initialized, skipping caption retrieval")
        # Release image resources
        del image
        return ocr_text, caption, image_url
    async def process_image_async(self, image, image_url=None):
        """Asynchronously process image: first perform OCR, then get caption if text is available
        Args:
            image: Image object (PIL.Image or numpy array)
@@ -333,84 +193,47 @@ class BaseParser(ABC):
            - image_url: Image URL (if provided)
        """
        logger.info("Starting asynchronous image processing (OCR + optional caption)")
        resized_image = None
        # Resize image
        resized_image = self._resize_image_if_needed(image)
        try:
-            # Resize image
+            # Perform OCR recognition
            resized_image = self._resize_image_if_needed(image)
            # Perform OCR recognition (using run_in_executor to execute synchronous operations in the event loop)
            loop = asyncio.get_event_loop()
            try:
                # Add timeout mechanism to avoid infinite blocking (30 seconds timeout)
                ocr_task = loop.run_in_executor(None, self.perform_ocr, resized_image)
                ocr_text = await asyncio.wait_for(ocr_task, timeout=30.0)
            except asyncio.TimeoutError:
                logger.error(
                    "OCR processing timed out (30 seconds), skipping this image"
                )
                ocr_text = ""
            except Exception as e:
-                logger.error(f"OCR processing error: {str(e)}")
+                logger.error(f"OCR processing error, skipping this image: {str(e)}")
                ocr_text = ""
-            logger.info(
+            logger.info(f"Successfully obtained image ocr: {ocr_text}")
-                f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
+            img_base64 = endecode.decode_image(resized_image)
-            )
+            caption = self.get_image_caption(img_base64)
-            caption = ""
+            logger.info(f"Successfully obtained image caption: {caption}")
            if self.caption_parser:
                try:
                    # Convert image to base64 for caption generation
                    img_base64 = image_to_base64(resized_image)
                    if img_base64:
                        # Add timeout to avoid blocking caption retrieval (30 seconds timeout)
                        caption_task = self.get_image_caption_async(img_base64)
                        image_data, caption = await asyncio.wait_for(
                            caption_task, timeout=30.0
                        )
                        if caption:
                            logger.info(
                                f"Successfully obtained image caption: {caption}"
                            )
                        else:
                            logger.warning("Failed to get caption")
                    else:
                        logger.warning("Failed to convert image to base64")
                        caption = ""
                except asyncio.TimeoutError:
                    logger.warning("Caption retrieval timed out, skipping")
                except Exception as e:
                    logger.error(f"Failed to get caption: {str(e)}")
            else:
                logger.info(
                    "Caption service not initialized, skipping caption retrieval"
                )
            return ocr_text, caption, image_url
        finally:
-            # Release image resources
+            resized_image.close()
            if resized_image is not image and hasattr(resized_image, "close"):
                # Only close the new image we created, not the original image
                resized_image.close()
-    async def process_with_limit(self, idx, image, url, semaphore):
+    async def process_with_limit(
        self, idx: int, image: Image.Image, url: str, semaphore: asyncio.Semaphore
    ):
        """Function to process a single image using a semaphore"""
        try:
-            logger.info(f"Waiting to process image {idx+1}")
+            logger.info(f"Waiting to process image {idx + 1}")
            async with semaphore:  # Use semaphore to control concurrency
-                logger.info(f"Starting to process image {idx+1}")
+                logger.info(f"Starting to process image {idx + 1}")
                result = await self.process_image_async(image, url)
-                logger.info(f"Completed processing image {idx+1}")
+                logger.info(f"Completed processing image {idx + 1}")
                return result
        except Exception as e:
-            logger.error(f"Error processing image {idx+1}: {str(e)}")
+            logger.error(f"Error processing image {idx + 1}: {str(e)}")
            return ("", "", url)  # Return empty result to avoid overall failure
        finally:
            # Manually release image resources
-            if hasattr(image, "close"):
+            image.close()
                image.close()
-    async def process_multiple_images(self, images_data):
+    async def process_multiple_images(self, images_data: List[Tuple[Image.Image, str]]):
        """Process multiple images concurrently
        Args:
@@ -450,7 +273,7 @@ class BaseParser(ABC):
            for i, result in enumerate(completed_results):
                if isinstance(result, Exception):
                    logger.error(
-                        f"Image {i+1} processing returned an exception: {str(result)}"
+                        f"Image {i + 1} processing returned an exception: {str(result)}"
                    )
                    # For exceptions, add empty results
                    if i < len(images_data):
@@ -467,47 +290,10 @@ class BaseParser(ABC):
            logger.info("Image processing resource cleanup complete")
        logger.info(
-            f"Completed concurrent processing of {len(results)}/{len(images_data)} images"
+            f"Concurrent processing of {len(results)}/{len(images_data)} images"
        )
        return results
    def decode_bytes(self, content: bytes) -> str:
        """Intelligently decode byte stream, supports multiple encodings
        Tries to decode in common encodings, if all fail, uses latin-1 as fallback
        Args:
            content: Byte stream to decode
        Returns:
            Decoded string
        """
        logger.info(f"Attempting to decode bytes of length: {len(content)}")
        # Common encodings, sorted by priority
        encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii", "latin-1"]
        text = None
        # Try decoding with each encoding format
        for encoding in encodings:
            try:
                text = content.decode(encoding)
                logger.info(f"Successfully decoded content using {encoding} encoding")
                break
            except UnicodeDecodeError:
                logger.info(f"Failed to decode using {encoding} encoding")
                continue
        # If all encodings fail, use latin-1 as fallback
        if text is None:
            text = content.decode("latin-1")
            logger.warning(
                f"Unable to determine correct encoding, using latin-1 as fallback. "
                f"This may cause character issues."
            )
        logger.info(f"Decoded text length: {len(text)} characters")
        return text
    def get_image_caption(self, image_data: str) -> str:
        """Get image description
@@ -517,6 +303,9 @@ class BaseParser(ABC):
        Returns:
            Image description
        """
        if not self.caption_parser:
            logger.warning("Caption parser not initialized")
            return ""
        start_time = time.time()
        logger.info(
            f"Getting caption for image: {image_data[:250]}..."
@@ -533,80 +322,7 @@ class BaseParser(ABC):
            logger.warning("Failed to get caption for image")
        return caption
-    async def get_image_caption_async(self, image_data: str) -> Tuple[str, str]:
+    def parse(self, content: bytes) -> Document:
        """Asynchronously get image description
        Args:
            image_data: Image data (base64 encoded string or URL)
        Returns:
            Tuple[str, str]: Image data and corresponding description
        """
        caption = self.get_image_caption(image_data)
        return image_data, caption
    def __init_storage(self):
        """Initialize storage client based on configuration"""
        if self._storage is None:
            storage_config = (
                self.chunking_config.storage_config if self.chunking_config else None
            )
            self._storage = create_storage(storage_config)
            logger.info(
                f"Initialized storage client: {self._storage.__class__.__name__}"
            )
        return self._storage
    def upload_file(self, file_path: str) -> str:
        """Upload file to object storage
        Args:
            file_path: File path
        Returns:
            File URL
        """
        logger.info(f"Uploading file: {file_path}")
        try:
            storage = self.__init_storage()
            return storage.upload_file(file_path)
        except Exception as e:
            logger.error(f"Failed to upload file: {str(e)}")
            return ""
    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
        """Upload bytes to object storage
        Args:
            content: Byte content to upload
            file_ext: File extension
        Returns:
            File URL
        """
        logger.info(f"Uploading bytes content, size: {len(content)} bytes")
        try:
            storage = self.__init_storage()
            return storage.upload_bytes(content, file_ext)
        except Exception as e:
            logger.error(f"Failed to upload bytes to storage: {str(e)}")
            traceback.print_exc()
            return ""
    @abstractmethod
    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
        """Parse document content
        Args:
            content: Document content
        Returns:
            Either a string containing the parsed text, or a tuple of (text, image_map)
            where image_map is a dict mapping image URLs to Image objects
        """
        pass
    def parse(self, content: bytes) -> ParseResult:
        """Parse document content
        Args:
@@ -616,17 +332,19 @@ class BaseParser(ABC):
            Parse result
        """
        logger.info(
-            f"Parsing document with {self.__class__.__name__}, content size: {len(content)} bytes"
+            f"Parsing document with {self.__class__.__name__}, bytes: {len(content)}"
        )
-        parse_result = self.parse_into_text(content)
+        document = self.parse_into_text(content)
-        if isinstance(parse_result, tuple):
+        logger.info(
-            text, image_map = parse_result
+            f"Extracted {len(document.content)} characters from {self.file_name}"
-        else:
+        )
-            text = parse_result
+        splitter = TextSplitter(
-            image_map = {}
+            chunk_size=self.chunk_size,
-        logger.info(f"Extracted {len(text)} characters of text from {self.file_name}")
+            chunk_overlap=self.chunk_overlap,
-        logger.info(f"Beginning chunking process for text")
+            separators=self.separators,
-        chunks = self.chunk_text(text)
+        )
        chunk_str = splitter.split_text(document.content)
        chunks = self._str_to_chunk(chunk_str)
        logger.info(f"Created {len(chunks)} chunks from document")
        # Limit the number of returned chunks
@@ -636,7 +354,7 @@ class BaseParser(ABC):
            )
            chunks = chunks[: self.max_chunks]
-        # If multimodal is enabled and file type is supported, process images in each chunk
+        # If multimodal is enabled and file type is supported, process images
        if self.enable_multimodal:
            # Get file extension and convert to lowercase
            file_ext = (
@@ -647,11 +365,12 @@ class BaseParser(ABC):
            # Define allowed file types for image processing
            allowed_types = [
-                ".pdf",  # PDF files
+                # Text files
                ".pdf",
                ".md",
-                ".markdown",  # Markdown files
+                ".markdown",
                ".doc",
-                ".docx",  # Word documents
+                ".docx",
                # Image files
                ".jpg",
                ".jpeg",
@@ -666,13 +385,21 @@ class BaseParser(ABC):
                logger.info(
                    f"Processing images in each chunk for file type: {file_ext}"
                )
-                chunks = self.process_chunks_images(chunks, image_map)
+                chunks = self.process_chunks_images(chunks, document.images)
            else:
                logger.info(
                    f"Skipping image processing for unsupported file type: {file_ext}"
                )
-        return ParseResult(text=text, chunks=chunks)
+        document.chunks = chunks
        return document
    def _str_to_chunk(self, text: List[Tuple[int, int, str]]) -> List[Chunk]:
        """Convert string to Chunk object"""
        return [
            Chunk(seq=i, content=t, start=start, end=end)
            for i, (start, end, t) in enumerate(text)
        ]
    def _split_into_units(self, text: str) -> List[str]:
        """
@@ -682,9 +409,7 @@ class BaseParser(ABC):
        Returns:
            基本单元的列表
        """
-        logger.info(
+        logger.info(f"Splitting text into basic units, text length: {len(text)}")
            f"Splitting text into basic units with robust structure protection, text length: {len(text)}"
        )
        # 定义所有需要作为整体保护的结构模式 ---
        table_pattern = r"(?m)(^\|.*\|[ \t]*\r?\n(?:[ \t]*\r?\n)?^\|\s*:?--+.*\r?\n(?:^\|.*\|\r?\n?)*)"
@@ -710,7 +435,8 @@ class BaseParser(ABC):
        # 按起始位置排序
        protected_ranges.sort(key=lambda x: x[0])
        logger.info(
-            f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links)."
+            f"Found {len(protected_ranges)} protected structures "
            "(tables, code, formulas, images, links)."
        )
        # 合并可能重叠的保护范围 ---
@@ -731,7 +457,7 @@ class BaseParser(ABC):
            merged_ranges.append((current_start, current_end))
            protected_ranges = merged_ranges
            logger.info(
-                f"After merging overlaps, {len(protected_ranges)} protected ranges remain."
+                f"After overlaps, {len(protected_ranges)} protected ranges remain."
            )
        # 根据保护范围和分隔符来分割文本 ---
@@ -749,7 +475,7 @@ class BaseParser(ABC):
                segments = re.split(separator_pattern, pre_text)
                units.extend([s for s in segments if s])  # 添加所有非空部分
-            # b. 将整个受保护的块（例如，一个完整的表格）作为一个单独的、不可分割的单元添加
+            # b. 将整个受保护的块（例如，一个完整的表格）作为一个不可分割的单元添加
            protected_text = text[start:end]
            units.append(protected_text)
@@ -764,38 +490,6 @@ class BaseParser(ABC):
        logger.info(f"Text splitting complete, created {len(units)} final basic units.")
        return units
    def _find_complete_units(self, units: List[str], target_size: int) -> List[str]:
        """Find a list of complete units that do not exceed the target size
        Args:
            units: List of units
            target_size: Target size
        Returns:
            List of complete units
        """
        logger.info(f"Finding complete units with target size: {target_size}")
        result = []
        current_size = 0
        for unit in units:
            unit_size = len(unit)
            if current_size + unit_size > target_size and result:
                logger.info(
                    f"Reached target size limit at {current_size} characters, stopping"
                )
                break
            result.append(unit)
            current_size += unit_size
            logger.info(
                f"Added unit of size {unit_size}, current total: {current_size}/{target_size}"
            )
        logger.info(
            f"Found {len(result)} complete units totaling {current_size} characters"
        )
        return result
    def chunk_text(self, text: str) -> List[Chunk]:
        """Chunk text, preserving Markdown structure
@@ -825,7 +519,7 @@ class BaseParser(ABC):
        for i, unit in enumerate(units):
            unit_size = len(unit)
-            logger.info(f"Processing unit {i+1}/{len(units)}, size: {unit_size}")
+            logger.info(f"Processing unit {i + 1}/{len(units)}, size: {unit_size}")
            # If current chunk plus new unit exceeds size limit, create new chunk
            if current_size + unit_size > self.chunk_size and current_chunk:
@@ -855,14 +549,12 @@ class BaseParser(ABC):
                    for u in reversed(current_chunk):
                        if overlap_size + len(u) > overlap_target:
                            logger.info(
-                                f"Reached overlap target ({overlap_size}/{overlap_target})"
+                                f"Overlap target ({overlap_size}/{overlap_target})"
                            )
                            break
                        overlap_units.insert(0, u)
                        overlap_size += len(u)
-                        logger.info(
+                        logger.info(f"Added unit to overlap, size: {overlap_size}")
                            f"Added unit to overlap, current overlap size: {overlap_size}"
                        )
                    # Remove elements from overlap that are included in separators
                    start_index = 0
@@ -883,7 +575,7 @@ class BaseParser(ABC):
                    overlap_units = overlap_units[start_index:]
                    logger.info(
-                        f"Final overlap: {len(overlap_units)} units, {overlap_size} characters"
+                        f"Overlap: {len(overlap_units)} units, {overlap_size} size"
                    )
                    current_chunk = overlap_units
@@ -899,7 +591,7 @@ class BaseParser(ABC):
            current_chunk.append(unit)
            current_size += unit_size
            logger.info(
-                f"Added unit to current chunk, now at {current_size}/{self.chunk_size} characters"
+                f"Added unit to current chunk, at {current_size}/{self.chunk_size}"
            )
        # Add the last chunk
@@ -925,12 +617,13 @@ class BaseParser(ABC):
            chunk: Document chunk
        Returns:
-            List of image information, each element contains image URL and match position
+            List of image information
        """
        logger.info(f"Extracting image information from Chunk #{chunk.seq}")
        text = chunk.content
-        # Regex to extract image information from text, supporting Markdown images and HTML images
+        # Regex to extract image information from text,
        # support: Markdown images, HTML images
        img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)|<img [^>]*src="([^"]+)" [^>]*>'
        # Extract image information
@@ -954,28 +647,28 @@ class BaseParser(ABC):
            images_info.append(image_info)
            logger.info(
-                f"Image in Chunk #{chunk.seq} {match_idx+1}: " f"URL={img_url[:50]}..."
+                f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url[:50]}..."
                if len(img_url) > 50
-                else f"Image in Chunk #{chunk.seq} {match_idx+1}: URL={img_url}"
+                else f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url}"
            )
        return images_info
-    async def download_and_upload_image(self, img_url: str):
+    async def download_and_upload_image(
-        """Download image and upload to object storage, if it's already an object storage path or local path, use directly
+        self, img_url: str
    ) -> Tuple[str, str, Image.Image | None]:
        """Download image and upload to object storage,
        if it's already an object storage path or local path, use directly
        Args:
            img_url: Image URL or local path
        Returns:
-            tuple: (original URL, storage URL, image object), if failed returns (original URL, None, None)
+            tuple: (original URL, storage URL, image object),
            if failed returns (original URL, None, None)
        """
        try:
            import requests
            from PIL import Image
            import io
            # Check if it's already a storage URL (COS or MinIO)
            is_storage_url = any(
                pattern in img_url
@@ -997,12 +690,7 @@ class BaseParser(ABC):
                    response = requests.get(img_url, timeout=5, proxies=proxies)
                    if response.status_code == 200:
                        image = Image.open(io.BytesIO(response.content))
-                        try:
+                        return img_url, img_url, image
                            return img_url, img_url, image
                        finally:
                            # Ensure image resources are also released after the function returns
                            # Image will be closed by the caller
                            pass
                    else:
                        logger.warning(
                            f"Failed to get storage image: {response.status_code}"
@@ -1022,7 +710,7 @@ class BaseParser(ABC):
                    # Upload to storage
                    with open(img_url, "rb") as f:
                        content = f.read()
-                    storage_url = self.upload_bytes(content)
+                    storage_url = self.storage.upload_bytes(content)
                    logger.info(
                        f"Successfully uploaded local image to storage: {storage_url}"
                    )
@@ -1031,7 +719,7 @@ class BaseParser(ABC):
                    logger.error(f"Error processing local image: {str(e)}")
                    if image and hasattr(image, "close"):
                        image.close()
-                    return img_url, None, None
+                    return img_url, img_url, None
            # Normal remote URL download handling
            else:
@@ -1044,9 +732,7 @@ class BaseParser(ABC):
                if https_proxy:
                    proxies["https"] = https_proxy
-                logger.info(
+                logger.info(f"Downloading image {img_url}, using proxy: {proxies}")
                    f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}"
                )
                response = requests.get(img_url, timeout=5, proxies=proxies)
                if response.status_code == 200:
@@ -1054,7 +740,7 @@ class BaseParser(ABC):
                    image = Image.open(io.BytesIO(response.content))
                    try:
                        # Upload to storage using the method in BaseParser
-                        storage_url = self.upload_bytes(response.content)
+                        storage_url = self.storage.upload_bytes(response.content)
                        logger.info(
                            f"Successfully uploaded image to storage: {storage_url}"
                        )
@@ -1064,11 +750,11 @@ class BaseParser(ABC):
                        pass
                else:
                    logger.warning(f"Failed to download image: {response.status_code}")
-                    return img_url, None, None
+                    return img_url, img_url, None
        except Exception as e:
            logger.error(f"Error downloading or processing image: {str(e)}")
-            return img_url, None, None
+            return img_url, img_url, None
    async def process_chunk_images_async(
        self, chunk, chunk_idx, total_chunks, image_map=None
@@ -1086,18 +772,19 @@ class BaseParser(ABC):
        """
        logger.info(
-            f"Starting to process images in Chunk #{chunk_idx+1}/{total_chunks}"
+            f"Starting to process images in Chunk #{chunk_idx + 1}/{total_chunks}"
        )
        # Extract image information from the Chunk
        images_info = self.extract_images_from_chunk(chunk)
        if not images_info:
-            logger.info(f"Chunk #{chunk_idx+1} found no images")
+            logger.info(f"Chunk #{chunk_idx + 1} found no images")
            return chunk
        # Prepare images that need to be downloaded and processed
        images_to_process = []
-        url_to_info_map = {}  # Map URL to image information
+        # Map URL to image information
        url_to_info_map = {}
        # Record all image URLs that need to be processed
        for img_info in images_info:
@@ -1106,14 +793,21 @@ class BaseParser(ABC):
        results = []
        download_tasks = []
-        for img_url in  url_to_info_map.keys():          # Check if image is already in the image_map
+        # Check if image is already in the image_map
        for img_url in url_to_info_map.keys():
            if image_map and img_url in image_map:
-                logger.info(f"Image already in image_map: {img_url}, using cached object")
+                logger.info(
-                results.append((img_url, img_url, image_map[img_url]))
+                    f"Image already in image_map: {img_url}, using cached object"
                )
                image = Image.open(
                    io.BytesIO(endecode.encode_image(image_map[img_url]))
                )
                results.append((img_url, img_url, image))
            else:
                download_task = self.download_and_upload_image(img_url)
                download_tasks.append(download_task)
-        # Concurrent download and upload of images, ignore images that are already in the image_map
+        # Concurrent download and upload of images,
        # ignore images that are already in the image_map
        results.extend(await asyncio.gather(*download_tasks))
        # Process download results, prepare for OCR processing
@@ -1123,16 +817,17 @@ class BaseParser(ABC):
                img_info["cos_url"] = cos_url
                images_to_process.append((image, cos_url))
-        # If no images were successfully downloaded and uploaded, return the original Chunk
+        # If no images were successfully downloaded and uploaded,
        # return the original Chunk
        if not images_to_process:
            logger.info(
-                f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images"
+                f"Chunk #{chunk_idx + 1} not found downloaded and uploaded images"
            )
            return chunk
        # Concurrent processing of all images (OCR + caption)
        logger.info(
-            f"Processing {len(images_to_process)} images in Chunk #{chunk_idx+1}"
+            f"Processing {len(images_to_process)} images in Chunk #{chunk_idx + 1}"
        )
        # Concurrent processing of all images
@@ -1163,10 +858,12 @@ class BaseParser(ABC):
        # Update image information in the Chunk
        chunk.images = processed_images
-        logger.info(f"Completed image processing in Chunk #{chunk_idx+1}")
+        logger.info(f"Completed image processing in Chunk #{chunk_idx + 1}")
        return chunk
-    def process_chunks_images(self, chunks: List[Chunk], image_map=None) -> List[Chunk]:
+    def process_chunks_images(
        self, chunks: List[Chunk], image_map: Dict[str, str] = {}
    ) -> List[Chunk]:
        """Concurrent processing of images in all Chunks
        Args:
@@ -1210,7 +907,7 @@ class BaseParser(ABC):
            processed_chunks = []
            for i, result in enumerate(results):
                if isinstance(result, Exception):
-                    logger.error(f"Error processing Chunk {i+1}: {str(result)}")
+                    logger.error(f"Error processing Chunk {i + 1}: {str(result)}")
                    # Keep original Chunk
                    if i < len(chunks):
                        processed_chunks.append(chunks[i])
@@ -1235,7 +932,7 @@ class BaseParser(ABC):
            # Execute processing for all Chunks
            processed_chunks = loop.run_until_complete(process_all_chunks())
            logger.info(
-                f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks"
+                f"Completed processing of {len(processed_chunks)}/{len(chunks)} chunks"
            )
            return processed_chunks
--- a/docreader/parser/caption.py
+++ b/docreader/parser/caption.py
@@ -3,11 +3,10 @@ import logging
 import os
 import time
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 import requests
 import ollama
-
+import requests
 logger = logging.getLogger(__name__)
@@ -158,11 +157,16 @@ class CaptionChatResp:
        Returns:
            The content string from the first choice, or empty string if no choices
        """
-        if self.choices:
+        if (
-            logger.info("Retrieving content from first choice")
+            not self.choices
-            return self.choices[0].message.content
+            or not self.choices[0]
-        logger.warning("No choices available in response")
+            or not self.choices[0].message
-        return ""
+            or not self.choices[0].message.content
        ):
            logger.warning("No choices available in response")
            return ""
        logger.info("Retrieving content from first choice")
        return self.choices[0].message.content
 class Caption:
@@ -171,33 +175,43 @@ class Caption:
    Uses an external API to process images and return textual descriptions.
    """
-    def __init__(self, vlm_config=None):
+    def __init__(self, vlm_config: Optional[Dict[str, str]] = None):
-        """Initialize the Caption service with configuration from parameters or environment variables."""
+        """
        Initialize the Caption service with configuration
        from parameters or environment variables.
        """
        logger.info("Initializing Caption service")
        self.prompt = """简单凝炼的描述图片的主要内容"""
-        
+        self.timeout = 30
-        # Use provided VLM config if available, otherwise fall back to environment variables
+
        # Use provided VLM config if available,
        # otherwise fall back to environment variables
        if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"):
            self.completion_url = vlm_config.get("base_url", "") + "/chat/completions"
            self.model = vlm_config.get("model_name", "")
            self.api_key = vlm_config.get("api_key", "")
            self.interface_type = vlm_config.get("interface_type", "openai").lower()
        else:
-            if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "":
+            base_url = os.getenv("VLM_MODEL_BASE_URL")
            model_name = os.getenv("VLM_MODEL_NAME")
            if not base_url or not model_name:
                logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set")
                return
-            self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions"
+            self.completion_url = base_url + "/chat/completions"
-            self.model = os.getenv("VLM_MODEL_NAME")
+            self.model = model_name
-            self.api_key = os.getenv("VLM_MODEL_API_KEY")
+            self.api_key = os.getenv("VLM_MODEL_API_KEY", "")
            self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower()
-        
+
        # 验证接口类型
        if self.interface_type not in ["ollama", "openai"]:
-            logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai")
+            logger.warning(
                f"Unknown interface type: {self.interface_type}, defaulting to openai"
            )
            self.interface_type = "openai"
-        
+
        logger.info(
-            f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}"
+            f"Configured with model: {self.model}, "
            f"endpoint: {self.completion_url}, interface: {self.interface_type}"
        )
    def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]:
@@ -210,8 +224,8 @@ class Caption:
        Returns:
            CaptionChatResp object if successful, None otherwise
        """
-        logger.info(f"Calling Caption API for image captioning")
+        logger.info("Calling Caption API for image captioning")
-        logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}")
+        logger.info(f"Processing image data: {image_data[:50]}...")
        # 根据接口类型选择调用方式
        if self.interface_type == "ollama":
@@ -226,39 +240,35 @@ class Caption:
        client = ollama.Client(
            host=host,
            timeout=self.timeout,
        )
-        
+
        try:
            logger.info(f"Calling Ollama API with model: {self.model}")
-            
+
            # 调用Ollama API，使用images参数传递base64编码的图片
            response = client.generate(
                model=self.model,
                prompt="简单凝炼的描述图片的主要内容",
-                images=[image_base64], # image_base64是base64编码的图片数据
+                images=[image_base64],  # image_base64是base64编码的图片数据
                options={"temperature": 0.1},
                stream=False,
            )
-            
+
            # 构造响应对象
            caption_resp = CaptionChatResp(
                id="ollama_response",
                created=int(time.time()),
-                model=self.model,
+                model=Model(id=self.model),
                object="chat.completion",
                choices=[
-                    Choice(
+                    Choice(message=Message(role="assistant", content=response.response))
-                        message=Message(
+                ],
                            role="assistant",
                            content=response.response
                        )
                    )
                ]
            )
-            
+
            logger.info("Successfully received response from Ollama API")
            return caption_resp
-            
+
        except Exception as e:
            logger.error(f"Error calling Ollama API: {e}")
            return None
@@ -266,13 +276,16 @@ class Caption:
    def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]:
        """Call OpenAI-compatible API for image captioning."""
        logger.info(f"Calling OpenAI-compatible API with model: {self.model}")
-        
+
        user_msg = UserMessage(
            role="user",
            content=[
                Content(type="text", text=self.prompt),
                Content(
-                    type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto")
+                    type="image_url",
                    image_url=ImageUrl(
                        url="data:image/png;base64," + image_base64, detail="auto"
                    ),
                ),
            ],
        )
@@ -295,23 +308,23 @@ class Caption:
            headers["Authorization"] = f"Bearer {self.api_key}"
        try:
-            logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}")
+            logger.info(
                f"Sending request to OpenAI-compatible API with model: {self.model}"
            )
            response = requests.post(
                self.completion_url,
                data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4),
                headers=headers,
-                timeout=30,
+                timeout=self.timeout,
            )
            if response.status_code != 200:
                logger.error(
-                    f"OpenAI-compatible API returned non-200 status code: {response.status_code}"
+                    f"OpenAI API returned non-200 status code: {response.status_code}"
                )
                response.raise_for_status()
-            logger.info(
+            logger.info(f"Received from OpenAI with status: {response.status_code}")
-                f"Successfully received response from OpenAI-compatible API with status: {response.status_code}"
+            logger.info("Converting response to CaptionChatResp object")
            )
            logger.info(f"Converting response to CaptionChatResp object")
            caption_resp = CaptionChatResp.from_json(response.json())
            if caption_resp.usage:
@@ -322,7 +335,7 @@ class Caption:
            return caption_resp
        except requests.exceptions.Timeout:
-            logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds")
+            logger.error("Timeout while calling OpenAI-compatible API after 30 seconds")
            return None
        except requests.exceptions.RequestException as e:
            logger.error(f"Request error calling OpenAI-compatible API: {e}")
--- a/docreader/parser/chain_parser.py
+++ b/docreader/parser/chain_parser.py
@@ -0,0 +1,70 @@
 import logging
 from typing import List, Tuple, Type
 from docreader.models.document import Document
 from docreader.parser.base_parser import BaseParser
 from docreader.utils import endecode
 logger = logging.getLogger(__name__)
 class FirstParser(BaseParser):
    _parser_cls: Tuple[Type["BaseParser"], ...] = ()
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._parsers: List[BaseParser] = []
        for parser_cls in self._parser_cls:
            try:
                parser = parser_cls(*args, **kwargs)
                self._parsers.append(parser)
            except Exception as e:
                logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
    def parse_into_text(self, content: bytes) -> Document:
        for p in self._parsers:
            document = p.parse_into_text(content)
            if document.is_valid():
                return document
        return Document()
    @classmethod
    def create(cls, *parser_classes: Type["BaseParser"]) -> Type["FirstParser"]:
        names = "_".join([p.__name__ for p in parser_classes])
        return type(f"FirstParser_{names}", (cls,), {"_parser_cls": parser_classes})
 class PipelineParser(BaseParser):
    _parser_cls: Tuple[Type["BaseParser"], ...] = ()
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._parsers: List[BaseParser] = []
        for parser_cls in self._parser_cls:
            try:
                parser = parser_cls(*args, **kwargs)
                self._parsers.append(parser)
            except Exception as e:
                logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
    def parse_into_text(self, content: bytes) -> Document:
        document = Document()
        for p in self._parsers:
            document = p.parse_into_text(content)
            content = endecode.encode_bytes(document.content)
        return document
    @classmethod
    def create(cls, *parser_classes: Type["BaseParser"]) -> Type["PipelineParser"]:
        names = "_".join([p.__name__ for p in parser_classes])
        return type(f"PipelineParser_{names}", (cls,), {"_parser_cls": parser_classes})
 if __name__ == "__main__":
    from docreader.parser.markdown_parser import MarkdownParser
    cls = FirstParser.create(MarkdownParser)
    parser = cls()
    print(parser.parse_into_text(b"aaa"))
--- a/docreader/parser/config.py
+++ b/docreader/parser/config.py
@@ -1,21 +0,0 @@
 from dataclasses import dataclass, field
@dataclass
 class ChunkingConfig:
    """
    Configuration for text chunking process.
    Controls how documents are split into smaller pieces for processing.
    """
    chunk_size: int = 512  # Maximum size of each chunk in tokens/chars
    chunk_overlap: int = 50  # Number of tokens/chars to overlap between chunks
    separators: list = field(
        default_factory=lambda: ["\n\n", "\n", "。"]
    )  # Text separators in order of priority
    enable_multimodal: bool = (
        False  # Whether to enable multimodal processing (text + images)
    )
    storage_config: dict = None  # Preferred field name going forward
    vlm_config: dict = None  # VLM configuration for image captioning
--- a/docreader/parser/doc_parser.py
+++ b/docreader/parser/doc_parser.py
@@ -1,134 +1,88 @@
 import asyncio
 import logging
 import re
 import tempfile
 import os
 import subprocess
-import shutil
+from typing import List, Optional
 from io import BytesIO
 from typing import Optional, List, Tuple
 import textract
 from PIL import Image
 import zipfile
 import xml.etree.ElementTree as ET
-from .base_parser import BaseParser
+import textract
-from .docx_parser import DocxParser, Docx
+
 from docreader.models.document import Document
 from docreader.parser.docx2_parser import Docx2Parser
 from docreader.utils.tempfile import TempDirContext, TempFileContext
 logger = logging.getLogger(__name__)
-class DocParser(BaseParser):
+class DocParser(Docx2Parser):
    """DOC document parser"""
-    def parse_into_text(self, content: bytes) -> str:
+    def parse_into_text(self, content: bytes) -> Document:
        """Parse DOC document
        Args:
            content: DOC document content
        Returns:
            Parse result
        """
        logger.info(f"Parsing DOC document, content size: {len(content)} bytes")
        handle_chain = [
            # 1. Try to convert to docx format to extract images
            self._parse_with_docx,
            # 2. If image extraction is not needed or conversion failed,
            # try using antiword to extract text
            self._parse_with_antiword,
            # 3. If antiword extraction fails, use textract
            self._parse_with_textract,
        ]
        # Save byte content as a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
+        with TempFileContext(content, ".doc") as temp_file_path:
-            temp_file_path = temp_file.name
+            for handle in handle_chain:
-            temp_file.write(content)
+                try:
-            temp_file.flush()
+                    document = handle(temp_file_path)
-            logger.info(f"Saved DOC content to temporary file: {temp_file_path}")
+                    if document:
                        return document
                except Exception as e:
                    logger.warning(f"Failed to parse DOC with {handle.__name__} {e}")
-        try:
+            return Document(content="")
            # First try to convert to docx format to extract images
            if self.enable_multimodal:
                logger.info("Multimodal enabled, attempting to extract images from DOC")
                docx_content = self._convert_doc_to_docx(temp_file_path)
-                if docx_content:
+    def _parse_with_docx(self, temp_file_path: str) -> Document:
-                    logger.info("Successfully converted DOC to DOCX, using DocxParser")
+        logger.info("Multimodal enabled, attempting to extract images from DOC")
                    # Use existing DocxParser to parse the converted docx
                    docx_parser = DocxParser(
                        file_name=self.file_name,
                        file_type="docx",
                        enable_multimodal=self.enable_multimodal,
                        chunk_size=self.chunk_size,
                        chunk_overlap=self.chunk_overlap,
                        chunking_config=self.chunking_config,
                        separators=self.separators,
                    )
                    text = docx_parser.parse_into_text(docx_content)
                    logger.info(f"Extracted {len(text)} characters using DocxParser")
-                    # Clean up temporary file
+        docx_content = self._try_convert_doc_to_docx(temp_file_path)
-                    os.unlink(temp_file_path)
+        if not docx_content:
-                    logger.info(f"Deleted temporary file: {temp_file_path}")
+            raise RuntimeError("Failed to convert DOC to DOCX")
-                    return text
+        logger.info("Successfully converted DOC to DOCX, using DocxParser")
-                else:
+        # Use existing DocxParser to parse the converted docx
-                    logger.warning(
+        document = super(Docx2Parser, self).parse_into_text(docx_content)
-                        "Failed to convert DOC to DOCX, falling back to text-only extraction"
+        logger.info(f"Extracted {len(document.content)} characters using DocxParser")
-                    )
+        return document
-            # If image extraction is not needed or conversion failed, try using antiword to extract text
+    def _parse_with_antiword(self, temp_file_path: str) -> Document:
-            try:
+        logger.info("Attempting to parse DOC file with antiword")
                logger.info("Attempting to parse DOC file with antiword")
                # Check if antiword is installed
                antiword_path = self._find_antiword_path()
-                if antiword_path:
+        # Check if antiword is installed
-                    # Use antiword to extract text directly
+        antiword_path = self._try_find_antiword()
-                    logger.info(f"Using antiword at {antiword_path} to extract text")
+        if not antiword_path:
-                    process = subprocess.Popen(
+            raise RuntimeError("antiword not found in PATH")
                        [antiword_path, temp_file_path],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
                    )
                    stdout, stderr = process.communicate()
-                    if process.returncode == 0:
+        # Use antiword to extract text directly
-                        text = stdout.decode("utf-8", errors="ignore")
+        process = subprocess.Popen(
-                        logger.info(
+            [antiword_path, temp_file_path],
-                            f"Successfully extracted {len(text)} characters using antiword"
+            stdout=subprocess.PIPE,
-                        )
+            stderr=subprocess.PIPE,
-
+        )
-                        # Clean up temporary file
+        stdout, stderr = process.communicate()
-                        os.unlink(temp_file_path)
+        if process.returncode != 0:
-                        logger.info(f"Deleted temporary file: {temp_file_path}")
+            raise RuntimeError(
-
+                f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
                        return text
                    else:
                        logger.warning(
                            f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
                        )
                else:
                    logger.warning("antiword not found, falling back to textract")
            except Exception as e:
                logger.warning(
                    f"Error using antiword: {str(e)}, falling back to textract"
                )
            # If antiword fails, try using textract
            logger.info("Parsing DOC file with textract")
            text = textract.process(temp_file_path, method="antiword").decode("utf-8")
            logger.info(
                f"Successfully extracted {len(text)} characters of text from DOC document using textract"
            )
        text = stdout.decode("utf-8", errors="ignore")
        logger.info(f"Successfully extracted {len(text)} characters using antiword")
        return Document(content=text)
-            # Clean up temporary file
+    def _parse_with_textract(self, temp_file_path: str) -> Document:
-            os.unlink(temp_file_path)
+        logger.info(f"Parsing DOC file with textract: {temp_file_path}")
-            logger.info(f"Deleted temporary file: {temp_file_path}")
+        text = textract.process(temp_file_path, method="antiword").decode("utf-8")
        logger.info(f"Successfully extracted {len(text)} bytes of DOC using textract")
        return Document(content=str(text))
-            return text
+    def _try_convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
        except Exception as e:
            logger.error(f"Error parsing DOC document: {str(e)}")
            # Ensure temporary file is cleaned up
            if os.path.exists(temp_file_path):
                os.unlink(temp_file_path)
                logger.info(f"Deleted temporary file after error: {temp_file_path}")
            return ""
    def _convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
        """Convert DOC file to DOCX format
        Uses LibreOffice/OpenOffice for conversion
@@ -141,21 +95,16 @@ class DocParser(BaseParser):
        """
        logger.info(f"Converting DOC to DOCX: {doc_path}")
        # Check if LibreOffice or OpenOffice is installed
        soffice_path = self._try_find_soffice()
        if not soffice_path:
            return None
        # Execute conversion command
        logger.info(f"Using {soffice_path} to convert DOC to DOCX")
        # Create a temporary directory to store the converted file
-        temp_dir = tempfile.mkdtemp()
+        with TempDirContext() as temp_dir:
        docx_path = os.path.join(temp_dir, "converted.docx")
        try:
            # Check if LibreOffice or OpenOffice is installed
            soffice_path = self._find_soffice_path()
            if not soffice_path:
                logger.error(
                    "LibreOffice/OpenOffice not found, cannot convert DOC to DOCX"
                )
                return None
            # Execute conversion command
            logger.info(f"Using {soffice_path} to convert DOC to DOCX")
            cmd = [
                soffice_path,
                "--headless",
@@ -165,7 +114,6 @@ class DocParser(BaseParser):
                temp_dir,
                doc_path,
            ]
            logger.info(f"Running command: {' '.join(cmd)}")
            process = subprocess.Popen(
                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
@@ -173,41 +121,68 @@ class DocParser(BaseParser):
            stdout, stderr = process.communicate()
            if process.returncode != 0:
-                logger.error(
+                logger.warning(
-                    f"Error converting DOC to DOCX: {stderr.decode('utf-8', errors='ignore')}"
+                    f"Error converting DOC to DOCX: {stderr.decode('utf-8')}"
                )
                return None
            # Find the converted file
-            for file in os.listdir(temp_dir):
+            docx_file = [
-                if file.endswith(".docx"):
+                file for file in os.listdir(temp_dir) if file.endswith(".docx")
-                    converted_file = os.path.join(temp_dir, file)
+            ]
-                    logger.info(f"Found converted file: {converted_file}")
+            logger.info(f"Found {len(docx_file)} DOCX file(s) in temporary directory")
-
+            for file in docx_file:
-                    # Read the converted file content
+                converted_file = os.path.join(temp_dir, file)
-                    with open(converted_file, "rb") as f:
+                logger.info(f"Found converted file: {converted_file}")
                        docx_content = f.read()
                # Read the converted file content
                with open(converted_file, "rb") as f:
                    docx_content = f.read()
                    logger.info(
-                        f"Successfully read converted DOCX file, size: {len(docx_content)} bytes"
+                        f"Successfully read DOCX file, size: {len(docx_content)}"
                    )
                    return docx_content
        return None
-            logger.error("No DOCX file found after conversion")
+    def _try_find_executable_path(
-            return None
+        self,
        executable_name: str,
        possible_path: List[str] = [],
        environment_variable: List[str] = [],
    ) -> Optional[str]:
        """Find executable path
        Args:
            executable_name: Executable name
            possible_path: List of possible paths
            environment_variable: List of environment variables to check
            Returns:
                Executable path, or None if not found
        """
        # Common executable paths
        paths: List[str] = []
        paths.extend(possible_path)
        paths.extend(os.environ.get(env_var, "") for env_var in environment_variable)
        paths = list(set(paths))
-        except Exception as e:
+        # Check if path is set in environment variable
-            logger.error(f"Error during DOC to DOCX conversion: {str(e)}")
+        for path in paths:
-            return None
+            if os.path.exists(path):
-        finally:
+                logger.info(f"Found {executable_name} at {path}")
-            # Clean up temporary directory
+                return path
            try:
                shutil.rmtree(temp_dir)
                logger.info(f"Cleaned up temporary directory: {temp_dir}")
            except Exception as e:
                logger.warning(f"Failed to clean up temporary directory: {str(e)}")
-    def _find_soffice_path(self) -> Optional[str]:
+        # Try to find in PATH
        result = subprocess.run(
            ["which", executable_name], capture_output=True, text=True
        )
        if result.returncode == 0 and result.stdout.strip():
            path = result.stdout.strip()
            logger.info(f"Found {executable_name} at {path}")
            return path
        logger.warning(f"Failed to find {executable_name}")
        return None
    def _try_find_soffice(self) -> Optional[str]:
        """Find LibreOffice/OpenOffice executable path
        Returns:
@@ -225,32 +200,13 @@ class DocParser(BaseParser):
            "C:\\Program Files\\LibreOffice\\program\\soffice.exe",
            "C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
        ]
        return self._try_find_executable_path(
            executable_name="soffice",
            possible_path=possible_paths,
            environment_variable=["LIBREOFFICE_PATH"],
        )
-        # Check if path is set in environment variable
+    def _try_find_antiword(self) -> Optional[str]:
        if os.environ.get("LIBREOFFICE_PATH"):
            possible_paths.insert(0, os.environ.get("LIBREOFFICE_PATH"))
        for path in possible_paths:
            if os.path.exists(path):
                logger.info(f"Found LibreOffice/OpenOffice at: {path}")
                return path
        # Try to find in PATH
        try:
            result = subprocess.run(
                ["which", "soffice"], capture_output=True, text=True
            )
            if result.returncode == 0 and result.stdout.strip():
                path = result.stdout.strip()
                logger.info(f"Found LibreOffice/OpenOffice in PATH: {path}")
                return path
        except Exception:
            pass
        logger.warning("LibreOffice/OpenOffice not found")
        return None
    def _find_antiword_path(self) -> Optional[str]:
        """Find antiword executable path
        Returns:
@@ -265,51 +221,27 @@ class DocParser(BaseParser):
            "C:\\Program Files\\Antiword\\antiword.exe",
            "C:\\Program Files (x86)\\Antiword\\antiword.exe",
        ]
-
+        return self._try_find_executable_path(
-        # Check if path is set in environment variable
+            executable_name="antiword",
-        if os.environ.get("ANTIWORD_PATH"):
+            possible_path=possible_paths,
-            possible_paths.insert(0, os.environ.get("ANTIWORD_PATH"))
+            environment_variable=["ANTIWORD_PATH"],
-
+        )
        for path in possible_paths:
            if os.path.exists(path):
                logger.info(f"Found antiword at: {path}")
                return path
        # Try to find in PATH
        try:
            result = subprocess.run(
                ["which", "antiword"], capture_output=True, text=True
            )
            if result.returncode == 0 and result.stdout.strip():
                path = result.stdout.strip()
                logger.info(f"Found antiword in PATH: {path}")
                return path
        except Exception:
            pass
        logger.warning("antiword not found")
        return None
 if __name__ == "__main__":
-    logging.basicConfig(
+    logging.basicConfig(level=logging.DEBUG)
        level=logging.INFO,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )
    logger.info("Running DocParser in standalone mode")
    file_name = "/path/to/your/test.doc"
    logger.info(f"Processing file: {file_name}")
    doc_parser = DocParser(
-        file_name, enable_multimodal=True, chunk_size=512, chunk_overlap=60
+        file_name=file_name,
        enable_multimodal=True,
        chunk_size=512,
        chunk_overlap=60,
    )
    logger.info("Parser initialized, starting processing")
    with open(file_name, "rb") as f:
        content = f.read()
-    text = doc_parser.parse_into_text(content)
+    document = doc_parser.parse_into_text(content)
-    logger.info(f"Processing complete, extracted text length: {len(text)}")
+    logger.info(f"Processing complete, extracted text length: {len(document.content)}")
-    logger.info(f"Sample text: {text[:200]}...")
+    logger.info(f"Sample text: {document.content[:200]}...")
--- a/docreader/parser/docx2_parser.py
+++ b/docreader/parser/docx2_parser.py
@@ -0,0 +1,28 @@
 import logging
 from docreader.parser.chain_parser import FirstParser
 from docreader.parser.docx_parser import DocxParser
 from docreader.parser.markitdown_parser import MarkitdownParser
 logger = logging.getLogger(__name__)
 class Docx2Parser(FirstParser):
    _parser_cls = (MarkitdownParser, DocxParser)
 if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    your_file = "/path/to/your/file.docx"
    parser = Docx2Parser(separators=[".", "?", "!", "。", "？", "！"])
    with open(your_file, "rb") as f:
        content = f.read()
        document = parser.parse(content)
        for cc in document.chunks:
            logger.info(f"chunk: {cc}")
        # document = parser.parse_into_text(content)
        # logger.info(f"docx content: {document.content}")
        # logger.info(f"find images {document.images.keys()}")
--- a/docreader/parser/docx_parser.py
+++ b/docreader/parser/docx_parser.py
@@ -1,37 +1,36 @@
 import logging
 import tempfile
 import os
-import sys
+import re
 import time
 from io import BytesIO
 from typing import Optional, Dict, Any, Tuple, List, Union
 from dataclasses import dataclass, field
 from PIL import Image
 from docx import Document
 from docx.image.exceptions import (
    UnrecognizedImageError,
    UnexpectedEndOfFileError,
    InvalidImageStreamError,
 )
 from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
 import tempfile
 import threading
 import time
 import traceback
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from io import BytesIO
 from multiprocessing import Manager
-import re
+from typing import Any, Dict, List, Optional, Tuple
-from .base_parser import BaseParser
+from docx import Document
 from docx.image.exceptions import (
    InvalidImageStreamError,
    UnexpectedEndOfFileError,
    UnrecognizedImageError,
 )
 from PIL import Image
 from docreader.models.document import Document as DocumentModel
 from docreader.parser.base_parser import BaseParser
 from docreader.utils import endecode
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 # Add thread local storage to track the processing status of each thread
 thread_local = threading.local()
 class ImageData:
    """Represents a processed image of document content"""
    local_path: str = ""
-    object: Image.Image = None
+    object: Optional[Image.Image] = None
    url: str = ""
@@ -40,7 +39,9 @@ class LineData:
    """Represents a processed line of document content with associated images"""
    text: str = ""  # Extracted text content
-    images: List[ImageData] = field(default_factory=list)  # List of images or image paths
+    images: List[ImageData] = field(
        default_factory=list
    )  # List of images or image paths
    extra_info: str = ""  # Placeholder for additional info (currently unused)
    page_num: int = 0  # Page number
    content_sequence: List[Tuple[str, Any]] = field(
@@ -53,18 +54,8 @@ class DocxParser(BaseParser):
    def __init__(
        self,
-        file_name: str = "",
+        max_pages: int = 100,  # Maximum number of pages to process
-        file_type: str = None,
+        **kwargs,
        enable_multimodal: bool = True,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
        separators: list = ["\n\n", "\n", "。"],
        ocr_backend: str = "paddle",
        ocr_config: dict = None,
        max_image_size: int = 1920,
        max_concurrent_tasks: int = 5,
        max_pages: int = 100,  # Maximum number of pages to process, default to 50 pages
        chunking_config=None,
    ):
        """Initialize DOCX document parser
@@ -79,37 +70,16 @@ class DocxParser(BaseParser):
            ocr_config: OCR engine configuration
            max_image_size: Maximum image size limit
            max_concurrent_tasks: Maximum number of concurrent tasks
-            max_pages: Maximum number of pages to process, if more than this, only process the first max_pages pages
+            max_pages: Maximum number of pages to process
        """
-        super().__init__(
+        super().__init__(**kwargs)
            file_name=file_name,
            file_type=file_type,
            enable_multimodal=enable_multimodal,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=separators,
            ocr_backend=ocr_backend,
            ocr_config=ocr_config,
            max_image_size=max_image_size,
            max_concurrent_tasks=max_concurrent_tasks,
            chunking_config=chunking_config,
        )
        self.max_pages = max_pages
        logger.info(f"DocxParser initialized with max_pages={max_pages}")
-    def parse_into_text(self, content: bytes) ->  Union[str, Tuple[str, Dict[str, Any]]]:
+    def parse_into_text(self, content: bytes) -> DocumentModel:
-        """Parse DOCX document, extract text content and image Markdown links
+        """Parse DOCX document, extract text content and image Markdown links"""
        Args:
            content: DOCX document content
        Returns:
            Tuple of (parsed_text, image_map) where image_map maps image URLs to Image objects
            All LineData objects are used internally but not returned directly through this interface
        """
        logger.info(f"Parsing DOCX document, content size: {len(content)} bytes")
        logger.info(f"Max pages limit set to: {self.max_pages}")
        logger.info("Converting DOCX content to sections and tables")
        start_time = time.time()
        # Use concurrent processing to handle the document
@@ -123,7 +93,7 @@ class DocxParser(BaseParser):
            docx_processor = Docx(
                max_image_size=self.max_image_size,
                enable_multimodal=self.enable_multimodal,
-                upload_file=self.upload_file,
+                upload_file=self.storage.upload_file,
            )
            all_lines, tables = docx_processor(
                binary=content,
@@ -140,7 +110,7 @@ class DocxParser(BaseParser):
            section_start_time = time.time()
            text_parts = []
-            image_parts = {}
+            image_parts: Dict[str, str] = {}
            for sec_idx, line in enumerate(all_lines):
                try:
@@ -148,16 +118,19 @@ class DocxParser(BaseParser):
                        text_parts.append(line.text)
                        if sec_idx < 3 or sec_idx % 50 == 0:
                            logger.info(
-                                f"Added section {sec_idx+1} text: {line.text[:50]}..."
+                                f"Added section {sec_idx + 1} text: {line.text[:50]}..."
                                if len(line.text) > 50
-                                else f"Added section {sec_idx+1} text: {line.text}"
+                                else f"Added section {sec_idx + 1} text: {line.text}"
                            )
                    if line.images:
                        for image_data in line.images:
-                            if image_data.url:
+                            if image_data.url and image_data.object:
-                                image_parts[image_data.url] = image_data.object
+                                image_parts[image_data.url] = endecode.decode_image(
                                    image_data.object
                                )
                                image_data.object.close()
                except Exception as e:
-                    logger.error(f"Error processing section {sec_idx+1}: {str(e)}")
+                    logger.error(f"Error processing section {sec_idx + 1}: {str(e)}")
                    logger.error(f"Detailed stack trace: {traceback.format_exc()}")
                    continue
@@ -176,17 +149,17 @@ class DocxParser(BaseParser):
            total_processing_time = time.time() - start_time
            logger.info(
-                f"Parsing complete in {total_processing_time:.2f}s, generated {len(text)} characters of text "
+                f"Parsing complete in {total_processing_time:.2f}s, "
                f"generated {len(text)} characters of text"
            )
-            return text, image_parts
+            return DocumentModel(content=text, images=image_parts)
        except Exception as e:
            logger.error(f"Error parsing DOCX document: {str(e)}")
            logger.error(f"Detailed stack trace: {traceback.format_exc()}")
-            fallback_text = self._parse_using_simple_method(content)
+            return self._parse_using_simple_method(content)
            return fallback_text, {}
-    def _parse_using_simple_method(self, content: bytes) -> str:
+    def _parse_using_simple_method(self, content: bytes) -> DocumentModel:
        """Parse document using a simplified method, as a fallback
        Args:
@@ -201,7 +174,8 @@ class DocxParser(BaseParser):
            doc = Document(BytesIO(content))
            logger.info(
                f"Successfully loaded document in simplified method, "
-                f"contains {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables"
+                f"contains {len(doc.paragraphs)} paragraphs "
                f"and {len(doc.tables)} tables"
            )
            text_parts = []
@@ -211,7 +185,7 @@ class DocxParser(BaseParser):
            para_with_text = 0
            for i, para in enumerate(doc.paragraphs):
                if i % 100 == 0:
-                    logger.info(f"Processing paragraph {i+1}/{para_count}")
+                    logger.info(f"Processing paragraph {i + 1}/{para_count}")
                if para.text.strip():
                    text_parts.append(para.text.strip())
                    para_with_text += 1
@@ -225,7 +199,7 @@ class DocxParser(BaseParser):
            rows_processed = 0
            for i, table in enumerate(doc.tables):
                if i % 10 == 0:
-                    logger.info(f"Processing table {i+1}/{table_count}")
+                    logger.info(f"Processing table {i + 1}/{table_count}")
                table_has_content = False
                for row in table.rows:
@@ -256,25 +230,24 @@ class DocxParser(BaseParser):
            # If the result is still empty, return an error message
            if not result_text:
                logger.warning("No text extracted using simplified method")
-                return "", {}
+                return DocumentModel()
-            return result_text, {}
+            return DocumentModel(content=result_text)
        except Exception as backup_error:
            processing_time = time.time() - start_time
            logger.error(
-                f"Simplified parsing failed after {processing_time:.2f}s: {str(backup_error)}"
+                f"Simplified parsing failed {processing_time:.2f}s: {backup_error}"
            )
            logger.error(f"Detailed traceback: {traceback.format_exc()}")
-            return "", {}
+            return DocumentModel()
 class Docx:
    def __init__(self, max_image_size=1920, enable_multimodal=False, upload_file=None):
        logger.info("Initializing DOCX processor")
        self.max_image_size = max_image_size  # Maximum image size limit
-        self.picture_cache = (
+        # Image cache to avoid processing the same image repeatedly
-            {}
+        self.picture_cache = {}
        )  # Image cache to avoid processing the same image repeatedly
        self.enable_multimodal = enable_multimodal
        self.upload_file = upload_file
@@ -454,7 +427,6 @@ class Docx:
        return page_to_paragraphs
    def __call__(
        self,
        binary: Optional[bytes] = None,
@@ -611,7 +583,6 @@ class Docx:
        return pages_to_process
    def _process_document(
        self,
        binary,
@@ -806,7 +777,9 @@ class Docx:
                # Collect temporary image paths for later cleanup
                for line in page_lines:
                    for image_data in line.images:
-                        if image_data.local_path and image_data.local_path.startswith("/tmp/docx_img_"):
+                        if image_data.local_path and image_data.local_path.startswith(
                            "/tmp/docx_img_"
                        ):
                            temp_img_paths.add(image_data.local_path)
                results.extend(page_lines)
@@ -876,7 +849,11 @@ class Docx:
                # Process all image data objects
                for image_data in image_paths:
-                    if image_data.local_path and os.path.exists(image_data.local_path) and image_data.local_path not in image_url_map:
+                    if (
                        image_data.local_path
                        and os.path.exists(image_data.local_path)
                        and image_data.local_path not in image_url_map
                    ):
                        try:
                            # Upload the image if it doesn't have a URL yet
                            if not image_data.url:
@@ -886,12 +863,16 @@ class Docx:
                                    image_data.url = image_url
                                    # Add image URL as Markdown format
                                    markdown_image = f"![]({image_url})"
-                                    image_url_map[image_data.local_path] = markdown_image
+                                    image_url_map[image_data.local_path] = (
                                        markdown_image
                                    )
                                    logger.info(
                                        f"Added image URL for {image_data.local_path}: {image_url}"
                                    )
                                else:
-                                    logger.warning(f"Failed to upload image: {image_data.local_path}")
+                                    logger.warning(
                                        f"Failed to upload image: {image_data.local_path}"
                                    )
                            else:
                                # Already has a URL, use it
                                markdown_image = f"![]({image_data.url})"
@@ -925,12 +906,19 @@ class Docx:
                        # For ImageData objects, use the URL
                        if isinstance(content, str) and content in image_url_map:
                            combined_parts.append(image_url_map[content])
-                        elif hasattr(content, 'local_path') and content.local_path in image_url_map:
+                        elif (
                            hasattr(content, "local_path")
                            and content.local_path in image_url_map
                        ):
                            combined_parts.append(image_url_map[content.local_path])
                # Create the final text with proper ordering
                final_text = "\n\n".join(part for part in combined_parts if part)
-                processed_lines.append(LineData(text=final_text, page_num=page_num, images=line_data.images))
+                processed_lines.append(
                    LineData(
                        text=final_text, page_num=page_num, images=line_data.images
                    )
                )
        else:
            processed_lines = lines
@@ -1003,11 +991,11 @@ class Docx:
            logger.info(f"Processing {table_count} tables")
            for tb_idx, tb in enumerate(self.doc.tables):
                if tb_idx % 10 == 0:  # Log only every 10 tables to reduce log volume
-                    logger.info(f"Processing table {tb_idx+1}/{table_count}")
+                    logger.info(f"Processing table {tb_idx + 1}/{table_count}")
                # Optimize: Check if table is empty
                if len(tb.rows) == 0 or all(len(r.cells) == 0 for r in tb.rows):
-                    logger.info(f"Skipping empty table {tb_idx+1}")
+                    logger.info(f"Skipping empty table {tb_idx + 1}")
                    continue
                table_html = self._convert_table_to_html(tb)
@@ -1111,8 +1099,8 @@ def _save_image_to_temp(logger, image, page_num, img_idx):
    if not image:
        return None
    import tempfile
    import os
    import tempfile
    try:
        # Create a temporary file
@@ -1187,8 +1175,15 @@ def process_page_multiprocess(
            return []
        # Extract page content
-        combined_text, image_objects, content_sequence = _extract_page_content_in_process(
+        combined_text, image_objects, content_sequence = (
-            process_logger, doc, page_num, paragraphs, enable_multimodal, max_image_size
+            _extract_page_content_in_process(
                process_logger,
                doc,
                page_num,
                paragraphs,
                enable_multimodal,
                max_image_size,
            )
        )
        # Process content sequence to maintain order between processes
@@ -1199,7 +1194,9 @@ def process_page_multiprocess(
        if enable_multimodal:
            # First pass: save all images to temporary files
            for i, image_object in enumerate(image_objects):
-                img_path = _save_image_to_temp(process_logger, image_object, page_num, i)
+                img_path = _save_image_to_temp(
                    process_logger, image_object, page_num, i
                )
                if img_path:
                    # Create ImageData object
                    image_data = ImageData()
--- a/docreader/parser/image_parser.py
+++ b/docreader/parser/image_parser.py
@@ -1,15 +1,13 @@
 import base64
 import logging
 import os
-import asyncio
+
-from PIL import Image
+from docreader.models.document import Document
-import io
+from docreader.parser.base_parser import BaseParser
 from typing import Dict, Any, Tuple, Union
 from .base_parser import BaseParser, ParseResult
 import numpy as np
 # Set up logger for this module
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+
 class ImageParser(BaseParser):
    """
@@ -23,46 +21,24 @@ class ImageParser(BaseParser):
    4. Returning a combined result with both text and image reference
    """
-    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+    def parse_into_text(self, content: bytes) -> Document:
        """
-        Parse image content, upload the image and return Markdown reference along with image map.
+        Parse image content into markdown text
-
+        :param content: bytes content of the image
-        Args:
+        :return: Document object
            content: Raw image data (bytes)
        Returns:
            Tuple of (markdown_text, image_map) where image_map maps image URLs to PIL Image objects
        """
        logger.info(f"Parsing image content, size: {len(content)} bytes")
        image_map = {}
        try:
            # Upload image to storage service
            logger.info("Uploading image to storage")
            _, ext = os.path.splitext(self.file_name)
            image_url = self.upload_bytes(content, file_ext=ext)
            if not image_url:
                logger.error("Failed to upload image to storage")
                return "", {}
            logger.info(
                f"Successfully uploaded image, URL: {image_url[:50]}..."
                if len(image_url) > 50
                else f"Successfully uploaded image, URL: {image_url}"
            )
-            # Create image object and add to map
+        # Get file extension
-            try:
+        ext = os.path.splitext(self.file_name)[1].lower()
                from PIL import Image
                import io
                image = Image.open(io.BytesIO(content))
                image_map[image_url] = image
                logger.info(f"Added image to image_map for URL: {image_url}")
            except Exception as img_err:
                logger.error(f"Error creating image object: {str(img_err)}")
-            markdown_text = f"![{self.file_name}]({image_url})"
+        # Upload image to storage
-            return markdown_text, image_map
+        image_url = self.storage.upload_bytes(content, file_ext=ext)
        logger.info(f"Successfully uploaded image, URL: {image_url[:50]}...")
-        except Exception as e:
+        # Generate markdown text
-            logger.error(f"Error parsing image: {str(e)}")
+        text = f"![{self.file_name}]({image_url})"
-            return "", {}
+        images = {image_url: base64.b64encode(content).decode()}
        # Create image object and add to map
        return Document(content=text, images=images)
--- a/docreader/parser/image_utils.py
+++ b/docreader/parser/image_utils.py
@@ -1,43 +0,0 @@
 import base64
 import io
 import logging
 from typing import Union
 from PIL import Image
 import numpy as np
 logger = logging.getLogger(__name__)
 def image_to_base64(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
    """Convert image to base64 encoded string
    Args:
        image: Image file path, bytes, PIL Image object, or numpy array
    Returns:
        Base64 encoded image string, or empty string if conversion fails
    """
    try:
        if isinstance(image, str):
            # It's a file path
            with open(image, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode("utf-8")
        elif isinstance(image, bytes):
            # It's bytes data
            return base64.b64encode(image).decode("utf-8")
        elif isinstance(image, Image.Image):
            # It's a PIL Image
            buffer = io.BytesIO()
            image.save(buffer, format="PNG")
            return base64.b64encode(buffer.getvalue()).decode("utf-8")
        elif isinstance(image, np.ndarray):
            # It's a numpy array
            pil_image = Image.fromarray(image)
            buffer = io.BytesIO()
            pil_image.save(buffer, format="PNG")
            return base64.b64encode(buffer.getvalue()).decode("utf-8")
        else:
            logger.error(f"Unsupported image type: {type(image)}")
            return ""
    except Exception as e:
        logger.error(f"Error converting image to base64: {str(e)}")
        return ""
--- a/docreader/parser/markdown_image_util.py
+++ b/docreader/parser/markdown_image_util.py
@@ -0,0 +1,111 @@
 import logging
 import re
 import uuid
 from typing import Dict, List, Match, Optional, Tuple
 from docreader.utils import endecode
 # Get logger object
 logger = logging.getLogger(__name__)
 class MarkdownImageUtil:
    def __init__(self):
        self.b64_pattern = re.compile(
            r"!\[([^\]]*)\]\(data:image/(\w+);base64,([^\)]+)\)"
        )
        self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
        self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
    def extract_image(
        self,
        content: str,
        path_prefix: Optional[str] = None,
        replace: bool = True,
    ) -> Tuple[str, List[str]]:
        """Extract base64 encoded images from Markdown content"""
        # image_path => base64 bytes
        images: List[str] = []
        def repl(match: Match[str]) -> str:
            title = match.group(1)
            image_path = match.group(2)
            if path_prefix:
                image_path = f"{path_prefix}/{image_path}"
            images.append(image_path)
            if not replace:
                return match.group(0)
            # Replace image path with URL
            return f"![{title}]({image_path})"
        text = self.image_pattern.sub(repl, content)
        logger.debug(f"Extracted {len(images)} images from markdown")
        return text, images
    def extract_base64(
        self,
        content: str,
        path_prefix: Optional[str] = None,
        replace: bool = True,
    ) -> Tuple[str, Dict[str, bytes]]:
        """Extract base64 encoded images from Markdown content"""
        # image_path => base64 bytes
        images: Dict[str, bytes] = {}
        def repl(match: Match[str]) -> str:
            title = match.group(1)
            img_ext = match.group(2)
            img_b64 = match.group(3)
            image_byte = endecode.encode_image(img_b64, errors="ignore")
            if not image_byte:
                logger.error(f"Failed to decode base64 image skip it: {img_b64}")
                return title
            image_path = f"{uuid.uuid4()}.{img_ext}"
            if path_prefix:
                image_path = f"{path_prefix}/{image_path}"
            images[image_path] = image_byte
            if not replace:
                return match.group(0)
            # Replace image path with URL
            return f"![{title}]({image_path})"
        text = self.b64_pattern.sub(repl, content)
        logger.debug(f"Extracted {len(images)} base64 images from markdown")
        return text, images
    def replace_path(self, content: str, images: Dict[str, str]) -> str:
        content_replace: set = set()
        def repl(match: Match[str]) -> str:
            title = match.group(1)
            image_path = match.group(2)
            if image_path not in images:
                return match.group(0)
            content_replace.add(image_path)
            image_path = images[image_path]
            return f"![{title}]({image_path})"
        text = self.replace_pattern.sub(repl, content)
        logger.debug(f"Replaced {len(content_replace)} images in markdown")
        return text
 if __name__ == "__main__":
    your_content = "test![](data:image/png;base64,iVBORw0KGgoAAAA)test"
    image_handle = MarkdownImageUtil()
    text, images = image_handle.extract_base64(your_content)
    print(text)
    for image_url, image_byte in images.items():
        with open(image_url, "wb") as f:
            f.write(image_byte)
--- a/docreader/parser/markdown_parser.py
+++ b/docreader/parser/markdown_parser.py
@@ -1,33 +1,53 @@
-import asyncio
+import base64
 import re
 import logging
-import numpy as np
+import os
-import os  # Import os module to get environment variables
+from typing import Dict
-from typing import Dict, List, Optional, Tuple, Union, Any
+
-from .base_parser import BaseParser
+from docreader.models.document import Document
 from docreader.parser.base_parser import BaseParser
 from docreader.parser.chain_parser import PipelineParser
 from docreader.parser.markdown_image_util import MarkdownImageUtil
 from docreader.utils import endecode
 # Get logger object
 logger = logging.getLogger(__name__)
-class MarkdownParser(BaseParser):
+class MarkdownImageBase64(BaseParser):
-    """Markdown document parser"""
+    def __init__(self, **kwargs):
-
+        super().__init__(**kwargs)
-    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+        self.image_helper = MarkdownImageUtil()
        """Parse Markdown document, only extract text content, do not process images
        Args:
            content: Markdown document content
        Returns:
            Parsed text result
        """
        logger.info(f"Parsing Markdown document, content size: {len(content)} bytes")
    def parse_into_text(self, content: bytes) -> Document:
        # Convert byte content to string using universal decoding method
-        text = self.decode_bytes(content)
+        text = endecode.decode_bytes(content)
-        logger.info(f"Decoded Markdown content, text length: {len(text)} characters")
+        text, img_b64 = self.image_helper.extract_base64(text, path_prefix="images")
-        logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text")
+        images: Dict[str, str] = {}
-        return text
+        image_replace: Dict[str, str] = {}
        logger.debug(f"Uploading {len(img_b64)} images from markdown")
        for ipath, b64_bytes in img_b64.items():
            ext = os.path.splitext(ipath)[1].lower()
            image_url = self.storage.upload_bytes(b64_bytes, ext)
            image_replace[ipath] = image_url
            images[image_url] = base64.b64encode(b64_bytes).decode()
        text = self.image_helper.replace_path(text, image_replace)
        return Document(content=text, images=images)
 class MarkdownParser(PipelineParser):
    _parser_cls = (MarkdownImageBase64,)
 if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    your_content = "test![](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMgA)test"
    parser = MarkdownParser()
    document = parser.parse_into_text(your_content.encode())
    logger.info(document.content)
    logger.info(f"Images: {len(document.images)}, name: {document.images.keys()}")
--- a/docreader/parser/markitdown_parser.py
+++ b/docreader/parser/markitdown_parser.py
@@ -0,0 +1,31 @@
 import io
 import logging
 from markitdown import MarkItDown
 from docreader.models.document import Document
 from docreader.parser.base_parser import BaseParser
 from docreader.parser.chain_parser import PipelineParser
 from docreader.parser.markdown_parser import MarkdownParser
 logger = logging.getLogger(__name__)
 class StdMarkitdownParser(BaseParser):
    """
    PDF Document Parser
    This parser handles PDF documents by extracting text content.
    It uses the markitdown library for simple text extraction.
    """
    def __init__(self, *args, **kwargs):
        self.markitdown = MarkItDown()
    def parse_into_text(self, content: bytes) -> Document:
        result = self.markitdown.convert(io.BytesIO(content), keep_data_uris=True)
        return Document(content=result.text_content)
 class MarkitdownParser(PipelineParser):
    _parser_cls = (StdMarkitdownParser, MarkdownParser)
--- a/docreader/parser/mineru_parser.py
+++ b/docreader/parser/mineru_parser.py
@@ -0,0 +1,124 @@
 import logging
 import os
 import re
 from typing import Dict
 import markdownify
 import requests
 from docreader.models.document import Document
 from docreader.parser.base_parser import BaseParser
 from docreader.parser.markdown_parser import MarkdownImageUtil
 from docreader.utils import endecode
 logger = logging.getLogger(__name__)
 class MinerUParser(BaseParser):
    def __init__(
        self,
        enable_markdownify: bool = True,
        mineru_endpoint: str = "",
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.minerU = os.getenv("MINERU_ENDPOINT", mineru_endpoint)
        self.enable_markdownify = enable_markdownify
        self.image_helper = MarkdownImageUtil()
        self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
        self.enable = self.ping()
        assert self.ping(), "MinerU API is not reachable"
    def ping(self, timeout: int = 5) -> bool:
        try:
            response = requests.get(
                self.minerU + "/docs", timeout=timeout, allow_redirects=True
            )
            response.raise_for_status()
            return True
        except Exception:
            return False
    def parse_into_text(self, content: bytes) -> Document:
        logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
        md_content: str = ""
        images_b64: Dict[str, str] = {}
        try:
            response = requests.post(
                url=self.minerU + "/file_parse",
                data={
                    "return_md": True,
                    "return_images": True,
                    "lang_list": ["ch", "en"],
                    "table_enable": True,
                    "formula_enable": True,
                    "parse_method": "auto",
                    "start_page_id": 0,
                    "end_page_id": 99999,
                    "backend": "pipeline",
                    "response_format_zip": False,
                    "return_middle_json": False,
                    "return_model_output": False,
                    "return_content_list": False,
                },
                files={"files": content},
                timeout=1000,
            )
            response.raise_for_status()
            result = response.json()["results"]["files"]
            md_content = result["md_content"]
            images_b64 = result.get("images", {})
        except Exception as e:
            logger.error(f"MinerU parsing failed: {e}", exc_info=True)
            return Document()
        # convert table(HTML) in markdown to markdown table
        if self.enable_markdownify:
            logger.debug("Converting HTML to Markdown")
            md_content = markdownify.markdownify(md_content)
        images = {}
        image_replace = {}
        # image in images_bs64 may not be used in md_content
        # such as: table ...
        # so we need to filter them
        for ipath, b64_str in images_b64.items():
            if f"images/{ipath}" not in md_content:
                logger.debug(f"Image {ipath} not used in markdown")
                continue
            match = self.base64_pattern.match(b64_str)
            if match:
                file_ext = match.group(1)
                b64_str = match.group(2)
                image_bytes = endecode.encode_image(b64_str, errors="ignore")
                if not image_bytes:
                    logger.error("Failed to decode base64 image skip it")
                    continue
                image_url = self.storage.upload_bytes(
                    image_bytes, file_ext=f".{file_ext}"
                )
                images[image_url] = b64_str
                image_replace[f"images/{ipath}"] = image_url
        logger.info(f"Replaced {len(image_replace)} images in markdown")
        text = self.image_helper.replace_path(md_content, image_replace)
        logger.info(
            f"Successfully parsed PDF, text: {len(text)}, images: {len(images)}"
        )
        return Document(content=text, images=images)
 if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    your_file = "/path/to/your/file.pdf"
    your_mineru = "http://host.docker.internal:9987"
    parser = MinerUParser(mineru_endpoint=your_mineru)
    with open(your_file, "rb") as f:
        content = f.read()
        document = parser.parse_into_text(content)
        logger.error(document.content)
--- a/docreader/parser/ocr_engine.py
+++ b/docreader/parser/ocr_engine.py
@@ -1,71 +1,96 @@
 import os
 import logging
 import base64
 from typing import Optional, Union, Dict, Any
 from abc import ABC, abstractmethod
 from PIL import Image
 import io
 import logging
 import os
 import platform
 import subprocess
 from abc import ABC, abstractmethod
 from typing import Dict, Union
 import numpy as np
-from .image_utils import image_to_base64
+from openai import OpenAI
 from PIL import Image
 from docreader.utils import endecode
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+
 class OCRBackend(ABC):
    """Base class for OCR backends"""
-    
+
    @abstractmethod
    def predict(self, image: Union[str, bytes, Image.Image]) -> str:
        """Extract text from an image
-        
+
        Args:
            image: Image file path, bytes, or PIL Image object
-            
+
        Returns:
            Extracted text
        """
        pass
 class DummyOCRBackend(OCRBackend):
    """Dummy OCR backend implementation"""
    def predict(self, image: Union[str, bytes, Image.Image]) -> str:
        logger.warning("Dummy OCR backend is used")
        return ""
 class PaddleOCRBackend(OCRBackend):
    """PaddleOCR backend implementation"""
-    
+
-    def __init__(self, **kwargs):
+    def __init__(self):
        """Initialize PaddleOCR backend"""
        self.ocr = None
        try:
            import os
            import paddle
-            
+
            # Set PaddlePaddle to use CPU and disable GPU
-            os.environ['CUDA_VISIBLE_DEVICES'] = ''
+            os.environ["CUDA_VISIBLE_DEVICES"] = ""
-            paddle.set_device('cpu')
+            paddle.device.set_device("cpu")
-            
+
            # 尝试检测CPU是否支持AVX指令集
            try:
                import subprocess
                import platform
                # 检测CPU是否支持AVX
                if platform.system() == "Linux":
                    try:
-                        result = subprocess.run(['grep', '-o', 'avx', '/proc/cpuinfo'], 
+                        result = subprocess.run(
-                                              capture_output=True, text=True, timeout=5)
+                            ["grep", "-o", "avx", "/proc/cpuinfo"],
-                        has_avx = 'avx' in result.stdout.lower()
+                            capture_output=True,
                            text=True,
                            timeout=5,
                        )
                        has_avx = "avx" in result.stdout.lower()
                        if not has_avx:
-                            logger.warning("CPU does not support AVX instructions, using compatibility mode")
+                            logger.warning(
                                "CPU does not support AVX instructions, "
                                "using compatibility mode"
                            )
                            # 进一步限制指令集使用
-                            os.environ['FLAGS_use_avx2'] = '0'
+                            os.environ["FLAGS_use_avx2"] = "0"
-                            os.environ['FLAGS_use_avx'] = '1'
+                            os.environ["FLAGS_use_avx"] = "1"
-                    except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
+                    except (
-                        logger.warning("Could not detect AVX support, using compatibility mode")
+                        subprocess.TimeoutExpired,
-                        os.environ['FLAGS_use_avx2'] = '0'
+                        FileNotFoundError,
-                        os.environ['FLAGS_use_avx'] = '1'
+                        subprocess.SubprocessError,
                    ):
                        logger.warning(
                            "Could not detect AVX support, using compatibility mode"
                        )
                        os.environ["FLAGS_use_avx2"] = "0"
                        os.environ["FLAGS_use_avx"] = "1"
            except Exception as e:
-                logger.warning(f"Error detecting CPU capabilities: {e}, using compatibility mode")
+                logger.warning(
-                os.environ['FLAGS_use_avx2'] = '0'
+                    f"Error detecting CPU capabilities: {e}, using compatibility mode"
-                os.environ['FLAGS_use_avx'] = '1'
+                )
-            
+                os.environ["FLAGS_use_avx2"] = "0"
                os.environ["FLAGS_use_avx"] = "1"
            from paddleocr import PaddleOCR
            # OCR configuration with text orientation classification enabled
            ocr_config = {
                "use_gpu": False,
@@ -86,23 +111,53 @@ class PaddleOCRBackend(OCRBackend):
                "use_dilation": True,  # improves accuracy
                "det_db_score_mode": "slow",  # improves accuracy
            }
-            
+
            self.ocr = PaddleOCR(**ocr_config)
            logger.info("PaddleOCR engine initialized successfully")
-            
+
        except ImportError as e:
-            logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
+            logger.error(
                f"Failed to import paddleocr: {str(e)}. "
                "Please install it with 'pip install paddleocr'"
            )
        except OSError as e:
            if "Illegal instruction" in str(e) or "core dumped" in str(e):
-                logger.error(f"PaddlePaddle crashed due to CPU instruction set incompatibility: {str(e)}")
+                logger.error(
-                logger.error("This usually happens when the CPU doesn't support AVX instructions.")
+                    f"PaddlePaddle crashed due to CPU instruction set incompatibility:"
-                logger.error("Please try installing a CPU-only version of PaddlePaddle or use a different OCR backend.")
+                    f"{e}"
                )
                logger.error(
                    "This happens when the CPU doesn't support AVX instructions. "
                    "Try install CPU-only version of PaddlePaddle, "
                    "or use a different OCR backend."
                )
            else:
-                logger.error(f"Failed to initialize PaddleOCR due to OS error: {str(e)}")
+                logger.error(
                    f"Failed to initialize PaddleOCR due to OS error: {str(e)}"
                )
        except Exception as e:
            logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
-    
+
-    def predict(self, image):
+    def predict(self, image: Union[str, bytes, Image.Image]) -> str:
        """Extract text from an image
        Args:
            image: Image file path, bytes, or PIL Image object
        Returns:
            Extracted text
        """
        if isinstance(image, str):
            image = Image.open(image)
        elif isinstance(image, bytes):
            image = Image.open(io.BytesIO(image))
        if not isinstance(image, Image.Image):
            raise TypeError("image must be a string, bytes, or PIL Image object")
        return self._predict(image)
    def _predict(self, image: Image.Image) -> str:
        """Perform OCR recognition on the image
        Args:
@@ -111,63 +166,59 @@ class PaddleOCRBackend(OCRBackend):
        Returns:
            Extracted text string
        """
        if self.ocr is None:
            logger.error("PaddleOCR engine not initialized")
            return ""
        try:
            # Ensure image is in RGB format
-            if hasattr(image, "convert") and image.mode != "RGB":
+            if image.mode != "RGB":
                image = image.convert("RGB")
            # Convert to numpy array if needed
-            if hasattr(image, "convert"):
+            image_array = np.array(image)
                image_array = np.array(image)
            else:
                image_array = image
            # Perform OCR
            ocr_result = self.ocr.ocr(image_array, cls=False)
-   
+
            # Extract text
            ocr_text = ""
            if ocr_result and ocr_result[0]:
-                for line in ocr_result[0]:
+                text = [
-                    if line and len(line) >= 2:
+                    line[1][0] if line and len(line) >= 2 and line[1] else ""
-                        text = line[1][0] if line[1] else ""
+                    for line in ocr_result[0]
-                        if text:
+                ]
-                            ocr_text += text + " "
+                text = [t.strip() for t in text if t]
-            
+                ocr_text = " ".join(text)
-            text_length = len(ocr_text.strip())
+
-            if text_length > 0:
+            logger.info(f"OCR extracted {len(ocr_text)} characters")
-                logger.info(f"OCR extracted {text_length} characters")
+            return ocr_text
-                return ocr_text.strip()
+
            else:
                logger.warning("OCR returned empty result")
                return ""
        except Exception as e:
            logger.error(f"OCR recognition error: {str(e)}")
            return ""
-    
+
 class NanonetsOCRBackend(OCRBackend):
    """Nanonets OCR backend implementation using OpenAI API format"""
-    
+
-    def __init__(self, **kwargs):
+    def __init__(self):
        """Initialize Nanonets OCR backend
-        
+
        Args:
            api_key: API key for OpenAI API
            base_url: Base URL for OpenAI API
            model: Model name
        """
-        try:
+        base_url = os.getenv("OCR_API_BASE_URL", "http://localhost:8000/v1")
-            from openai import OpenAI
+        api_key = os.getenv("OCR_API_KEY", "123")
-            self.api_key = kwargs.get("api_key", "123")
+        timeout = 30
-            self.base_url = kwargs.get("base_url", "http://localhost:8000/v1")
+        self.client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout)
-            self.model = kwargs.get("model", "nanonets/Nanonets-OCR-s")
+
-            self.temperature = kwargs.get("temperature", 0.0)
+        self.model = os.getenv("OCR_MODEL", "nanonets/Nanonets-OCR-s")
-            self.max_tokens = kwargs.get("max_tokens", 15000)
+        logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
-            
+        self.temperature = 0.0
-            self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+        self.max_tokens = 15000
-            self.prompt = """
+        self.prompt = """## 任务说明
 ## 任务说明
 请从上传的文档中提取文字内容，严格按自然阅读顺序（从上到下，从左到右）输出，并遵循以下格式规范。
@@ -192,33 +243,26 @@ class NanonetsOCRBackend(OCRBackend):
 * 不要猜测或补全不确定的链接地址。
 """
-            logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
+
        except ImportError:
            logger.error("Failed to import openai. Please install it with 'pip install openai'")
            self.client = None
        except Exception as e:
            logger.error(f"Failed to initialize Nanonets OCR: {str(e)}")
            self.client = None
    def predict(self, image: Union[str, bytes, Image.Image]) -> str:
        """Extract text from an image using Nanonets OCR
-        
+
        Args:
            image: Image file path, bytes, or PIL Image object
-            
+
        Returns:
            Extracted text
        """
        if self.client is None:
            logger.error("Nanonets OCR client not initialized")
            return ""
-        
+
        try:
            # Encode image to base64
-            img_base64 = image_to_base64(image)
+            img_base64 = endecode.decode_image(image)
            if not img_base64:
                return ""
-            
+
            # Call Nanonets OCR API
            logger.info(f"Calling Nanonets OCR API with model: {self.model}")
            response = self.client.chat.completions.create(
@@ -229,7 +273,9 @@ class NanonetsOCRBackend(OCRBackend):
                        "content": [
                            {
                                "type": "image_url",
-                                "image_url": {"url": f"data:image/png;base64,{img_base64}"},
+                                "image_url": {
                                    "url": f"data:image/png;base64,{img_base64}"
                                },
                            },
                            {
                                "type": "text",
@@ -239,40 +285,43 @@ class NanonetsOCRBackend(OCRBackend):
                    }
                ],
                temperature=self.temperature,
-                max_tokens=self.max_tokens
+                max_tokens=self.max_tokens,
            )
-            
+            return response.choices[0].message.content or ""
            return response.choices[0].message.content
        except Exception as e:
            logger.error(f"Nanonets OCR prediction error: {str(e)}")
            return ""
 class OCREngine:
    """OCR Engine factory class"""
-    
+
-    _instance = None
+    _instance: Dict[str, OCRBackend] = {}
-    
+
    @classmethod
-    def get_instance(cls, backend_type="paddle", **kwargs) -> Optional[OCRBackend]:
+    def get_instance(cls, backend_type: str) -> OCRBackend:
        """Get OCR engine instance
-        
+
        Args:
            backend_type: OCR backend type, one of: "paddle", "nanonets"
            **kwargs: Additional arguments for the backend
-            
+
        Returns:
            OCR engine instance or None if initialization fails
        """
-        if cls._instance is None:
+        backend_type = backend_type.lower()
-            logger.info(f"Initializing OCR engine with backend: {backend_type}")
+        if cls._instance.get(backend_type):
-            
+            return cls._instance[backend_type]
-            if backend_type.lower() == "paddle":
+
-                cls._instance = PaddleOCRBackend(**kwargs)
+        logger.info(f"Initializing OCR engine with backend: {backend_type}")
-            elif backend_type.lower() == "nanonets":
+
-                cls._instance = NanonetsOCRBackend(**kwargs)
+        if backend_type == "paddle":
-            else:
+            cls._instance[backend_type] = PaddleOCRBackend()
-                logger.error(f"Unknown OCR backend type: {backend_type}")
+
-                return None
+        elif backend_type == "nanonets":
-        
+            cls._instance[backend_type] = NanonetsOCRBackend()
-        return cls._instance
+
-    
+        else:
            cls._instance[backend_type] = DummyOCRBackend()
        return cls._instance[backend_type]
--- a/docreader/parser/parser.py
+++ b/docreader/parser/parser.py
@@ -1,30 +1,19 @@
 import logging
-from dataclasses import dataclass, field
+from typing import Dict, Type
 from typing import Dict, Any, Optional, Type
-from .base_parser import BaseParser, ParseResult
+from docreader.models.document import Document
-from .docx_parser import DocxParser
+from docreader.models.read_config import ChunkingConfig
-from .doc_parser import DocParser
+from docreader.parser.base_parser import BaseParser
-from .pdf_parser import PDFParser
+from docreader.parser.doc_parser import DocParser
-from .markdown_parser import MarkdownParser
+from docreader.parser.docx2_parser import Docx2Parser
-from .text_parser import TextParser
+from docreader.parser.image_parser import ImageParser
-from .image_parser import ImageParser
+from docreader.parser.markdown_parser import MarkdownParser
-from .web_parser import WebParser
+from docreader.parser.pdf_parser import PDFParser
-from .config import ChunkingConfig
+from docreader.parser.text_parser import TextParser
-import traceback
+from docreader.parser.web_parser import WebParser
 logger = logging.getLogger(__name__)
@dataclass
 class Chunk:
    """
    Represents a single text chunk with associated metadata.
    Basic unit for document processing and embedding.
    """
    content: str  # Text content of the chunk
    metadata: Dict[str, Any] = None  # Associated metadata (source, page number, etc.)
 class Parser:
    """
@@ -33,10 +22,9 @@ class Parser:
    """
    def __init__(self):
        logger.info("Initializing document parser")
        # Initialize all parser types
        self.parsers: Dict[str, Type[BaseParser]] = {
-            "docx": DocxParser,
+            "docx": Docx2Parser,
            "doc": DocParser,
            "pdf": PDFParser,
            "md": MarkdownParser,
@@ -56,8 +44,7 @@ class Parser:
            ", ".join(self.parsers.keys()),
        )
-
+    def get_parser(self, file_type: str) -> Type[BaseParser]:
    def get_parser(self, file_type: str) -> Optional[Type[BaseParser]]:
        """
        Get parser class for the specified file type.
@@ -67,12 +54,9 @@ class Parser:
        Returns:
            Parser class for the file type, or None if unsupported
        """
-        file_type = file_type.lower()
+        parser = self.parsers.get(file_type.lower())
-        parser = self.parsers.get(file_type)
+        if not parser:
-        if parser:
+            raise ValueError(f"Unsupported file type: {file_type}")
            logger.info(f"Found parser for file type: {file_type}")
        else:
            logger.warning(f"No parser found for file type: {file_type}")
        return parser
    def parse_file(
@@ -81,7 +65,7 @@ class Parser:
        file_type: str,
        content: bytes,
        config: ChunkingConfig,
-    ) -> Optional[ParseResult]:
+    ) -> Document:
        """
        Parse file content using appropriate parser based on file type.
@@ -96,60 +80,41 @@ class Parser:
        """
        logger.info(f"Parsing file: {file_name} with type: {file_type}")
        logger.info(
-            f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
+            f"Chunking config: size={config.chunk_size}, "
            f"overlap={config.chunk_overlap}, "
            f"multimodal={config.enable_multimodal}"
        )
        parser_instance = None
        try:
            # Get appropriate parser for file type
            cls = self.get_parser(file_type)
            if cls is None:
                logger.error(f"Unsupported file type: {file_type}")
                return None
-            # Parse file content
+        # Get appropriate parser for file type
-            logger.info(f"Creating parser instance for {file_type} file")
+        cls = self.get_parser(file_type)
            parser_instance = cls(
                file_name=file_name,
                file_type=file_type,
                chunk_size=config.chunk_size,
                chunk_overlap=config.chunk_overlap,
                separators=config.separators,
                enable_multimodal=config.enable_multimodal,
                max_image_size=1920,  # Limit image size to 1920px
                max_concurrent_tasks=5,  # Limit concurrent tasks to 5
                chunking_config=config,  # Pass the entire chunking config
            )
-            logger.info(f"Starting to parse file content, size: {len(content)} bytes")
+        # Parse file content
-            result = parser_instance.parse(content)
+        logger.info(f"Creating parser instance for {file_type} file")
        parser = cls(
            file_name=file_name,
            file_type=file_type,
            chunk_size=config.chunk_size,
            chunk_overlap=config.chunk_overlap,
            separators=config.separators,
            enable_multimodal=config.enable_multimodal,
            max_image_size=1920,  # Limit image size to 1920px
            max_concurrent_tasks=5,  # Limit concurrent tasks to 5
            chunking_config=config,  # Pass the entire chunking config
        )
-            if result:
+        logger.info(f"Starting to parse file content, size: {len(content)} bytes")
-                logger.info(
+        result = parser.parse(content)
                    f"Successfully parsed file {file_name}, generated {len(result.chunks)} chunks"
                )
                if result.chunks and len(result.chunks) > 0:
                    logger.info(
                        f"First chunk content length: {len(result.chunks[0].content)}"
                    )
                else:
                    logger.warning(f"Parser returned empty chunks for file: {file_name}")
            else:
                logger.warning(f"Parser returned None result for file: {file_name}")
-            # Return parse results
+        if not result.content:
-            return result
+            logger.warning(f"Parser returned empty content for file: {file_name}")
        elif not result.chunks:
            logger.warning(f"Parser returned empty chunks for file: {file_name}")
        elif result.chunks[0]:
            logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
        logger.info(f"Parsed file {file_name}, with {len(result.chunks)} chunks")
        return result
-        except Exception as e:
+    def parse_url(self, url: str, title: str, config: ChunkingConfig) -> Document:
            logger.error(f"Error parsing file {file_name}: {str(e)}")
            logger.info(f"Detailed traceback: {traceback.format_exc()}")
            return None
    def parse_url(
        self, url: str, title: str, config: ChunkingConfig
    ) -> Optional[ParseResult]:
        """
        Parse content from a URL using the WebParser.
@@ -163,44 +128,31 @@ class Parser:
        """
        logger.info(f"Parsing URL: {url}, title: {title}")
        logger.info(
-            f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
+            f"Chunking config: size={config.chunk_size}, "
-            f"multimodal={config.enable_multimodal}"
+            f"overlap={config.chunk_overlap}, multimodal={config.enable_multimodal}"
        )
        parser_instance = None
-        try:
+        # Create web parser instance
-            # Create web parser instance
+        logger.info("Creating WebParser instance")
-            logger.info("Creating WebParser instance")
+        parser = WebParser(
-            parser_instance = WebParser(
+            title=title,
-                title=title,
+            chunk_size=config.chunk_size,
-                chunk_size=config.chunk_size,
+            chunk_overlap=config.chunk_overlap,
-                chunk_overlap=config.chunk_overlap,
+            separators=config.separators,
-                separators=config.separators,
+            enable_multimodal=config.enable_multimodal,
-                enable_multimodal=config.enable_multimodal,
+            max_image_size=1920,  # Limit image size
-                max_image_size=1920,  # Limit image size
+            max_concurrent_tasks=5,  # Limit concurrent tasks
-                max_concurrent_tasks=5,  # Limit concurrent tasks
+            chunking_config=config,
-                chunking_config=config,
+        )
            )
-            logger.info(f"Starting to parse URL content")
+        logger.info("Starting to parse URL content")
-            result = parser_instance.parse(url)
+        result = parser.parse(url.encode())
            if result:
                logger.info(
                    f"Successfully parsed URL, generated {len(result.chunks)} chunks"
                )
                logger.info(
                    f"First chunk content length: {len(result.chunks[0].content) if result.chunks else 0}"
                )
            else:
                logger.warning(f"Parser returned empty result for URL: {url}")
            # Return parse results
            return result
        except Exception as e:
            logger.error(f"Error parsing URL {url}: {str(e)}")
            logger.info(f"Detailed traceback: {traceback.format_exc()}")
            return None
        if not result.content:
            logger.warning(f"Parser returned empty content for url: {url}")
        elif not result.chunks:
            logger.warning(f"Parser returned empty chunks for url: {url}")
        elif result.chunks[0]:
            logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
        logger.info(f"Parsed url {url}, with {len(result.chunks)} chunks")
        return result
--- a/docreader/parser/pdf_parser.py
+++ b/docreader/parser/pdf_parser.py
@@ -1,113 +1,7 @@
-import logging
+from docreader.parser.chain_parser import FirstParser
-import os
+from docreader.parser.markitdown_parser import MarkitdownParser
-import io
+from docreader.parser.mineru_parser import MinerUParser
 from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
 import pdfplumber
 import tempfile
 from .base_parser import BaseParser
-logger = logging.getLogger(__name__)
+class PDFParser(FirstParser):
-
+    _parser_cls = (MinerUParser, MarkitdownParser)
 class PDFParser(BaseParser):
    """
    PDF Document Parser
    This parser handles PDF documents by extracting text content.
    It uses the pypdf library for simple text extraction.
    """
    def _convert_table_to_markdown(self, table_data: list) -> str:
        if not table_data or not table_data[0]: return ""
        def clean_cell(cell):
            if cell is None: return ""
            return str(cell).replace("\n", " <br> ")
        try:
            markdown = ""
            header = [clean_cell(cell) for cell in table_data[0]]
            markdown += "| " + " | ".join(header) + " |\n"
            markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
            for row in table_data[1:]:
                if not row: continue
                body_row = [clean_cell(cell) for cell in row]
                if len(body_row) != len(header):
                    logger.warning(f"Skipping malformed table row: {body_row}")
                    continue
                markdown += "| " + " | ".join(body_row) + " |\n"
            return markdown
        except Exception as e:
            logger.error(f"Error converting table to markdown: {e}")
            return ""
    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
        logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
        all_page_content = []
        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        temp_pdf_path = temp_pdf.name
        try:
            temp_pdf.write(content)
            temp_pdf.close()
            logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
            with pdfplumber.open(temp_pdf_path) as pdf:
                logger.info(f"PDF has {len(pdf.pages)} pages")
                for page_num, page in enumerate(pdf.pages):
                    page_content_parts = []
                    # Try-fallback strategy for table detection
                    default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
                    found_tables = page.find_tables(default_settings)
                    if not found_tables:
                        logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
                        fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
                        found_tables = page.find_tables(fallback_settings)
                    table_bboxes = [table.bbox for table in found_tables]
                    # Define a filter function that keeps objects NOT inside any table bbox.
                    def not_within_bboxes(obj):
                        """Check if an object is outside all table bounding boxes."""
                        for bbox in table_bboxes:
                            # Check if the object's vertical center is within a bbox
                            if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
                                return False # It's inside a table, so we DON'T keep it
                        return True # It's outside all tables, so we DO keep it
                    # that contains only the non-table text.
                    non_table_page = page.filter(not_within_bboxes)
                    # Now, extract text from this filtered page view.
                    text = non_table_page.extract_text(x_tolerance=2)
                    if text:
                        page_content_parts.append(text)
                    # Process and append the structured Markdown tables
                    if found_tables:
                        logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
                        for table in found_tables:
                            markdown_table = self._convert_table_to_markdown(table.extract())
                            page_content_parts.append(f"\n\n{markdown_table}\n\n")
                    all_page_content.append("".join(page_content_parts))
            final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
            logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
            return final_text
        except Exception as e:
            logger.error(f"Failed to parse PDF document: {str(e)}")
            return ""
        finally:
            # This block is GUARANTEED to execute, preventing resource leaks.
            if os.path.exists(temp_pdf_path):
                try:
                    os.remove(temp_pdf_path)
                    logger.info(f"Temporary file cleaned up: {temp_pdf_path}")
                except OSError as e:
                    logger.error(f"Error removing temporary file {temp_pdf_path}: {e}")
--- a/docreader/parser/storage.py
+++ b/docreader/parser/storage.py
@@ -1,64 +1,68 @@
 # -*- coding: utf-8 -*-
 import os
 import uuid
 import logging
 import io
 import logging
 import os
 import traceback
 import uuid
 from abc import ABC, abstractmethod
-from typing import Tuple, Optional
+from typing import Dict
 from qcloud_cos import CosConfig, CosS3Client
 from minio import Minio
 from qcloud_cos import CosConfig, CosS3Client
 from docreader.utils import endecode
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 class Storage(ABC):
    """Abstract base class for object storage operations"""
-    
+
    @abstractmethod
    def upload_file(self, file_path: str) -> str:
        """Upload file to object storage
-        
+
        Args:
            file_path: File path
-            
+
        Returns:
            File URL
        """
        pass
    @abstractmethod
    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
        """Upload bytes to object storage
        Args:
            content: Byte content to upload
            file_ext: File extension
        Returns:
            File URL
        """
        pass
-        
+    @abstractmethod
    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
        """Upload bytes to object storage
        Args:
            content: Byte content to upload
            file_ext: File extension
        Returns:
            File URL
        """
        pass
 class CosStorage(Storage):
    """Tencent Cloud COS storage implementation"""
-    
+
    def __init__(self, storage_config=None):
        """Initialize COS storage
-        
+
        Args:
            storage_config: Storage configuration
        """
        self.storage_config = storage_config
-        self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client()
+        self.client, self.bucket_name, self.region, self.prefix = (
-        
+            self._init_cos_client()
        )
    def _init_cos_client(self):
        """Initialize Tencent Cloud COS client"""
        try:
-            # Use provided COS config if available, otherwise fall back to environment variables
+            # Use provided COS config if available,
            # otherwise fall back to environment variables
            if self.storage_config and self.storage_config.get("access_key_id") != "":
                cos_config = self.storage_config
                secret_id = cos_config.get("access_key_id")
@@ -75,15 +79,16 @@ class CosStorage(Storage):
                bucket_name = os.getenv("COS_BUCKET_NAME")
                appid = os.getenv("COS_APP_ID")
                prefix = os.getenv("COS_PATH_PREFIX")
-                
+
            enable_old_domain = (
                os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true"
            )
            if not all([secret_id, secret_key, region, bucket_name, appid]):
                logger.error(
-                    "Incomplete COS configuration, missing required environment variables"
+                    "Incomplete COS configuration, missing environment variables"
-                    f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}"
+                    f"secret_id: {secret_id}, secret_key: {secret_key}, "
                    f"region: {region}, bucket_name: {bucket_name}, appid: {appid}"
                )
                return None, None, None, None
@@ -105,27 +110,26 @@ class CosStorage(Storage):
        except Exception as e:
            logger.error(f"Failed to initialize COS client: {str(e)}")
            return None, None, None, None
-            
+
    def _get_download_url(self, bucket_name, region, object_key):
        """Generate COS object URL
-        
+
        Args:
            bucket_name: Bucket name
            region: Region
            object_key: Object key
-            
+
        Returns:
            File URL
        """
        return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
-    
+
    def upload_file(self, file_path: str) -> str:
        """Upload file to Tencent Cloud COS
-        
+
        Args:
            file_path: File path
-            
+
        Returns:
            File URL
        """
@@ -135,16 +139,16 @@ class CosStorage(Storage):
                return ""
            # Generate object key, use UUID to avoid conflicts
-            file_name = os.path.basename(file_path)
+            file_ext = os.path.splitext(file_path)[1]
-            object_key = (
+            object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
                f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
            )
            logger.info(f"Generated object key: {object_key}")
            # Upload file
            logger.info("Attempting to upload file to COS")
-            response = self.client.upload_file(
+            self.client.upload_file(
-                Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key
+                Bucket=self.bucket_name,
                LocalFilePath=file_path,
                Key=object_key,
            )
            # Get file URL
@@ -156,14 +160,14 @@ class CosStorage(Storage):
        except Exception as e:
            logger.error(f"Failed to upload file to COS: {str(e)}")
            return ""
-            
+
    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
        """Upload bytes to Tencent Cloud COS
-        
+
        Args:
            content: Byte content to upload
            file_ext: File extension
-            
+
        Returns:
            File URL
        """
@@ -171,10 +175,16 @@ class CosStorage(Storage):
            logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes")
            if not self.client:
                return ""
-                
+
-            object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}"
+            object_key = (
                f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
                if self.prefix
                else f"images/{uuid.uuid4().hex}{file_ext}"
            )
            logger.info(f"Generated object key: {object_key}")
-            self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key)
+            self.client.put_object(
                Bucket=self.bucket_name, Body=content, Key=object_key
            )
            file_url = self._get_download_url(self.bucket_name, self.region, object_key)
            logger.info(f"Successfully uploaded bytes to COS: {file_url}")
            return file_url
@@ -186,16 +196,18 @@ class CosStorage(Storage):
 class MinioStorage(Storage):
    """MinIO storage implementation"""
-    
+
    def __init__(self, storage_config=None):
        """Initialize MinIO storage
-        
+
        Args:
            storage_config: Storage configuration
        """
        self.storage_config = storage_config
-        self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client()
+        self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = (
-        
+            self._init_minio_client()
        )
    def _init_minio_client(self):
        """Initialize MinIO client from environment variables or injected config.
@@ -203,58 +215,69 @@ class MinioStorage(Storage):
        prefer those values to override envs.
        """
        try:
-            endpoint = os.getenv("MINIO_ENDPOINT")
+            endpoint = os.getenv("MINIO_ENDPOINT", "")
            use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true"
            if self.storage_config and self.storage_config.get("bucket_name"):
                storage_config = self.storage_config
-                bucket_name = storage_config.get("bucket_name")
+                bucket_name = storage_config.get("bucket_name", "")
                path_prefix = storage_config.get("path_prefix").strip().strip("/")
                access_key = storage_config.get("access_key_id")
                secret_key = storage_config.get("secret_access_key")
            else:
                access_key = os.getenv("MINIO_ACCESS_KEY_ID")
                secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY")
-                bucket_name = os.getenv("MINIO_BUCKET_NAME")
+                bucket_name = os.getenv("MINIO_BUCKET_NAME", "")
                path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/")
            if not all([endpoint, access_key, secret_key, bucket_name]):
-                logger.error("Incomplete MinIO configuration, missing required environment variables")
+                logger.error(
                    "Incomplete MinIO configuration, missing environment variables"
                )
                return None, None, None, None, None
            # Initialize client
-            client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl)
+            client = Minio(
                endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl
            )
            # Ensure bucket exists
            found = client.bucket_exists(bucket_name)
            if not found:
                client.make_bucket(bucket_name)
-                policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name)
+                policy = (
                    '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}'
                    % (bucket_name, bucket_name)
                )
                client.set_bucket_policy(bucket_name, policy)
            return client, bucket_name, use_ssl, endpoint, path_prefix
        except Exception as e:
            logger.error(f"Failed to initialize MinIO client: {str(e)}")
            return None, None, None, None, None
-            
+
-    def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None):
+    def _get_download_url(self, object_key: str):
        """Construct a public URL for MinIO object.
        If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint.
        """
-        if public_endpoint:
+        # 1. Use public endpoint if provided
-            base = public_endpoint
+        endpoint = os.getenv("MINIO_PUBLIC_ENDPOINT")
-        else:
+        if endpoint:
-            scheme = "https" if use_ssl else "http"
+            return f"{endpoint}/{self.bucket_name}/{object_key}"
-            base = f"{scheme}://{endpoint}"
+
-        # Path-style URL for MinIO
+        # 2. Use SSL if enabled
-        return f"{base}/{bucket_name}/{object_key}"
+        if self.use_ssl:
-        
+            return f"https://{self.endpoint}/{self.bucket_name}/{object_key}"
        # 3. Use HTTP default
        return f"http://{self.endpoint}/{self.bucket_name}/{object_key}"
    def upload_file(self, file_path: str) -> str:
        """Upload file to MinIO
-        
+
        Args:
            file_path: File path
-            
+
        Returns:
            File URL
        """
@@ -265,29 +288,27 @@ class MinioStorage(Storage):
            # Generate object key, use UUID to avoid conflicts
            file_name = os.path.basename(file_path)
-            object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
+            object_key = (
                f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
                if self.path_prefix
                else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
            )
            logger.info(f"Generated MinIO object key: {object_key}")
            # Upload file
            logger.info("Attempting to upload file to MinIO")
-            with open(file_path, 'rb') as file_data:
+            with open(file_path, "rb") as file_data:
                file_size = os.path.getsize(file_path)
                self.client.put_object(
-                    bucket_name=self.bucket_name,
+                    bucket_name=self.bucket_name or "",
                    object_name=object_key,
                    data=file_data,
                    length=file_size,
-                    content_type='application/octet-stream'
+                    content_type="application/octet-stream",
                )
            # Get file URL
-            file_url = self._get_download_url(
+            file_url = self._get_download_url(object_key)
                self.bucket_name, 
                object_key, 
                self.use_ssl, 
                self.endpoint,
                os.getenv("MINIO_PUBLIC_ENDPOINT", None)
            )
            logger.info(f"Successfully uploaded file to MinIO: {file_url}")
            return file_url
@@ -295,14 +316,14 @@ class MinioStorage(Storage):
        except Exception as e:
            logger.error(f"Failed to upload file to MinIO: {str(e)}")
            return ""
-            
+
    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
        """Upload bytes to MinIO
-        
+
        Args:
            content: Byte content to upload
            file_ext: File extension
-            
+
        Returns:
            File URL
        """
@@ -310,23 +331,21 @@ class MinioStorage(Storage):
            logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes")
            if not self.client:
                return ""
-                
+
-            object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}"
+            object_key = (
                f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}"
                if self.path_prefix
                else f"images/{uuid.uuid4().hex}{file_ext}"
            )
            logger.info(f"Generated MinIO object key: {object_key}")
            self.client.put_object(
-                self.bucket_name, 
+                self.bucket_name or "",
-                object_key, 
+                object_key,
-                data=io.BytesIO(content), 
+                data=io.BytesIO(content),
-                length=len(content), 
+                length=len(content),
-                content_type="application/octet-stream"
+                content_type="application/octet-stream",
            )
            file_url = self._get_download_url(
                self.bucket_name, 
                object_key, 
                self.use_ssl, 
                self.endpoint,
                os.getenv("MINIO_PUBLIC_ENDPOINT", None)
            )
            file_url = self._get_download_url(object_key)
            logger.info(f"Successfully uploaded bytes to MinIO: {file_url}")
            return file_url
        except Exception as e:
@@ -335,26 +354,61 @@ class MinioStorage(Storage):
            return ""
-def create_storage(storage_config=None) -> Storage:
+class LocalStorage(Storage):
    """Local file system storage implementation"""
    def __init__(self, storage_config: Dict[str, str] = {}):
        self.storage_config = storage_config
        base_dir = storage_config.get(
            "base_dir", os.getenv("LOCAL_STORAGE_BASE_DIR", "")
        )
        self.image_dir = os.path.join(base_dir, "images")
        os.makedirs(self.image_dir, exist_ok=True)
    def upload_file(self, file_path: str) -> str:
        logger.info(f"Uploading file to local storage: {file_path}")
        return file_path
    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
        logger.info(f"Uploading file to local storage: {len(content)} bytes")
        fname = os.path.join(self.image_dir, f"{uuid.uuid4()}{file_ext}")
        with open(fname, "wb") as f:
            f.write(content)
        return fname
 class Base64Storage(Storage):
    def upload_file(self, file_path: str) -> str:
        logger.info(f"Uploading file to base64 storage: {file_path}")
        return file_path
    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
        logger.info(f"Uploading file to base64 storage: {len(content)} bytes")
        file_ext = file_ext.lstrip(".")
        return f"data:image/{file_ext};base64,{endecode.decode_image(content)}"
 def create_storage(storage_config: Dict[str, str] | None = None) -> Storage:
    """Create a storage instance based on configuration or environment variables
-    
+
    Args:
        storage_config: Storage configuration dictionary
-        
+
    Returns:
        Storage instance
    """
    storage_type = os.getenv("STORAGE_TYPE", "cos").lower()
    if storage_config:
        storage_type = str(storage_config.get("provider", storage_type)).lower()
    logger.info(f"Creating {storage_type} storage instance")
-    
+
    if storage_type == "minio":
        return MinioStorage(storage_config)
    elif storage_type == "cos":
        # Default to COS
        return CosStorage(storage_config)
-    else:
+    elif storage_type == "local":
-        return None
+        return LocalStorage(storage_config or {})
    elif storage_type == "base64":
        return Base64Storage()
    raise ValueError(f"Invalid storage type: {storage_type}")
--- a/docreader/parser/text_parser.py
+++ b/docreader/parser/text_parser.py
@@ -1,6 +1,8 @@
 import logging
-from .base_parser import BaseParser
+
-from typing import Dict, Any, Tuple, Union
+from docreader.models.document import Document
 from docreader.parser.base_parser import BaseParser
 from docreader.utils import endecode
 logger = logging.getLogger(__name__)
@@ -11,7 +13,7 @@ class TextParser(BaseParser):
    This parser handles text extraction and chunking from plain text documents.
    """
-    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+    def parse_into_text(self, content: bytes) -> Document:
        """
        Parse text document content by decoding bytes to string.
@@ -25,20 +27,15 @@ class TextParser(BaseParser):
            Parsed text content as string
        """
        logger.info(f"Parsing text document, content size: {len(content)} bytes")
-        text = self.decode_bytes(content)
+        text = endecode.decode_bytes(content)
        logger.info(
            f"Successfully parsed text document, extracted {len(text)} characters"
        )
-        return text
+        return Document(content=text)
 if __name__ == "__main__":
-    logging.basicConfig(
+    logger = logging.getLogger(__name__)
        level=logging.INFO,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )
    logger.info("Running TextParser in standalone mode")
    # Sample text for testing
    text = """## 标题1
--- a/docreader/parser/web_parser.py
+++ b/docreader/parser/web_parser.py
@@ -1,11 +1,14 @@
 from typing import Any, Optional, Tuple, Dict, Union
 import os
 from playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
 from .base_parser import BaseParser, ParseResult
 import logging
 import asyncio
 import logging
 import os
 from typing import Any
 from bs4 import BeautifulSoup
 from playwright.async_api import async_playwright
 from docreader.models.document import Document
 from docreader.parser.base_parser import BaseParser
 from docreader.utils import endecode
 logger = logging.getLogger(__name__)
@@ -59,7 +62,7 @@ class WebParser(BaseParser):
            # Return empty BeautifulSoup object on error
            return BeautifulSoup("", "html.parser")
-    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+    def parse_into_text(self, content: bytes) -> Document:
        """Parse web page
        Args:
@@ -78,10 +81,10 @@ class WebParser(BaseParser):
            # Run async method
            # Handle content possibly being a string
            if isinstance(content, bytes):
-                url = self.decode_bytes(content)
+                url = endecode.decode_bytes(content)
                logger.info(f"Decoded URL from bytes: {url}")
            else:
-                url = content
+                url = str(content)
                logger.info(f"Using content as URL directly: {url}")
            logger.info(f"Scraping web page: {url}")
@@ -118,11 +121,11 @@ class WebParser(BaseParser):
            logger.info(
                f"Web page parsing complete, total content: {len(result)} characters"
            )
-            return result
+            return Document(content=result)
        except Exception as e:
            logger.error(f"Error parsing web page: {str(e)}")
-            return f"Error parsing web page: {str(e)}"
+            return Document(content=f"Error parsing web page: {str(e)}")
        finally:
            # Close event loop
--- a/docreader/proto/docreader_pb2.pyi
+++ b/docreader/proto/docreader_pb2.pyi
@@ -0,0 +1,127 @@
 from google.protobuf.internal import containers as _containers
 from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import message as _message
 from collections.abc import Iterable as _Iterable, Mapping as _Mapping
 from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
 DESCRIPTOR: _descriptor.FileDescriptor
 class StorageProvider(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
    __slots__ = ()
    STORAGE_PROVIDER_UNSPECIFIED: _ClassVar[StorageProvider]
    COS: _ClassVar[StorageProvider]
    MINIO: _ClassVar[StorageProvider]
 STORAGE_PROVIDER_UNSPECIFIED: StorageProvider
 COS: StorageProvider
 MINIO: StorageProvider
 class StorageConfig(_message.Message):
    __slots__ = ("provider", "region", "bucket_name", "access_key_id", "secret_access_key", "app_id", "path_prefix")
    PROVIDER_FIELD_NUMBER: _ClassVar[int]
    REGION_FIELD_NUMBER: _ClassVar[int]
    BUCKET_NAME_FIELD_NUMBER: _ClassVar[int]
    ACCESS_KEY_ID_FIELD_NUMBER: _ClassVar[int]
    SECRET_ACCESS_KEY_FIELD_NUMBER: _ClassVar[int]
    APP_ID_FIELD_NUMBER: _ClassVar[int]
    PATH_PREFIX_FIELD_NUMBER: _ClassVar[int]
    provider: StorageProvider
    region: str
    bucket_name: str
    access_key_id: str
    secret_access_key: str
    app_id: str
    path_prefix: str
    def __init__(self, provider: _Optional[_Union[StorageProvider, str]] = ..., region: _Optional[str] = ..., bucket_name: _Optional[str] = ..., access_key_id: _Optional[str] = ..., secret_access_key: _Optional[str] = ..., app_id: _Optional[str] = ..., path_prefix: _Optional[str] = ...) -> None: ...
 class VLMConfig(_message.Message):
    __slots__ = ("model_name", "base_url", "api_key", "interface_type")
    MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
    BASE_URL_FIELD_NUMBER: _ClassVar[int]
    API_KEY_FIELD_NUMBER: _ClassVar[int]
    INTERFACE_TYPE_FIELD_NUMBER: _ClassVar[int]
    model_name: str
    base_url: str
    api_key: str
    interface_type: str
    def __init__(self, model_name: _Optional[str] = ..., base_url: _Optional[str] = ..., api_key: _Optional[str] = ..., interface_type: _Optional[str] = ...) -> None: ...
 class ReadConfig(_message.Message):
    __slots__ = ("chunk_size", "chunk_overlap", "separators", "enable_multimodal", "storage_config", "vlm_config")
    CHUNK_SIZE_FIELD_NUMBER: _ClassVar[int]
    CHUNK_OVERLAP_FIELD_NUMBER: _ClassVar[int]
    SEPARATORS_FIELD_NUMBER: _ClassVar[int]
    ENABLE_MULTIMODAL_FIELD_NUMBER: _ClassVar[int]
    STORAGE_CONFIG_FIELD_NUMBER: _ClassVar[int]
    VLM_CONFIG_FIELD_NUMBER: _ClassVar[int]
    chunk_size: int
    chunk_overlap: int
    separators: _containers.RepeatedScalarFieldContainer[str]
    enable_multimodal: bool
    storage_config: StorageConfig
    vlm_config: VLMConfig
    def __init__(self, chunk_size: _Optional[int] = ..., chunk_overlap: _Optional[int] = ..., separators: _Optional[_Iterable[str]] = ..., enable_multimodal: bool = ..., storage_config: _Optional[_Union[StorageConfig, _Mapping]] = ..., vlm_config: _Optional[_Union[VLMConfig, _Mapping]] = ...) -> None: ...
 class ReadFromFileRequest(_message.Message):
    __slots__ = ("file_content", "file_name", "file_type", "read_config", "request_id")
    FILE_CONTENT_FIELD_NUMBER: _ClassVar[int]
    FILE_NAME_FIELD_NUMBER: _ClassVar[int]
    FILE_TYPE_FIELD_NUMBER: _ClassVar[int]
    READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
    REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
    file_content: bytes
    file_name: str
    file_type: str
    read_config: ReadConfig
    request_id: str
    def __init__(self, file_content: _Optional[bytes] = ..., file_name: _Optional[str] = ..., file_type: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
 class ReadFromURLRequest(_message.Message):
    __slots__ = ("url", "title", "read_config", "request_id")
    URL_FIELD_NUMBER: _ClassVar[int]
    TITLE_FIELD_NUMBER: _ClassVar[int]
    READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
    REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
    url: str
    title: str
    read_config: ReadConfig
    request_id: str
    def __init__(self, url: _Optional[str] = ..., title: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
 class Image(_message.Message):
    __slots__ = ("url", "caption", "ocr_text", "original_url", "start", "end")
    URL_FIELD_NUMBER: _ClassVar[int]
    CAPTION_FIELD_NUMBER: _ClassVar[int]
    OCR_TEXT_FIELD_NUMBER: _ClassVar[int]
    ORIGINAL_URL_FIELD_NUMBER: _ClassVar[int]
    START_FIELD_NUMBER: _ClassVar[int]
    END_FIELD_NUMBER: _ClassVar[int]
    url: str
    caption: str
    ocr_text: str
    original_url: str
    start: int
    end: int
    def __init__(self, url: _Optional[str] = ..., caption: _Optional[str] = ..., ocr_text: _Optional[str] = ..., original_url: _Optional[str] = ..., start: _Optional[int] = ..., end: _Optional[int] = ...) -> None: ...
 class Chunk(_message.Message):
    __slots__ = ("content", "seq", "start", "end", "images")
    CONTENT_FIELD_NUMBER: _ClassVar[int]
    SEQ_FIELD_NUMBER: _ClassVar[int]
    START_FIELD_NUMBER: _ClassVar[int]
    END_FIELD_NUMBER: _ClassVar[int]
    IMAGES_FIELD_NUMBER: _ClassVar[int]
    content: str
    seq: int
    start: int
    end: int
    images: _containers.RepeatedCompositeFieldContainer[Image]
    def __init__(self, content: _Optional[str] = ..., seq: _Optional[int] = ..., start: _Optional[int] = ..., end: _Optional[int] = ..., images: _Optional[_Iterable[_Union[Image, _Mapping]]] = ...) -> None: ...
 class ReadResponse(_message.Message):
    __slots__ = ("chunks", "error")
    CHUNKS_FIELD_NUMBER: _ClassVar[int]
    ERROR_FIELD_NUMBER: _ClassVar[int]
    chunks: _containers.RepeatedCompositeFieldContainer[Chunk]
    error: str
    def __init__(self, chunks: _Optional[_Iterable[_Union[Chunk, _Mapping]]] = ..., error: _Optional[str] = ...) -> None: ...
--- a/docreader/proto/docreader_pb2_grpc.py
+++ b/docreader/proto/docreader_pb2_grpc.py
@@ -3,7 +3,7 @@
 import grpc
 import warnings
-from . import docreader_pb2 as docreader__pb2
+import docreader_pb2 as docreader__pb2
 GRPC_GENERATED_VERSION = '1.76.0'
 GRPC_VERSION = grpc.__version__
--- a/docreader/pyproject.toml
+++ b/docreader/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
    "lxml>=6.0.2",
    "markdown>=3.10",
    "markdownify>=1.2.0",
    "markitdown[docx,pdf,xls,xlsx]>=0.1.3",
    "minio>=7.2.18",
    "mistletoe>=1.5.0",
    "ollama>=0.6.0",
@@ -26,6 +27,7 @@ dependencies = [
    "pillow>=12.0.0",
    "playwright>=1.55.0",
    "protobuf>=6.33.0",
    "pydantic>=2.12.3",
    "pypdf>=6.1.3",
    "pypdf2>=3.0.1",
    "python-docx>=1.2.0",
--- a/docreader/scripts/generate_proto.sh
+++ b/docreader/scripts/generate_proto.sh
@@ -2,13 +2,14 @@
 set -x
 # 设置目录
-PROTO_DIR="proto"
+PROTO_DIR="docreader/proto"
-PYTHON_OUT="proto"
+PYTHON_OUT="docreader/proto"
-GO_OUT="proto"
+GO_OUT="docreader/proto"
 # 生成Python代码
 python3 -m grpc_tools.protoc -I${PROTO_DIR} \
    --python_out=${PYTHON_OUT} \
    --pyi_out=${PYTHON_OUT} \
    --grpc_python_out=${PYTHON_OUT} \
    ${PROTO_DIR}/docreader.proto
@@ -22,10 +23,10 @@ protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
 # 修复Python导入问题（MacOS兼容版本）
 if [ "$(uname)" == "Darwin" ]; then
    # MacOS版本
-    sed -i '' 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
+    sed -i '' 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
 else
    # Linux版本
-    sed -i 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
+    sed -i 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
 fi
 echo "Proto files generated successfully!"
--- a/docreader/splitter/header_hook.py
+++ b/docreader/splitter/header_hook.py
@@ -0,0 +1,112 @@
 import re
 from typing import Callable, Dict, List, Match, Pattern, Union
 from pydantic import BaseModel, Field
 class HeaderTrackerHook(BaseModel):
    """表头追踪Hook的配置类，支持多种场景的表头识别"""
    start_pattern: Pattern[str] = Field(
        description="表头开始匹配（正则表达式或字符串）"
    )
    end_pattern: Pattern[str] = Field(description="表头结束匹配（正则表达式或字符串）")
    extract_header_fn: Callable[[Match[str]], str] = Field(
        default=lambda m: m.group(0),
        description="从开始匹配结果中提取表头内容的函数（默认取匹配到的整个内容）",
    )
    priority: int = Field(default=0, description="优先级（多个配置时，高优先级先匹配）")
    case_sensitive: bool = Field(
        default=True, description="是否大小写敏感（仅当传入字符串pattern时生效）"
    )
    def __init__(
        self,
        start_pattern: Union[str, Pattern[str]],
        end_pattern: Union[str, Pattern[str]],
        **kwargs,
    ):
        flags = 0 if kwargs.get("case_sensitive", True) else re.IGNORECASE
        if isinstance(start_pattern, str):
            start_pattern = re.compile(start_pattern, flags | re.DOTALL)
        if isinstance(end_pattern, str):
            end_pattern = re.compile(end_pattern, flags | re.DOTALL)
        super().__init__(
            start_pattern=start_pattern,
            end_pattern=end_pattern,
            **kwargs,
        )
 # 初始化表头Hook配置（提供默认配置：支持Markdown表格、代码块）
 DEFAULT_CONFIGS = [
    # 代码块配置（```开头，```结尾）
    # HeaderTrackerHook(
    #     # 代码块开始（支持语言指定）
    #     start_pattern=r"^\s*```(\w+).*(?!```$)",
    #     # 代码块结束
    #     end_pattern=r"^\s*```.*$",
    #     extract_header_fn=lambda m: f"```{m.group(1)}" if m.group(1) else "```",
    #     priority=20,  # 代码块优先级高于表格
    #     case_sensitive=True,
    # ),
    # Markdown表格配置（表头带下划线）
    HeaderTrackerHook(
        # 表头行 + 分隔行
        start_pattern=r"^\s*(?:\|[^|\n]*)+[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|?[\r\n]+$",
        # 空行或非表格内容
        end_pattern=r"^\s*$|^\s*[^|\s].*$",
        priority=15,
        case_sensitive=False,
    ),
 ]
 DEFAULT_CONFIGS.sort(key=lambda x: -x.priority)
 # 定义Hook状态数据结构
 class HeaderTracker(BaseModel):
    """表头追踪 Hook 的状态类"""
    header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS)
    active_headers: Dict[int, str] = Field(default_factory=dict)
    ended_headers: set[int] = Field(default_factory=set)
    def update(self, split: str) -> Dict[int, str]:
        """检测当前split中的表头开始/结束，更新Hook状态"""
        new_headers: Dict[int, str] = {}
        # 1. 检查是否有表头结束标记
        for config in self.header_hook_configs:
            if config.priority in self.active_headers and config.end_pattern.search(
                split
            ):
                self.ended_headers.add(config.priority)
                del self.active_headers[config.priority]
        # 2. 检查是否有新的表头开始标记（只处理未活跃且未结束的）
        for config in self.header_hook_configs:
            if (
                config.priority not in self.active_headers
                and config.priority not in self.ended_headers
            ):
                match = config.start_pattern.search(split)
                if match:
                    header = config.extract_header_fn(match)
                    self.active_headers[config.priority] = header
                    new_headers[config.priority] = header
        # 3. 检查是否所有活跃表头都已结束（清空结束标记）
        if not self.active_headers:
            self.ended_headers.clear()
        return new_headers
    def get_headers(self) -> str:
        """获取当前所有活跃表头的拼接文本（按优先级排序）"""
        # 按优先级降序排列表头
        sorted_headers = sorted(self.active_headers.items(), key=lambda x: -x[0])
        return (
            "\n".join([header for _, header in sorted_headers])
            if sorted_headers
            else ""
        )
--- a/docreader/splitter/splitter.py
+++ b/docreader/splitter/splitter.py
@@ -0,0 +1,313 @@
 """Token splitter."""
 import itertools
 import logging
 import re
 from typing import Callable, Generic, List, Pattern, Tuple, TypeVar
 from pydantic import BaseModel, Field, PrivateAttr
 from docreader.splitter.header_hook import (
    HeaderTracker,
 )
 from docreader.utils.split import split_by_char, split_by_sep
 DEFAULT_CHUNK_OVERLAP = 100
 DEFAULT_CHUNK_SIZE = 512
 T = TypeVar("T")
 logger = logging.getLogger(__name__)
 class TextSplitter(BaseModel, Generic[T]):
    chunk_size: int = Field(description="The token chunk size for each chunk.")
    chunk_overlap: int = Field(
        description="The token overlap of each chunk when splitting."
    )
    separators: List[str] = Field(
        description="Default separators for splitting into words"
    )
    # Try to keep the matched characters as a whole.
    # If it's too long, the content will be further segmented.
    protected_regex: List[str] = Field(
        description="Protected regex for splitting into words"
    )
    len_function: Callable[[str], int] = Field(description="The length function.")
    # Header tracking Hook related attributes
    header_hook: HeaderTracker = Field(default_factory=HeaderTracker, exclude=True)
    _protected_fns: List[Pattern] = PrivateAttr()
    _split_fns: List[Callable] = PrivateAttr()
    def __init__(
        self,
        chunk_size: int = DEFAULT_CHUNK_SIZE,
        chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
        separators: List[str] = ["\n", "。", " "],
        protected_regex: List[str] = [
            # math formula
            r"\$\$[\s\S]*?\$\$",
            # image
            r"!\[.*?\]\(.*?\)",
            # link
            r"\[.*?\]\(.*?\)",
            # table header
            r"(?:\|[^|\n]*)+\|[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|[\r\n]+",
            # table body
            r"(?:\|[^|\n]*)+\|[\r\n]+",
            # code header
            r"```(?:\w+)[\r\n]+[^\r\n]*",
        ],
        length_function: Callable[[str], int] = lambda x: len(x),
    ):
        """Initialize with parameters."""
        if chunk_overlap > chunk_size:
            raise ValueError(
                f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
                f"({chunk_size}), should be smaller."
            )
        super().__init__(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=separators,
            protected_regex=protected_regex,
            len_function=length_function,
        )
        self._protected_fns = [re.compile(reg) for reg in protected_regex]
        self._split_fns = [split_by_sep(sep) for sep in separators] + [split_by_char()]
    def split_text(self, text: str) -> List[Tuple[int, int, str]]:
        """Split text into chunks."""
        if text == "":
            return []
        splits = self._split(text)
        protect = self._split_protected(text)
        splits = self._join(splits, protect)
        assert "".join(splits) == text
        chunks = self._merge(splits)
        return chunks
    def _split(self, text: str) -> List[str]:
        """Break text into splits that are smaller than chunk size.
        NOTE: the splits contain the separators.
        """
        if self.len_function(text) <= self.chunk_size:
            return [text]
        splits = []
        for split_fn in self._split_fns:
            splits = split_fn(text)
            if len(splits) > 1:
                break
        new_splits = []
        for split in splits:
            split_len = self.len_function(split)
            if split_len <= self.chunk_size:
                new_splits.append(split)
            else:
                # recursively split
                new_splits.extend(self._split(split))
        return new_splits
    def _merge(self, splits: List[str]) -> List[Tuple[int, int, str]]:
        """Merge splits into chunks.
        The high-level idea is to keep adding splits to a chunk until we
        exceed the chunk size, then we start a new chunk with overlap.
        When we start a new chunk, we pop off the first element of the previous
        chunk until the total length is less than the chunk size.
        """
        chunks: List[Tuple[int, int, str]] = []
        cur_chunk: List[Tuple[int, int, str]] = []
        cur_headers, cur_len = "", 0
        cur_start, cur_end = 0, 0
        for split in splits:
            cur_end = cur_start + len(split)
            split_len = self.len_function(split)
            if split_len > self.chunk_size:
                logger.error(
                    f"Got a split of size {split_len}, ",
                    f"larger than chunk size {self.chunk_size}.",
                )
            self.header_hook.update(split)
            cur_headers = self.header_hook.get_headers()
            cur_headers_len = self.len_function(cur_headers)
            if cur_headers_len > self.chunk_size:
                logger.error(
                    f"Got headers of size {cur_headers_len}, ",
                    f"larger than chunk size {self.chunk_size}.",
                )
                cur_headers, cur_headers_len = "", 0
            # if we exceed the chunk size after adding the new split, then
            # we need to end the current chunk and start a new one
            if cur_len + split_len + cur_headers_len > self.chunk_size:
                # end the previous chunk
                if len(cur_chunk) > 0:
                    chunks.append(
                        (
                            cur_chunk[0][0],
                            cur_chunk[-1][1],
                            "".join([c[2] for c in cur_chunk]),
                        )
                    )
                # start a new chunk with overlap
                # keep popping off the first element of the previous chunk until:
                #   1. the current chunk length is less than chunk overlap
                #   2. the total length is less than chunk size
                while cur_chunk and (
                    cur_len > self.chunk_overlap
                    or cur_len + split_len + cur_headers_len > self.chunk_size
                ):
                    # pop off the first element
                    first_chunk = cur_chunk.pop(0)
                    cur_len -= self.len_function(first_chunk[2])
                if (
                    cur_headers
                    and split_len + cur_headers_len < self.chunk_size
                    and cur_headers not in split
                ):
                    cur_chunk.insert(
                        0,
                        (
                            cur_chunk[0][0] if cur_chunk else cur_start,
                            cur_chunk[0][1] if cur_chunk else cur_end,
                            cur_headers,
                        ),
                    )
                    cur_len += cur_headers_len
            cur_chunk.append((cur_start, cur_end, split))
            cur_len += split_len
            cur_start = cur_end
        # handle the last chunk
        assert cur_chunk
        if cur_headers and cur_len < self.chunk_size:
            cur_chunk.insert(0, (cur_chunk[0][0], cur_chunk[0][1], cur_headers))
        chunks.append(
            (
                cur_chunk[0][0],
                cur_chunk[-1][1],
                "".join([c[2] for c in cur_chunk]),
            )
        )
        return chunks
    def _split_protected(self, text: str) -> List[Tuple[int, str]]:
        matches = [
            (match.start(), match.end())
            for pattern in self._protected_fns
            for match in pattern.finditer(text)
        ]
        matches.sort(key=lambda x: (x[0], -x[1]))
        res = []
        def fold(initial: int, current: Tuple[int, int]) -> int:
            if current[0] >= initial:
                if current[1] - current[0] < self.chunk_size:
                    res.append((current[0], text[current[0] : current[1]]))
                else:
                    logger.warning(f"Protected text ignore: {current}")
            return max(initial, current[1])
        # filter overlapping matches
        list(itertools.accumulate(matches, fold, initial=-1))
        return res
    def _join(self, splits: List[str], protect: List[Tuple[int, str]]) -> List[str]:
        """
        Merges and splits elements in splits array based on protected substrings.
        The function processes the input splits to ensure all protected substrings
        remain as single items. If a protected substring is concatenated with preceding
        or following content in any split element, it will be separated from
        the adjacent content. The final result maintains the original order of content
        while enforcing the integrity of protected substrings.
        Key behaviors:
        1. Preserves the complete structure of each protected substring
        2. Separates protected substrings from any adjacent non-protected content
        3. Maintains the original sequence of all content except for necessary
        4. Handles cases where protected substrings are partially concatenated
        """
        j = 0
        point, start = 0, 0
        res = []
        for split in splits:
            end = start + len(split)
            cur = split[point - start :]
            while j < len(protect):
                p_start, p_content = protect[j]
                p_end = p_start + len(p_content)
                if end <= p_start:
                    break
                if point < p_start:
                    local_end = p_start - point
                    res.append(cur[:local_end])
                    cur = cur[local_end:]
                    point = p_start
                res.append(p_content)
                j += 1
                if point < p_end:
                    local_start = p_end - point
                    cur = cur[local_start:]
                    point = p_end
                if not cur:
                    break
            if cur:
                res.append(cur)
                point = end
            start = end
        return res
 if __name__ == "__main__":
    s = """
    这是一些普通文本。
    | 姓名 | 年龄 | 城市 |
    |------|------|------|
    | 张三 | 25   | 北京 |
    | 李四 | 30   | 上海 |
    | 王五 | 28   | 广州 |
    | 张三 | 25   | 北京 |
    | 李四 | 30   | 上海 |
    | 王五 | 28   | 广州 |
    这是文本结束。
 """
    sp = TextSplitter(chunk_size=200, chunk_overlap=2)
    ck = sp.split_text(s)
    for c in ck:
        print("------", len(c))
        print(c)
    pass
--- a/docreader/utils/endecode.py
+++ b/docreader/utils/endecode.py
@@ -0,0 +1,103 @@
 import base64
 import binascii
 import io
 import logging
 from typing import List, Union
 import numpy as np
 from PIL import Image
 logger = logging.getLogger(__name__)
 def decode_image(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
    """Convert image to base64 encoded string
    Args:
        image: Image file path, bytes, PIL Image object, or numpy array
    Returns:
        Base64 encoded image string, or empty string if conversion fails
    """
    if isinstance(image, str):
        # It's a file path
        with open(image, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode()
    elif isinstance(image, bytes):
        # It's bytes data
        return base64.b64encode(image).decode()
    elif isinstance(image, Image.Image):
        # It's a PIL Image
        buffer = io.BytesIO()
        image.save(buffer, format=image.format)
        return base64.b64encode(buffer.getvalue()).decode()
    elif isinstance(image, np.ndarray):
        # It's a numpy array
        pil_image = Image.fromarray(image)
        buffer = io.BytesIO()
        pil_image.save(buffer, format="PNG")
        return base64.b64encode(buffer.getvalue()).decode()
    raise ValueError(f"Unsupported image type: {type(image)}")
 def encode_image(image: str, errors="strict") -> bytes:
    """
    Decode image bytes using base64.
    errors
        The error handling scheme to use for the handling of decoding errors.
        The default is 'strict' meaning that decoding errors raise a
        UnicodeDecodeError. Other possible values are 'ignore' and '????'
        as well as any other name registered with codecs.register_error that
        can handle UnicodeDecodeErrors.
    """
    try:
        image_bytes = base64.b64decode(image)
    except binascii.Error as e:
        if errors == "ignore":
            return b""
        else:
            raise e
    return image_bytes
 def encode_bytes(content: str) -> bytes:
    return content.encode()
 def decode_bytes(
    content: bytes,
    encodings: List[str] = [
        "utf-8",
        "gb18030",
        "gb2312",
        "gbk",
        "big5",
        "ascii",
        "latin-1",
    ],
 ) -> str:
    # Try decoding with each encoding format
    for encoding in encodings:
        try:
            text = content.decode(encoding)
            logger.debug(f"Decode content with {encoding}: {len(text)} characters")
            return text
        except UnicodeDecodeError:
            continue
    text = content.decode(encoding="latin-1", errors="replace")
    logger.warning(
        "Unable to determine correct encoding, using latin-1 as fallback. "
        "This may cause character issues."
    )
    return text
 if __name__ == "__main__":
    img = "test![](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMgA)test"
    encode_image(img, errors="ignore")
--- a/docreader/utils/request.py
+++ b/docreader/utils/request.py
@@ -1,10 +1,10 @@
 from contextvars import ContextVar
 import logging
 import uuid
 import contextlib
 import logging
 import time
-from typing import Optional
+import uuid
 from contextvars import ContextVar
 from logging import LogRecord
 from typing import Optional
 # 配置日志
 logger = logging.getLogger(__name__)
@@ -26,21 +26,21 @@ def get_request_id() -> Optional[str]:
 class MillisecondFormatter(logging.Formatter):
    """自定义日志格式化器，只显示毫秒级时间戳(3位数字)而不是微秒(6位)"""
-    
+
    def formatTime(self, record, datefmt=None):
        """重写formatTime方法，将微秒格式化为毫秒"""
        # 先获取标准的格式化时间
        result = super().formatTime(record, datefmt)
-        
+
        # 如果使用了包含.%f的格式，则将微秒(6位)截断为毫秒(3位)
        if datefmt and ".%f" in datefmt:
            # 格式化的时间字符串应该在最后有6位微秒数
-            parts = result.split('.')
+            parts = result.split(".")
            if len(parts) > 1 and len(parts[1]) >= 6:
                # 只保留前3位作为毫秒
                millis = parts[1][:3]
                result = f"{parts[0]}.{millis}"
-                
+
        return result
--- a/docreader/utils/split.py
+++ b/docreader/utils/split.py
@@ -0,0 +1,34 @@
 import re
 from typing import Callable, List
 def split_text_keep_separator(text: str, separator: str) -> List[str]:
    """Split text with separator and keep the separator at the end of each split."""
    parts = text.split(separator)
    result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
    return [s for s in result if s]
 def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
    """Split text by separator."""
    if keep_sep:
        return lambda text: split_text_keep_separator(text, sep)
    else:
        return lambda text: text.split(sep)
 def split_by_char() -> Callable[[str], List[str]]:
    """Split text by character."""
    return lambda text: list(text)
 def split_by_regex(regex: str) -> Callable[[str], List[str]]:
    """Split text by regex."""
    pattern = re.compile(f"({regex})")
    return lambda text: list(filter(None, pattern.split(text)))
 def match_by_regex(regex: str) -> Callable[[str], bool]:
    """Split text by regex."""
    pattern = re.compile(regex)
    return lambda text: bool(pattern.match(text))
--- a/docreader/utils/tempfile.py
+++ b/docreader/utils/tempfile.py
@@ -0,0 +1,77 @@
 import logging
 import os
 import tempfile
 logger = logging.getLogger(__name__)
 class TempFileContext:
    def __init__(self, file_content: bytes, suffix: str):
        """
        Initialize the context
        :param file_content: Byte data to write to file
        :param suffix: File suffix
        """
        self.file_content = file_content
        self.suffix = suffix
        self.file = None
    def __enter__(self):
        """
        Create file when entering context
        """
        self.temp_file = tempfile.NamedTemporaryFile(suffix=self.suffix, delete=False)
        self.temp_file.write(self.file_content)
        self.temp_file.flush()
        logger.info(
            f"Saved {self.suffix} content to temporary file: {self.temp_file.name}"
        )
        return self.temp_file.name
    def __exit__(self, exc_type, exc_val, exc_tb):
        """
        Delete file when exiting context
        """
        if self.temp_file:
            self.temp_file.close()
            if os.path.exists(self.temp_file.name):
                os.remove(self.temp_file.name)
            logger.info(f"File {self.temp_file.name} has been deleted.")
        # Return False to propagate exception (if any exception occurred)
        return False
 class TempDirContext:
    def __init__(self):
        """
        Initialize the context
        """
        self.temp_dir = None
    def __enter__(self):
        """
        Create directory when entering context
        """
        self.temp_dir = tempfile.TemporaryDirectory()
        logger.info(f"Created temporary directory: {self.temp_dir.name}")
        return self.temp_dir.name
    def __exit__(self, exc_type, exc_val, exc_tb):
        """
        Delete directory when exiting context
        """
        if self.temp_dir and os.path.exists(self.temp_dir.name):
            self.temp_dir.cleanup()
            logger.info(f"Directory {self.temp_dir.name} has been deleted.")
        # Return False to propagate exception (if any exception occurred)
        return False
 if __name__ == "__main__":
    example_bytes = b"Hello, this is a test file."
    file_name = "test_file.txt"
    # Using with statement
    with TempFileContext(example_bytes, file_name) as temp_file:
        # File operations can be performed within the context
        print(f"Does file {file_name} exist: {os.path.exists(file_name)}")
--- a/docreader/uv.lock
+++ b/docreader/uv.lock
@@ -6,17 +6,22 @@ resolution-markers = [
    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.13.*' and sys_platform == 'win32'",
    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version < '3.11' and sys_platform == 'darwin'",
    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 [[package]]
@@ -423,6 +428,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" },
 ]
 [[package]]
 name = "cobble"
 version = "0.1.4"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa", size = 3805, upload-time = "2024-06-01T18:11:09.528Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44", size = 3984, upload-time = "2024-06-01T18:11:07.911Z" },
 ]
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -432,6 +446,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
 ]
 [[package]]
 name = "coloredlogs"
 version = "15.0.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "humanfriendly" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" },
 ]
 [[package]]
 name = "cos-python-sdk-v5"
 version = "1.9.38"
@@ -587,6 +613,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/18/d5/7a04640bf559bb890455ffb28978daf7d44f667c3f04a4d422c655c1ba92/cython-3.1.6-py3-none-any.whl", hash = "sha256:91dcf7eb9b6a089ce4e9e1140e571d84c3bca834afb77ec269be7aa9d31a8157", size = 1223550, upload-time = "2025-10-23T12:38:16.732Z" },
 ]
 [[package]]
 name = "defusedxml"
 version = "0.7.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" },
 ]
 [[package]]
 name = "distro"
 version = "1.9.0"
@@ -612,6 +647,7 @@ dependencies = [
    { name = "lxml" },
    { name = "markdown" },
    { name = "markdownify" },
    { name = "markitdown", extra = ["docx", "pdf", "xls", "xlsx"] },
    { name = "minio" },
    { name = "mistletoe" },
    { name = "ollama" },
@@ -622,6 +658,7 @@ dependencies = [
    { name = "pillow" },
    { name = "playwright" },
    { name = "protobuf" },
    { name = "pydantic" },
    { name = "pypdf" },
    { name = "pypdf2" },
    { name = "python-docx" },
@@ -643,6 +680,7 @@ requires-dist = [
    { name = "lxml", specifier = ">=6.0.2" },
    { name = "markdown", specifier = ">=3.10" },
    { name = "markdownify", specifier = ">=1.2.0" },
    { name = "markitdown", extras = ["docx", "pdf", "xls", "xlsx"], specifier = ">=0.1.3" },
    { name = "minio", specifier = ">=7.2.18" },
    { name = "mistletoe", specifier = ">=1.5.0" },
    { name = "ollama", specifier = ">=0.6.0" },
@@ -653,6 +691,7 @@ requires-dist = [
    { name = "pillow", specifier = ">=12.0.0" },
    { name = "playwright", specifier = ">=1.55.0" },
    { name = "protobuf", specifier = ">=6.33.0" },
    { name = "pydantic", specifier = ">=2.12.3" },
    { name = "pypdf", specifier = ">=6.1.3" },
    { name = "pypdf2", specifier = ">=3.0.1" },
    { name = "python-docx", specifier = ">=1.2.0" },
@@ -683,6 +722,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/bf/ee/aa015c5de8b0dc42a8e507eae8c2de5d1c0e068c896858fec6d502402ed6/ebooklib-0.20-py3-none-any.whl", hash = "sha256:fff5322517a37e31c972d27be7d982cc3928c16b3dcc5fd7e8f7c0f5d7bcf42b", size = 40995, upload-time = "2025-10-26T20:56:19.104Z" },
 ]
 [[package]]
 name = "et-xmlfile"
 version = "2.0.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
 ]
 [[package]]
 name = "exceptiongroup"
 version = "1.3.0"
@@ -707,6 +755,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" },
 ]
 [[package]]
 name = "flatbuffers"
 version = "25.9.23"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" },
 ]
 [[package]]
 name = "fonttools"
 version = "4.60.1"
@@ -850,6 +907,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" },
    { url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" },
    { url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126, upload-time = "2025-08-07T13:18:20.239Z" },
    { url = "https://files.pythonhosted.org/packages/f1/29/74242b7d72385e29bcc5563fba67dad94943d7cd03552bac320d597f29b2/greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7", size = 1544904, upload-time = "2025-11-04T12:42:04.763Z" },
    { url = "https://files.pythonhosted.org/packages/c8/e2/1572b8eeab0f77df5f6729d6ab6b141e4a84ee8eb9bc8c1e7918f94eda6d/greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8", size = 1611228, upload-time = "2025-11-04T12:42:08.423Z" },
    { url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654, upload-time = "2025-08-07T13:50:00.469Z" },
    { url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" },
    { url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" },
@@ -859,6 +918,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" },
    { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" },
    { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" },
    { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" },
    { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" },
    { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" },
    { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
    { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
@@ -868,6 +929,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
    { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
    { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" },
    { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" },
    { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" },
    { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" },
    { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
    { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
@@ -877,6 +940,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
    { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
    { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" },
    { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" },
    { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" },
    { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" },
    { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
    { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
@@ -884,6 +949,8 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
    { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
    { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
    { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
    { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" },
    { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" },
 ]
@@ -1061,6 +1128,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
 ]
 [[package]]
 name = "humanfriendly"
 version = "10.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "pyreadline3", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" },
 ]
 [[package]]
 name = "idna"
 version = "3.11"
@@ -1386,6 +1465,38 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" },
 ]
 [[package]]
 name = "magika"
 version = "0.6.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "click" },
    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
    { name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
    { name = "onnxruntime", version = "1.23.2", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
    { name = "python-dotenv" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a3/f3/3d1dcdd7b9c41d589f5cff252d32ed91cdf86ba84391cfc81d9d8773571d/magika-0.6.3.tar.gz", hash = "sha256:7cc52aa7359af861957043e2bf7265ed4741067251c104532765cd668c0c0cb1", size = 3042784, upload-time = "2025-10-30T15:22:34.499Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/a2/e4/35c323beb3280482c94299d61626116856ac2d4ec16ecef50afc4fdd4291/magika-0.6.3-py3-none-any.whl", hash = "sha256:eda443d08006ee495e02083b32e51b98cb3696ab595a7d13900d8e2ef506ec9d", size = 2969474, upload-time = "2025-10-30T15:22:25.298Z" },
    { url = "https://files.pythonhosted.org/packages/25/8f/132b0d7cd51c02c39fd52658a5896276c30c8cc2fd453270b19db8c40f7e/magika-0.6.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:86901e64b05dde5faff408c9b8245495b2e1fd4c226e3393d3d2a3fee65c504b", size = 13358841, upload-time = "2025-10-30T15:22:27.413Z" },
    { url = "https://files.pythonhosted.org/packages/c4/03/5ed859be502903a68b7b393b17ae0283bf34195cfcca79ce2dc25b9290e7/magika-0.6.3-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:3d9661eedbdf445ac9567e97e7ceefb93545d77a6a32858139ea966b5806fb64", size = 15367335, upload-time = "2025-10-30T15:22:29.907Z" },
    { url = "https://files.pythonhosted.org/packages/7b/9e/f8ee7d644affa3b80efdd623a3d75865c8f058f3950cb87fb0c48e3559bc/magika-0.6.3-py3-none-win_amd64.whl", hash = "sha256:e57f75674447b20cab4db928ae58ab264d7d8582b55183a0b876711c2b2787f3", size = 12692831, upload-time = "2025-10-30T15:22:32.063Z" },
 ]
 [[package]]
 name = "mammoth"
 version = "1.11.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "cobble" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/ed/3c/a58418d2af00f2da60d4a51e18cd0311307b72d48d2fffec36a97b4a5e44/mammoth-1.11.0.tar.gz", hash = "sha256:a0f59e442f34d5b6447f4b0999306cbf3e67aaabfa8cb516f878fb1456744637", size = 53142, upload-time = "2025-09-19T10:35:20.373Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/ca/54/2e39566a131b13f6d8d193f974cb6a34e81bb7cc2fa6f7e03de067b36588/mammoth-1.11.0-py2.py3-none-any.whl", hash = "sha256:c077ab0d450bd7c0c6ecd529a23bf7e0fa8190c929e28998308ff4eada3f063b", size = 54752, upload-time = "2025-09-19T10:35:18.699Z" },
 ]
 [[package]]
 name = "markdown"
 version = "3.10"
@@ -1408,6 +1519,41 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/6a/e2/7af643acb4cae0741dffffaa7f3f7c9e7ab4046724543ba1777c401d821c/markdownify-1.2.0-py3-none-any.whl", hash = "sha256:48e150a1c4993d4d50f282f725c0111bd9eb25645d41fa2f543708fd44161351", size = 15561, upload-time = "2025-08-09T17:44:14.074Z" },
 ]
 [[package]]
 name = "markitdown"
 version = "0.1.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "beautifulsoup4" },
    { name = "charset-normalizer" },
    { name = "defusedxml" },
    { name = "magika" },
    { name = "markdownify" },
    { name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'win32'" },
    { name = "requests" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/87/31/90cef2bc8ecd85c200ed3b3d1e20fc7a724213502685c4b05b5431e02668/markitdown-0.1.3.tar.gz", hash = "sha256:b0d9127c3373a68274dede6af6c9bb0684b78ce364c727c4c304da97a20d6fd9", size = 40039, upload-time = "2025-08-26T22:37:04.4Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/97/83/7b47d2ecbf58650a03aeeb21ba2d59175f202bf4fb81d44f40f1deb82bc0/markitdown-0.1.3-py3-none-any.whl", hash = "sha256:08d9a25770979d78f60dcc0afcb868de6799608e4db65342b2e03304fb091251", size = 58391, upload-time = "2025-08-26T22:37:02.924Z" },
 ]
 [package.optional-dependencies]
 docx = [
    { name = "lxml" },
    { name = "mammoth" },
 ]
 pdf = [
    { name = "pdfminer-six" },
 ]
 xls = [
    { name = "pandas" },
    { name = "xlrd" },
 ]
 xlsx = [
    { name = "openpyxl" },
    { name = "pandas" },
 ]
 [[package]]
 name = "minio"
 version = "7.2.18"
@@ -1433,6 +1579,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/3f/c2/bd0fc48dc323035cd65d21c45a3bb5c7b054001bf4ea75e461beb7656e07/mistletoe-1.5.0-py3-none-any.whl", hash = "sha256:d4e77b991b998c5efe3c4eab4e1b472263b5349688acd50d79bd9a6c317a9df1", size = 55262, upload-time = "2025-10-18T16:37:08.376Z" },
 ]
 [[package]]
 name = "mpmath"
 version = "1.3.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
 ]
 [[package]]
 name = "networkx"
 version = "3.4.2"
@@ -1440,7 +1595,8 @@ source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
    "python_full_version < '3.11' and sys_platform == 'darwin'",
    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" }
 wheels = [
@@ -1456,14 +1612,18 @@ resolution-markers = [
    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.13.*' and sys_platform == 'win32'",
    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" }
 wheels = [
@@ -1492,7 +1652,8 @@ source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
    "python_full_version < '3.11' and sys_platform == 'darwin'",
    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
 wheels = [
@@ -1561,14 +1722,18 @@ resolution-markers = [
    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.13.*' and sys_platform == 'win32'",
    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b5/f4/098d2270d52b41f1bd7db9fc288aaa0400cb48c2a3e2af6fa365d9720947/numpy-2.3.4.tar.gz", hash = "sha256:a7d018bfedb375a8d979ac758b120ba846a7fe764911a64465fd87b8729f4a6a", size = 20582187, upload-time = "2025-10-15T16:18:11.77Z" }
 wheels = [
@@ -1660,6 +1825,97 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/b5/c1/edc9f41b425ca40b26b7c104c5f6841a4537bb2552bfa6ca66e81405bb95/ollama-0.6.0-py3-none-any.whl", hash = "sha256:534511b3ccea2dff419ae06c3b58d7f217c55be7897c8ce5868dfb6b219cf7a0", size = 14130, upload-time = "2025-09-24T22:46:01.19Z" },
 ]
 [[package]]
 name = "onnxruntime"
 version = "1.20.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
    "python_full_version >= '3.14' and sys_platform == 'win32'",
    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.13.*' and sys_platform == 'win32'",
    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.12.*' and sys_platform == 'win32'",
    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.11.*' and sys_platform == 'win32'",
    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version < '3.11' and sys_platform == 'win32'",
    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 dependencies = [
    { name = "coloredlogs", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
    { name = "flatbuffers", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
    { name = "packaging", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
    { name = "protobuf", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
    { name = "sympy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/6d/c6/c4c0860bee2fde6037bdd9dcd12d323f6e38cf00fcc9a5065b394337fc55/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b2908b50101a19e99c4d4e97ebb9905561daf61829403061c1adc1b588bc0de", size = 11954010, upload-time = "2024-11-21T00:48:35.254Z" },
    { url = "https://files.pythonhosted.org/packages/63/47/3dc0b075ab539f16b3d8b09df6b504f51836086ee709690a6278d791737d/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d82daaec24045a2e87598b8ac2b417b1cce623244e80e663882e9fe1aae86410", size = 13330452, upload-time = "2024-11-21T00:48:40.02Z" },
    { url = "https://files.pythonhosted.org/packages/27/ef/80fab86289ecc01a734b7ddf115dfb93d8b2e004bd1e1977e12881c72b12/onnxruntime-1.20.1-cp310-cp310-win32.whl", hash = "sha256:4c4b251a725a3b8cf2aab284f7d940c26094ecd9d442f07dd81ab5470e99b83f", size = 9813849, upload-time = "2024-11-21T00:48:43.569Z" },
    { url = "https://files.pythonhosted.org/packages/a9/e6/33ab10066c9875a29d55e66ae97c3bf91b9b9b987179455d67c32261a49c/onnxruntime-1.20.1-cp310-cp310-win_amd64.whl", hash = "sha256:d3b616bb53a77a9463707bb313637223380fc327f5064c9a782e8ec69c22e6a2", size = 11329702, upload-time = "2024-11-21T00:48:46.599Z" },
    { url = "https://files.pythonhosted.org/packages/a5/da/c44bf9bd66cd6d9018a921f053f28d819445c4d84b4dd4777271b0fe52a2/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6243e34d74423bdd1edf0ae9596dd61023b260f546ee17d701723915f06a9f7", size = 11955227, upload-time = "2024-11-21T00:48:54.556Z" },
    { url = "https://files.pythonhosted.org/packages/11/ac/4120dfb74c8e45cce1c664fc7f7ce010edd587ba67ac41489f7432eb9381/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5eec64c0269dcdb8d9a9a53dc4d64f87b9e0c19801d9321246a53b7eb5a7d1bc", size = 13331703, upload-time = "2024-11-21T00:48:57.97Z" },
    { url = "https://files.pythonhosted.org/packages/12/f1/cefacac137f7bb7bfba57c50c478150fcd3c54aca72762ac2c05ce0532c1/onnxruntime-1.20.1-cp311-cp311-win32.whl", hash = "sha256:a19bc6e8c70e2485a1725b3d517a2319603acc14c1f1a017dda0afe6d4665b41", size = 9813977, upload-time = "2024-11-21T00:49:00.519Z" },
    { url = "https://files.pythonhosted.org/packages/2c/2d/2d4d202c0bcfb3a4cc2b171abb9328672d7f91d7af9ea52572722c6d8d96/onnxruntime-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:8508887eb1c5f9537a4071768723ec7c30c28eb2518a00d0adcd32c89dea3221", size = 11329895, upload-time = "2024-11-21T00:49:03.845Z" },
    { url = "https://files.pythonhosted.org/packages/c5/9d/a42a84e10f1744dd27c6f2f9280cc3fb98f869dd19b7cd042e391ee2ab61/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f56e898815963d6dc4ee1c35fc6c36506466eff6d16f3cb9848cea4e8c8172", size = 11952833, upload-time = "2024-11-21T00:49:10.563Z" },
    { url = "https://files.pythonhosted.org/packages/47/42/2f71f5680834688a9c81becbe5c5bb996fd33eaed5c66ae0606c3b1d6a02/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb71a814f66517a65628c9e4a2bb530a6edd2cd5d87ffa0af0f6f773a027d99e", size = 13333903, upload-time = "2024-11-21T00:49:12.984Z" },
    { url = "https://files.pythonhosted.org/packages/c8/f1/aabfdf91d013320aa2fc46cf43c88ca0182860ff15df872b4552254a9680/onnxruntime-1.20.1-cp312-cp312-win32.whl", hash = "sha256:bd386cc9ee5f686ee8a75ba74037750aca55183085bf1941da8efcfe12d5b120", size = 9814562, upload-time = "2024-11-21T00:49:15.453Z" },
    { url = "https://files.pythonhosted.org/packages/dd/80/76979e0b744307d488c79e41051117634b956612cc731f1028eb17ee7294/onnxruntime-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:19c2d843eb074f385e8bbb753a40df780511061a63f9def1b216bf53860223fb", size = 11331482, upload-time = "2024-11-21T00:49:19.412Z" },
    { url = "https://files.pythonhosted.org/packages/81/0d/13bbd9489be2a6944f4a940084bfe388f1100472f38c07080a46fbd4ab96/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb44b08e017a648924dbe91b82d89b0c105b1adcfe31e90d1dc06b8677ad37be", size = 11951459, upload-time = "2024-11-21T00:49:26.269Z" },
    { url = "https://files.pythonhosted.org/packages/c0/ea/4454ae122874fd52bbb8a961262de81c5f932edeb1b72217f594c700d6ef/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bda6aebdf7917c1d811f21d41633df00c58aff2bef2f598f69289c1f1dabc4b3", size = 13331620, upload-time = "2024-11-21T00:49:28.875Z" },
    { url = "https://files.pythonhosted.org/packages/d8/e0/50db43188ca1c945decaa8fc2a024c33446d31afed40149897d4f9de505f/onnxruntime-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:d30367df7e70f1d9fc5a6a68106f5961686d39b54d3221f760085524e8d38e16", size = 11331758, upload-time = "2024-11-21T00:49:31.417Z" },
    { url = "https://files.pythonhosted.org/packages/d8/55/3821c5fd60b52a6c82a00bba18531793c93c4addfe64fbf061e235c5617a/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9158465745423b2b5d97ed25aa7740c7d38d2993ee2e5c3bfacb0c4145c49d8", size = 11950342, upload-time = "2024-11-21T00:49:34.164Z" },
    { url = "https://files.pythonhosted.org/packages/14/56/fd990ca222cef4f9f4a9400567b9a15b220dee2eafffb16b2adbc55c8281/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0df6f2df83d61f46e842dbcde610ede27218947c33e994545a22333491e72a3b", size = 13337040, upload-time = "2024-11-21T00:49:37.271Z" },
 ]
 [[package]]
 name = "onnxruntime"
 version = "1.23.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
    "python_full_version >= '3.14' and sys_platform == 'darwin'",
    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version < '3.11' and sys_platform == 'darwin'",
    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
 ]
 dependencies = [
    { name = "coloredlogs", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
    { name = "flatbuffers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'darwin')" },
    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform == 'darwin')" },
    { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
    { name = "protobuf", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
    { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/35/d6/311b1afea060015b56c742f3531168c1644650767f27ef40062569960587/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a7730122afe186a784660f6ec5807138bf9d792fa1df76556b27307ea9ebcbe3", size = 17195934, upload-time = "2025-10-27T23:06:14.143Z" },
    { url = "https://files.pythonhosted.org/packages/db/db/81bf3d7cecfbfed9092b6b4052e857a769d62ed90561b410014e0aae18db/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:b28740f4ecef1738ea8f807461dd541b8287d5650b5be33bca7b474e3cbd1f36", size = 19153079, upload-time = "2025-10-27T23:05:57.686Z" },
    { url = "https://files.pythonhosted.org/packages/2e/4d/a382452b17cf70a2313153c520ea4c96ab670c996cb3a95cc5d5ac7bfdac/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f7d1fe034090a1e371b7f3ca9d3ccae2fabae8c1d8844fb7371d1ea38e8e8d2", size = 15219883, upload-time = "2025-10-22T03:46:21.66Z" },
    { url = "https://files.pythonhosted.org/packages/fb/56/179bf90679984c85b417664c26aae4f427cba7514bd2d65c43b181b7b08b/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ca88747e708e5c67337b0f65eed4b7d0dd70d22ac332038c9fc4635760018f7", size = 17370357, upload-time = "2025-10-22T03:46:57.968Z" },
    { url = "https://files.pythonhosted.org/packages/44/be/467b00f09061572f022ffd17e49e49e5a7a789056bad95b54dfd3bee73ff/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f91d2c9b0965e86827a5ba01531d5b669770b01775b23199565d6c1f136616c", size = 17196113, upload-time = "2025-10-22T03:47:33.526Z" },
    { url = "https://files.pythonhosted.org/packages/9f/a8/3c23a8f75f93122d2b3410bfb74d06d0f8da4ac663185f91866b03f7da1b/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:87d8b6eaf0fbeb6835a60a4265fde7a3b60157cf1b2764773ac47237b4d48612", size = 19153857, upload-time = "2025-10-22T03:46:37.578Z" },
    { url = "https://files.pythonhosted.org/packages/3f/d8/506eed9af03d86f8db4880a4c47cd0dffee973ef7e4f4cff9f1d4bcf7d22/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbfd2fca76c855317568c1b36a885ddea2272c13cb0e395002c402f2360429a6", size = 15220095, upload-time = "2025-10-22T03:46:24.769Z" },
    { url = "https://files.pythonhosted.org/packages/e9/80/113381ba832d5e777accedc6cb41d10f9eca82321ae31ebb6bcede530cea/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da44b99206e77734c5819aa2142c69e64f3b46edc3bd314f6a45a932defc0b3e", size = 17372080, upload-time = "2025-10-22T03:47:00.265Z" },
    { url = "https://files.pythonhosted.org/packages/1b/9e/f748cd64161213adeef83d0cb16cb8ace1e62fa501033acdd9f9341fff57/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:b8f029a6b98d3cf5be564d52802bb50a8489ab73409fa9db0bf583eabb7c2321", size = 17195929, upload-time = "2025-10-22T03:47:36.24Z" },
    { url = "https://files.pythonhosted.org/packages/91/9d/a81aafd899b900101988ead7fb14974c8a58695338ab6a0f3d6b0100f30b/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:218295a8acae83905f6f1aed8cacb8e3eb3bd7513a13fe4ba3b2664a19fc4a6b", size = 19157705, upload-time = "2025-10-22T03:46:40.415Z" },
    { url = "https://files.pythonhosted.org/packages/3c/35/4e40f2fba272a6698d62be2cd21ddc3675edfc1a4b9ddefcc4648f115315/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76ff670550dc23e58ea9bc53b5149b99a44e63b34b524f7b8547469aaa0dcb8c", size = 15226915, upload-time = "2025-10-22T03:46:27.773Z" },
    { url = "https://files.pythonhosted.org/packages/ef/88/9cc25d2bafe6bc0d4d3c1db3ade98196d5b355c0b273e6a5dc09c5d5d0d5/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f9b4ae77f8e3c9bee50c27bc1beede83f786fe1d52e99ac85aa8d65a01e9b77", size = 17382649, upload-time = "2025-10-22T03:47:02.782Z" },
    { url = "https://files.pythonhosted.org/packages/3d/41/fba0cabccecefe4a1b5fc8020c44febb334637f133acefc7ec492029dd2c/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2ff531ad8496281b4297f32b83b01cdd719617e2351ffe0dba5684fb283afa1f", size = 17196337, upload-time = "2025-10-22T03:46:35.168Z" },
    { url = "https://files.pythonhosted.org/packages/fe/f9/2d49ca491c6a986acce9f1d1d5fc2099108958cc1710c28e89a032c9cfe9/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:162f4ca894ec3de1a6fd53589e511e06ecdc3ff646849b62a9da7489dee9ce95", size = 19157691, upload-time = "2025-10-22T03:46:43.518Z" },
    { url = "https://files.pythonhosted.org/packages/1c/a1/428ee29c6eaf09a6f6be56f836213f104618fb35ac6cc586ff0f477263eb/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45d127d6e1e9b99d1ebeae9bcd8f98617a812f53f46699eafeb976275744826b", size = 15226898, upload-time = "2025-10-22T03:46:30.039Z" },
    { url = "https://files.pythonhosted.org/packages/f2/2b/b57c8a2466a3126dbe0a792f56ad7290949b02f47b86216cd47d857e4b77/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8bace4e0d46480fbeeb7bbe1ffe1f080e6663a42d1086ff95c1551f2d39e7872", size = 17382518, upload-time = "2025-10-22T03:47:05.407Z" },
    { url = "https://files.pythonhosted.org/packages/7c/3d/6830fa61c69ca8e905f237001dbfc01689a4e4ab06147020a4518318881f/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466", size = 15229610, upload-time = "2025-10-22T03:46:32.239Z" },
    { url = "https://files.pythonhosted.org/packages/b6/ca/862b1e7a639460f0ca25fd5b6135fb42cf9deea86d398a92e44dfda2279d/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2b9233c4947907fd1818d0e581c049c41ccc39b2856cc942ff6d26317cee145", size = 17394184, upload-time = "2025-10-22T03:47:08.127Z" },
 ]
 [[package]]
 name = "openai"
 version = "2.7.1"
@@ -1733,6 +1989,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" },
 ]
 [[package]]
 name = "openpyxl"
 version = "3.1.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "et-xmlfile" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
 ]
 [[package]]
 name = "opt-einsum"
 version = "3.3.0"
@@ -1821,6 +2089,68 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/a3/6f/042d73453ad01a2e3dd4adeae4870e25679d9eaa340d70bd54cd409cb982/paddlepaddle-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:d16fd322246c4a580ca512971840ac8c16126a77d59a7bceee165d8f2196a6b2", size = 101708504, upload-time = "2025-10-30T13:21:18.125Z" },
 ]
 [[package]]
 name = "pandas"
 version = "2.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
    { name = "python-dateutil" },
    { name = "pytz" },
    { name = "tzdata" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" },
    { url = "https://files.pythonhosted.org/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" },
    { url = "https://files.pythonhosted.org/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" },
    { url = "https://files.pythonhosted.org/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" },
    { url = "https://files.pythonhosted.org/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" },
    { url = "https://files.pythonhosted.org/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" },
    { url = "https://files.pythonhosted.org/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" },
    { url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" },
    { url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" },
    { url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" },
    { url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" },
    { url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" },
    { url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" },
    { url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" },
    { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
    { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
    { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
    { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" },
    { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" },
    { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" },
    { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" },
    { url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" },
    { url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" },
    { url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" },
    { url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" },
    { url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" },
    { url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" },
    { url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" },
    { url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" },
    { url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" },
    { url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" },
    { url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" },
    { url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" },
    { url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
    { url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" },
    { url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" },
    { url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" },
    { url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" },
    { url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" },
    { url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" },
    { url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" },
    { url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" },
    { url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" },
    { url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" },
    { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
    { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
    { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
 ]
 [[package]]
 name = "pdfminer-six"
 version = "20250506"
@@ -2266,6 +2596,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/ae/43/2b0607ef7f16d63fbe00de728151a090397ef5b3b9147b4aefe975d17106/pypdfium2-5.0.0-py3-none-win_arm64.whl", hash = "sha256:0a2a473fe95802e7a5f4140f25e5cd036cf17f060f27ee2d28c3977206add763", size = 2939015, upload-time = "2025-10-26T13:31:40.531Z" },
 ]
 [[package]]
 name = "pyreadline3"
 version = "3.5.4"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" },
 ]
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -2291,6 +2630,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" },
 ]
 [[package]]
 name = "python-dotenv"
 version = "1.2.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
 ]
 [[package]]
 name = "python-pptx"
 version = "1.0.2"
@@ -2306,6 +2654,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" },
 ]
 [[package]]
 name = "pytz"
 version = "2025.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
 ]
 [[package]]
 name = "pyyaml"
 version = "6.0.3"
@@ -2654,7 +3011,8 @@ source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
    "python_full_version < '3.11' and sys_platform == 'darwin'",
    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 dependencies = [
    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -2717,14 +3075,18 @@ resolution-markers = [
    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.13.*' and sys_platform == 'win32'",
    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 dependencies = [
    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -3083,6 +3445,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/97/45/1097a0eb3ad2a1cb5a29d493aefffe4170e454b2726cfe2a76f6652e91af/stringzilla-4.2.3-cp313-cp313-win_arm64.whl", hash = "sha256:1000a8df547fb3b194def48cc3ed6494715fe1c8ac25c6444ed3cfb0b52942c7", size = 99815, upload-time = "2025-10-27T18:35:56.555Z" },
 ]
 [[package]]
 name = "sympy"
 version = "1.14.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "mpmath" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
 ]
 [[package]]
 name = "termcolor"
 version = "3.2.0"
@@ -3116,7 +3490,8 @@ source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
    "python_full_version < '3.11' and sys_platform == 'darwin'",
    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version < '3.11' and sys_platform == 'win32'",
    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 dependencies = [
    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -3135,14 +3510,18 @@ resolution-markers = [
    "python_full_version == '3.13.*' and sys_platform == 'darwin'",
    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.13.*' and sys_platform == 'win32'",
    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.12.*' and sys_platform == 'darwin'",
    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
    "python_full_version == '3.11.*' and sys_platform == 'darwin'",
    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
 ]
 dependencies = [
    { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -3185,6 +3564,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
 ]
 [[package]]
 name = "tzdata"
 version = "2025.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
 ]
 [[package]]
 name = "unidic-lite"
 version = "1.0.8"