diff --git a/.gitignore b/.gitignore
index 8de2b3c..5c4420c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,17 +24,14 @@ node_modules/
tmp/
temp/
-# Docker compose файл (локальные настройки)
-# docker-compose.yml
-
WeKnora
/models/
-**/__pycache__
test/data/mswag.txt
data/files/
-.python-version
.venv/
+**/__pycache__
+.python-version
### macOS
# General
diff --git a/docker-compose.yml b/docker-compose.yml
index 0713397..4c210b3 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -127,6 +127,7 @@ services:
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
- MINIO_USE_SSL=${MINIO_USE_SSL:-}
- WEB_PROXY=${WEB_PROXY:-}
+ - MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
healthcheck:
test: ["CMD", "grpc_health_probe", "-addr=:50051"]
interval: 30s
diff --git a/docker/Dockerfile.docreader b/docker/Dockerfile.docreader
index 67d413f..f73476f 100644
--- a/docker/Dockerfile.docreader
+++ b/docker/Dockerfile.docreader
@@ -80,10 +80,11 @@ RUN pip install uv --break-system-packages && \
python -m uv sync --locked --no-dev
# 复制源代码和生成脚本
-COPY docreader .
+COPY docreader docreader
# 生成 protobuf 代码
-RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh
+RUN chmod +x docreader/scripts/generate_proto.sh && \
+ bash docreader/scripts/generate_proto.sh
# 确保模型目录存在
RUN ls -la /root/.paddleocr/whl/
@@ -150,10 +151,11 @@ RUN python -m playwright install-deps webkit
# COPY docreader/scripts/download_deps.py download_deps.py
# RUN python -m download_deps
-COPY --from=builder /app/ ./
+COPY docreader/pyproject.toml docreader/uv.lock ./
+COPY --from=builder /app/docreader docreader
# 暴露 gRPC 端口
EXPOSE 50051
# 直接运行 Python 服务(日志输出到 stdout/stderr)
-CMD ["uv", "run", "main.py"]
\ No newline at end of file
+CMD ["uv", "run", "-m", "docreader.main"]
\ No newline at end of file
diff --git a/docreader/.pylintrc b/docreader/.pylintrc
new file mode 100644
index 0000000..0f446b4
--- /dev/null
+++ b/docreader/.pylintrc
@@ -0,0 +1,5 @@
+[LOGGING]
+logging-format-style=fstr
+
+[MESSAGES CONTROL]
+; disable=W1203
diff --git a/docreader/main.py b/docreader/main.py
index bba5256..1a0e2e7 100644
--- a/docreader/main.py
+++ b/docreader/main.py
@@ -1,37 +1,25 @@
-import os
-import sys
import logging
-from concurrent import futures
+import os
+import re
+import sys
import traceback
-import grpc
import uuid
-import atexit
+from concurrent import futures
+from typing import Optional
+
+import grpc
from grpc_health.v1 import health_pb2_grpc
from grpc_health.v1.health import HealthServicer
-# Add parent directory to Python path
-current_dir = os.path.dirname(os.path.abspath(__file__))
-parent_dir = os.path.dirname(current_dir)
-if parent_dir not in sys.path:
- sys.path.insert(0, parent_dir)
+from docreader.models.read_config import ChunkingConfig
+from docreader.parser import Parser
+from docreader.parser.ocr_engine import OCREngine
+from docreader.proto import docreader_pb2_grpc
+from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse
+from docreader.utils.request import init_logging_request_id, request_id_context
-from proto.docreader_pb2 import ReadResponse, Chunk, Image
-from proto import docreader_pb2_grpc
-from parser import Parser, OCREngine
-from parser.config import ChunkingConfig
-from utils.request import request_id_context, init_logging_request_id
-
-# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
-import re
-from typing import Optional
-
-try:
- # Optional dependency for charset detection; install via `pip install charset-normalizer`
- from charset_normalizer import from_bytes as _cn_from_bytes # type: ignore
-except Exception: # pragma: no cover
- _cn_from_bytes = None # type: ignore
-
-# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
+# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
+# cannot be encoded to UTF-8
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
@@ -47,29 +35,6 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
return s.encode("utf-8", errors="replace").decode("utf-8")
-def read_text_with_fallback(file_path: str) -> str:
- """Read text from file supporting multiple encodings with graceful fallback.
-
- This server currently receives bytes over gRPC and delegates decoding to the parser.
- This helper is provided for future local-file reads if needed.
- """
- with open(file_path, "rb") as f:
- raw = f.read()
- if _cn_from_bytes is not None:
- try:
- result = _cn_from_bytes(raw).best()
- if result:
- return str(result)
- except Exception:
- pass
- for enc in ("utf-8", "gb18030", "latin-1"):
- try:
- return raw.decode(enc, errors="replace")
- except UnicodeDecodeError:
- continue
- return raw.decode("utf-8", errors="replace")
-
-
# Ensure no existing handlers
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
@@ -113,7 +78,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
request.file_type or os.path.splitext(request.file_name)[1][1:]
)
logger.info(
- f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
+ f"ReadFromFile for file: {request.file_name}, type: {file_type}"
)
logger.info(f"File content size: {len(request.file_content)} bytes")
@@ -124,8 +89,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
enable_multimodal = request.read_config.enable_multimodal or False
logger.info(
- f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
- f"multimodal={enable_multimodal}"
+ f"Using chunking config: size={chunk_size}, "
+ f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
)
# Get Storage and VLM config from request
@@ -144,7 +109,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
"path_prefix": sc.path_prefix,
}
logger.info(
- f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
+ f"Using Storage config: provider={storage_config.get('provider')}, "
+ f"bucket={storage_config['bucket_name']}"
)
vlm_config = {
@@ -170,7 +136,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
)
# Parse file
- logger.info(f"Starting file parsing process")
+ logger.info("Starting file parsing process")
result = self.parser.parse_file(
request.file_name, file_type, request.file_content, chunking_config
)
@@ -184,7 +150,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
# Convert to protobuf message
logger.info(
- f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
+ f"Parsed file {request.file_name}, with {len(result.chunks)} chunks"
)
# Build response, including image info
@@ -224,8 +190,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
enable_multimodal = request.read_config.enable_multimodal or False
logger.info(
- f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
- f"multimodal={enable_multimodal}"
+ f"Using chunking config: size={chunk_size}, "
+ f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
)
# Get Storage and VLM config from request
@@ -243,7 +209,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
"path_prefix": sc.path_prefix,
}
logger.info(
- f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
+ f"Using Storage config: provider={storage_config.get('provider')}, "
+ f"bucket={storage_config['bucket_name']}"
)
vlm_config = {
@@ -269,7 +236,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
)
# Parse URL
- logger.info(f"Starting URL parsing process")
+ logger.info("Starting URL parsing process")
result = self.parser.parse_url(
request.url, request.title, chunking_config
)
@@ -282,7 +249,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
# Convert to protobuf message, including image info
logger.info(
- f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
+ f"Parsed URL {request.url}, returning {len(result.chunks)} chunks"
)
response = ReadResponse(
@@ -335,29 +302,15 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
return proto_chunk
-def init_ocr_engine(ocr_backend, ocr_config):
+def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs):
"""Initialize OCR engine"""
- try:
- logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
- ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
- if ocr_engine:
- logger.info("OCR engine initialized successfully")
- return True
- else:
- logger.error("OCR engine initialization failed")
- return False
- except Exception as e:
- logger.error(f"Error initializing OCR engine: {str(e)}")
- return False
+ backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle")
+ logger.info(f"Initializing OCR engine with backend: {backend_type}")
+ OCREngine.get_instance(backend_type=backend_type, **kwargs)
def main():
- init_ocr_engine(
- os.getenv("OCR_BACKEND", "paddle"),
- {
- "OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
- },
- )
+ init_ocr_engine()
# Set max number of worker threads
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
diff --git a/docreader/models/__init__.py b/docreader/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/docreader/models/document.py b/docreader/models/document.py
new file mode 100644
index 0000000..1ab8c46
--- /dev/null
+++ b/docreader/models/document.py
@@ -0,0 +1,87 @@
+"""Chunk document schema."""
+
+import json
+from typing import Any, Dict, List
+
+from pydantic import BaseModel, Field
+
+
+class Chunk(BaseModel):
+ """Document Chunk including chunk content, chunk metadata."""
+
+ content: str = Field(default="", description="chunk text content")
+ seq: int = Field(default=0, description="Chunk sequence number")
+ start: int = Field(default=0, description="Chunk start position")
+ end: int = Field(description="Chunk end position")
+ images: List[Dict[str, Any]] = Field(
+ default_factory=list, description="Images in the chunk"
+ )
+
+ metadata: Dict[str, Any] = Field(
+ default_factory=dict,
+ description="metadata fields",
+ )
+
+ def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
+ """Convert Chunk to dict."""
+
+ data = self.model_dump()
+ data.update(kwargs)
+ data["class_name"] = self.__class__.__name__
+ return data
+
+ def to_json(self, **kwargs: Any) -> str:
+ """Convert Chunk to json."""
+ data = self.to_dict(**kwargs)
+ return json.dumps(data)
+
+ def __hash__(self):
+ """Hash function."""
+ return hash((self.content,))
+
+ def __eq__(self, other):
+ """Equal function."""
+ return self.content == other.content
+
+ @classmethod
+ def from_dict(cls, data: Dict[str, Any], **kwargs: Any): # type: ignore
+ """Create Chunk from dict."""
+ if isinstance(kwargs, dict):
+ data.update(kwargs)
+
+ data.pop("class_name", None)
+ return cls(**data)
+
+ @classmethod
+ def from_json(cls, data_str: str, **kwargs: Any): # type: ignore
+ """Create Chunk from json."""
+ data = json.loads(data_str)
+ return cls.from_dict(data, **kwargs)
+
+
+class Document(BaseModel):
+ """Document including document content, document metadata."""
+
+ model_config = {"arbitrary_types_allowed": True}
+
+ content: str = Field(default="", description="document text content")
+ images: Dict[str, str] = Field(
+ default_factory=dict, description="Images in the document"
+ )
+
+ chunks: List[Chunk] = Field(default_factory=list, description="document chunks")
+ metadata: Dict[str, Any] = Field(
+ default_factory=dict,
+ description="metadata fields",
+ )
+
+ def set_content(self, content: str) -> None:
+ """Set document content."""
+ self.content = content
+
+ def get_content(self) -> str:
+ """Get document content."""
+ return self.content
+
+ def is_valid(self) -> bool:
+ return self.content != ""
diff --git a/docreader/models/read_config.py b/docreader/models/read_config.py
new file mode 100644
index 0000000..c2c95d8
--- /dev/null
+++ b/docreader/models/read_config.py
@@ -0,0 +1,27 @@
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ChunkingConfig:
+ """
+ Configuration for text chunking process.
+ Controls how documents are split into smaller pieces for processing.
+ """
+
+ # Maximum size of each chunk in tokens/chars
+ chunk_size: int = 512
+
+ # Number of tokens/chars to overlap between chunks
+ chunk_overlap: int = 50
+
+ # Text separators in order of priority
+ separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"])
+
+ # Whether to enable multimodal processing (text + images)
+ enable_multimodal: bool = False
+
+ # Preferred field name going forward
+ storage_config: dict[str, str] = field(default_factory=dict)
+
+ # VLM configuration for image captioning
+ vlm_config: dict[str, str] = field(default_factory=dict)
diff --git a/docreader/parser/__init__.py b/docreader/parser/__init__.py
index c60a018..085b09f 100644
--- a/docreader/parser/__init__.py
+++ b/docreader/parser/__init__.py
@@ -13,22 +13,18 @@ The parsers extract content from documents and can split them into
meaningful chunks for further processing and indexing.
"""
-from .base_parser import BaseParser, ParseResult
-from .docx_parser import DocxParser
from .doc_parser import DocParser
-from .pdf_parser import PDFParser
-from .markdown_parser import MarkdownParser
-from .text_parser import TextParser
+from .docx2_parser import Docx2Parser
from .image_parser import ImageParser
-from .web_parser import WebParser
+from .markdown_parser import MarkdownParser
from .parser import Parser
-from .config import ChunkingConfig
-from .ocr_engine import OCREngine
+from .pdf_parser import PDFParser
+from .text_parser import TextParser
+from .web_parser import WebParser
# Export public classes and modules
__all__ = [
- "BaseParser", # Base parser class that all format parsers inherit from
- "DocxParser", # Parser for .docx files (modern Word documents)
+ "Docx2Parser", # Parser for .docx files (modern Word documents)
"DocParser", # Parser for .doc files (legacy Word documents)
"PDFParser", # Parser for PDF documents
"MarkdownParser", # Parser for Markdown text files
@@ -36,7 +32,4 @@ __all__ = [
"ImageParser", # Parser for images with text content
"WebParser", # Parser for web pages
"Parser", # Main parser factory that selects the appropriate parser
- "ChunkingConfig", # Configuration for text chunking behavior
- "ParseResult", # Standard result format returned by all parsers
- "OCREngine", # OCR engine for extracting text from images
]
diff --git a/docreader/parser/base_parser.py b/docreader/parser/base_parser.py
index 052bc82..8dab374 100644
--- a/docreader/parser/base_parser.py
+++ b/docreader/parser/base_parser.py
@@ -1,65 +1,28 @@
# -*- coding: utf-8 -*-
-import re
-import os
import asyncio
-from typing import List, Dict, Any, Optional, Tuple, Union
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-import logging
-import sys
-import traceback
-import numpy as np
-import time
import io
-import json
-from .ocr_engine import OCREngine
-from .image_utils import image_to_base64
-from .config import ChunkingConfig
-from .storage import create_storage
+import logging
+import os
+import re
+import time
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Tuple
+
+import requests
from PIL import Image
-# Add parent directory to Python path for src imports
-current_dir = os.path.dirname(os.path.abspath(__file__))
-parent_dir = os.path.dirname(current_dir)
-if parent_dir not in sys.path:
- sys.path.insert(0, parent_dir)
-
-try:
- from services.docreader.src.parser.caption import Caption
-except ImportError:
- # Fallback: try relative import
- try:
- from .caption import Caption
- except ImportError:
- # If both imports fail, set to None
- Caption = None
- logging.warning(
- "Failed to import Caption, image captioning will be unavailable"
- )
+from docreader.models.document import Chunk, Document
+from docreader.models.read_config import ChunkingConfig
+from docreader.parser.caption import Caption
+from docreader.parser.ocr_engine import OCREngine
+from docreader.parser.storage import create_storage
+from docreader.splitter.splitter import TextSplitter
+from docreader.utils import endecode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
-@dataclass
-class Chunk:
- """Chunk result"""
-
- content: str # Chunk content
- seq: int # Chunk sequence number
- start: int # Chunk start position
- end: int # Chunk end position
- images: List[Dict[str, Any]] = field(default_factory=list) # Images in the chunk
-
-
-@dataclass
-class ParseResult:
- """Parse result"""
-
- text: str # Extracted text content
- chunks: Optional[List[Chunk]] = None # Chunk results
-
-
class BaseParser(ABC):
"""Base parser interface"""
@@ -97,17 +60,17 @@ class BaseParser(ABC):
def __init__(
self,
file_name: str = "",
- file_type: str = None,
+ file_type: Optional[str] = None,
enable_multimodal: bool = True,
chunk_size: int = 1000,
chunk_overlap: int = 200,
- separators: list = ["\n\n", "\n", "。"],
+ separators: list[str] = ["\n\n", "\n", "。"],
ocr_backend: str = "paddle",
- ocr_config: dict = None,
+ ocr_config: dict = {},
max_image_size: int = 1920, # Maximum image size
max_concurrent_tasks: int = 5, # Max concurrent tasks
max_chunks: int = 1000, # Max number of returned chunks
- chunking_config: ChunkingConfig = None, # Chunking configuration object
+ chunking_config: Optional[ChunkingConfig] = None,
):
"""Initialize parser
@@ -125,7 +88,6 @@ class BaseParser(ABC):
max_chunks: Max number of returned chunks
"""
# Storage client instance
- self._storage = None
self.file_name = file_name
self.file_type = file_type or os.path.splitext(file_name)[1]
self.enable_multimodal = enable_multimodal
@@ -133,15 +95,16 @@ class BaseParser(ABC):
self.chunk_overlap = chunk_overlap
self.separators = separators
self.ocr_backend = os.getenv("OCR_BACKEND", ocr_backend)
- self.ocr_config = ocr_config or {}
+ self.ocr_config = ocr_config
self.max_image_size = max_image_size
self.max_concurrent_tasks = max_concurrent_tasks
self.max_chunks = max_chunks
self.chunking_config = chunking_config
-
- logger.info(
- f"Initializing {self.__class__.__name__} for file: {file_name}, type: {self.file_type}"
+ self.storage = create_storage(
+ self.chunking_config.storage_config if self.chunking_config else None
)
+
+ logger.info(f"Initializing parser for file: {file_name}, type: {file_type}")
logger.info(
f"Parser config: chunk_size={chunk_size}, "
f"overlap={chunk_overlap}, "
@@ -150,16 +113,24 @@ class BaseParser(ABC):
f"max_chunks={max_chunks}"
)
# Only initialize Caption service if multimodal is enabled
- if self.enable_multimodal:
- try:
- self.caption_parser = Caption(self.chunking_config.vlm_config)
- except Exception as e:
- logger.warning(f"Failed to initialize Caption service: {str(e)}")
- self.caption_parser = None
- else:
- self.caption_parser = None
+ vlm_config = self.chunking_config.vlm_config if self.chunking_config else None
+ self.caption_parser = (
+ Caption(vlm_config=vlm_config) if self.enable_multimodal else None
+ )
- def perform_ocr(self, image):
+ @abstractmethod
+ def parse_into_text(self, content: bytes) -> Document:
+ """Parse document content
+
+ Args:
+ content: Document content
+
+ Returns:
+ Either a string containing the parsed text, or a tuple of (text, image_map)
+ where image_map is a dict mapping image URLs to Image objects
+ """
+
+ def perform_ocr(self, image: Image.Image):
"""Execute OCR recognition on the image
Args:
@@ -170,53 +141,23 @@ class BaseParser(ABC):
"""
start_time = time.time()
logger.info("Starting OCR recognition")
- resized_image = None
- try:
- # Resize image to avoid processing large images
- resized_image = self._resize_image_if_needed(image)
+ # Resize image to avoid processing large images
+ resized_image = self._resize_image_if_needed(image)
- # Get OCR engine
- ocr_engine = self.get_ocr_engine(
- backend_type=self.ocr_backend, **self.ocr_config
- )
- if ocr_engine is None:
- logger.error(
- f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, "
- "skipping OCR recognition"
- )
- return ""
+ # Get OCR engine
+ ocr_engine = OCREngine.get_instance(self.ocr_backend)
- # Execute OCR prediction
- logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
- # Add extra exception handling
- try:
- ocr_result = ocr_engine.predict(resized_image)
- except RuntimeError as e:
- # Handle common CUDA memory issues or other runtime errors
- logger.error(f"OCR prediction runtime error: {str(e)}")
- return ""
- except Exception as e:
- # Handle other prediction errors
- logger.error(f"Unexpected OCR prediction error: {str(e)}")
- return ""
+ # Execute OCR prediction
+ logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
+ ocr_result = ocr_engine.predict(resized_image)
- process_time = time.time() - start_time
- logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")
- return ocr_result
- except Exception as e:
- process_time = time.time() - start_time
- logger.error(
- f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds"
- )
- return ""
- finally:
- # Release image resources
- if resized_image is not image and hasattr(resized_image, "close"):
- # Only close the new image we created, not the original image
- resized_image.close()
+ process_time = time.time() - start_time
+ logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")
- def _resize_image_if_needed(self, image):
+ return ocr_result
+
+ def _resize_image_if_needed(self, image: Image.Image) -> Image.Image:
"""Resize image if it exceeds maximum size limit
Args:
@@ -225,102 +166,21 @@ class BaseParser(ABC):
Returns:
Resized image object
"""
- try:
- # If it's a PIL Image
- if hasattr(image, "size"):
- width, height = image.size
- if width > self.max_image_size or height > self.max_image_size:
- logger.info(f"Resizing PIL image, original size: {width}x{height}")
- scale = min(
- self.max_image_size / width, self.max_image_size / height
- )
- new_width = int(width * scale)
- new_height = int(height * scale)
- resized_image = image.resize((new_width, new_height))
- logger.info(f"Resized to: {new_width}x{new_height}")
- return resized_image
- else:
- logger.info(
- f"PIL image size {width}x{height} is within limits, no resizing needed"
- )
- return image
- # If it's a numpy array
- elif hasattr(image, "shape"):
- height, width = image.shape[:2]
- if width > self.max_image_size or height > self.max_image_size:
- logger.info(
- f"Resizing numpy image, original size: {width}x{height}"
- )
- scale = min(
- self.max_image_size / width, self.max_image_size / height
- )
- new_width = int(width * scale)
- new_height = int(height * scale)
- # Use PIL for resizing numpy arrays
- pil_image = Image.fromarray(image)
- resized_pil = pil_image.resize((new_width, new_height))
- resized_image = np.array(resized_pil)
- logger.info(f"Resized to: {new_width}x{new_height}")
- return resized_image
- else:
- logger.info(
- f"Numpy image size {width}x{height} is within limits, no resizing needed"
- )
- return image
- else:
- logger.warning(f"Unknown image type: {type(image)}, cannot resize")
- return image
- except Exception as e:
- logger.error(f"Error resizing image: {str(e)}")
- return image
+ width, height = image.size
+ if width > self.max_image_size or height > self.max_image_size:
+ logger.info(f"Resizing PIL image, original size: {width}x{height}")
+ scale = min(self.max_image_size / width, self.max_image_size / height)
+ new_width = int(width * scale)
+ new_height = int(height * scale)
+ resized_image = image.resize((new_width, new_height))
+ logger.info(f"Resized to: {new_width}x{new_height}")
+ return resized_image
- def process_image(self, image, image_url=None):
- """Process image: first perform OCR, then get caption if text is available
+ logger.info(f"PIL image size is {width}x{height}, no resizing needed")
+ return image
- Args:
- image: Image object (PIL.Image or numpy array)
- image_url: Image URL (if uploaded)
-
- Returns:
- tuple: (ocr_text, caption, image_url)
- - ocr_text: OCR extracted text
- - caption: Image description (if OCR has text) or empty string
- - image_url: Image URL (if provided)
- """
- logger.info("Starting image processing (OCR + optional caption)")
-
- # Resize image
- image = self._resize_image_if_needed(image)
-
- # Perform OCR recognition
- ocr_text = self.perform_ocr(image)
- caption = ""
-
- if self.caption_parser:
- logger.info(
- f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
- )
- # Convert image to base64 for caption generation
- img_base64 = image_to_base64(image)
- if img_base64:
- caption = self.get_image_caption(img_base64)
- if caption:
- logger.info(f"Successfully obtained image caption: {caption}")
- else:
- logger.warning("Failed to get caption")
- else:
- logger.warning("Failed to convert image to base64")
- caption = ""
- else:
- logger.info("Caption service not initialized, skipping caption retrieval")
-
- # Release image resources
- del image
-
- return ocr_text, caption, image_url
-
- async def process_image_async(self, image, image_url=None):
- """Asynchronously process image: first perform OCR, then get caption if text is available
+ async def process_image_async(self, image: Image.Image, image_url: str):
+ """Asynchronously process image: first perform OCR, then get caption
Args:
image: Image object (PIL.Image or numpy array)
@@ -333,84 +193,47 @@ class BaseParser(ABC):
- image_url: Image URL (if provided)
"""
logger.info("Starting asynchronous image processing (OCR + optional caption)")
- resized_image = None
+ # Resize image
+ resized_image = self._resize_image_if_needed(image)
try:
- # Resize image
- resized_image = self._resize_image_if_needed(image)
-
- # Perform OCR recognition (using run_in_executor to execute synchronous operations in the event loop)
+ # Perform OCR recognition
loop = asyncio.get_event_loop()
try:
# Add timeout mechanism to avoid infinite blocking (30 seconds timeout)
ocr_task = loop.run_in_executor(None, self.perform_ocr, resized_image)
ocr_text = await asyncio.wait_for(ocr_task, timeout=30.0)
- except asyncio.TimeoutError:
- logger.error(
- "OCR processing timed out (30 seconds), skipping this image"
- )
- ocr_text = ""
except Exception as e:
- logger.error(f"OCR processing error: {str(e)}")
+ logger.error(f"OCR processing error, skipping this image: {str(e)}")
ocr_text = ""
- logger.info(
- f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
- )
- caption = ""
- if self.caption_parser:
- try:
- # Convert image to base64 for caption generation
- img_base64 = image_to_base64(resized_image)
- if img_base64:
- # Add timeout to avoid blocking caption retrieval (30 seconds timeout)
- caption_task = self.get_image_caption_async(img_base64)
- image_data, caption = await asyncio.wait_for(
- caption_task, timeout=30.0
- )
- if caption:
- logger.info(
- f"Successfully obtained image caption: {caption}"
- )
- else:
- logger.warning("Failed to get caption")
- else:
- logger.warning("Failed to convert image to base64")
- caption = ""
- except asyncio.TimeoutError:
- logger.warning("Caption retrieval timed out, skipping")
- except Exception as e:
- logger.error(f"Failed to get caption: {str(e)}")
- else:
- logger.info(
- "Caption service not initialized, skipping caption retrieval"
- )
-
+ logger.info(f"Successfully obtained image ocr: {ocr_text}")
+ img_base64 = endecode.decode_image(resized_image)
+ caption = self.get_image_caption(img_base64)
+ logger.info(f"Successfully obtained image caption: {caption}")
return ocr_text, caption, image_url
finally:
- # Release image resources
- if resized_image is not image and hasattr(resized_image, "close"):
- # Only close the new image we created, not the original image
- resized_image.close()
+ resized_image.close()
- async def process_with_limit(self, idx, image, url, semaphore):
+ async def process_with_limit(
+ self, idx: int, image: Image.Image, url: str, semaphore: asyncio.Semaphore
+ ):
"""Function to process a single image using a semaphore"""
try:
- logger.info(f"Waiting to process image {idx+1}")
+ logger.info(f"Waiting to process image {idx + 1}")
async with semaphore: # Use semaphore to control concurrency
- logger.info(f"Starting to process image {idx+1}")
+ logger.info(f"Starting to process image {idx + 1}")
result = await self.process_image_async(image, url)
- logger.info(f"Completed processing image {idx+1}")
+ logger.info(f"Completed processing image {idx + 1}")
return result
except Exception as e:
- logger.error(f"Error processing image {idx+1}: {str(e)}")
+ logger.error(f"Error processing image {idx + 1}: {str(e)}")
return ("", "", url) # Return empty result to avoid overall failure
finally:
# Manually release image resources
- if hasattr(image, "close"):
- image.close()
+ image.close()
- async def process_multiple_images(self, images_data):
+ async def process_multiple_images(self, images_data: List[Tuple[Image.Image, str]]):
"""Process multiple images concurrently
Args:
@@ -450,7 +273,7 @@ class BaseParser(ABC):
for i, result in enumerate(completed_results):
if isinstance(result, Exception):
logger.error(
- f"Image {i+1} processing returned an exception: {str(result)}"
+ f"Image {i + 1} processing returned an exception: {str(result)}"
)
# For exceptions, add empty results
if i < len(images_data):
@@ -467,47 +290,10 @@ class BaseParser(ABC):
logger.info("Image processing resource cleanup complete")
logger.info(
- f"Completed concurrent processing of {len(results)}/{len(images_data)} images"
+ f"Concurrent processing of {len(results)}/{len(images_data)} images"
)
return results
- def decode_bytes(self, content: bytes) -> str:
- """Intelligently decode byte stream, supports multiple encodings
-
- Tries to decode in common encodings, if all fail, uses latin-1 as fallback
-
- Args:
- content: Byte stream to decode
-
- Returns:
- Decoded string
- """
- logger.info(f"Attempting to decode bytes of length: {len(content)}")
- # Common encodings, sorted by priority
- encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii", "latin-1"]
- text = None
-
- # Try decoding with each encoding format
- for encoding in encodings:
- try:
- text = content.decode(encoding)
- logger.info(f"Successfully decoded content using {encoding} encoding")
- break
- except UnicodeDecodeError:
- logger.info(f"Failed to decode using {encoding} encoding")
- continue
-
- # If all encodings fail, use latin-1 as fallback
- if text is None:
- text = content.decode("latin-1")
- logger.warning(
- f"Unable to determine correct encoding, using latin-1 as fallback. "
- f"This may cause character issues."
- )
-
- logger.info(f"Decoded text length: {len(text)} characters")
- return text
-
def get_image_caption(self, image_data: str) -> str:
"""Get image description
@@ -517,6 +303,9 @@ class BaseParser(ABC):
Returns:
Image description
"""
+ if not self.caption_parser:
+ logger.warning("Caption parser not initialized")
+ return ""
start_time = time.time()
logger.info(
f"Getting caption for image: {image_data[:250]}..."
@@ -533,80 +322,7 @@ class BaseParser(ABC):
logger.warning("Failed to get caption for image")
return caption
- async def get_image_caption_async(self, image_data: str) -> Tuple[str, str]:
- """Asynchronously get image description
-
- Args:
- image_data: Image data (base64 encoded string or URL)
-
- Returns:
- Tuple[str, str]: Image data and corresponding description
- """
- caption = self.get_image_caption(image_data)
- return image_data, caption
-
- def __init_storage(self):
- """Initialize storage client based on configuration"""
- if self._storage is None:
- storage_config = (
- self.chunking_config.storage_config if self.chunking_config else None
- )
- self._storage = create_storage(storage_config)
- logger.info(
- f"Initialized storage client: {self._storage.__class__.__name__}"
- )
- return self._storage
-
- def upload_file(self, file_path: str) -> str:
- """Upload file to object storage
-
- Args:
- file_path: File path
-
- Returns:
- File URL
- """
- logger.info(f"Uploading file: {file_path}")
- try:
- storage = self.__init_storage()
- return storage.upload_file(file_path)
- except Exception as e:
- logger.error(f"Failed to upload file: {str(e)}")
- return ""
-
- def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
- """Upload bytes to object storage
-
- Args:
- content: Byte content to upload
- file_ext: File extension
-
- Returns:
- File URL
- """
- logger.info(f"Uploading bytes content, size: {len(content)} bytes")
- try:
- storage = self.__init_storage()
- return storage.upload_bytes(content, file_ext)
- except Exception as e:
- logger.error(f"Failed to upload bytes to storage: {str(e)}")
- traceback.print_exc()
- return ""
-
- @abstractmethod
- def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
- """Parse document content
-
- Args:
- content: Document content
-
- Returns:
- Either a string containing the parsed text, or a tuple of (text, image_map)
- where image_map is a dict mapping image URLs to Image objects
- """
- pass
-
- def parse(self, content: bytes) -> ParseResult:
+ def parse(self, content: bytes) -> Document:
"""Parse document content
Args:
@@ -616,17 +332,19 @@ class BaseParser(ABC):
Parse result
"""
logger.info(
- f"Parsing document with {self.__class__.__name__}, content size: {len(content)} bytes"
+ f"Parsing document with {self.__class__.__name__}, bytes: {len(content)}"
)
- parse_result = self.parse_into_text(content)
- if isinstance(parse_result, tuple):
- text, image_map = parse_result
- else:
- text = parse_result
- image_map = {}
- logger.info(f"Extracted {len(text)} characters of text from {self.file_name}")
- logger.info(f"Beginning chunking process for text")
- chunks = self.chunk_text(text)
+ document = self.parse_into_text(content)
+ logger.info(
+ f"Extracted {len(document.content)} characters from {self.file_name}"
+ )
+ splitter = TextSplitter(
+ chunk_size=self.chunk_size,
+ chunk_overlap=self.chunk_overlap,
+ separators=self.separators,
+ )
+ chunk_str = splitter.split_text(document.content)
+ chunks = self._str_to_chunk(chunk_str)
logger.info(f"Created {len(chunks)} chunks from document")
# Limit the number of returned chunks
@@ -636,7 +354,7 @@ class BaseParser(ABC):
)
chunks = chunks[: self.max_chunks]
- # If multimodal is enabled and file type is supported, process images in each chunk
+ # If multimodal is enabled and file type is supported, process images
if self.enable_multimodal:
# Get file extension and convert to lowercase
file_ext = (
@@ -647,11 +365,12 @@ class BaseParser(ABC):
# Define allowed file types for image processing
allowed_types = [
- ".pdf", # PDF files
+ # Text files
+ ".pdf",
".md",
- ".markdown", # Markdown files
+ ".markdown",
".doc",
- ".docx", # Word documents
+ ".docx",
# Image files
".jpg",
".jpeg",
@@ -666,13 +385,21 @@ class BaseParser(ABC):
logger.info(
f"Processing images in each chunk for file type: {file_ext}"
)
- chunks = self.process_chunks_images(chunks, image_map)
+ chunks = self.process_chunks_images(chunks, document.images)
else:
logger.info(
f"Skipping image processing for unsupported file type: {file_ext}"
)
- return ParseResult(text=text, chunks=chunks)
+ document.chunks = chunks
+ return document
+
+ def _str_to_chunk(self, text: List[Tuple[int, int, str]]) -> List[Chunk]:
+ """Convert string to Chunk object"""
+ return [
+ Chunk(seq=i, content=t, start=start, end=end)
+ for i, (start, end, t) in enumerate(text)
+ ]
def _split_into_units(self, text: str) -> List[str]:
"""
@@ -682,9 +409,7 @@ class BaseParser(ABC):
Returns:
基本单元的列表
"""
- logger.info(
- f"Splitting text into basic units with robust structure protection, text length: {len(text)}"
- )
+ logger.info(f"Splitting text into basic units, text length: {len(text)}")
# 定义所有需要作为整体保护的结构模式 ---
table_pattern = r"(?m)(^\|.*\|[ \t]*\r?\n(?:[ \t]*\r?\n)?^\|\s*:?--+.*\r?\n(?:^\|.*\|\r?\n?)*)"
@@ -710,7 +435,8 @@ class BaseParser(ABC):
# 按起始位置排序
protected_ranges.sort(key=lambda x: x[0])
logger.info(
- f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links)."
+ f"Found {len(protected_ranges)} protected structures "
+ "(tables, code, formulas, images, links)."
)
# 合并可能重叠的保护范围 ---
@@ -731,7 +457,7 @@ class BaseParser(ABC):
merged_ranges.append((current_start, current_end))
protected_ranges = merged_ranges
logger.info(
- f"After merging overlaps, {len(protected_ranges)} protected ranges remain."
+ f"After overlaps, {len(protected_ranges)} protected ranges remain."
)
# 根据保护范围和分隔符来分割文本 ---
@@ -749,7 +475,7 @@ class BaseParser(ABC):
segments = re.split(separator_pattern, pre_text)
units.extend([s for s in segments if s]) # 添加所有非空部分
- # b. 将整个受保护的块(例如,一个完整的表格)作为一个单独的、不可分割的单元添加
+ # b. 将整个受保护的块(例如,一个完整的表格)作为一个不可分割的单元添加
protected_text = text[start:end]
units.append(protected_text)
@@ -764,38 +490,6 @@ class BaseParser(ABC):
logger.info(f"Text splitting complete, created {len(units)} final basic units.")
return units
- def _find_complete_units(self, units: List[str], target_size: int) -> List[str]:
- """Find a list of complete units that do not exceed the target size
-
- Args:
- units: List of units
- target_size: Target size
-
- Returns:
- List of complete units
- """
- logger.info(f"Finding complete units with target size: {target_size}")
- result = []
- current_size = 0
-
- for unit in units:
- unit_size = len(unit)
- if current_size + unit_size > target_size and result:
- logger.info(
- f"Reached target size limit at {current_size} characters, stopping"
- )
- break
- result.append(unit)
- current_size += unit_size
- logger.info(
- f"Added unit of size {unit_size}, current total: {current_size}/{target_size}"
- )
-
- logger.info(
- f"Found {len(result)} complete units totaling {current_size} characters"
- )
- return result
-
def chunk_text(self, text: str) -> List[Chunk]:
"""Chunk text, preserving Markdown structure
@@ -825,7 +519,7 @@ class BaseParser(ABC):
for i, unit in enumerate(units):
unit_size = len(unit)
- logger.info(f"Processing unit {i+1}/{len(units)}, size: {unit_size}")
+ logger.info(f"Processing unit {i + 1}/{len(units)}, size: {unit_size}")
# If current chunk plus new unit exceeds size limit, create new chunk
if current_size + unit_size > self.chunk_size and current_chunk:
@@ -855,14 +549,12 @@ class BaseParser(ABC):
for u in reversed(current_chunk):
if overlap_size + len(u) > overlap_target:
logger.info(
- f"Reached overlap target ({overlap_size}/{overlap_target})"
+ f"Overlap target ({overlap_size}/{overlap_target})"
)
break
overlap_units.insert(0, u)
overlap_size += len(u)
- logger.info(
- f"Added unit to overlap, current overlap size: {overlap_size}"
- )
+ logger.info(f"Added unit to overlap, size: {overlap_size}")
# Remove elements from overlap that are included in separators
start_index = 0
@@ -883,7 +575,7 @@ class BaseParser(ABC):
overlap_units = overlap_units[start_index:]
logger.info(
- f"Final overlap: {len(overlap_units)} units, {overlap_size} characters"
+ f"Overlap: {len(overlap_units)} units, {overlap_size} size"
)
current_chunk = overlap_units
@@ -899,7 +591,7 @@ class BaseParser(ABC):
current_chunk.append(unit)
current_size += unit_size
logger.info(
- f"Added unit to current chunk, now at {current_size}/{self.chunk_size} characters"
+ f"Added unit to current chunk, at {current_size}/{self.chunk_size}"
)
# Add the last chunk
@@ -925,12 +617,13 @@ class BaseParser(ABC):
chunk: Document chunk
Returns:
- List of image information, each element contains image URL and match position
+ List of image information
"""
logger.info(f"Extracting image information from Chunk #{chunk.seq}")
text = chunk.content
- # Regex to extract image information from text, supporting Markdown images and HTML images
+ # Regex to extract image information from text,
+ # support: Markdown images, HTML images
img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)|
]*src="([^"]+)" [^>]*>'
# Extract image information
@@ -954,28 +647,28 @@ class BaseParser(ABC):
images_info.append(image_info)
logger.info(
- f"Image in Chunk #{chunk.seq} {match_idx+1}: " f"URL={img_url[:50]}..."
+ f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url[:50]}..."
if len(img_url) > 50
- else f"Image in Chunk #{chunk.seq} {match_idx+1}: URL={img_url}"
+ else f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url}"
)
return images_info
- async def download_and_upload_image(self, img_url: str):
- """Download image and upload to object storage, if it's already an object storage path or local path, use directly
+ async def download_and_upload_image(
+ self, img_url: str
+ ) -> Tuple[str, str, Image.Image | None]:
+ """Download image and upload to object storage,
+ if it's already an object storage path or local path, use directly
Args:
img_url: Image URL or local path
Returns:
- tuple: (original URL, storage URL, image object), if failed returns (original URL, None, None)
+ tuple: (original URL, storage URL, image object),
+ if failed returns (original URL, None, None)
"""
try:
- import requests
- from PIL import Image
- import io
-
# Check if it's already a storage URL (COS or MinIO)
is_storage_url = any(
pattern in img_url
@@ -997,12 +690,7 @@ class BaseParser(ABC):
response = requests.get(img_url, timeout=5, proxies=proxies)
if response.status_code == 200:
image = Image.open(io.BytesIO(response.content))
- try:
- return img_url, img_url, image
- finally:
- # Ensure image resources are also released after the function returns
- # Image will be closed by the caller
- pass
+ return img_url, img_url, image
else:
logger.warning(
f"Failed to get storage image: {response.status_code}"
@@ -1022,7 +710,7 @@ class BaseParser(ABC):
# Upload to storage
with open(img_url, "rb") as f:
content = f.read()
- storage_url = self.upload_bytes(content)
+ storage_url = self.storage.upload_bytes(content)
logger.info(
f"Successfully uploaded local image to storage: {storage_url}"
)
@@ -1031,7 +719,7 @@ class BaseParser(ABC):
logger.error(f"Error processing local image: {str(e)}")
if image and hasattr(image, "close"):
image.close()
- return img_url, None, None
+ return img_url, img_url, None
# Normal remote URL download handling
else:
@@ -1044,9 +732,7 @@ class BaseParser(ABC):
if https_proxy:
proxies["https"] = https_proxy
- logger.info(
- f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}"
- )
+ logger.info(f"Downloading image {img_url}, using proxy: {proxies}")
response = requests.get(img_url, timeout=5, proxies=proxies)
if response.status_code == 200:
@@ -1054,7 +740,7 @@ class BaseParser(ABC):
image = Image.open(io.BytesIO(response.content))
try:
# Upload to storage using the method in BaseParser
- storage_url = self.upload_bytes(response.content)
+ storage_url = self.storage.upload_bytes(response.content)
logger.info(
f"Successfully uploaded image to storage: {storage_url}"
)
@@ -1064,11 +750,11 @@ class BaseParser(ABC):
pass
else:
logger.warning(f"Failed to download image: {response.status_code}")
- return img_url, None, None
+ return img_url, img_url, None
except Exception as e:
logger.error(f"Error downloading or processing image: {str(e)}")
- return img_url, None, None
+ return img_url, img_url, None
async def process_chunk_images_async(
self, chunk, chunk_idx, total_chunks, image_map=None
@@ -1086,18 +772,19 @@ class BaseParser(ABC):
"""
logger.info(
- f"Starting to process images in Chunk #{chunk_idx+1}/{total_chunks}"
+ f"Starting to process images in Chunk #{chunk_idx + 1}/{total_chunks}"
)
# Extract image information from the Chunk
images_info = self.extract_images_from_chunk(chunk)
if not images_info:
- logger.info(f"Chunk #{chunk_idx+1} found no images")
+ logger.info(f"Chunk #{chunk_idx + 1} found no images")
return chunk
# Prepare images that need to be downloaded and processed
images_to_process = []
- url_to_info_map = {} # Map URL to image information
+ # Map URL to image information
+ url_to_info_map = {}
# Record all image URLs that need to be processed
for img_info in images_info:
@@ -1106,14 +793,21 @@ class BaseParser(ABC):
results = []
download_tasks = []
- for img_url in url_to_info_map.keys(): # Check if image is already in the image_map
+ # Check if image is already in the image_map
+ for img_url in url_to_info_map.keys():
if image_map and img_url in image_map:
- logger.info(f"Image already in image_map: {img_url}, using cached object")
- results.append((img_url, img_url, image_map[img_url]))
+ logger.info(
+ f"Image already in image_map: {img_url}, using cached object"
+ )
+ image = Image.open(
+ io.BytesIO(endecode.encode_image(image_map[img_url]))
+ )
+ results.append((img_url, img_url, image))
else:
download_task = self.download_and_upload_image(img_url)
download_tasks.append(download_task)
- # Concurrent download and upload of images, ignore images that are already in the image_map
+ # Concurrent download and upload of images,
+ # ignore images that are already in the image_map
results.extend(await asyncio.gather(*download_tasks))
# Process download results, prepare for OCR processing
@@ -1123,16 +817,17 @@ class BaseParser(ABC):
img_info["cos_url"] = cos_url
images_to_process.append((image, cos_url))
- # If no images were successfully downloaded and uploaded, return the original Chunk
+ # If no images were successfully downloaded and uploaded,
+ # return the original Chunk
if not images_to_process:
logger.info(
- f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images"
+ f"Chunk #{chunk_idx + 1} not found downloaded and uploaded images"
)
return chunk
# Concurrent processing of all images (OCR + caption)
logger.info(
- f"Processing {len(images_to_process)} images in Chunk #{chunk_idx+1}"
+ f"Processing {len(images_to_process)} images in Chunk #{chunk_idx + 1}"
)
# Concurrent processing of all images
@@ -1163,10 +858,12 @@ class BaseParser(ABC):
# Update image information in the Chunk
chunk.images = processed_images
- logger.info(f"Completed image processing in Chunk #{chunk_idx+1}")
+ logger.info(f"Completed image processing in Chunk #{chunk_idx + 1}")
return chunk
- def process_chunks_images(self, chunks: List[Chunk], image_map=None) -> List[Chunk]:
+ def process_chunks_images(
+ self, chunks: List[Chunk], image_map: Dict[str, str] = {}
+ ) -> List[Chunk]:
"""Concurrent processing of images in all Chunks
Args:
@@ -1210,7 +907,7 @@ class BaseParser(ABC):
processed_chunks = []
for i, result in enumerate(results):
if isinstance(result, Exception):
- logger.error(f"Error processing Chunk {i+1}: {str(result)}")
+ logger.error(f"Error processing Chunk {i + 1}: {str(result)}")
# Keep original Chunk
if i < len(chunks):
processed_chunks.append(chunks[i])
@@ -1235,7 +932,7 @@ class BaseParser(ABC):
# Execute processing for all Chunks
processed_chunks = loop.run_until_complete(process_all_chunks())
logger.info(
- f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks"
+ f"Completed processing of {len(processed_chunks)}/{len(chunks)} chunks"
)
return processed_chunks
diff --git a/docreader/parser/caption.py b/docreader/parser/caption.py
index d84bf6d..f3e6f69 100644
--- a/docreader/parser/caption.py
+++ b/docreader/parser/caption.py
@@ -3,11 +3,10 @@ import logging
import os
import time
from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
-import requests
import ollama
-
+import requests
logger = logging.getLogger(__name__)
@@ -158,11 +157,16 @@ class CaptionChatResp:
Returns:
The content string from the first choice, or empty string if no choices
"""
- if self.choices:
- logger.info("Retrieving content from first choice")
- return self.choices[0].message.content
- logger.warning("No choices available in response")
- return ""
+ if (
+ not self.choices
+ or not self.choices[0]
+ or not self.choices[0].message
+ or not self.choices[0].message.content
+ ):
+ logger.warning("No choices available in response")
+ return ""
+ logger.info("Retrieving content from first choice")
+ return self.choices[0].message.content
class Caption:
@@ -171,33 +175,43 @@ class Caption:
Uses an external API to process images and return textual descriptions.
"""
- def __init__(self, vlm_config=None):
- """Initialize the Caption service with configuration from parameters or environment variables."""
+ def __init__(self, vlm_config: Optional[Dict[str, str]] = None):
+ """
+ Initialize the Caption service with configuration
+ from parameters or environment variables.
+ """
logger.info("Initializing Caption service")
self.prompt = """简单凝炼的描述图片的主要内容"""
-
- # Use provided VLM config if available, otherwise fall back to environment variables
+ self.timeout = 30
+
+ # Use provided VLM config if available,
+ # otherwise fall back to environment variables
if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"):
self.completion_url = vlm_config.get("base_url", "") + "/chat/completions"
self.model = vlm_config.get("model_name", "")
self.api_key = vlm_config.get("api_key", "")
self.interface_type = vlm_config.get("interface_type", "openai").lower()
else:
- if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "":
+ base_url = os.getenv("VLM_MODEL_BASE_URL")
+ model_name = os.getenv("VLM_MODEL_NAME")
+ if not base_url or not model_name:
logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set")
return
- self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions"
- self.model = os.getenv("VLM_MODEL_NAME")
- self.api_key = os.getenv("VLM_MODEL_API_KEY")
+ self.completion_url = base_url + "/chat/completions"
+ self.model = model_name
+ self.api_key = os.getenv("VLM_MODEL_API_KEY", "")
self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower()
-
+
# 验证接口类型
if self.interface_type not in ["ollama", "openai"]:
- logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai")
+ logger.warning(
+ f"Unknown interface type: {self.interface_type}, defaulting to openai"
+ )
self.interface_type = "openai"
-
+
logger.info(
- f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}"
+ f"Configured with model: {self.model}, "
+ f"endpoint: {self.completion_url}, interface: {self.interface_type}"
)
def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]:
@@ -210,8 +224,8 @@ class Caption:
Returns:
CaptionChatResp object if successful, None otherwise
"""
- logger.info(f"Calling Caption API for image captioning")
- logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}")
+ logger.info("Calling Caption API for image captioning")
+ logger.info(f"Processing image data: {image_data[:50]}...")
# 根据接口类型选择调用方式
if self.interface_type == "ollama":
@@ -226,39 +240,35 @@ class Caption:
client = ollama.Client(
host=host,
+ timeout=self.timeout,
)
-
+
try:
logger.info(f"Calling Ollama API with model: {self.model}")
-
+
# 调用Ollama API,使用images参数传递base64编码的图片
response = client.generate(
model=self.model,
prompt="简单凝炼的描述图片的主要内容",
- images=[image_base64], # image_base64是base64编码的图片数据
+ images=[image_base64], # image_base64是base64编码的图片数据
options={"temperature": 0.1},
stream=False,
)
-
+
# 构造响应对象
caption_resp = CaptionChatResp(
id="ollama_response",
created=int(time.time()),
- model=self.model,
+ model=Model(id=self.model),
object="chat.completion",
choices=[
- Choice(
- message=Message(
- role="assistant",
- content=response.response
- )
- )
- ]
+ Choice(message=Message(role="assistant", content=response.response))
+ ],
)
-
+
logger.info("Successfully received response from Ollama API")
return caption_resp
-
+
except Exception as e:
logger.error(f"Error calling Ollama API: {e}")
return None
@@ -266,13 +276,16 @@ class Caption:
def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]:
"""Call OpenAI-compatible API for image captioning."""
logger.info(f"Calling OpenAI-compatible API with model: {self.model}")
-
+
user_msg = UserMessage(
role="user",
content=[
Content(type="text", text=self.prompt),
Content(
- type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto")
+ type="image_url",
+ image_url=ImageUrl(
+ url="data:image/png;base64," + image_base64, detail="auto"
+ ),
),
],
)
@@ -295,23 +308,23 @@ class Caption:
headers["Authorization"] = f"Bearer {self.api_key}"
try:
- logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}")
+ logger.info(
+ f"Sending request to OpenAI-compatible API with model: {self.model}"
+ )
response = requests.post(
self.completion_url,
data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4),
headers=headers,
- timeout=30,
+ timeout=self.timeout,
)
if response.status_code != 200:
logger.error(
- f"OpenAI-compatible API returned non-200 status code: {response.status_code}"
+ f"OpenAI API returned non-200 status code: {response.status_code}"
)
response.raise_for_status()
- logger.info(
- f"Successfully received response from OpenAI-compatible API with status: {response.status_code}"
- )
- logger.info(f"Converting response to CaptionChatResp object")
+ logger.info(f"Received from OpenAI with status: {response.status_code}")
+ logger.info("Converting response to CaptionChatResp object")
caption_resp = CaptionChatResp.from_json(response.json())
if caption_resp.usage:
@@ -322,7 +335,7 @@ class Caption:
return caption_resp
except requests.exceptions.Timeout:
- logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds")
+ logger.error("Timeout while calling OpenAI-compatible API after 30 seconds")
return None
except requests.exceptions.RequestException as e:
logger.error(f"Request error calling OpenAI-compatible API: {e}")
diff --git a/docreader/parser/chain_parser.py b/docreader/parser/chain_parser.py
new file mode 100644
index 0000000..45fd1c9
--- /dev/null
+++ b/docreader/parser/chain_parser.py
@@ -0,0 +1,70 @@
+import logging
+from typing import List, Tuple, Type
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.utils import endecode
+
+logger = logging.getLogger(__name__)
+
+
+class FirstParser(BaseParser):
+ _parser_cls: Tuple[Type["BaseParser"], ...] = ()
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self._parsers: List[BaseParser] = []
+ for parser_cls in self._parser_cls:
+ try:
+ parser = parser_cls(*args, **kwargs)
+ self._parsers.append(parser)
+ except Exception as e:
+ logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
+
+ def parse_into_text(self, content: bytes) -> Document:
+ for p in self._parsers:
+ document = p.parse_into_text(content)
+ if document.is_valid():
+ return document
+ return Document()
+
+ @classmethod
+ def create(cls, *parser_classes: Type["BaseParser"]) -> Type["FirstParser"]:
+ names = "_".join([p.__name__ for p in parser_classes])
+ return type(f"FirstParser_{names}", (cls,), {"_parser_cls": parser_classes})
+
+
+class PipelineParser(BaseParser):
+ _parser_cls: Tuple[Type["BaseParser"], ...] = ()
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self._parsers: List[BaseParser] = []
+ for parser_cls in self._parser_cls:
+ try:
+ parser = parser_cls(*args, **kwargs)
+ self._parsers.append(parser)
+ except Exception as e:
+ logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
+
+ def parse_into_text(self, content: bytes) -> Document:
+ document = Document()
+ for p in self._parsers:
+ document = p.parse_into_text(content)
+ content = endecode.encode_bytes(document.content)
+ return document
+
+ @classmethod
+ def create(cls, *parser_classes: Type["BaseParser"]) -> Type["PipelineParser"]:
+ names = "_".join([p.__name__ for p in parser_classes])
+ return type(f"PipelineParser_{names}", (cls,), {"_parser_cls": parser_classes})
+
+
+if __name__ == "__main__":
+ from docreader.parser.markdown_parser import MarkdownParser
+
+ cls = FirstParser.create(MarkdownParser)
+ parser = cls()
+ print(parser.parse_into_text(b"aaa"))
diff --git a/docreader/parser/config.py b/docreader/parser/config.py
deleted file mode 100644
index 85f9cb5..0000000
--- a/docreader/parser/config.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from dataclasses import dataclass, field
-
-
-@dataclass
-class ChunkingConfig:
- """
- Configuration for text chunking process.
- Controls how documents are split into smaller pieces for processing.
- """
-
- chunk_size: int = 512 # Maximum size of each chunk in tokens/chars
- chunk_overlap: int = 50 # Number of tokens/chars to overlap between chunks
- separators: list = field(
- default_factory=lambda: ["\n\n", "\n", "。"]
- ) # Text separators in order of priority
- enable_multimodal: bool = (
- False # Whether to enable multimodal processing (text + images)
- )
- storage_config: dict = None # Preferred field name going forward
- vlm_config: dict = None # VLM configuration for image captioning
-
diff --git a/docreader/parser/doc_parser.py b/docreader/parser/doc_parser.py
index 71fc897..337ab85 100644
--- a/docreader/parser/doc_parser.py
+++ b/docreader/parser/doc_parser.py
@@ -1,134 +1,88 @@
-import asyncio
import logging
-import re
-import tempfile
import os
import subprocess
-import shutil
-from io import BytesIO
-from typing import Optional, List, Tuple
-import textract
-from PIL import Image
-import zipfile
-import xml.etree.ElementTree as ET
+from typing import List, Optional
-from .base_parser import BaseParser
-from .docx_parser import DocxParser, Docx
+import textract
+
+from docreader.models.document import Document
+from docreader.parser.docx2_parser import Docx2Parser
+from docreader.utils.tempfile import TempDirContext, TempFileContext
logger = logging.getLogger(__name__)
-class DocParser(BaseParser):
+class DocParser(Docx2Parser):
"""DOC document parser"""
- def parse_into_text(self, content: bytes) -> str:
- """Parse DOC document
-
- Args:
- content: DOC document content
-
- Returns:
- Parse result
- """
+ def parse_into_text(self, content: bytes) -> Document:
logger.info(f"Parsing DOC document, content size: {len(content)} bytes")
+ handle_chain = [
+ # 1. Try to convert to docx format to extract images
+ self._parse_with_docx,
+ # 2. If image extraction is not needed or conversion failed,
+ # try using antiword to extract text
+ self._parse_with_antiword,
+ # 3. If antiword extraction fails, use textract
+ self._parse_with_textract,
+ ]
+
# Save byte content as a temporary file
- with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
- temp_file_path = temp_file.name
- temp_file.write(content)
- temp_file.flush()
- logger.info(f"Saved DOC content to temporary file: {temp_file_path}")
+ with TempFileContext(content, ".doc") as temp_file_path:
+ for handle in handle_chain:
+ try:
+ document = handle(temp_file_path)
+ if document:
+ return document
+ except Exception as e:
+ logger.warning(f"Failed to parse DOC with {handle.__name__} {e}")
- try:
- # First try to convert to docx format to extract images
- if self.enable_multimodal:
- logger.info("Multimodal enabled, attempting to extract images from DOC")
- docx_content = self._convert_doc_to_docx(temp_file_path)
+ return Document(content="")
- if docx_content:
- logger.info("Successfully converted DOC to DOCX, using DocxParser")
- # Use existing DocxParser to parse the converted docx
- docx_parser = DocxParser(
- file_name=self.file_name,
- file_type="docx",
- enable_multimodal=self.enable_multimodal,
- chunk_size=self.chunk_size,
- chunk_overlap=self.chunk_overlap,
- chunking_config=self.chunking_config,
- separators=self.separators,
- )
- text = docx_parser.parse_into_text(docx_content)
- logger.info(f"Extracted {len(text)} characters using DocxParser")
+ def _parse_with_docx(self, temp_file_path: str) -> Document:
+ logger.info("Multimodal enabled, attempting to extract images from DOC")
- # Clean up temporary file
- os.unlink(temp_file_path)
- logger.info(f"Deleted temporary file: {temp_file_path}")
+ docx_content = self._try_convert_doc_to_docx(temp_file_path)
+ if not docx_content:
+ raise RuntimeError("Failed to convert DOC to DOCX")
- return text
- else:
- logger.warning(
- "Failed to convert DOC to DOCX, falling back to text-only extraction"
- )
+ logger.info("Successfully converted DOC to DOCX, using DocxParser")
+ # Use existing DocxParser to parse the converted docx
+ document = super(Docx2Parser, self).parse_into_text(docx_content)
+ logger.info(f"Extracted {len(document.content)} characters using DocxParser")
+ return document
- # If image extraction is not needed or conversion failed, try using antiword to extract text
- try:
- logger.info("Attempting to parse DOC file with antiword")
- # Check if antiword is installed
- antiword_path = self._find_antiword_path()
+ def _parse_with_antiword(self, temp_file_path: str) -> Document:
+ logger.info("Attempting to parse DOC file with antiword")
- if antiword_path:
- # Use antiword to extract text directly
- logger.info(f"Using antiword at {antiword_path} to extract text")
- process = subprocess.Popen(
- [antiword_path, temp_file_path],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
- stdout, stderr = process.communicate()
+ # Check if antiword is installed
+ antiword_path = self._try_find_antiword()
+ if not antiword_path:
+ raise RuntimeError("antiword not found in PATH")
- if process.returncode == 0:
- text = stdout.decode("utf-8", errors="ignore")
- logger.info(
- f"Successfully extracted {len(text)} characters using antiword"
- )
-
- # Clean up temporary file
- os.unlink(temp_file_path)
- logger.info(f"Deleted temporary file: {temp_file_path}")
-
- return text
- else:
- logger.warning(
- f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
- )
- else:
- logger.warning("antiword not found, falling back to textract")
- except Exception as e:
- logger.warning(
- f"Error using antiword: {str(e)}, falling back to textract"
- )
-
- # If antiword fails, try using textract
- logger.info("Parsing DOC file with textract")
- text = textract.process(temp_file_path, method="antiword").decode("utf-8")
- logger.info(
- f"Successfully extracted {len(text)} characters of text from DOC document using textract"
+ # Use antiword to extract text directly
+ process = subprocess.Popen(
+ [antiword_path, temp_file_path],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ stdout, stderr = process.communicate()
+ if process.returncode != 0:
+ raise RuntimeError(
+ f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
)
+ text = stdout.decode("utf-8", errors="ignore")
+ logger.info(f"Successfully extracted {len(text)} characters using antiword")
+ return Document(content=text)
- # Clean up temporary file
- os.unlink(temp_file_path)
- logger.info(f"Deleted temporary file: {temp_file_path}")
+ def _parse_with_textract(self, temp_file_path: str) -> Document:
+ logger.info(f"Parsing DOC file with textract: {temp_file_path}")
+ text = textract.process(temp_file_path, method="antiword").decode("utf-8")
+ logger.info(f"Successfully extracted {len(text)} bytes of DOC using textract")
+ return Document(content=str(text))
- return text
- except Exception as e:
- logger.error(f"Error parsing DOC document: {str(e)}")
- # Ensure temporary file is cleaned up
- if os.path.exists(temp_file_path):
- os.unlink(temp_file_path)
- logger.info(f"Deleted temporary file after error: {temp_file_path}")
- return ""
-
- def _convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
+ def _try_convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
"""Convert DOC file to DOCX format
Uses LibreOffice/OpenOffice for conversion
@@ -141,21 +95,16 @@ class DocParser(BaseParser):
"""
logger.info(f"Converting DOC to DOCX: {doc_path}")
+ # Check if LibreOffice or OpenOffice is installed
+ soffice_path = self._try_find_soffice()
+ if not soffice_path:
+ return None
+
+ # Execute conversion command
+ logger.info(f"Using {soffice_path} to convert DOC to DOCX")
+
# Create a temporary directory to store the converted file
- temp_dir = tempfile.mkdtemp()
- docx_path = os.path.join(temp_dir, "converted.docx")
-
- try:
- # Check if LibreOffice or OpenOffice is installed
- soffice_path = self._find_soffice_path()
- if not soffice_path:
- logger.error(
- "LibreOffice/OpenOffice not found, cannot convert DOC to DOCX"
- )
- return None
-
- # Execute conversion command
- logger.info(f"Using {soffice_path} to convert DOC to DOCX")
+ with TempDirContext() as temp_dir:
cmd = [
soffice_path,
"--headless",
@@ -165,7 +114,6 @@ class DocParser(BaseParser):
temp_dir,
doc_path,
]
-
logger.info(f"Running command: {' '.join(cmd)}")
process = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
@@ -173,41 +121,68 @@ class DocParser(BaseParser):
stdout, stderr = process.communicate()
if process.returncode != 0:
- logger.error(
- f"Error converting DOC to DOCX: {stderr.decode('utf-8', errors='ignore')}"
+ logger.warning(
+ f"Error converting DOC to DOCX: {stderr.decode('utf-8')}"
)
return None
# Find the converted file
- for file in os.listdir(temp_dir):
- if file.endswith(".docx"):
- converted_file = os.path.join(temp_dir, file)
- logger.info(f"Found converted file: {converted_file}")
-
- # Read the converted file content
- with open(converted_file, "rb") as f:
- docx_content = f.read()
+ docx_file = [
+ file for file in os.listdir(temp_dir) if file.endswith(".docx")
+ ]
+ logger.info(f"Found {len(docx_file)} DOCX file(s) in temporary directory")
+ for file in docx_file:
+ converted_file = os.path.join(temp_dir, file)
+ logger.info(f"Found converted file: {converted_file}")
+ # Read the converted file content
+ with open(converted_file, "rb") as f:
+ docx_content = f.read()
logger.info(
- f"Successfully read converted DOCX file, size: {len(docx_content)} bytes"
+ f"Successfully read DOCX file, size: {len(docx_content)}"
)
return docx_content
+ return None
- logger.error("No DOCX file found after conversion")
- return None
+ def _try_find_executable_path(
+ self,
+ executable_name: str,
+ possible_path: List[str] = [],
+ environment_variable: List[str] = [],
+ ) -> Optional[str]:
+ """Find executable path
+ Args:
+ executable_name: Executable name
+ possible_path: List of possible paths
+ environment_variable: List of environment variables to check
+ Returns:
+ Executable path, or None if not found
+ """
+ # Common executable paths
+ paths: List[str] = []
+ paths.extend(possible_path)
+ paths.extend(os.environ.get(env_var, "") for env_var in environment_variable)
+ paths = list(set(paths))
- except Exception as e:
- logger.error(f"Error during DOC to DOCX conversion: {str(e)}")
- return None
- finally:
- # Clean up temporary directory
- try:
- shutil.rmtree(temp_dir)
- logger.info(f"Cleaned up temporary directory: {temp_dir}")
- except Exception as e:
- logger.warning(f"Failed to clean up temporary directory: {str(e)}")
+ # Check if path is set in environment variable
+ for path in paths:
+ if os.path.exists(path):
+ logger.info(f"Found {executable_name} at {path}")
+ return path
- def _find_soffice_path(self) -> Optional[str]:
+ # Try to find in PATH
+ result = subprocess.run(
+ ["which", executable_name], capture_output=True, text=True
+ )
+ if result.returncode == 0 and result.stdout.strip():
+ path = result.stdout.strip()
+ logger.info(f"Found {executable_name} at {path}")
+ return path
+
+ logger.warning(f"Failed to find {executable_name}")
+ return None
+
+ def _try_find_soffice(self) -> Optional[str]:
"""Find LibreOffice/OpenOffice executable path
Returns:
@@ -225,32 +200,13 @@ class DocParser(BaseParser):
"C:\\Program Files\\LibreOffice\\program\\soffice.exe",
"C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
]
+ return self._try_find_executable_path(
+ executable_name="soffice",
+ possible_path=possible_paths,
+ environment_variable=["LIBREOFFICE_PATH"],
+ )
- # Check if path is set in environment variable
- if os.environ.get("LIBREOFFICE_PATH"):
- possible_paths.insert(0, os.environ.get("LIBREOFFICE_PATH"))
-
- for path in possible_paths:
- if os.path.exists(path):
- logger.info(f"Found LibreOffice/OpenOffice at: {path}")
- return path
-
- # Try to find in PATH
- try:
- result = subprocess.run(
- ["which", "soffice"], capture_output=True, text=True
- )
- if result.returncode == 0 and result.stdout.strip():
- path = result.stdout.strip()
- logger.info(f"Found LibreOffice/OpenOffice in PATH: {path}")
- return path
- except Exception:
- pass
-
- logger.warning("LibreOffice/OpenOffice not found")
- return None
-
- def _find_antiword_path(self) -> Optional[str]:
+ def _try_find_antiword(self) -> Optional[str]:
"""Find antiword executable path
Returns:
@@ -265,51 +221,27 @@ class DocParser(BaseParser):
"C:\\Program Files\\Antiword\\antiword.exe",
"C:\\Program Files (x86)\\Antiword\\antiword.exe",
]
-
- # Check if path is set in environment variable
- if os.environ.get("ANTIWORD_PATH"):
- possible_paths.insert(0, os.environ.get("ANTIWORD_PATH"))
-
- for path in possible_paths:
- if os.path.exists(path):
- logger.info(f"Found antiword at: {path}")
- return path
-
- # Try to find in PATH
- try:
- result = subprocess.run(
- ["which", "antiword"], capture_output=True, text=True
- )
- if result.returncode == 0 and result.stdout.strip():
- path = result.stdout.strip()
- logger.info(f"Found antiword in PATH: {path}")
- return path
- except Exception:
- pass
-
- logger.warning("antiword not found")
- return None
+ return self._try_find_executable_path(
+ executable_name="antiword",
+ possible_path=possible_paths,
+ environment_variable=["ANTIWORD_PATH"],
+ )
if __name__ == "__main__":
- logging.basicConfig(
- level=logging.INFO,
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
- datefmt="%Y-%m-%d %H:%M:%S",
- )
- logger.info("Running DocParser in standalone mode")
+ logging.basicConfig(level=logging.DEBUG)
file_name = "/path/to/your/test.doc"
logger.info(f"Processing file: {file_name}")
-
doc_parser = DocParser(
- file_name, enable_multimodal=True, chunk_size=512, chunk_overlap=60
+ file_name=file_name,
+ enable_multimodal=True,
+ chunk_size=512,
+ chunk_overlap=60,
)
- logger.info("Parser initialized, starting processing")
-
with open(file_name, "rb") as f:
content = f.read()
- text = doc_parser.parse_into_text(content)
- logger.info(f"Processing complete, extracted text length: {len(text)}")
- logger.info(f"Sample text: {text[:200]}...")
+ document = doc_parser.parse_into_text(content)
+ logger.info(f"Processing complete, extracted text length: {len(document.content)}")
+ logger.info(f"Sample text: {document.content[:200]}...")
diff --git a/docreader/parser/docx2_parser.py b/docreader/parser/docx2_parser.py
new file mode 100644
index 0000000..872b3ef
--- /dev/null
+++ b/docreader/parser/docx2_parser.py
@@ -0,0 +1,28 @@
+import logging
+
+from docreader.parser.chain_parser import FirstParser
+from docreader.parser.docx_parser import DocxParser
+from docreader.parser.markitdown_parser import MarkitdownParser
+
+logger = logging.getLogger(__name__)
+
+
+class Docx2Parser(FirstParser):
+ _parser_cls = (MarkitdownParser, DocxParser)
+
+
+if __name__ == "__main__":
+ logging.basicConfig(level=logging.DEBUG)
+
+ your_file = "/path/to/your/file.docx"
+ parser = Docx2Parser(separators=[".", "?", "!", "。", "?", "!"])
+ with open(your_file, "rb") as f:
+ content = f.read()
+
+ document = parser.parse(content)
+ for cc in document.chunks:
+ logger.info(f"chunk: {cc}")
+
+ # document = parser.parse_into_text(content)
+ # logger.info(f"docx content: {document.content}")
+ # logger.info(f"find images {document.images.keys()}")
diff --git a/docreader/parser/docx_parser.py b/docreader/parser/docx_parser.py
index ee0cb90..979a4fc 100644
--- a/docreader/parser/docx_parser.py
+++ b/docreader/parser/docx_parser.py
@@ -1,37 +1,36 @@
import logging
-import tempfile
import os
-import sys
-import time
-from io import BytesIO
-from typing import Optional, Dict, Any, Tuple, List, Union
-from dataclasses import dataclass, field
-from PIL import Image
-from docx import Document
-from docx.image.exceptions import (
- UnrecognizedImageError,
- UnexpectedEndOfFileError,
- InvalidImageStreamError,
-)
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
+import re
import tempfile
import threading
+import time
import traceback
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from io import BytesIO
from multiprocessing import Manager
-import re
+from typing import Any, Dict, List, Optional, Tuple
-from .base_parser import BaseParser
+from docx import Document
+from docx.image.exceptions import (
+ InvalidImageStreamError,
+ UnexpectedEndOfFileError,
+ UnrecognizedImageError,
+)
+from PIL import Image
+
+from docreader.models.document import Document as DocumentModel
+from docreader.parser.base_parser import BaseParser
+from docreader.utils import endecode
logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-# Add thread local storage to track the processing status of each thread
-thread_local = threading.local()
class ImageData:
"""Represents a processed image of document content"""
+
local_path: str = ""
- object: Image.Image = None
+ object: Optional[Image.Image] = None
url: str = ""
@@ -40,7 +39,9 @@ class LineData:
"""Represents a processed line of document content with associated images"""
text: str = "" # Extracted text content
- images: List[ImageData] = field(default_factory=list) # List of images or image paths
+ images: List[ImageData] = field(
+ default_factory=list
+ ) # List of images or image paths
extra_info: str = "" # Placeholder for additional info (currently unused)
page_num: int = 0 # Page number
content_sequence: List[Tuple[str, Any]] = field(
@@ -53,18 +54,8 @@ class DocxParser(BaseParser):
def __init__(
self,
- file_name: str = "",
- file_type: str = None,
- enable_multimodal: bool = True,
- chunk_size: int = 1000,
- chunk_overlap: int = 200,
- separators: list = ["\n\n", "\n", "。"],
- ocr_backend: str = "paddle",
- ocr_config: dict = None,
- max_image_size: int = 1920,
- max_concurrent_tasks: int = 5,
- max_pages: int = 100, # Maximum number of pages to process, default to 50 pages
- chunking_config=None,
+ max_pages: int = 100, # Maximum number of pages to process
+ **kwargs,
):
"""Initialize DOCX document parser
@@ -79,37 +70,16 @@ class DocxParser(BaseParser):
ocr_config: OCR engine configuration
max_image_size: Maximum image size limit
max_concurrent_tasks: Maximum number of concurrent tasks
- max_pages: Maximum number of pages to process, if more than this, only process the first max_pages pages
+ max_pages: Maximum number of pages to process
"""
- super().__init__(
- file_name=file_name,
- file_type=file_type,
- enable_multimodal=enable_multimodal,
- chunk_size=chunk_size,
- chunk_overlap=chunk_overlap,
- separators=separators,
- ocr_backend=ocr_backend,
- ocr_config=ocr_config,
- max_image_size=max_image_size,
- max_concurrent_tasks=max_concurrent_tasks,
- chunking_config=chunking_config,
- )
+ super().__init__(**kwargs)
self.max_pages = max_pages
logger.info(f"DocxParser initialized with max_pages={max_pages}")
- def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
- """Parse DOCX document, extract text content and image Markdown links
-
- Args:
- content: DOCX document content
-
- Returns:
- Tuple of (parsed_text, image_map) where image_map maps image URLs to Image objects
- All LineData objects are used internally but not returned directly through this interface
- """
+ def parse_into_text(self, content: bytes) -> DocumentModel:
+ """Parse DOCX document, extract text content and image Markdown links"""
logger.info(f"Parsing DOCX document, content size: {len(content)} bytes")
logger.info(f"Max pages limit set to: {self.max_pages}")
- logger.info("Converting DOCX content to sections and tables")
start_time = time.time()
# Use concurrent processing to handle the document
@@ -123,7 +93,7 @@ class DocxParser(BaseParser):
docx_processor = Docx(
max_image_size=self.max_image_size,
enable_multimodal=self.enable_multimodal,
- upload_file=self.upload_file,
+ upload_file=self.storage.upload_file,
)
all_lines, tables = docx_processor(
binary=content,
@@ -140,7 +110,7 @@ class DocxParser(BaseParser):
section_start_time = time.time()
text_parts = []
- image_parts = {}
+ image_parts: Dict[str, str] = {}
for sec_idx, line in enumerate(all_lines):
try:
@@ -148,16 +118,19 @@ class DocxParser(BaseParser):
text_parts.append(line.text)
if sec_idx < 3 or sec_idx % 50 == 0:
logger.info(
- f"Added section {sec_idx+1} text: {line.text[:50]}..."
+ f"Added section {sec_idx + 1} text: {line.text[:50]}..."
if len(line.text) > 50
- else f"Added section {sec_idx+1} text: {line.text}"
+ else f"Added section {sec_idx + 1} text: {line.text}"
)
if line.images:
for image_data in line.images:
- if image_data.url:
- image_parts[image_data.url] = image_data.object
+ if image_data.url and image_data.object:
+ image_parts[image_data.url] = endecode.decode_image(
+ image_data.object
+ )
+ image_data.object.close()
except Exception as e:
- logger.error(f"Error processing section {sec_idx+1}: {str(e)}")
+ logger.error(f"Error processing section {sec_idx + 1}: {str(e)}")
logger.error(f"Detailed stack trace: {traceback.format_exc()}")
continue
@@ -176,17 +149,17 @@ class DocxParser(BaseParser):
total_processing_time = time.time() - start_time
logger.info(
- f"Parsing complete in {total_processing_time:.2f}s, generated {len(text)} characters of text "
+ f"Parsing complete in {total_processing_time:.2f}s, "
+ f"generated {len(text)} characters of text"
)
- return text, image_parts
+ return DocumentModel(content=text, images=image_parts)
except Exception as e:
logger.error(f"Error parsing DOCX document: {str(e)}")
logger.error(f"Detailed stack trace: {traceback.format_exc()}")
- fallback_text = self._parse_using_simple_method(content)
- return fallback_text, {}
+ return self._parse_using_simple_method(content)
- def _parse_using_simple_method(self, content: bytes) -> str:
+ def _parse_using_simple_method(self, content: bytes) -> DocumentModel:
"""Parse document using a simplified method, as a fallback
Args:
@@ -201,7 +174,8 @@ class DocxParser(BaseParser):
doc = Document(BytesIO(content))
logger.info(
f"Successfully loaded document in simplified method, "
- f"contains {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables"
+ f"contains {len(doc.paragraphs)} paragraphs "
+ f"and {len(doc.tables)} tables"
)
text_parts = []
@@ -211,7 +185,7 @@ class DocxParser(BaseParser):
para_with_text = 0
for i, para in enumerate(doc.paragraphs):
if i % 100 == 0:
- logger.info(f"Processing paragraph {i+1}/{para_count}")
+ logger.info(f"Processing paragraph {i + 1}/{para_count}")
if para.text.strip():
text_parts.append(para.text.strip())
para_with_text += 1
@@ -225,7 +199,7 @@ class DocxParser(BaseParser):
rows_processed = 0
for i, table in enumerate(doc.tables):
if i % 10 == 0:
- logger.info(f"Processing table {i+1}/{table_count}")
+ logger.info(f"Processing table {i + 1}/{table_count}")
table_has_content = False
for row in table.rows:
@@ -256,25 +230,24 @@ class DocxParser(BaseParser):
# If the result is still empty, return an error message
if not result_text:
logger.warning("No text extracted using simplified method")
- return "", {}
+ return DocumentModel()
- return result_text, {}
+ return DocumentModel(content=result_text)
except Exception as backup_error:
processing_time = time.time() - start_time
logger.error(
- f"Simplified parsing failed after {processing_time:.2f}s: {str(backup_error)}"
+ f"Simplified parsing failed {processing_time:.2f}s: {backup_error}"
)
logger.error(f"Detailed traceback: {traceback.format_exc()}")
- return "", {}
+ return DocumentModel()
class Docx:
def __init__(self, max_image_size=1920, enable_multimodal=False, upload_file=None):
logger.info("Initializing DOCX processor")
self.max_image_size = max_image_size # Maximum image size limit
- self.picture_cache = (
- {}
- ) # Image cache to avoid processing the same image repeatedly
+ # Image cache to avoid processing the same image repeatedly
+ self.picture_cache = {}
self.enable_multimodal = enable_multimodal
self.upload_file = upload_file
@@ -454,7 +427,6 @@ class Docx:
return page_to_paragraphs
-
def __call__(
self,
binary: Optional[bytes] = None,
@@ -611,7 +583,6 @@ class Docx:
return pages_to_process
-
def _process_document(
self,
binary,
@@ -806,7 +777,9 @@ class Docx:
# Collect temporary image paths for later cleanup
for line in page_lines:
for image_data in line.images:
- if image_data.local_path and image_data.local_path.startswith("/tmp/docx_img_"):
+ if image_data.local_path and image_data.local_path.startswith(
+ "/tmp/docx_img_"
+ ):
temp_img_paths.add(image_data.local_path)
results.extend(page_lines)
@@ -876,7 +849,11 @@ class Docx:
# Process all image data objects
for image_data in image_paths:
- if image_data.local_path and os.path.exists(image_data.local_path) and image_data.local_path not in image_url_map:
+ if (
+ image_data.local_path
+ and os.path.exists(image_data.local_path)
+ and image_data.local_path not in image_url_map
+ ):
try:
# Upload the image if it doesn't have a URL yet
if not image_data.url:
@@ -886,12 +863,16 @@ class Docx:
image_data.url = image_url
# Add image URL as Markdown format
markdown_image = f""
- image_url_map[image_data.local_path] = markdown_image
+ image_url_map[image_data.local_path] = (
+ markdown_image
+ )
logger.info(
f"Added image URL for {image_data.local_path}: {image_url}"
)
else:
- logger.warning(f"Failed to upload image: {image_data.local_path}")
+ logger.warning(
+ f"Failed to upload image: {image_data.local_path}"
+ )
else:
# Already has a URL, use it
markdown_image = f""
@@ -925,12 +906,19 @@ class Docx:
# For ImageData objects, use the URL
if isinstance(content, str) and content in image_url_map:
combined_parts.append(image_url_map[content])
- elif hasattr(content, 'local_path') and content.local_path in image_url_map:
+ elif (
+ hasattr(content, "local_path")
+ and content.local_path in image_url_map
+ ):
combined_parts.append(image_url_map[content.local_path])
# Create the final text with proper ordering
final_text = "\n\n".join(part for part in combined_parts if part)
- processed_lines.append(LineData(text=final_text, page_num=page_num, images=line_data.images))
+ processed_lines.append(
+ LineData(
+ text=final_text, page_num=page_num, images=line_data.images
+ )
+ )
else:
processed_lines = lines
@@ -1003,11 +991,11 @@ class Docx:
logger.info(f"Processing {table_count} tables")
for tb_idx, tb in enumerate(self.doc.tables):
if tb_idx % 10 == 0: # Log only every 10 tables to reduce log volume
- logger.info(f"Processing table {tb_idx+1}/{table_count}")
+ logger.info(f"Processing table {tb_idx + 1}/{table_count}")
# Optimize: Check if table is empty
if len(tb.rows) == 0 or all(len(r.cells) == 0 for r in tb.rows):
- logger.info(f"Skipping empty table {tb_idx+1}")
+ logger.info(f"Skipping empty table {tb_idx + 1}")
continue
table_html = self._convert_table_to_html(tb)
@@ -1111,8 +1099,8 @@ def _save_image_to_temp(logger, image, page_num, img_idx):
if not image:
return None
- import tempfile
import os
+ import tempfile
try:
# Create a temporary file
@@ -1187,8 +1175,15 @@ def process_page_multiprocess(
return []
# Extract page content
- combined_text, image_objects, content_sequence = _extract_page_content_in_process(
- process_logger, doc, page_num, paragraphs, enable_multimodal, max_image_size
+ combined_text, image_objects, content_sequence = (
+ _extract_page_content_in_process(
+ process_logger,
+ doc,
+ page_num,
+ paragraphs,
+ enable_multimodal,
+ max_image_size,
+ )
)
# Process content sequence to maintain order between processes
@@ -1199,7 +1194,9 @@ def process_page_multiprocess(
if enable_multimodal:
# First pass: save all images to temporary files
for i, image_object in enumerate(image_objects):
- img_path = _save_image_to_temp(process_logger, image_object, page_num, i)
+ img_path = _save_image_to_temp(
+ process_logger, image_object, page_num, i
+ )
if img_path:
# Create ImageData object
image_data = ImageData()
diff --git a/docreader/parser/image_parser.py b/docreader/parser/image_parser.py
index 4ebbcee..5c054bc 100644
--- a/docreader/parser/image_parser.py
+++ b/docreader/parser/image_parser.py
@@ -1,15 +1,13 @@
+import base64
import logging
import os
-import asyncio
-from PIL import Image
-import io
-from typing import Dict, Any, Tuple, Union
-from .base_parser import BaseParser, ParseResult
-import numpy as np
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
# Set up logger for this module
logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+
class ImageParser(BaseParser):
"""
@@ -23,46 +21,24 @@ class ImageParser(BaseParser):
4. Returning a combined result with both text and image reference
"""
- def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+ def parse_into_text(self, content: bytes) -> Document:
"""
- Parse image content, upload the image and return Markdown reference along with image map.
-
- Args:
- content: Raw image data (bytes)
-
- Returns:
- Tuple of (markdown_text, image_map) where image_map maps image URLs to PIL Image objects
+ Parse image content into markdown text
+ :param content: bytes content of the image
+ :return: Document object
"""
logger.info(f"Parsing image content, size: {len(content)} bytes")
- image_map = {}
-
- try:
- # Upload image to storage service
- logger.info("Uploading image to storage")
- _, ext = os.path.splitext(self.file_name)
- image_url = self.upload_bytes(content, file_ext=ext)
- if not image_url:
- logger.error("Failed to upload image to storage")
- return "", {}
- logger.info(
- f"Successfully uploaded image, URL: {image_url[:50]}..."
- if len(image_url) > 50
- else f"Successfully uploaded image, URL: {image_url}"
- )
- # Create image object and add to map
- try:
- from PIL import Image
- import io
- image = Image.open(io.BytesIO(content))
- image_map[image_url] = image
- logger.info(f"Added image to image_map for URL: {image_url}")
- except Exception as img_err:
- logger.error(f"Error creating image object: {str(img_err)}")
+ # Get file extension
+ ext = os.path.splitext(self.file_name)[1].lower()
- markdown_text = f""
- return markdown_text, image_map
+ # Upload image to storage
+ image_url = self.storage.upload_bytes(content, file_ext=ext)
+ logger.info(f"Successfully uploaded image, URL: {image_url[:50]}...")
- except Exception as e:
- logger.error(f"Error parsing image: {str(e)}")
- return "", {}
+ # Generate markdown text
+ text = f""
+ images = {image_url: base64.b64encode(content).decode()}
+
+ # Create image object and add to map
+ return Document(content=text, images=images)
diff --git a/docreader/parser/image_utils.py b/docreader/parser/image_utils.py
deleted file mode 100644
index 55cb474..0000000
--- a/docreader/parser/image_utils.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import base64
-import io
-import logging
-from typing import Union
-from PIL import Image
-import numpy as np
-
-logger = logging.getLogger(__name__)
-
-def image_to_base64(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
- """Convert image to base64 encoded string
-
- Args:
- image: Image file path, bytes, PIL Image object, or numpy array
-
- Returns:
- Base64 encoded image string, or empty string if conversion fails
- """
- try:
- if isinstance(image, str):
- # It's a file path
- with open(image, "rb") as image_file:
- return base64.b64encode(image_file.read()).decode("utf-8")
- elif isinstance(image, bytes):
- # It's bytes data
- return base64.b64encode(image).decode("utf-8")
- elif isinstance(image, Image.Image):
- # It's a PIL Image
- buffer = io.BytesIO()
- image.save(buffer, format="PNG")
- return base64.b64encode(buffer.getvalue()).decode("utf-8")
- elif isinstance(image, np.ndarray):
- # It's a numpy array
- pil_image = Image.fromarray(image)
- buffer = io.BytesIO()
- pil_image.save(buffer, format="PNG")
- return base64.b64encode(buffer.getvalue()).decode("utf-8")
- else:
- logger.error(f"Unsupported image type: {type(image)}")
- return ""
- except Exception as e:
- logger.error(f"Error converting image to base64: {str(e)}")
- return ""
diff --git a/docreader/parser/markdown_image_util.py b/docreader/parser/markdown_image_util.py
new file mode 100644
index 0000000..b748db1
--- /dev/null
+++ b/docreader/parser/markdown_image_util.py
@@ -0,0 +1,111 @@
+import logging
+import re
+import uuid
+from typing import Dict, List, Match, Optional, Tuple
+
+from docreader.utils import endecode
+
+# Get logger object
+logger = logging.getLogger(__name__)
+
+
+class MarkdownImageUtil:
+ def __init__(self):
+ self.b64_pattern = re.compile(
+ r"!\[([^\]]*)\]\(data:image/(\w+);base64,([^\)]+)\)"
+ )
+ self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
+ self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
+
+ def extract_image(
+ self,
+ content: str,
+ path_prefix: Optional[str] = None,
+ replace: bool = True,
+ ) -> Tuple[str, List[str]]:
+ """Extract base64 encoded images from Markdown content"""
+
+ # image_path => base64 bytes
+ images: List[str] = []
+
+ def repl(match: Match[str]) -> str:
+ title = match.group(1)
+ image_path = match.group(2)
+ if path_prefix:
+ image_path = f"{path_prefix}/{image_path}"
+
+ images.append(image_path)
+
+ if not replace:
+ return match.group(0)
+
+ # Replace image path with URL
+ return f""
+
+ text = self.image_pattern.sub(repl, content)
+ logger.debug(f"Extracted {len(images)} images from markdown")
+ return text, images
+
+ def extract_base64(
+ self,
+ content: str,
+ path_prefix: Optional[str] = None,
+ replace: bool = True,
+ ) -> Tuple[str, Dict[str, bytes]]:
+ """Extract base64 encoded images from Markdown content"""
+
+ # image_path => base64 bytes
+ images: Dict[str, bytes] = {}
+
+ def repl(match: Match[str]) -> str:
+ title = match.group(1)
+ img_ext = match.group(2)
+ img_b64 = match.group(3)
+
+ image_byte = endecode.encode_image(img_b64, errors="ignore")
+ if not image_byte:
+ logger.error(f"Failed to decode base64 image skip it: {img_b64}")
+ return title
+
+ image_path = f"{uuid.uuid4()}.{img_ext}"
+ if path_prefix:
+ image_path = f"{path_prefix}/{image_path}"
+ images[image_path] = image_byte
+
+ if not replace:
+ return match.group(0)
+
+ # Replace image path with URL
+ return f""
+
+ text = self.b64_pattern.sub(repl, content)
+ logger.debug(f"Extracted {len(images)} base64 images from markdown")
+ return text, images
+
+ def replace_path(self, content: str, images: Dict[str, str]) -> str:
+ content_replace: set = set()
+
+ def repl(match: Match[str]) -> str:
+ title = match.group(1)
+ image_path = match.group(2)
+ if image_path not in images:
+ return match.group(0)
+
+ content_replace.add(image_path)
+ image_path = images[image_path]
+ return f""
+
+ text = self.replace_pattern.sub(repl, content)
+ logger.debug(f"Replaced {len(content_replace)} images in markdown")
+ return text
+
+
+if __name__ == "__main__":
+ your_content = "testtest"
+ image_handle = MarkdownImageUtil()
+ text, images = image_handle.extract_base64(your_content)
+ print(text)
+
+ for image_url, image_byte in images.items():
+ with open(image_url, "wb") as f:
+ f.write(image_byte)
diff --git a/docreader/parser/markdown_parser.py b/docreader/parser/markdown_parser.py
index 330d5a0..1758dcd 100644
--- a/docreader/parser/markdown_parser.py
+++ b/docreader/parser/markdown_parser.py
@@ -1,33 +1,53 @@
-import asyncio
-import re
+import base64
import logging
-import numpy as np
-import os # Import os module to get environment variables
-from typing import Dict, List, Optional, Tuple, Union, Any
-from .base_parser import BaseParser
+import os
+from typing import Dict
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.parser.chain_parser import PipelineParser
+from docreader.parser.markdown_image_util import MarkdownImageUtil
+from docreader.utils import endecode
# Get logger object
logger = logging.getLogger(__name__)
-class MarkdownParser(BaseParser):
- """Markdown document parser"""
-
- def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
- """Parse Markdown document, only extract text content, do not process images
-
- Args:
- content: Markdown document content
-
- Returns:
- Parsed text result
- """
- logger.info(f"Parsing Markdown document, content size: {len(content)} bytes")
+class MarkdownImageBase64(BaseParser):
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.image_helper = MarkdownImageUtil()
+ def parse_into_text(self, content: bytes) -> Document:
# Convert byte content to string using universal decoding method
- text = self.decode_bytes(content)
- logger.info(f"Decoded Markdown content, text length: {len(text)} characters")
+ text = endecode.decode_bytes(content)
+ text, img_b64 = self.image_helper.extract_base64(text, path_prefix="images")
- logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text")
- return text
+ images: Dict[str, str] = {}
+ image_replace: Dict[str, str] = {}
+ logger.debug(f"Uploading {len(img_b64)} images from markdown")
+ for ipath, b64_bytes in img_b64.items():
+ ext = os.path.splitext(ipath)[1].lower()
+ image_url = self.storage.upload_bytes(b64_bytes, ext)
+
+ image_replace[ipath] = image_url
+ images[image_url] = base64.b64encode(b64_bytes).decode()
+
+ text = self.image_helper.replace_path(text, image_replace)
+ return Document(content=text, images=images)
+
+
+class MarkdownParser(PipelineParser):
+ _parser_cls = (MarkdownImageBase64,)
+
+
+if __name__ == "__main__":
+ logging.basicConfig(level=logging.DEBUG)
+
+ your_content = "testtest"
+ parser = MarkdownParser()
+
+ document = parser.parse_into_text(your_content.encode())
+ logger.info(document.content)
+ logger.info(f"Images: {len(document.images)}, name: {document.images.keys()}")
diff --git a/docreader/parser/markitdown_parser.py b/docreader/parser/markitdown_parser.py
new file mode 100644
index 0000000..c067cd7
--- /dev/null
+++ b/docreader/parser/markitdown_parser.py
@@ -0,0 +1,31 @@
+import io
+import logging
+
+from markitdown import MarkItDown
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.parser.chain_parser import PipelineParser
+from docreader.parser.markdown_parser import MarkdownParser
+
+logger = logging.getLogger(__name__)
+
+
+class StdMarkitdownParser(BaseParser):
+ """
+ PDF Document Parser
+
+ This parser handles PDF documents by extracting text content.
+ It uses the markitdown library for simple text extraction.
+ """
+
+ def __init__(self, *args, **kwargs):
+ self.markitdown = MarkItDown()
+
+ def parse_into_text(self, content: bytes) -> Document:
+ result = self.markitdown.convert(io.BytesIO(content), keep_data_uris=True)
+ return Document(content=result.text_content)
+
+
+class MarkitdownParser(PipelineParser):
+ _parser_cls = (StdMarkitdownParser, MarkdownParser)
diff --git a/docreader/parser/mineru_parser.py b/docreader/parser/mineru_parser.py
new file mode 100644
index 0000000..1e182de
--- /dev/null
+++ b/docreader/parser/mineru_parser.py
@@ -0,0 +1,124 @@
+import logging
+import os
+import re
+from typing import Dict
+
+import markdownify
+import requests
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.parser.markdown_parser import MarkdownImageUtil
+from docreader.utils import endecode
+
+logger = logging.getLogger(__name__)
+
+
+class MinerUParser(BaseParser):
+ def __init__(
+ self,
+ enable_markdownify: bool = True,
+ mineru_endpoint: str = "",
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.minerU = os.getenv("MINERU_ENDPOINT", mineru_endpoint)
+ self.enable_markdownify = enable_markdownify
+ self.image_helper = MarkdownImageUtil()
+ self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
+ self.enable = self.ping()
+ assert self.ping(), "MinerU API is not reachable"
+
+ def ping(self, timeout: int = 5) -> bool:
+ try:
+ response = requests.get(
+ self.minerU + "/docs", timeout=timeout, allow_redirects=True
+ )
+ response.raise_for_status()
+ return True
+ except Exception:
+ return False
+
+ def parse_into_text(self, content: bytes) -> Document:
+ logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
+ md_content: str = ""
+ images_b64: Dict[str, str] = {}
+ try:
+ response = requests.post(
+ url=self.minerU + "/file_parse",
+ data={
+ "return_md": True,
+ "return_images": True,
+ "lang_list": ["ch", "en"],
+ "table_enable": True,
+ "formula_enable": True,
+ "parse_method": "auto",
+ "start_page_id": 0,
+ "end_page_id": 99999,
+ "backend": "pipeline",
+ "response_format_zip": False,
+ "return_middle_json": False,
+ "return_model_output": False,
+ "return_content_list": False,
+ },
+ files={"files": content},
+ timeout=1000,
+ )
+ response.raise_for_status()
+ result = response.json()["results"]["files"]
+ md_content = result["md_content"]
+ images_b64 = result.get("images", {})
+ except Exception as e:
+ logger.error(f"MinerU parsing failed: {e}", exc_info=True)
+ return Document()
+
+ # convert table(HTML) in markdown to markdown table
+ if self.enable_markdownify:
+ logger.debug("Converting HTML to Markdown")
+ md_content = markdownify.markdownify(md_content)
+
+ images = {}
+ image_replace = {}
+ # image in images_bs64 may not be used in md_content
+ # such as: table ...
+ # so we need to filter them
+ for ipath, b64_str in images_b64.items():
+ if f"images/{ipath}" not in md_content:
+ logger.debug(f"Image {ipath} not used in markdown")
+ continue
+ match = self.base64_pattern.match(b64_str)
+ if match:
+ file_ext = match.group(1)
+ b64_str = match.group(2)
+
+ image_bytes = endecode.encode_image(b64_str, errors="ignore")
+ if not image_bytes:
+ logger.error("Failed to decode base64 image skip it")
+ continue
+
+ image_url = self.storage.upload_bytes(
+ image_bytes, file_ext=f".{file_ext}"
+ )
+
+ images[image_url] = b64_str
+ image_replace[f"images/{ipath}"] = image_url
+
+ logger.info(f"Replaced {len(image_replace)} images in markdown")
+ text = self.image_helper.replace_path(md_content, image_replace)
+
+ logger.info(
+ f"Successfully parsed PDF, text: {len(text)}, images: {len(images)}"
+ )
+ return Document(content=text, images=images)
+
+
+if __name__ == "__main__":
+ logging.basicConfig(level=logging.DEBUG)
+
+ your_file = "/path/to/your/file.pdf"
+ your_mineru = "http://host.docker.internal:9987"
+ parser = MinerUParser(mineru_endpoint=your_mineru)
+ with open(your_file, "rb") as f:
+ content = f.read()
+ document = parser.parse_into_text(content)
+ logger.error(document.content)
diff --git a/docreader/parser/ocr_engine.py b/docreader/parser/ocr_engine.py
index 13c3e88..0a999b9 100644
--- a/docreader/parser/ocr_engine.py
+++ b/docreader/parser/ocr_engine.py
@@ -1,71 +1,96 @@
-import os
-import logging
-import base64
-from typing import Optional, Union, Dict, Any
-from abc import ABC, abstractmethod
-from PIL import Image
import io
+import logging
+import os
+import platform
+import subprocess
+from abc import ABC, abstractmethod
+from typing import Dict, Union
+
import numpy as np
-from .image_utils import image_to_base64
+from openai import OpenAI
+from PIL import Image
+
+from docreader.utils import endecode
logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+
class OCRBackend(ABC):
"""Base class for OCR backends"""
-
+
@abstractmethod
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
"""Extract text from an image
-
+
Args:
image: Image file path, bytes, or PIL Image object
-
+
Returns:
Extracted text
"""
pass
+
+class DummyOCRBackend(OCRBackend):
+ """Dummy OCR backend implementation"""
+
+ def predict(self, image: Union[str, bytes, Image.Image]) -> str:
+ logger.warning("Dummy OCR backend is used")
+ return ""
+
+
class PaddleOCRBackend(OCRBackend):
"""PaddleOCR backend implementation"""
-
- def __init__(self, **kwargs):
+
+ def __init__(self):
"""Initialize PaddleOCR backend"""
self.ocr = None
try:
- import os
import paddle
-
+
# Set PaddlePaddle to use CPU and disable GPU
- os.environ['CUDA_VISIBLE_DEVICES'] = ''
- paddle.set_device('cpu')
-
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
+ paddle.device.set_device("cpu")
+
# 尝试检测CPU是否支持AVX指令集
try:
- import subprocess
- import platform
-
# 检测CPU是否支持AVX
if platform.system() == "Linux":
try:
- result = subprocess.run(['grep', '-o', 'avx', '/proc/cpuinfo'],
- capture_output=True, text=True, timeout=5)
- has_avx = 'avx' in result.stdout.lower()
+ result = subprocess.run(
+ ["grep", "-o", "avx", "/proc/cpuinfo"],
+ capture_output=True,
+ text=True,
+ timeout=5,
+ )
+ has_avx = "avx" in result.stdout.lower()
if not has_avx:
- logger.warning("CPU does not support AVX instructions, using compatibility mode")
+ logger.warning(
+ "CPU does not support AVX instructions, "
+ "using compatibility mode"
+ )
# 进一步限制指令集使用
- os.environ['FLAGS_use_avx2'] = '0'
- os.environ['FLAGS_use_avx'] = '1'
- except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
- logger.warning("Could not detect AVX support, using compatibility mode")
- os.environ['FLAGS_use_avx2'] = '0'
- os.environ['FLAGS_use_avx'] = '1'
+ os.environ["FLAGS_use_avx2"] = "0"
+ os.environ["FLAGS_use_avx"] = "1"
+ except (
+ subprocess.TimeoutExpired,
+ FileNotFoundError,
+ subprocess.SubprocessError,
+ ):
+ logger.warning(
+ "Could not detect AVX support, using compatibility mode"
+ )
+ os.environ["FLAGS_use_avx2"] = "0"
+ os.environ["FLAGS_use_avx"] = "1"
except Exception as e:
- logger.warning(f"Error detecting CPU capabilities: {e}, using compatibility mode")
- os.environ['FLAGS_use_avx2'] = '0'
- os.environ['FLAGS_use_avx'] = '1'
-
+ logger.warning(
+ f"Error detecting CPU capabilities: {e}, using compatibility mode"
+ )
+ os.environ["FLAGS_use_avx2"] = "0"
+ os.environ["FLAGS_use_avx"] = "1"
+
from paddleocr import PaddleOCR
+
# OCR configuration with text orientation classification enabled
ocr_config = {
"use_gpu": False,
@@ -86,23 +111,53 @@ class PaddleOCRBackend(OCRBackend):
"use_dilation": True, # improves accuracy
"det_db_score_mode": "slow", # improves accuracy
}
-
+
self.ocr = PaddleOCR(**ocr_config)
logger.info("PaddleOCR engine initialized successfully")
-
+
except ImportError as e:
- logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
+ logger.error(
+ f"Failed to import paddleocr: {str(e)}. "
+ "Please install it with 'pip install paddleocr'"
+ )
except OSError as e:
if "Illegal instruction" in str(e) or "core dumped" in str(e):
- logger.error(f"PaddlePaddle crashed due to CPU instruction set incompatibility: {str(e)}")
- logger.error("This usually happens when the CPU doesn't support AVX instructions.")
- logger.error("Please try installing a CPU-only version of PaddlePaddle or use a different OCR backend.")
+ logger.error(
+ f"PaddlePaddle crashed due to CPU instruction set incompatibility:"
+ f"{e}"
+ )
+ logger.error(
+ "This happens when the CPU doesn't support AVX instructions. "
+ "Try install CPU-only version of PaddlePaddle, "
+ "or use a different OCR backend."
+ )
else:
- logger.error(f"Failed to initialize PaddleOCR due to OS error: {str(e)}")
+ logger.error(
+ f"Failed to initialize PaddleOCR due to OS error: {str(e)}"
+ )
except Exception as e:
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
-
- def predict(self, image):
+
+ def predict(self, image: Union[str, bytes, Image.Image]) -> str:
+ """Extract text from an image
+
+ Args:
+ image: Image file path, bytes, or PIL Image object
+
+ Returns:
+ Extracted text
+ """
+ if isinstance(image, str):
+ image = Image.open(image)
+ elif isinstance(image, bytes):
+ image = Image.open(io.BytesIO(image))
+
+ if not isinstance(image, Image.Image):
+ raise TypeError("image must be a string, bytes, or PIL Image object")
+
+ return self._predict(image)
+
+ def _predict(self, image: Image.Image) -> str:
"""Perform OCR recognition on the image
Args:
@@ -111,63 +166,59 @@ class PaddleOCRBackend(OCRBackend):
Returns:
Extracted text string
"""
+ if self.ocr is None:
+ logger.error("PaddleOCR engine not initialized")
+ return ""
try:
# Ensure image is in RGB format
- if hasattr(image, "convert") and image.mode != "RGB":
+ if image.mode != "RGB":
image = image.convert("RGB")
# Convert to numpy array if needed
- if hasattr(image, "convert"):
- image_array = np.array(image)
- else:
- image_array = image
+ image_array = np.array(image)
# Perform OCR
ocr_result = self.ocr.ocr(image_array, cls=False)
-
+
# Extract text
ocr_text = ""
if ocr_result and ocr_result[0]:
- for line in ocr_result[0]:
- if line and len(line) >= 2:
- text = line[1][0] if line[1] else ""
- if text:
- ocr_text += text + " "
-
- text_length = len(ocr_text.strip())
- if text_length > 0:
- logger.info(f"OCR extracted {text_length} characters")
- return ocr_text.strip()
- else:
- logger.warning("OCR returned empty result")
- return ""
-
+ text = [
+ line[1][0] if line and len(line) >= 2 and line[1] else ""
+ for line in ocr_result[0]
+ ]
+ text = [t.strip() for t in text if t]
+ ocr_text = " ".join(text)
+
+ logger.info(f"OCR extracted {len(ocr_text)} characters")
+ return ocr_text
+
except Exception as e:
logger.error(f"OCR recognition error: {str(e)}")
return ""
-
+
+
class NanonetsOCRBackend(OCRBackend):
"""Nanonets OCR backend implementation using OpenAI API format"""
-
- def __init__(self, **kwargs):
+
+ def __init__(self):
"""Initialize Nanonets OCR backend
-
+
Args:
api_key: API key for OpenAI API
base_url: Base URL for OpenAI API
model: Model name
"""
- try:
- from openai import OpenAI
- self.api_key = kwargs.get("api_key", "123")
- self.base_url = kwargs.get("base_url", "http://localhost:8000/v1")
- self.model = kwargs.get("model", "nanonets/Nanonets-OCR-s")
- self.temperature = kwargs.get("temperature", 0.0)
- self.max_tokens = kwargs.get("max_tokens", 15000)
-
- self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
- self.prompt = """
-## 任务说明
+ base_url = os.getenv("OCR_API_BASE_URL", "http://localhost:8000/v1")
+ api_key = os.getenv("OCR_API_KEY", "123")
+ timeout = 30
+ self.client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout)
+
+ self.model = os.getenv("OCR_MODEL", "nanonets/Nanonets-OCR-s")
+ logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
+ self.temperature = 0.0
+ self.max_tokens = 15000
+ self.prompt = """## 任务说明
请从上传的文档中提取文字内容,严格按自然阅读顺序(从上到下,从左到右)输出,并遵循以下格式规范。
@@ -192,33 +243,26 @@ class NanonetsOCRBackend(OCRBackend):
* 不要猜测或补全不确定的链接地址。
"""
- logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
- except ImportError:
- logger.error("Failed to import openai. Please install it with 'pip install openai'")
- self.client = None
- except Exception as e:
- logger.error(f"Failed to initialize Nanonets OCR: {str(e)}")
- self.client = None
-
+
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
"""Extract text from an image using Nanonets OCR
-
+
Args:
image: Image file path, bytes, or PIL Image object
-
+
Returns:
Extracted text
"""
if self.client is None:
logger.error("Nanonets OCR client not initialized")
return ""
-
+
try:
# Encode image to base64
- img_base64 = image_to_base64(image)
+ img_base64 = endecode.decode_image(image)
if not img_base64:
return ""
-
+
# Call Nanonets OCR API
logger.info(f"Calling Nanonets OCR API with model: {self.model}")
response = self.client.chat.completions.create(
@@ -229,7 +273,9 @@ class NanonetsOCRBackend(OCRBackend):
"content": [
{
"type": "image_url",
- "image_url": {"url": f"data:image/png;base64,{img_base64}"},
+ "image_url": {
+ "url": f"data:image/png;base64,{img_base64}"
+ },
},
{
"type": "text",
@@ -239,40 +285,43 @@ class NanonetsOCRBackend(OCRBackend):
}
],
temperature=self.temperature,
- max_tokens=self.max_tokens
+ max_tokens=self.max_tokens,
)
-
- return response.choices[0].message.content
+ return response.choices[0].message.content or ""
except Exception as e:
logger.error(f"Nanonets OCR prediction error: {str(e)}")
return ""
+
class OCREngine:
"""OCR Engine factory class"""
-
- _instance = None
-
+
+ _instance: Dict[str, OCRBackend] = {}
+
@classmethod
- def get_instance(cls, backend_type="paddle", **kwargs) -> Optional[OCRBackend]:
+ def get_instance(cls, backend_type: str) -> OCRBackend:
"""Get OCR engine instance
-
+
Args:
backend_type: OCR backend type, one of: "paddle", "nanonets"
**kwargs: Additional arguments for the backend
-
+
Returns:
OCR engine instance or None if initialization fails
"""
- if cls._instance is None:
- logger.info(f"Initializing OCR engine with backend: {backend_type}")
-
- if backend_type.lower() == "paddle":
- cls._instance = PaddleOCRBackend(**kwargs)
- elif backend_type.lower() == "nanonets":
- cls._instance = NanonetsOCRBackend(**kwargs)
- else:
- logger.error(f"Unknown OCR backend type: {backend_type}")
- return None
-
- return cls._instance
-
+ backend_type = backend_type.lower()
+ if cls._instance.get(backend_type):
+ return cls._instance[backend_type]
+
+ logger.info(f"Initializing OCR engine with backend: {backend_type}")
+
+ if backend_type == "paddle":
+ cls._instance[backend_type] = PaddleOCRBackend()
+
+ elif backend_type == "nanonets":
+ cls._instance[backend_type] = NanonetsOCRBackend()
+
+ else:
+ cls._instance[backend_type] = DummyOCRBackend()
+
+ return cls._instance[backend_type]
diff --git a/docreader/parser/parser.py b/docreader/parser/parser.py
index 8e1668d..b53448d 100644
--- a/docreader/parser/parser.py
+++ b/docreader/parser/parser.py
@@ -1,30 +1,19 @@
import logging
-from dataclasses import dataclass, field
-from typing import Dict, Any, Optional, Type
+from typing import Dict, Type
-from .base_parser import BaseParser, ParseResult
-from .docx_parser import DocxParser
-from .doc_parser import DocParser
-from .pdf_parser import PDFParser
-from .markdown_parser import MarkdownParser
-from .text_parser import TextParser
-from .image_parser import ImageParser
-from .web_parser import WebParser
-from .config import ChunkingConfig
-import traceback
+from docreader.models.document import Document
+from docreader.models.read_config import ChunkingConfig
+from docreader.parser.base_parser import BaseParser
+from docreader.parser.doc_parser import DocParser
+from docreader.parser.docx2_parser import Docx2Parser
+from docreader.parser.image_parser import ImageParser
+from docreader.parser.markdown_parser import MarkdownParser
+from docreader.parser.pdf_parser import PDFParser
+from docreader.parser.text_parser import TextParser
+from docreader.parser.web_parser import WebParser
logger = logging.getLogger(__name__)
-@dataclass
-class Chunk:
- """
- Represents a single text chunk with associated metadata.
- Basic unit for document processing and embedding.
- """
-
- content: str # Text content of the chunk
- metadata: Dict[str, Any] = None # Associated metadata (source, page number, etc.)
-
class Parser:
"""
@@ -33,10 +22,9 @@ class Parser:
"""
def __init__(self):
- logger.info("Initializing document parser")
# Initialize all parser types
self.parsers: Dict[str, Type[BaseParser]] = {
- "docx": DocxParser,
+ "docx": Docx2Parser,
"doc": DocParser,
"pdf": PDFParser,
"md": MarkdownParser,
@@ -56,8 +44,7 @@ class Parser:
", ".join(self.parsers.keys()),
)
-
- def get_parser(self, file_type: str) -> Optional[Type[BaseParser]]:
+ def get_parser(self, file_type: str) -> Type[BaseParser]:
"""
Get parser class for the specified file type.
@@ -67,12 +54,9 @@ class Parser:
Returns:
Parser class for the file type, or None if unsupported
"""
- file_type = file_type.lower()
- parser = self.parsers.get(file_type)
- if parser:
- logger.info(f"Found parser for file type: {file_type}")
- else:
- logger.warning(f"No parser found for file type: {file_type}")
+ parser = self.parsers.get(file_type.lower())
+ if not parser:
+ raise ValueError(f"Unsupported file type: {file_type}")
return parser
def parse_file(
@@ -81,7 +65,7 @@ class Parser:
file_type: str,
content: bytes,
config: ChunkingConfig,
- ) -> Optional[ParseResult]:
+ ) -> Document:
"""
Parse file content using appropriate parser based on file type.
@@ -96,60 +80,41 @@ class Parser:
"""
logger.info(f"Parsing file: {file_name} with type: {file_type}")
logger.info(
- f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
+ f"Chunking config: size={config.chunk_size}, "
+ f"overlap={config.chunk_overlap}, "
f"multimodal={config.enable_multimodal}"
)
-
- parser_instance = None
-
- try:
- # Get appropriate parser for file type
- cls = self.get_parser(file_type)
- if cls is None:
- logger.error(f"Unsupported file type: {file_type}")
- return None
- # Parse file content
- logger.info(f"Creating parser instance for {file_type} file")
- parser_instance = cls(
- file_name=file_name,
- file_type=file_type,
- chunk_size=config.chunk_size,
- chunk_overlap=config.chunk_overlap,
- separators=config.separators,
- enable_multimodal=config.enable_multimodal,
- max_image_size=1920, # Limit image size to 1920px
- max_concurrent_tasks=5, # Limit concurrent tasks to 5
- chunking_config=config, # Pass the entire chunking config
- )
+ # Get appropriate parser for file type
+ cls = self.get_parser(file_type)
- logger.info(f"Starting to parse file content, size: {len(content)} bytes")
- result = parser_instance.parse(content)
+ # Parse file content
+ logger.info(f"Creating parser instance for {file_type} file")
+ parser = cls(
+ file_name=file_name,
+ file_type=file_type,
+ chunk_size=config.chunk_size,
+ chunk_overlap=config.chunk_overlap,
+ separators=config.separators,
+ enable_multimodal=config.enable_multimodal,
+ max_image_size=1920, # Limit image size to 1920px
+ max_concurrent_tasks=5, # Limit concurrent tasks to 5
+ chunking_config=config, # Pass the entire chunking config
+ )
- if result:
- logger.info(
- f"Successfully parsed file {file_name}, generated {len(result.chunks)} chunks"
- )
- if result.chunks and len(result.chunks) > 0:
- logger.info(
- f"First chunk content length: {len(result.chunks[0].content)}"
- )
- else:
- logger.warning(f"Parser returned empty chunks for file: {file_name}")
- else:
- logger.warning(f"Parser returned None result for file: {file_name}")
+ logger.info(f"Starting to parse file content, size: {len(content)} bytes")
+ result = parser.parse(content)
- # Return parse results
- return result
+ if not result.content:
+ logger.warning(f"Parser returned empty content for file: {file_name}")
+ elif not result.chunks:
+ logger.warning(f"Parser returned empty chunks for file: {file_name}")
+ elif result.chunks[0]:
+ logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
+ logger.info(f"Parsed file {file_name}, with {len(result.chunks)} chunks")
+ return result
- except Exception as e:
- logger.error(f"Error parsing file {file_name}: {str(e)}")
- logger.info(f"Detailed traceback: {traceback.format_exc()}")
- return None
-
- def parse_url(
- self, url: str, title: str, config: ChunkingConfig
- ) -> Optional[ParseResult]:
+ def parse_url(self, url: str, title: str, config: ChunkingConfig) -> Document:
"""
Parse content from a URL using the WebParser.
@@ -163,44 +128,31 @@ class Parser:
"""
logger.info(f"Parsing URL: {url}, title: {title}")
logger.info(
- f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
- f"multimodal={config.enable_multimodal}"
+ f"Chunking config: size={config.chunk_size}, "
+ f"overlap={config.chunk_overlap}, multimodal={config.enable_multimodal}"
)
-
- parser_instance = None
- try:
- # Create web parser instance
- logger.info("Creating WebParser instance")
- parser_instance = WebParser(
- title=title,
- chunk_size=config.chunk_size,
- chunk_overlap=config.chunk_overlap,
- separators=config.separators,
- enable_multimodal=config.enable_multimodal,
- max_image_size=1920, # Limit image size
- max_concurrent_tasks=5, # Limit concurrent tasks
- chunking_config=config,
- )
+ # Create web parser instance
+ logger.info("Creating WebParser instance")
+ parser = WebParser(
+ title=title,
+ chunk_size=config.chunk_size,
+ chunk_overlap=config.chunk_overlap,
+ separators=config.separators,
+ enable_multimodal=config.enable_multimodal,
+ max_image_size=1920, # Limit image size
+ max_concurrent_tasks=5, # Limit concurrent tasks
+ chunking_config=config,
+ )
- logger.info(f"Starting to parse URL content")
- result = parser_instance.parse(url)
-
- if result:
- logger.info(
- f"Successfully parsed URL, generated {len(result.chunks)} chunks"
- )
- logger.info(
- f"First chunk content length: {len(result.chunks[0].content) if result.chunks else 0}"
- )
- else:
- logger.warning(f"Parser returned empty result for URL: {url}")
-
- # Return parse results
- return result
-
- except Exception as e:
- logger.error(f"Error parsing URL {url}: {str(e)}")
- logger.info(f"Detailed traceback: {traceback.format_exc()}")
- return None
+ logger.info("Starting to parse URL content")
+ result = parser.parse(url.encode())
+ if not result.content:
+ logger.warning(f"Parser returned empty content for url: {url}")
+ elif not result.chunks:
+ logger.warning(f"Parser returned empty chunks for url: {url}")
+ elif result.chunks[0]:
+ logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
+ logger.info(f"Parsed url {url}, with {len(result.chunks)} chunks")
+ return result
diff --git a/docreader/parser/pdf_parser.py b/docreader/parser/pdf_parser.py
index 94d9f9a..c17184d 100644
--- a/docreader/parser/pdf_parser.py
+++ b/docreader/parser/pdf_parser.py
@@ -1,113 +1,7 @@
-import logging
-import os
-import io
-from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
+from docreader.parser.chain_parser import FirstParser
+from docreader.parser.markitdown_parser import MarkitdownParser
+from docreader.parser.mineru_parser import MinerUParser
-import pdfplumber
-import tempfile
-from .base_parser import BaseParser
-logger = logging.getLogger(__name__)
-
-class PDFParser(BaseParser):
- """
- PDF Document Parser
-
- This parser handles PDF documents by extracting text content.
- It uses the pypdf library for simple text extraction.
- """
- def _convert_table_to_markdown(self, table_data: list) -> str:
-
- if not table_data or not table_data[0]: return ""
- def clean_cell(cell):
- if cell is None: return ""
- return str(cell).replace("\n", "
")
- try:
- markdown = ""
- header = [clean_cell(cell) for cell in table_data[0]]
- markdown += "| " + " | ".join(header) + " |\n"
- markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
- for row in table_data[1:]:
- if not row: continue
- body_row = [clean_cell(cell) for cell in row]
- if len(body_row) != len(header):
- logger.warning(f"Skipping malformed table row: {body_row}")
- continue
- markdown += "| " + " | ".join(body_row) + " |\n"
- return markdown
- except Exception as e:
- logger.error(f"Error converting table to markdown: {e}")
- return ""
- def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
-
- logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
-
- all_page_content = []
-
-
- temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
- temp_pdf_path = temp_pdf.name
-
- try:
- temp_pdf.write(content)
- temp_pdf.close()
- logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
-
- with pdfplumber.open(temp_pdf_path) as pdf:
- logger.info(f"PDF has {len(pdf.pages)} pages")
-
- for page_num, page in enumerate(pdf.pages):
- page_content_parts = []
-
- # Try-fallback strategy for table detection
- default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
- found_tables = page.find_tables(default_settings)
- if not found_tables:
- logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
- fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
- found_tables = page.find_tables(fallback_settings)
-
- table_bboxes = [table.bbox for table in found_tables]
- # Define a filter function that keeps objects NOT inside any table bbox.
- def not_within_bboxes(obj):
- """Check if an object is outside all table bounding boxes."""
- for bbox in table_bboxes:
- # Check if the object's vertical center is within a bbox
- if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
- return False # It's inside a table, so we DON'T keep it
- return True # It's outside all tables, so we DO keep it
-
- # that contains only the non-table text.
- non_table_page = page.filter(not_within_bboxes)
-
- # Now, extract text from this filtered page view.
- text = non_table_page.extract_text(x_tolerance=2)
- if text:
- page_content_parts.append(text)
-
- # Process and append the structured Markdown tables
- if found_tables:
- logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
- for table in found_tables:
- markdown_table = self._convert_table_to_markdown(table.extract())
- page_content_parts.append(f"\n\n{markdown_table}\n\n")
-
-
- all_page_content.append("".join(page_content_parts))
-
- final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
- logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
-
- return final_text
-
- except Exception as e:
- logger.error(f"Failed to parse PDF document: {str(e)}")
- return ""
- finally:
- # This block is GUARANTEED to execute, preventing resource leaks.
- if os.path.exists(temp_pdf_path):
- try:
- os.remove(temp_pdf_path)
- logger.info(f"Temporary file cleaned up: {temp_pdf_path}")
- except OSError as e:
- logger.error(f"Error removing temporary file {temp_pdf_path}: {e}")
+class PDFParser(FirstParser):
+ _parser_cls = (MinerUParser, MarkitdownParser)
diff --git a/docreader/parser/storage.py b/docreader/parser/storage.py
index 33cb9a2..767ab0a 100644
--- a/docreader/parser/storage.py
+++ b/docreader/parser/storage.py
@@ -1,64 +1,68 @@
# -*- coding: utf-8 -*-
-import os
-import uuid
-import logging
import io
+import logging
+import os
import traceback
+import uuid
from abc import ABC, abstractmethod
-from typing import Tuple, Optional
+from typing import Dict
-from qcloud_cos import CosConfig, CosS3Client
from minio import Minio
+from qcloud_cos import CosConfig, CosS3Client
+
+from docreader.utils import endecode
logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
class Storage(ABC):
"""Abstract base class for object storage operations"""
-
+
@abstractmethod
def upload_file(self, file_path: str) -> str:
"""Upload file to object storage
-
+
Args:
file_path: File path
-
- Returns:
- File URL
- """
- pass
-
- @abstractmethod
- def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
- """Upload bytes to object storage
-
- Args:
- content: Byte content to upload
- file_ext: File extension
-
+
Returns:
File URL
"""
pass
-
+ @abstractmethod
+ def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
+ """Upload bytes to object storage
+
+ Args:
+ content: Byte content to upload
+ file_ext: File extension
+
+ Returns:
+ File URL
+ """
+ pass
+
+
class CosStorage(Storage):
"""Tencent Cloud COS storage implementation"""
-
+
def __init__(self, storage_config=None):
"""Initialize COS storage
-
+
Args:
storage_config: Storage configuration
"""
self.storage_config = storage_config
- self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client()
-
+ self.client, self.bucket_name, self.region, self.prefix = (
+ self._init_cos_client()
+ )
+
def _init_cos_client(self):
"""Initialize Tencent Cloud COS client"""
try:
- # Use provided COS config if available, otherwise fall back to environment variables
+ # Use provided COS config if available,
+ # otherwise fall back to environment variables
if self.storage_config and self.storage_config.get("access_key_id") != "":
cos_config = self.storage_config
secret_id = cos_config.get("access_key_id")
@@ -75,15 +79,16 @@ class CosStorage(Storage):
bucket_name = os.getenv("COS_BUCKET_NAME")
appid = os.getenv("COS_APP_ID")
prefix = os.getenv("COS_PATH_PREFIX")
-
+
enable_old_domain = (
os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true"
)
if not all([secret_id, secret_key, region, bucket_name, appid]):
logger.error(
- "Incomplete COS configuration, missing required environment variables"
- f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}"
+ "Incomplete COS configuration, missing environment variables"
+ f"secret_id: {secret_id}, secret_key: {secret_key}, "
+ f"region: {region}, bucket_name: {bucket_name}, appid: {appid}"
)
return None, None, None, None
@@ -105,27 +110,26 @@ class CosStorage(Storage):
except Exception as e:
logger.error(f"Failed to initialize COS client: {str(e)}")
return None, None, None, None
-
+
def _get_download_url(self, bucket_name, region, object_key):
"""Generate COS object URL
-
+
Args:
bucket_name: Bucket name
region: Region
object_key: Object key
-
+
Returns:
File URL
"""
return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
-
-
+
def upload_file(self, file_path: str) -> str:
"""Upload file to Tencent Cloud COS
-
+
Args:
file_path: File path
-
+
Returns:
File URL
"""
@@ -135,16 +139,16 @@ class CosStorage(Storage):
return ""
# Generate object key, use UUID to avoid conflicts
- file_name = os.path.basename(file_path)
- object_key = (
- f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
- )
+ file_ext = os.path.splitext(file_path)[1]
+ object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
logger.info(f"Generated object key: {object_key}")
# Upload file
logger.info("Attempting to upload file to COS")
- response = self.client.upload_file(
- Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key
+ self.client.upload_file(
+ Bucket=self.bucket_name,
+ LocalFilePath=file_path,
+ Key=object_key,
)
# Get file URL
@@ -156,14 +160,14 @@ class CosStorage(Storage):
except Exception as e:
logger.error(f"Failed to upload file to COS: {str(e)}")
return ""
-
+
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to Tencent Cloud COS
-
+
Args:
content: Byte content to upload
file_ext: File extension
-
+
Returns:
File URL
"""
@@ -171,10 +175,16 @@ class CosStorage(Storage):
logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes")
if not self.client:
return ""
-
- object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}"
+
+ object_key = (
+ f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
+ if self.prefix
+ else f"images/{uuid.uuid4().hex}{file_ext}"
+ )
logger.info(f"Generated object key: {object_key}")
- self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key)
+ self.client.put_object(
+ Bucket=self.bucket_name, Body=content, Key=object_key
+ )
file_url = self._get_download_url(self.bucket_name, self.region, object_key)
logger.info(f"Successfully uploaded bytes to COS: {file_url}")
return file_url
@@ -186,16 +196,18 @@ class CosStorage(Storage):
class MinioStorage(Storage):
"""MinIO storage implementation"""
-
+
def __init__(self, storage_config=None):
"""Initialize MinIO storage
-
+
Args:
storage_config: Storage configuration
"""
self.storage_config = storage_config
- self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client()
-
+ self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = (
+ self._init_minio_client()
+ )
+
def _init_minio_client(self):
"""Initialize MinIO client from environment variables or injected config.
@@ -203,58 +215,69 @@ class MinioStorage(Storage):
prefer those values to override envs.
"""
try:
- endpoint = os.getenv("MINIO_ENDPOINT")
+ endpoint = os.getenv("MINIO_ENDPOINT", "")
use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true"
if self.storage_config and self.storage_config.get("bucket_name"):
storage_config = self.storage_config
- bucket_name = storage_config.get("bucket_name")
+ bucket_name = storage_config.get("bucket_name", "")
path_prefix = storage_config.get("path_prefix").strip().strip("/")
access_key = storage_config.get("access_key_id")
secret_key = storage_config.get("secret_access_key")
else:
access_key = os.getenv("MINIO_ACCESS_KEY_ID")
secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY")
- bucket_name = os.getenv("MINIO_BUCKET_NAME")
+ bucket_name = os.getenv("MINIO_BUCKET_NAME", "")
path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/")
if not all([endpoint, access_key, secret_key, bucket_name]):
- logger.error("Incomplete MinIO configuration, missing required environment variables")
+ logger.error(
+ "Incomplete MinIO configuration, missing environment variables"
+ )
return None, None, None, None, None
# Initialize client
- client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl)
+ client = Minio(
+ endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl
+ )
# Ensure bucket exists
found = client.bucket_exists(bucket_name)
if not found:
client.make_bucket(bucket_name)
- policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name)
+ policy = (
+ '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}'
+ % (bucket_name, bucket_name)
+ )
client.set_bucket_policy(bucket_name, policy)
return client, bucket_name, use_ssl, endpoint, path_prefix
except Exception as e:
logger.error(f"Failed to initialize MinIO client: {str(e)}")
return None, None, None, None, None
-
- def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None):
+
+ def _get_download_url(self, object_key: str):
"""Construct a public URL for MinIO object.
If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint.
"""
- if public_endpoint:
- base = public_endpoint
- else:
- scheme = "https" if use_ssl else "http"
- base = f"{scheme}://{endpoint}"
- # Path-style URL for MinIO
- return f"{base}/{bucket_name}/{object_key}"
-
+ # 1. Use public endpoint if provided
+ endpoint = os.getenv("MINIO_PUBLIC_ENDPOINT")
+ if endpoint:
+ return f"{endpoint}/{self.bucket_name}/{object_key}"
+
+ # 2. Use SSL if enabled
+ if self.use_ssl:
+ return f"https://{self.endpoint}/{self.bucket_name}/{object_key}"
+
+ # 3. Use HTTP default
+ return f"http://{self.endpoint}/{self.bucket_name}/{object_key}"
+
def upload_file(self, file_path: str) -> str:
"""Upload file to MinIO
-
+
Args:
file_path: File path
-
+
Returns:
File URL
"""
@@ -265,29 +288,27 @@ class MinioStorage(Storage):
# Generate object key, use UUID to avoid conflicts
file_name = os.path.basename(file_path)
- object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
+ object_key = (
+ f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
+ if self.path_prefix
+ else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
+ )
logger.info(f"Generated MinIO object key: {object_key}")
# Upload file
logger.info("Attempting to upload file to MinIO")
- with open(file_path, 'rb') as file_data:
+ with open(file_path, "rb") as file_data:
file_size = os.path.getsize(file_path)
self.client.put_object(
- bucket_name=self.bucket_name,
+ bucket_name=self.bucket_name or "",
object_name=object_key,
data=file_data,
length=file_size,
- content_type='application/octet-stream'
+ content_type="application/octet-stream",
)
# Get file URL
- file_url = self._get_download_url(
- self.bucket_name,
- object_key,
- self.use_ssl,
- self.endpoint,
- os.getenv("MINIO_PUBLIC_ENDPOINT", None)
- )
+ file_url = self._get_download_url(object_key)
logger.info(f"Successfully uploaded file to MinIO: {file_url}")
return file_url
@@ -295,14 +316,14 @@ class MinioStorage(Storage):
except Exception as e:
logger.error(f"Failed to upload file to MinIO: {str(e)}")
return ""
-
+
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to MinIO
-
+
Args:
content: Byte content to upload
file_ext: File extension
-
+
Returns:
File URL
"""
@@ -310,23 +331,21 @@ class MinioStorage(Storage):
logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes")
if not self.client:
return ""
-
- object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}"
+
+ object_key = (
+ f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}"
+ if self.path_prefix
+ else f"images/{uuid.uuid4().hex}{file_ext}"
+ )
logger.info(f"Generated MinIO object key: {object_key}")
self.client.put_object(
- self.bucket_name,
- object_key,
- data=io.BytesIO(content),
- length=len(content),
- content_type="application/octet-stream"
- )
- file_url = self._get_download_url(
- self.bucket_name,
- object_key,
- self.use_ssl,
- self.endpoint,
- os.getenv("MINIO_PUBLIC_ENDPOINT", None)
+ self.bucket_name or "",
+ object_key,
+ data=io.BytesIO(content),
+ length=len(content),
+ content_type="application/octet-stream",
)
+ file_url = self._get_download_url(object_key)
logger.info(f"Successfully uploaded bytes to MinIO: {file_url}")
return file_url
except Exception as e:
@@ -335,26 +354,61 @@ class MinioStorage(Storage):
return ""
-def create_storage(storage_config=None) -> Storage:
+class LocalStorage(Storage):
+ """Local file system storage implementation"""
+
+ def __init__(self, storage_config: Dict[str, str] = {}):
+ self.storage_config = storage_config
+ base_dir = storage_config.get(
+ "base_dir", os.getenv("LOCAL_STORAGE_BASE_DIR", "")
+ )
+ self.image_dir = os.path.join(base_dir, "images")
+ os.makedirs(self.image_dir, exist_ok=True)
+
+ def upload_file(self, file_path: str) -> str:
+ logger.info(f"Uploading file to local storage: {file_path}")
+ return file_path
+
+ def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
+ logger.info(f"Uploading file to local storage: {len(content)} bytes")
+ fname = os.path.join(self.image_dir, f"{uuid.uuid4()}{file_ext}")
+ with open(fname, "wb") as f:
+ f.write(content)
+ return fname
+
+
+class Base64Storage(Storage):
+ def upload_file(self, file_path: str) -> str:
+ logger.info(f"Uploading file to base64 storage: {file_path}")
+ return file_path
+
+ def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
+ logger.info(f"Uploading file to base64 storage: {len(content)} bytes")
+ file_ext = file_ext.lstrip(".")
+ return f"data:image/{file_ext};base64,{endecode.decode_image(content)}"
+
+
+def create_storage(storage_config: Dict[str, str] | None = None) -> Storage:
"""Create a storage instance based on configuration or environment variables
-
+
Args:
storage_config: Storage configuration dictionary
-
+
Returns:
Storage instance
"""
storage_type = os.getenv("STORAGE_TYPE", "cos").lower()
-
if storage_config:
storage_type = str(storage_config.get("provider", storage_type)).lower()
-
logger.info(f"Creating {storage_type} storage instance")
-
+
if storage_type == "minio":
return MinioStorage(storage_config)
elif storage_type == "cos":
- # Default to COS
return CosStorage(storage_config)
- else:
- return None
\ No newline at end of file
+ elif storage_type == "local":
+ return LocalStorage(storage_config or {})
+ elif storage_type == "base64":
+ return Base64Storage()
+
+ raise ValueError(f"Invalid storage type: {storage_type}")
diff --git a/docreader/parser/text_parser.py b/docreader/parser/text_parser.py
index 0bd0dd7..7675f17 100644
--- a/docreader/parser/text_parser.py
+++ b/docreader/parser/text_parser.py
@@ -1,6 +1,8 @@
import logging
-from .base_parser import BaseParser
-from typing import Dict, Any, Tuple, Union
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.utils import endecode
logger = logging.getLogger(__name__)
@@ -11,7 +13,7 @@ class TextParser(BaseParser):
This parser handles text extraction and chunking from plain text documents.
"""
- def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+ def parse_into_text(self, content: bytes) -> Document:
"""
Parse text document content by decoding bytes to string.
@@ -25,20 +27,15 @@ class TextParser(BaseParser):
Parsed text content as string
"""
logger.info(f"Parsing text document, content size: {len(content)} bytes")
- text = self.decode_bytes(content)
+ text = endecode.decode_bytes(content)
logger.info(
f"Successfully parsed text document, extracted {len(text)} characters"
)
- return text
+ return Document(content=text)
if __name__ == "__main__":
- logging.basicConfig(
- level=logging.INFO,
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
- datefmt="%Y-%m-%d %H:%M:%S",
- )
- logger.info("Running TextParser in standalone mode")
+ logger = logging.getLogger(__name__)
# Sample text for testing
text = """## 标题1
diff --git a/docreader/parser/web_parser.py b/docreader/parser/web_parser.py
index 44c883a..e7291a0 100644
--- a/docreader/parser/web_parser.py
+++ b/docreader/parser/web_parser.py
@@ -1,11 +1,14 @@
-from typing import Any, Optional, Tuple, Dict, Union
-import os
-
-from playwright.async_api import async_playwright
-from bs4 import BeautifulSoup
-from .base_parser import BaseParser, ParseResult
-import logging
import asyncio
+import logging
+import os
+from typing import Any
+
+from bs4 import BeautifulSoup
+from playwright.async_api import async_playwright
+
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.utils import endecode
logger = logging.getLogger(__name__)
@@ -59,7 +62,7 @@ class WebParser(BaseParser):
# Return empty BeautifulSoup object on error
return BeautifulSoup("", "html.parser")
- def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+ def parse_into_text(self, content: bytes) -> Document:
"""Parse web page
Args:
@@ -78,10 +81,10 @@ class WebParser(BaseParser):
# Run async method
# Handle content possibly being a string
if isinstance(content, bytes):
- url = self.decode_bytes(content)
+ url = endecode.decode_bytes(content)
logger.info(f"Decoded URL from bytes: {url}")
else:
- url = content
+ url = str(content)
logger.info(f"Using content as URL directly: {url}")
logger.info(f"Scraping web page: {url}")
@@ -118,11 +121,11 @@ class WebParser(BaseParser):
logger.info(
f"Web page parsing complete, total content: {len(result)} characters"
)
- return result
+ return Document(content=result)
except Exception as e:
logger.error(f"Error parsing web page: {str(e)}")
- return f"Error parsing web page: {str(e)}"
+ return Document(content=f"Error parsing web page: {str(e)}")
finally:
# Close event loop
diff --git a/docreader/proto/docreader_pb2.pyi b/docreader/proto/docreader_pb2.pyi
new file mode 100644
index 0000000..9f7cdf6
--- /dev/null
+++ b/docreader/proto/docreader_pb2.pyi
@@ -0,0 +1,127 @@
+from google.protobuf.internal import containers as _containers
+from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from collections.abc import Iterable as _Iterable, Mapping as _Mapping
+from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
+
+DESCRIPTOR: _descriptor.FileDescriptor
+
+class StorageProvider(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
+ __slots__ = ()
+ STORAGE_PROVIDER_UNSPECIFIED: _ClassVar[StorageProvider]
+ COS: _ClassVar[StorageProvider]
+ MINIO: _ClassVar[StorageProvider]
+STORAGE_PROVIDER_UNSPECIFIED: StorageProvider
+COS: StorageProvider
+MINIO: StorageProvider
+
+class StorageConfig(_message.Message):
+ __slots__ = ("provider", "region", "bucket_name", "access_key_id", "secret_access_key", "app_id", "path_prefix")
+ PROVIDER_FIELD_NUMBER: _ClassVar[int]
+ REGION_FIELD_NUMBER: _ClassVar[int]
+ BUCKET_NAME_FIELD_NUMBER: _ClassVar[int]
+ ACCESS_KEY_ID_FIELD_NUMBER: _ClassVar[int]
+ SECRET_ACCESS_KEY_FIELD_NUMBER: _ClassVar[int]
+ APP_ID_FIELD_NUMBER: _ClassVar[int]
+ PATH_PREFIX_FIELD_NUMBER: _ClassVar[int]
+ provider: StorageProvider
+ region: str
+ bucket_name: str
+ access_key_id: str
+ secret_access_key: str
+ app_id: str
+ path_prefix: str
+ def __init__(self, provider: _Optional[_Union[StorageProvider, str]] = ..., region: _Optional[str] = ..., bucket_name: _Optional[str] = ..., access_key_id: _Optional[str] = ..., secret_access_key: _Optional[str] = ..., app_id: _Optional[str] = ..., path_prefix: _Optional[str] = ...) -> None: ...
+
+class VLMConfig(_message.Message):
+ __slots__ = ("model_name", "base_url", "api_key", "interface_type")
+ MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
+ BASE_URL_FIELD_NUMBER: _ClassVar[int]
+ API_KEY_FIELD_NUMBER: _ClassVar[int]
+ INTERFACE_TYPE_FIELD_NUMBER: _ClassVar[int]
+ model_name: str
+ base_url: str
+ api_key: str
+ interface_type: str
+ def __init__(self, model_name: _Optional[str] = ..., base_url: _Optional[str] = ..., api_key: _Optional[str] = ..., interface_type: _Optional[str] = ...) -> None: ...
+
+class ReadConfig(_message.Message):
+ __slots__ = ("chunk_size", "chunk_overlap", "separators", "enable_multimodal", "storage_config", "vlm_config")
+ CHUNK_SIZE_FIELD_NUMBER: _ClassVar[int]
+ CHUNK_OVERLAP_FIELD_NUMBER: _ClassVar[int]
+ SEPARATORS_FIELD_NUMBER: _ClassVar[int]
+ ENABLE_MULTIMODAL_FIELD_NUMBER: _ClassVar[int]
+ STORAGE_CONFIG_FIELD_NUMBER: _ClassVar[int]
+ VLM_CONFIG_FIELD_NUMBER: _ClassVar[int]
+ chunk_size: int
+ chunk_overlap: int
+ separators: _containers.RepeatedScalarFieldContainer[str]
+ enable_multimodal: bool
+ storage_config: StorageConfig
+ vlm_config: VLMConfig
+ def __init__(self, chunk_size: _Optional[int] = ..., chunk_overlap: _Optional[int] = ..., separators: _Optional[_Iterable[str]] = ..., enable_multimodal: bool = ..., storage_config: _Optional[_Union[StorageConfig, _Mapping]] = ..., vlm_config: _Optional[_Union[VLMConfig, _Mapping]] = ...) -> None: ...
+
+class ReadFromFileRequest(_message.Message):
+ __slots__ = ("file_content", "file_name", "file_type", "read_config", "request_id")
+ FILE_CONTENT_FIELD_NUMBER: _ClassVar[int]
+ FILE_NAME_FIELD_NUMBER: _ClassVar[int]
+ FILE_TYPE_FIELD_NUMBER: _ClassVar[int]
+ READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
+ REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
+ file_content: bytes
+ file_name: str
+ file_type: str
+ read_config: ReadConfig
+ request_id: str
+ def __init__(self, file_content: _Optional[bytes] = ..., file_name: _Optional[str] = ..., file_type: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
+
+class ReadFromURLRequest(_message.Message):
+ __slots__ = ("url", "title", "read_config", "request_id")
+ URL_FIELD_NUMBER: _ClassVar[int]
+ TITLE_FIELD_NUMBER: _ClassVar[int]
+ READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
+ REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
+ url: str
+ title: str
+ read_config: ReadConfig
+ request_id: str
+ def __init__(self, url: _Optional[str] = ..., title: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
+
+class Image(_message.Message):
+ __slots__ = ("url", "caption", "ocr_text", "original_url", "start", "end")
+ URL_FIELD_NUMBER: _ClassVar[int]
+ CAPTION_FIELD_NUMBER: _ClassVar[int]
+ OCR_TEXT_FIELD_NUMBER: _ClassVar[int]
+ ORIGINAL_URL_FIELD_NUMBER: _ClassVar[int]
+ START_FIELD_NUMBER: _ClassVar[int]
+ END_FIELD_NUMBER: _ClassVar[int]
+ url: str
+ caption: str
+ ocr_text: str
+ original_url: str
+ start: int
+ end: int
+ def __init__(self, url: _Optional[str] = ..., caption: _Optional[str] = ..., ocr_text: _Optional[str] = ..., original_url: _Optional[str] = ..., start: _Optional[int] = ..., end: _Optional[int] = ...) -> None: ...
+
+class Chunk(_message.Message):
+ __slots__ = ("content", "seq", "start", "end", "images")
+ CONTENT_FIELD_NUMBER: _ClassVar[int]
+ SEQ_FIELD_NUMBER: _ClassVar[int]
+ START_FIELD_NUMBER: _ClassVar[int]
+ END_FIELD_NUMBER: _ClassVar[int]
+ IMAGES_FIELD_NUMBER: _ClassVar[int]
+ content: str
+ seq: int
+ start: int
+ end: int
+ images: _containers.RepeatedCompositeFieldContainer[Image]
+ def __init__(self, content: _Optional[str] = ..., seq: _Optional[int] = ..., start: _Optional[int] = ..., end: _Optional[int] = ..., images: _Optional[_Iterable[_Union[Image, _Mapping]]] = ...) -> None: ...
+
+class ReadResponse(_message.Message):
+ __slots__ = ("chunks", "error")
+ CHUNKS_FIELD_NUMBER: _ClassVar[int]
+ ERROR_FIELD_NUMBER: _ClassVar[int]
+ chunks: _containers.RepeatedCompositeFieldContainer[Chunk]
+ error: str
+ def __init__(self, chunks: _Optional[_Iterable[_Union[Chunk, _Mapping]]] = ..., error: _Optional[str] = ...) -> None: ...
diff --git a/docreader/proto/docreader_pb2_grpc.py b/docreader/proto/docreader_pb2_grpc.py
index 7cfcba5..b2c9f11 100644
--- a/docreader/proto/docreader_pb2_grpc.py
+++ b/docreader/proto/docreader_pb2_grpc.py
@@ -3,7 +3,7 @@
import grpc
import warnings
-from . import docreader_pb2 as docreader__pb2
+import docreader_pb2 as docreader__pb2
GRPC_GENERATED_VERSION = '1.76.0'
GRPC_VERSION = grpc.__version__
diff --git a/docreader/pyproject.toml b/docreader/pyproject.toml
index 3d24590..c8ccc36 100644
--- a/docreader/pyproject.toml
+++ b/docreader/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
"lxml>=6.0.2",
"markdown>=3.10",
"markdownify>=1.2.0",
+ "markitdown[docx,pdf,xls,xlsx]>=0.1.3",
"minio>=7.2.18",
"mistletoe>=1.5.0",
"ollama>=0.6.0",
@@ -26,6 +27,7 @@ dependencies = [
"pillow>=12.0.0",
"playwright>=1.55.0",
"protobuf>=6.33.0",
+ "pydantic>=2.12.3",
"pypdf>=6.1.3",
"pypdf2>=3.0.1",
"python-docx>=1.2.0",
diff --git a/docreader/scripts/generate_proto.sh b/docreader/scripts/generate_proto.sh
index 21516ae..4bef407 100755
--- a/docreader/scripts/generate_proto.sh
+++ b/docreader/scripts/generate_proto.sh
@@ -2,13 +2,14 @@
set -x
# 设置目录
-PROTO_DIR="proto"
-PYTHON_OUT="proto"
-GO_OUT="proto"
+PROTO_DIR="docreader/proto"
+PYTHON_OUT="docreader/proto"
+GO_OUT="docreader/proto"
# 生成Python代码
python3 -m grpc_tools.protoc -I${PROTO_DIR} \
--python_out=${PYTHON_OUT} \
+ --pyi_out=${PYTHON_OUT} \
--grpc_python_out=${PYTHON_OUT} \
${PROTO_DIR}/docreader.proto
@@ -22,10 +23,10 @@ protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
# 修复Python导入问题(MacOS兼容版本)
if [ "$(uname)" == "Darwin" ]; then
# MacOS版本
- sed -i '' 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
+ sed -i '' 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
else
# Linux版本
- sed -i 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
+ sed -i 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
fi
echo "Proto files generated successfully!"
\ No newline at end of file
diff --git a/docreader/splitter/header_hook.py b/docreader/splitter/header_hook.py
new file mode 100644
index 0000000..a178e27
--- /dev/null
+++ b/docreader/splitter/header_hook.py
@@ -0,0 +1,112 @@
+import re
+from typing import Callable, Dict, List, Match, Pattern, Union
+
+from pydantic import BaseModel, Field
+
+
+class HeaderTrackerHook(BaseModel):
+ """表头追踪Hook的配置类,支持多种场景的表头识别"""
+
+ start_pattern: Pattern[str] = Field(
+ description="表头开始匹配(正则表达式或字符串)"
+ )
+ end_pattern: Pattern[str] = Field(description="表头结束匹配(正则表达式或字符串)")
+ extract_header_fn: Callable[[Match[str]], str] = Field(
+ default=lambda m: m.group(0),
+ description="从开始匹配结果中提取表头内容的函数(默认取匹配到的整个内容)",
+ )
+ priority: int = Field(default=0, description="优先级(多个配置时,高优先级先匹配)")
+ case_sensitive: bool = Field(
+ default=True, description="是否大小写敏感(仅当传入字符串pattern时生效)"
+ )
+
+ def __init__(
+ self,
+ start_pattern: Union[str, Pattern[str]],
+ end_pattern: Union[str, Pattern[str]],
+ **kwargs,
+ ):
+ flags = 0 if kwargs.get("case_sensitive", True) else re.IGNORECASE
+ if isinstance(start_pattern, str):
+ start_pattern = re.compile(start_pattern, flags | re.DOTALL)
+ if isinstance(end_pattern, str):
+ end_pattern = re.compile(end_pattern, flags | re.DOTALL)
+ super().__init__(
+ start_pattern=start_pattern,
+ end_pattern=end_pattern,
+ **kwargs,
+ )
+
+
+# 初始化表头Hook配置(提供默认配置:支持Markdown表格、代码块)
+DEFAULT_CONFIGS = [
+ # 代码块配置(```开头,```结尾)
+ # HeaderTrackerHook(
+ # # 代码块开始(支持语言指定)
+ # start_pattern=r"^\s*```(\w+).*(?!```$)",
+ # # 代码块结束
+ # end_pattern=r"^\s*```.*$",
+ # extract_header_fn=lambda m: f"```{m.group(1)}" if m.group(1) else "```",
+ # priority=20, # 代码块优先级高于表格
+ # case_sensitive=True,
+ # ),
+ # Markdown表格配置(表头带下划线)
+ HeaderTrackerHook(
+ # 表头行 + 分隔行
+ start_pattern=r"^\s*(?:\|[^|\n]*)+[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|?[\r\n]+$",
+ # 空行或非表格内容
+ end_pattern=r"^\s*$|^\s*[^|\s].*$",
+ priority=15,
+ case_sensitive=False,
+ ),
+]
+DEFAULT_CONFIGS.sort(key=lambda x: -x.priority)
+
+
+# 定义Hook状态数据结构
+class HeaderTracker(BaseModel):
+ """表头追踪 Hook 的状态类"""
+
+ header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS)
+ active_headers: Dict[int, str] = Field(default_factory=dict)
+ ended_headers: set[int] = Field(default_factory=set)
+
+ def update(self, split: str) -> Dict[int, str]:
+ """检测当前split中的表头开始/结束,更新Hook状态"""
+ new_headers: Dict[int, str] = {}
+
+ # 1. 检查是否有表头结束标记
+ for config in self.header_hook_configs:
+ if config.priority in self.active_headers and config.end_pattern.search(
+ split
+ ):
+ self.ended_headers.add(config.priority)
+ del self.active_headers[config.priority]
+
+ # 2. 检查是否有新的表头开始标记(只处理未活跃且未结束的)
+ for config in self.header_hook_configs:
+ if (
+ config.priority not in self.active_headers
+ and config.priority not in self.ended_headers
+ ):
+ match = config.start_pattern.search(split)
+ if match:
+ header = config.extract_header_fn(match)
+ self.active_headers[config.priority] = header
+ new_headers[config.priority] = header
+
+ # 3. 检查是否所有活跃表头都已结束(清空结束标记)
+ if not self.active_headers:
+ self.ended_headers.clear()
+
+ return new_headers
+
+ def get_headers(self) -> str:
+ """获取当前所有活跃表头的拼接文本(按优先级排序)"""
+ # 按优先级降序排列表头
+ sorted_headers = sorted(self.active_headers.items(), key=lambda x: -x[0])
+ return (
+ "\n".join([header for _, header in sorted_headers])
+ if sorted_headers
+ else ""
+ )
diff --git a/docreader/splitter/splitter.py b/docreader/splitter/splitter.py
new file mode 100644
index 0000000..abb4851
--- /dev/null
+++ b/docreader/splitter/splitter.py
@@ -0,0 +1,313 @@
+"""Token splitter."""
+
+import itertools
+import logging
+import re
+from typing import Callable, Generic, List, Pattern, Tuple, TypeVar
+
+from pydantic import BaseModel, Field, PrivateAttr
+
+from docreader.splitter.header_hook import (
+ HeaderTracker,
+)
+from docreader.utils.split import split_by_char, split_by_sep
+
+DEFAULT_CHUNK_OVERLAP = 100
+DEFAULT_CHUNK_SIZE = 512
+
+T = TypeVar("T")
+
+logger = logging.getLogger(__name__)
+
+
+class TextSplitter(BaseModel, Generic[T]):
+ chunk_size: int = Field(description="The token chunk size for each chunk.")
+ chunk_overlap: int = Field(
+ description="The token overlap of each chunk when splitting."
+ )
+ separators: List[str] = Field(
+ description="Default separators for splitting into words"
+ )
+
+ # Try to keep the matched characters as a whole.
+ # If it's too long, the content will be further segmented.
+ protected_regex: List[str] = Field(
+ description="Protected regex for splitting into words"
+ )
+ len_function: Callable[[str], int] = Field(description="The length function.")
+ # Header tracking Hook related attributes
+ header_hook: HeaderTracker = Field(default_factory=HeaderTracker, exclude=True)
+
+ _protected_fns: List[Pattern] = PrivateAttr()
+ _split_fns: List[Callable] = PrivateAttr()
+
+ def __init__(
+ self,
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
+ separators: List[str] = ["\n", "。", " "],
+ protected_regex: List[str] = [
+ # math formula
+ r"\$\$[\s\S]*?\$\$",
+ # image
+ r"!\[.*?\]\(.*?\)",
+ # link
+ r"\[.*?\]\(.*?\)",
+ # table header
+ r"(?:\|[^|\n]*)+\|[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|[\r\n]+",
+ # table body
+ r"(?:\|[^|\n]*)+\|[\r\n]+",
+ # code header
+ r"```(?:\w+)[\r\n]+[^\r\n]*",
+ ],
+ length_function: Callable[[str], int] = lambda x: len(x),
+ ):
+ """Initialize with parameters."""
+ if chunk_overlap > chunk_size:
+ raise ValueError(
+ f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
+ f"({chunk_size}), should be smaller."
+ )
+
+ super().__init__(
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ separators=separators,
+ protected_regex=protected_regex,
+ len_function=length_function,
+ )
+ self._protected_fns = [re.compile(reg) for reg in protected_regex]
+ self._split_fns = [split_by_sep(sep) for sep in separators] + [split_by_char()]
+
+ def split_text(self, text: str) -> List[Tuple[int, int, str]]:
+ """Split text into chunks."""
+ if text == "":
+ return []
+
+ splits = self._split(text)
+ protect = self._split_protected(text)
+ splits = self._join(splits, protect)
+
+ assert "".join(splits) == text
+
+ chunks = self._merge(splits)
+ return chunks
+
+ def _split(self, text: str) -> List[str]:
+ """Break text into splits that are smaller than chunk size.
+
+ NOTE: the splits contain the separators.
+ """
+ if self.len_function(text) <= self.chunk_size:
+ return [text]
+
+ splits = []
+ for split_fn in self._split_fns:
+ splits = split_fn(text)
+ if len(splits) > 1:
+ break
+
+ new_splits = []
+ for split in splits:
+ split_len = self.len_function(split)
+ if split_len <= self.chunk_size:
+ new_splits.append(split)
+ else:
+ # recursively split
+ new_splits.extend(self._split(split))
+ return new_splits
+
+ def _merge(self, splits: List[str]) -> List[Tuple[int, int, str]]:
+ """Merge splits into chunks.
+
+ The high-level idea is to keep adding splits to a chunk until we
+ exceed the chunk size, then we start a new chunk with overlap.
+
+ When we start a new chunk, we pop off the first element of the previous
+ chunk until the total length is less than the chunk size.
+ """
+ chunks: List[Tuple[int, int, str]] = []
+
+ cur_chunk: List[Tuple[int, int, str]] = []
+
+ cur_headers, cur_len = "", 0
+ cur_start, cur_end = 0, 0
+ for split in splits:
+ cur_end = cur_start + len(split)
+ split_len = self.len_function(split)
+ if split_len > self.chunk_size:
+ logger.error(
+ f"Got a split of size {split_len}, ",
+ f"larger than chunk size {self.chunk_size}.",
+ )
+
+ self.header_hook.update(split)
+ cur_headers = self.header_hook.get_headers()
+ cur_headers_len = self.len_function(cur_headers)
+
+ if cur_headers_len > self.chunk_size:
+ logger.error(
+ f"Got headers of size {cur_headers_len}, ",
+ f"larger than chunk size {self.chunk_size}.",
+ )
+ cur_headers, cur_headers_len = "", 0
+
+ # if we exceed the chunk size after adding the new split, then
+ # we need to end the current chunk and start a new one
+ if cur_len + split_len + cur_headers_len > self.chunk_size:
+ # end the previous chunk
+ if len(cur_chunk) > 0:
+ chunks.append(
+ (
+ cur_chunk[0][0],
+ cur_chunk[-1][1],
+ "".join([c[2] for c in cur_chunk]),
+ )
+ )
+
+ # start a new chunk with overlap
+ # keep popping off the first element of the previous chunk until:
+ # 1. the current chunk length is less than chunk overlap
+ # 2. the total length is less than chunk size
+ while cur_chunk and (
+ cur_len > self.chunk_overlap
+ or cur_len + split_len + cur_headers_len > self.chunk_size
+ ):
+ # pop off the first element
+ first_chunk = cur_chunk.pop(0)
+ cur_len -= self.len_function(first_chunk[2])
+
+ if (
+ cur_headers
+ and split_len + cur_headers_len < self.chunk_size
+ and cur_headers not in split
+ ):
+ cur_chunk.insert(
+ 0,
+ (
+ cur_chunk[0][0] if cur_chunk else cur_start,
+ cur_chunk[0][1] if cur_chunk else cur_end,
+ cur_headers,
+ ),
+ )
+ cur_len += cur_headers_len
+
+ cur_chunk.append((cur_start, cur_end, split))
+ cur_len += split_len
+ cur_start = cur_end
+
+ # handle the last chunk
+ assert cur_chunk
+ if cur_headers and cur_len < self.chunk_size:
+ cur_chunk.insert(0, (cur_chunk[0][0], cur_chunk[0][1], cur_headers))
+ chunks.append(
+ (
+ cur_chunk[0][0],
+ cur_chunk[-1][1],
+ "".join([c[2] for c in cur_chunk]),
+ )
+ )
+
+ return chunks
+
+ def _split_protected(self, text: str) -> List[Tuple[int, str]]:
+ matches = [
+ (match.start(), match.end())
+ for pattern in self._protected_fns
+ for match in pattern.finditer(text)
+ ]
+ matches.sort(key=lambda x: (x[0], -x[1]))
+
+ res = []
+
+ def fold(initial: int, current: Tuple[int, int]) -> int:
+ if current[0] >= initial:
+ if current[1] - current[0] < self.chunk_size:
+ res.append((current[0], text[current[0] : current[1]]))
+ else:
+ logger.warning(f"Protected text ignore: {current}")
+ return max(initial, current[1])
+
+ # filter overlapping matches
+ list(itertools.accumulate(matches, fold, initial=-1))
+ return res
+
+ def _join(self, splits: List[str], protect: List[Tuple[int, str]]) -> List[str]:
+ """
+ Merges and splits elements in splits array based on protected substrings.
+
+ The function processes the input splits to ensure all protected substrings
+ remain as single items. If a protected substring is concatenated with preceding
+ or following content in any split element, it will be separated from
+ the adjacent content. The final result maintains the original order of content
+ while enforcing the integrity of protected substrings.
+
+ Key behaviors:
+ 1. Preserves the complete structure of each protected substring
+ 2. Separates protected substrings from any adjacent non-protected content
+ 3. Maintains the original sequence of all content except for necessary
+ 4. Handles cases where protected substrings are partially concatenated
+ """
+ j = 0
+ point, start = 0, 0
+ res = []
+
+ for split in splits:
+ end = start + len(split)
+
+ cur = split[point - start :]
+ while j < len(protect):
+ p_start, p_content = protect[j]
+ p_end = p_start + len(p_content)
+
+ if end <= p_start:
+ break
+
+ if point < p_start:
+ local_end = p_start - point
+ res.append(cur[:local_end])
+ cur = cur[local_end:]
+ point = p_start
+
+ res.append(p_content)
+ j += 1
+
+ if point < p_end:
+ local_start = p_end - point
+ cur = cur[local_start:]
+ point = p_end
+
+ if not cur:
+ break
+
+ if cur:
+ res.append(cur)
+ point = end
+
+ start = end
+ return res
+
+
+if __name__ == "__main__":
+ s = """
+ 这是一些普通文本。
+
+ | 姓名 | 年龄 | 城市 |
+ |------|------|------|
+ | 张三 | 25 | 北京 |
+ | 李四 | 30 | 上海 |
+ | 王五 | 28 | 广州 |
+ | 张三 | 25 | 北京 |
+ | 李四 | 30 | 上海 |
+ | 王五 | 28 | 广州 |
+
+ 这是文本结束。
+
+"""
+
+ sp = TextSplitter(chunk_size=200, chunk_overlap=2)
+ ck = sp.split_text(s)
+ for c in ck:
+ print("------", len(c))
+ print(c)
+ pass
diff --git a/docreader/utils/endecode.py b/docreader/utils/endecode.py
new file mode 100644
index 0000000..2457d07
--- /dev/null
+++ b/docreader/utils/endecode.py
@@ -0,0 +1,103 @@
+import base64
+import binascii
+import io
+import logging
+from typing import List, Union
+
+import numpy as np
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+def decode_image(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
+ """Convert image to base64 encoded string
+
+ Args:
+ image: Image file path, bytes, PIL Image object, or numpy array
+
+ Returns:
+ Base64 encoded image string, or empty string if conversion fails
+ """
+ if isinstance(image, str):
+ # It's a file path
+ with open(image, "rb") as image_file:
+ return base64.b64encode(image_file.read()).decode()
+
+ elif isinstance(image, bytes):
+ # It's bytes data
+ return base64.b64encode(image).decode()
+
+ elif isinstance(image, Image.Image):
+ # It's a PIL Image
+ buffer = io.BytesIO()
+ image.save(buffer, format=image.format)
+ return base64.b64encode(buffer.getvalue()).decode()
+
+ elif isinstance(image, np.ndarray):
+ # It's a numpy array
+ pil_image = Image.fromarray(image)
+ buffer = io.BytesIO()
+ pil_image.save(buffer, format="PNG")
+ return base64.b64encode(buffer.getvalue()).decode()
+
+ raise ValueError(f"Unsupported image type: {type(image)}")
+
+
+def encode_image(image: str, errors="strict") -> bytes:
+ """
+ Decode image bytes using base64.
+
+ errors
+ The error handling scheme to use for the handling of decoding errors.
+ The default is 'strict' meaning that decoding errors raise a
+ UnicodeDecodeError. Other possible values are 'ignore' and '????'
+ as well as any other name registered with codecs.register_error that
+ can handle UnicodeDecodeErrors.
+ """
+ try:
+ image_bytes = base64.b64decode(image)
+ except binascii.Error as e:
+ if errors == "ignore":
+ return b""
+ else:
+ raise e
+ return image_bytes
+
+
+def encode_bytes(content: str) -> bytes:
+ return content.encode()
+
+
+def decode_bytes(
+ content: bytes,
+ encodings: List[str] = [
+ "utf-8",
+ "gb18030",
+ "gb2312",
+ "gbk",
+ "big5",
+ "ascii",
+ "latin-1",
+ ],
+) -> str:
+ # Try decoding with each encoding format
+ for encoding in encodings:
+ try:
+ text = content.decode(encoding)
+ logger.debug(f"Decode content with {encoding}: {len(text)} characters")
+ return text
+ except UnicodeDecodeError:
+ continue
+
+ text = content.decode(encoding="latin-1", errors="replace")
+ logger.warning(
+ "Unable to determine correct encoding, using latin-1 as fallback. "
+ "This may cause character issues."
+ )
+ return text
+
+
+if __name__ == "__main__":
+ img = "testtest"
+ encode_image(img, errors="ignore")
diff --git a/docreader/utils/request.py b/docreader/utils/request.py
index 867892c..0a6af5b 100644
--- a/docreader/utils/request.py
+++ b/docreader/utils/request.py
@@ -1,10 +1,10 @@
-from contextvars import ContextVar
-import logging
-import uuid
import contextlib
+import logging
import time
-from typing import Optional
+import uuid
+from contextvars import ContextVar
from logging import LogRecord
+from typing import Optional
# 配置日志
logger = logging.getLogger(__name__)
@@ -26,21 +26,21 @@ def get_request_id() -> Optional[str]:
class MillisecondFormatter(logging.Formatter):
"""自定义日志格式化器,只显示毫秒级时间戳(3位数字)而不是微秒(6位)"""
-
+
def formatTime(self, record, datefmt=None):
"""重写formatTime方法,将微秒格式化为毫秒"""
# 先获取标准的格式化时间
result = super().formatTime(record, datefmt)
-
+
# 如果使用了包含.%f的格式,则将微秒(6位)截断为毫秒(3位)
if datefmt and ".%f" in datefmt:
# 格式化的时间字符串应该在最后有6位微秒数
- parts = result.split('.')
+ parts = result.split(".")
if len(parts) > 1 and len(parts[1]) >= 6:
# 只保留前3位作为毫秒
millis = parts[1][:3]
result = f"{parts[0]}.{millis}"
-
+
return result
diff --git a/docreader/utils/split.py b/docreader/utils/split.py
new file mode 100644
index 0000000..6442c4f
--- /dev/null
+++ b/docreader/utils/split.py
@@ -0,0 +1,34 @@
+import re
+from typing import Callable, List
+
+
+def split_text_keep_separator(text: str, separator: str) -> List[str]:
+ """Split text with separator and keep the separator at the end of each split."""
+ parts = text.split(separator)
+ result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
+ return [s for s in result if s]
+
+
+def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
+ """Split text by separator."""
+ if keep_sep:
+ return lambda text: split_text_keep_separator(text, sep)
+ else:
+ return lambda text: text.split(sep)
+
+
+def split_by_char() -> Callable[[str], List[str]]:
+ """Split text by character."""
+ return lambda text: list(text)
+
+
+def split_by_regex(regex: str) -> Callable[[str], List[str]]:
+ """Split text by regex."""
+ pattern = re.compile(f"({regex})")
+ return lambda text: list(filter(None, pattern.split(text)))
+
+
+def match_by_regex(regex: str) -> Callable[[str], bool]:
+ """Split text by regex."""
+ pattern = re.compile(regex)
+ return lambda text: bool(pattern.match(text))
diff --git a/docreader/utils/tempfile.py b/docreader/utils/tempfile.py
new file mode 100644
index 0000000..ab61619
--- /dev/null
+++ b/docreader/utils/tempfile.py
@@ -0,0 +1,77 @@
+import logging
+import os
+import tempfile
+
+logger = logging.getLogger(__name__)
+
+
+class TempFileContext:
+ def __init__(self, file_content: bytes, suffix: str):
+ """
+ Initialize the context
+ :param file_content: Byte data to write to file
+ :param suffix: File suffix
+ """
+ self.file_content = file_content
+ self.suffix = suffix
+ self.file = None
+
+ def __enter__(self):
+ """
+ Create file when entering context
+ """
+ self.temp_file = tempfile.NamedTemporaryFile(suffix=self.suffix, delete=False)
+ self.temp_file.write(self.file_content)
+ self.temp_file.flush()
+ logger.info(
+ f"Saved {self.suffix} content to temporary file: {self.temp_file.name}"
+ )
+ return self.temp_file.name
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ """
+ Delete file when exiting context
+ """
+ if self.temp_file:
+ self.temp_file.close()
+ if os.path.exists(self.temp_file.name):
+ os.remove(self.temp_file.name)
+ logger.info(f"File {self.temp_file.name} has been deleted.")
+ # Return False to propagate exception (if any exception occurred)
+ return False
+
+
+class TempDirContext:
+ def __init__(self):
+ """
+ Initialize the context
+ """
+ self.temp_dir = None
+
+ def __enter__(self):
+ """
+ Create directory when entering context
+ """
+ self.temp_dir = tempfile.TemporaryDirectory()
+ logger.info(f"Created temporary directory: {self.temp_dir.name}")
+ return self.temp_dir.name
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ """
+ Delete directory when exiting context
+ """
+ if self.temp_dir and os.path.exists(self.temp_dir.name):
+ self.temp_dir.cleanup()
+ logger.info(f"Directory {self.temp_dir.name} has been deleted.")
+ # Return False to propagate exception (if any exception occurred)
+ return False
+
+
+if __name__ == "__main__":
+ example_bytes = b"Hello, this is a test file."
+ file_name = "test_file.txt"
+
+ # Using with statement
+ with TempFileContext(example_bytes, file_name) as temp_file:
+ # File operations can be performed within the context
+ print(f"Does file {file_name} exist: {os.path.exists(file_name)}")
diff --git a/docreader/uv.lock b/docreader/uv.lock
index c5bfad0..8e53ad9 100644
--- a/docreader/uv.lock
+++ b/docreader/uv.lock
@@ -6,17 +6,22 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
- "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version >= '3.14' and sys_platform == 'win32'",
+ "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+ "python_full_version == '3.13.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version == '3.12.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version == '3.11.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version < '3.11' and sys_platform == 'win32'",
+ "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
[[package]]
@@ -423,6 +428,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" },
]
+[[package]]
+name = "cobble"
+version = "0.1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa", size = 3805, upload-time = "2024-06-01T18:11:09.528Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44", size = 3984, upload-time = "2024-06-01T18:11:07.911Z" },
+]
+
[[package]]
name = "colorama"
version = "0.4.6"
@@ -432,6 +446,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
]
+[[package]]
+name = "coloredlogs"
+version = "15.0.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "humanfriendly" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" },
+]
+
[[package]]
name = "cos-python-sdk-v5"
version = "1.9.38"
@@ -587,6 +613,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/18/d5/7a04640bf559bb890455ffb28978daf7d44f667c3f04a4d422c655c1ba92/cython-3.1.6-py3-none-any.whl", hash = "sha256:91dcf7eb9b6a089ce4e9e1140e571d84c3bca834afb77ec269be7aa9d31a8157", size = 1223550, upload-time = "2025-10-23T12:38:16.732Z" },
]
+[[package]]
+name = "defusedxml"
+version = "0.7.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" },
+]
+
[[package]]
name = "distro"
version = "1.9.0"
@@ -612,6 +647,7 @@ dependencies = [
{ name = "lxml" },
{ name = "markdown" },
{ name = "markdownify" },
+ { name = "markitdown", extra = ["docx", "pdf", "xls", "xlsx"] },
{ name = "minio" },
{ name = "mistletoe" },
{ name = "ollama" },
@@ -622,6 +658,7 @@ dependencies = [
{ name = "pillow" },
{ name = "playwright" },
{ name = "protobuf" },
+ { name = "pydantic" },
{ name = "pypdf" },
{ name = "pypdf2" },
{ name = "python-docx" },
@@ -643,6 +680,7 @@ requires-dist = [
{ name = "lxml", specifier = ">=6.0.2" },
{ name = "markdown", specifier = ">=3.10" },
{ name = "markdownify", specifier = ">=1.2.0" },
+ { name = "markitdown", extras = ["docx", "pdf", "xls", "xlsx"], specifier = ">=0.1.3" },
{ name = "minio", specifier = ">=7.2.18" },
{ name = "mistletoe", specifier = ">=1.5.0" },
{ name = "ollama", specifier = ">=0.6.0" },
@@ -653,6 +691,7 @@ requires-dist = [
{ name = "pillow", specifier = ">=12.0.0" },
{ name = "playwright", specifier = ">=1.55.0" },
{ name = "protobuf", specifier = ">=6.33.0" },
+ { name = "pydantic", specifier = ">=2.12.3" },
{ name = "pypdf", specifier = ">=6.1.3" },
{ name = "pypdf2", specifier = ">=3.0.1" },
{ name = "python-docx", specifier = ">=1.2.0" },
@@ -683,6 +722,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/bf/ee/aa015c5de8b0dc42a8e507eae8c2de5d1c0e068c896858fec6d502402ed6/ebooklib-0.20-py3-none-any.whl", hash = "sha256:fff5322517a37e31c972d27be7d982cc3928c16b3dcc5fd7e8f7c0f5d7bcf42b", size = 40995, upload-time = "2025-10-26T20:56:19.104Z" },
]
+[[package]]
+name = "et-xmlfile"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
+]
+
[[package]]
name = "exceptiongroup"
version = "1.3.0"
@@ -707,6 +755,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" },
]
+[[package]]
+name = "flatbuffers"
+version = "25.9.23"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" },
+]
+
[[package]]
name = "fonttools"
version = "4.60.1"
@@ -850,6 +907,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" },
{ url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" },
{ url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126, upload-time = "2025-08-07T13:18:20.239Z" },
+ { url = "https://files.pythonhosted.org/packages/f1/29/74242b7d72385e29bcc5563fba67dad94943d7cd03552bac320d597f29b2/greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7", size = 1544904, upload-time = "2025-11-04T12:42:04.763Z" },
+ { url = "https://files.pythonhosted.org/packages/c8/e2/1572b8eeab0f77df5f6729d6ab6b141e4a84ee8eb9bc8c1e7918f94eda6d/greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8", size = 1611228, upload-time = "2025-11-04T12:42:08.423Z" },
{ url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654, upload-time = "2025-08-07T13:50:00.469Z" },
{ url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" },
{ url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" },
@@ -859,6 +918,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" },
{ url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" },
{ url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" },
+ { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" },
{ url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" },
{ url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
{ url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
@@ -868,6 +929,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
{ url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
{ url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" },
+ { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" },
+ { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" },
{ url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" },
{ url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
{ url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
@@ -877,6 +940,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
{ url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
{ url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" },
+ { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" },
+ { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" },
{ url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" },
{ url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
{ url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
@@ -884,6 +949,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
{ url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
{ url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
+ { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
+ { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" },
{ url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" },
]
@@ -1061,6 +1128,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
]
+[[package]]
+name = "humanfriendly"
+version = "10.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "pyreadline3", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" },
+]
+
[[package]]
name = "idna"
version = "3.11"
@@ -1386,6 +1465,38 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" },
]
+[[package]]
+name = "magika"
+version = "0.6.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "click" },
+ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+ { name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+ { name = "onnxruntime", version = "1.23.2", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+ { name = "python-dotenv" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a3/f3/3d1dcdd7b9c41d589f5cff252d32ed91cdf86ba84391cfc81d9d8773571d/magika-0.6.3.tar.gz", hash = "sha256:7cc52aa7359af861957043e2bf7265ed4741067251c104532765cd668c0c0cb1", size = 3042784, upload-time = "2025-10-30T15:22:34.499Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a2/e4/35c323beb3280482c94299d61626116856ac2d4ec16ecef50afc4fdd4291/magika-0.6.3-py3-none-any.whl", hash = "sha256:eda443d08006ee495e02083b32e51b98cb3696ab595a7d13900d8e2ef506ec9d", size = 2969474, upload-time = "2025-10-30T15:22:25.298Z" },
+ { url = "https://files.pythonhosted.org/packages/25/8f/132b0d7cd51c02c39fd52658a5896276c30c8cc2fd453270b19db8c40f7e/magika-0.6.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:86901e64b05dde5faff408c9b8245495b2e1fd4c226e3393d3d2a3fee65c504b", size = 13358841, upload-time = "2025-10-30T15:22:27.413Z" },
+ { url = "https://files.pythonhosted.org/packages/c4/03/5ed859be502903a68b7b393b17ae0283bf34195cfcca79ce2dc25b9290e7/magika-0.6.3-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:3d9661eedbdf445ac9567e97e7ceefb93545d77a6a32858139ea966b5806fb64", size = 15367335, upload-time = "2025-10-30T15:22:29.907Z" },
+ { url = "https://files.pythonhosted.org/packages/7b/9e/f8ee7d644affa3b80efdd623a3d75865c8f058f3950cb87fb0c48e3559bc/magika-0.6.3-py3-none-win_amd64.whl", hash = "sha256:e57f75674447b20cab4db928ae58ab264d7d8582b55183a0b876711c2b2787f3", size = 12692831, upload-time = "2025-10-30T15:22:32.063Z" },
+]
+
+[[package]]
+name = "mammoth"
+version = "1.11.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "cobble" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ed/3c/a58418d2af00f2da60d4a51e18cd0311307b72d48d2fffec36a97b4a5e44/mammoth-1.11.0.tar.gz", hash = "sha256:a0f59e442f34d5b6447f4b0999306cbf3e67aaabfa8cb516f878fb1456744637", size = 53142, upload-time = "2025-09-19T10:35:20.373Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ca/54/2e39566a131b13f6d8d193f974cb6a34e81bb7cc2fa6f7e03de067b36588/mammoth-1.11.0-py2.py3-none-any.whl", hash = "sha256:c077ab0d450bd7c0c6ecd529a23bf7e0fa8190c929e28998308ff4eada3f063b", size = 54752, upload-time = "2025-09-19T10:35:18.699Z" },
+]
+
[[package]]
name = "markdown"
version = "3.10"
@@ -1408,6 +1519,41 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6a/e2/7af643acb4cae0741dffffaa7f3f7c9e7ab4046724543ba1777c401d821c/markdownify-1.2.0-py3-none-any.whl", hash = "sha256:48e150a1c4993d4d50f282f725c0111bd9eb25645d41fa2f543708fd44161351", size = 15561, upload-time = "2025-08-09T17:44:14.074Z" },
]
+[[package]]
+name = "markitdown"
+version = "0.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "beautifulsoup4" },
+ { name = "charset-normalizer" },
+ { name = "defusedxml" },
+ { name = "magika" },
+ { name = "markdownify" },
+ { name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'win32'" },
+ { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/87/31/90cef2bc8ecd85c200ed3b3d1e20fc7a724213502685c4b05b5431e02668/markitdown-0.1.3.tar.gz", hash = "sha256:b0d9127c3373a68274dede6af6c9bb0684b78ce364c727c4c304da97a20d6fd9", size = 40039, upload-time = "2025-08-26T22:37:04.4Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/97/83/7b47d2ecbf58650a03aeeb21ba2d59175f202bf4fb81d44f40f1deb82bc0/markitdown-0.1.3-py3-none-any.whl", hash = "sha256:08d9a25770979d78f60dcc0afcb868de6799608e4db65342b2e03304fb091251", size = 58391, upload-time = "2025-08-26T22:37:02.924Z" },
+]
+
+[package.optional-dependencies]
+docx = [
+ { name = "lxml" },
+ { name = "mammoth" },
+]
+pdf = [
+ { name = "pdfminer-six" },
+]
+xls = [
+ { name = "pandas" },
+ { name = "xlrd" },
+]
+xlsx = [
+ { name = "openpyxl" },
+ { name = "pandas" },
+]
+
[[package]]
name = "minio"
version = "7.2.18"
@@ -1433,6 +1579,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3f/c2/bd0fc48dc323035cd65d21c45a3bb5c7b054001bf4ea75e461beb7656e07/mistletoe-1.5.0-py3-none-any.whl", hash = "sha256:d4e77b991b998c5efe3c4eab4e1b472263b5349688acd50d79bd9a6c317a9df1", size = 55262, upload-time = "2025-10-18T16:37:08.376Z" },
]
+[[package]]
+name = "mpmath"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
+]
+
[[package]]
name = "networkx"
version = "3.4.2"
@@ -1440,7 +1595,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version < '3.11' and sys_platform == 'win32'",
+ "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" }
wheels = [
@@ -1456,14 +1612,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
- "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version >= '3.14' and sys_platform == 'win32'",
+ "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+ "python_full_version == '3.13.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version == '3.12.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version == '3.11.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" }
wheels = [
@@ -1492,7 +1652,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version < '3.11' and sys_platform == 'win32'",
+ "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
wheels = [
@@ -1561,14 +1722,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
- "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version >= '3.14' and sys_platform == 'win32'",
+ "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+ "python_full_version == '3.13.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version == '3.12.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version == '3.11.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
sdist = { url = "https://files.pythonhosted.org/packages/b5/f4/098d2270d52b41f1bd7db9fc288aaa0400cb48c2a3e2af6fa365d9720947/numpy-2.3.4.tar.gz", hash = "sha256:a7d018bfedb375a8d979ac758b120ba846a7fe764911a64465fd87b8729f4a6a", size = 20582187, upload-time = "2025-10-15T16:18:11.77Z" }
wheels = [
@@ -1660,6 +1825,97 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b5/c1/edc9f41b425ca40b26b7c104c5f6841a4537bb2552bfa6ca66e81405bb95/ollama-0.6.0-py3-none-any.whl", hash = "sha256:534511b3ccea2dff419ae06c3b58d7f217c55be7897c8ce5868dfb6b219cf7a0", size = 14130, upload-time = "2025-09-24T22:46:01.19Z" },
]
+[[package]]
+name = "onnxruntime"
+version = "1.20.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version >= '3.14' and sys_platform == 'win32'",
+ "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+ "python_full_version == '3.13.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+ "python_full_version == '3.12.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+ "python_full_version == '3.11.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+ "python_full_version < '3.11' and sys_platform == 'win32'",
+ "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+]
+dependencies = [
+ { name = "coloredlogs", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+ { name = "flatbuffers", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
+ { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
+ { name = "packaging", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+ { name = "protobuf", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+ { name = "sympy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+]
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/6d/c6/c4c0860bee2fde6037bdd9dcd12d323f6e38cf00fcc9a5065b394337fc55/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b2908b50101a19e99c4d4e97ebb9905561daf61829403061c1adc1b588bc0de", size = 11954010, upload-time = "2024-11-21T00:48:35.254Z" },
+ { url = "https://files.pythonhosted.org/packages/63/47/3dc0b075ab539f16b3d8b09df6b504f51836086ee709690a6278d791737d/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d82daaec24045a2e87598b8ac2b417b1cce623244e80e663882e9fe1aae86410", size = 13330452, upload-time = "2024-11-21T00:48:40.02Z" },
+ { url = "https://files.pythonhosted.org/packages/27/ef/80fab86289ecc01a734b7ddf115dfb93d8b2e004bd1e1977e12881c72b12/onnxruntime-1.20.1-cp310-cp310-win32.whl", hash = "sha256:4c4b251a725a3b8cf2aab284f7d940c26094ecd9d442f07dd81ab5470e99b83f", size = 9813849, upload-time = "2024-11-21T00:48:43.569Z" },
+ { url = "https://files.pythonhosted.org/packages/a9/e6/33ab10066c9875a29d55e66ae97c3bf91b9b9b987179455d67c32261a49c/onnxruntime-1.20.1-cp310-cp310-win_amd64.whl", hash = "sha256:d3b616bb53a77a9463707bb313637223380fc327f5064c9a782e8ec69c22e6a2", size = 11329702, upload-time = "2024-11-21T00:48:46.599Z" },
+ { url = "https://files.pythonhosted.org/packages/a5/da/c44bf9bd66cd6d9018a921f053f28d819445c4d84b4dd4777271b0fe52a2/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6243e34d74423bdd1edf0ae9596dd61023b260f546ee17d701723915f06a9f7", size = 11955227, upload-time = "2024-11-21T00:48:54.556Z" },
+ { url = "https://files.pythonhosted.org/packages/11/ac/4120dfb74c8e45cce1c664fc7f7ce010edd587ba67ac41489f7432eb9381/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5eec64c0269dcdb8d9a9a53dc4d64f87b9e0c19801d9321246a53b7eb5a7d1bc", size = 13331703, upload-time = "2024-11-21T00:48:57.97Z" },
+ { url = "https://files.pythonhosted.org/packages/12/f1/cefacac137f7bb7bfba57c50c478150fcd3c54aca72762ac2c05ce0532c1/onnxruntime-1.20.1-cp311-cp311-win32.whl", hash = "sha256:a19bc6e8c70e2485a1725b3d517a2319603acc14c1f1a017dda0afe6d4665b41", size = 9813977, upload-time = "2024-11-21T00:49:00.519Z" },
+ { url = "https://files.pythonhosted.org/packages/2c/2d/2d4d202c0bcfb3a4cc2b171abb9328672d7f91d7af9ea52572722c6d8d96/onnxruntime-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:8508887eb1c5f9537a4071768723ec7c30c28eb2518a00d0adcd32c89dea3221", size = 11329895, upload-time = "2024-11-21T00:49:03.845Z" },
+ { url = "https://files.pythonhosted.org/packages/c5/9d/a42a84e10f1744dd27c6f2f9280cc3fb98f869dd19b7cd042e391ee2ab61/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f56e898815963d6dc4ee1c35fc6c36506466eff6d16f3cb9848cea4e8c8172", size = 11952833, upload-time = "2024-11-21T00:49:10.563Z" },
+ { url = "https://files.pythonhosted.org/packages/47/42/2f71f5680834688a9c81becbe5c5bb996fd33eaed5c66ae0606c3b1d6a02/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb71a814f66517a65628c9e4a2bb530a6edd2cd5d87ffa0af0f6f773a027d99e", size = 13333903, upload-time = "2024-11-21T00:49:12.984Z" },
+ { url = "https://files.pythonhosted.org/packages/c8/f1/aabfdf91d013320aa2fc46cf43c88ca0182860ff15df872b4552254a9680/onnxruntime-1.20.1-cp312-cp312-win32.whl", hash = "sha256:bd386cc9ee5f686ee8a75ba74037750aca55183085bf1941da8efcfe12d5b120", size = 9814562, upload-time = "2024-11-21T00:49:15.453Z" },
+ { url = "https://files.pythonhosted.org/packages/dd/80/76979e0b744307d488c79e41051117634b956612cc731f1028eb17ee7294/onnxruntime-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:19c2d843eb074f385e8bbb753a40df780511061a63f9def1b216bf53860223fb", size = 11331482, upload-time = "2024-11-21T00:49:19.412Z" },
+ { url = "https://files.pythonhosted.org/packages/81/0d/13bbd9489be2a6944f4a940084bfe388f1100472f38c07080a46fbd4ab96/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb44b08e017a648924dbe91b82d89b0c105b1adcfe31e90d1dc06b8677ad37be", size = 11951459, upload-time = "2024-11-21T00:49:26.269Z" },
+ { url = "https://files.pythonhosted.org/packages/c0/ea/4454ae122874fd52bbb8a961262de81c5f932edeb1b72217f594c700d6ef/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bda6aebdf7917c1d811f21d41633df00c58aff2bef2f598f69289c1f1dabc4b3", size = 13331620, upload-time = "2024-11-21T00:49:28.875Z" },
+ { url = "https://files.pythonhosted.org/packages/d8/e0/50db43188ca1c945decaa8fc2a024c33446d31afed40149897d4f9de505f/onnxruntime-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:d30367df7e70f1d9fc5a6a68106f5961686d39b54d3221f760085524e8d38e16", size = 11331758, upload-time = "2024-11-21T00:49:31.417Z" },
+ { url = "https://files.pythonhosted.org/packages/d8/55/3821c5fd60b52a6c82a00bba18531793c93c4addfe64fbf061e235c5617a/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9158465745423b2b5d97ed25aa7740c7d38d2993ee2e5c3bfacb0c4145c49d8", size = 11950342, upload-time = "2024-11-21T00:49:34.164Z" },
+ { url = "https://files.pythonhosted.org/packages/14/56/fd990ca222cef4f9f4a9400567b9a15b220dee2eafffb16b2adbc55c8281/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0df6f2df83d61f46e842dbcde610ede27218947c33e994545a22333491e72a3b", size = 13337040, upload-time = "2024-11-21T00:49:37.271Z" },
+]
+
+[[package]]
+name = "onnxruntime"
+version = "1.23.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version >= '3.14' and sys_platform == 'darwin'",
+ "python_full_version == '3.13.*' and sys_platform == 'darwin'",
+ "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+ "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+ "python_full_version == '3.12.*' and sys_platform == 'darwin'",
+ "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+ "python_full_version == '3.11.*' and sys_platform == 'darwin'",
+ "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+ "python_full_version < '3.11' and sys_platform == 'darwin'",
+ "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+]
+dependencies = [
+ { name = "coloredlogs", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+ { name = "flatbuffers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'darwin')" },
+ { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform == 'darwin')" },
+ { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+ { name = "protobuf", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+ { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
+]
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/35/d6/311b1afea060015b56c742f3531168c1644650767f27ef40062569960587/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a7730122afe186a784660f6ec5807138bf9d792fa1df76556b27307ea9ebcbe3", size = 17195934, upload-time = "2025-10-27T23:06:14.143Z" },
+ { url = "https://files.pythonhosted.org/packages/db/db/81bf3d7cecfbfed9092b6b4052e857a769d62ed90561b410014e0aae18db/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:b28740f4ecef1738ea8f807461dd541b8287d5650b5be33bca7b474e3cbd1f36", size = 19153079, upload-time = "2025-10-27T23:05:57.686Z" },
+ { url = "https://files.pythonhosted.org/packages/2e/4d/a382452b17cf70a2313153c520ea4c96ab670c996cb3a95cc5d5ac7bfdac/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f7d1fe034090a1e371b7f3ca9d3ccae2fabae8c1d8844fb7371d1ea38e8e8d2", size = 15219883, upload-time = "2025-10-22T03:46:21.66Z" },
+ { url = "https://files.pythonhosted.org/packages/fb/56/179bf90679984c85b417664c26aae4f427cba7514bd2d65c43b181b7b08b/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ca88747e708e5c67337b0f65eed4b7d0dd70d22ac332038c9fc4635760018f7", size = 17370357, upload-time = "2025-10-22T03:46:57.968Z" },
+ { url = "https://files.pythonhosted.org/packages/44/be/467b00f09061572f022ffd17e49e49e5a7a789056bad95b54dfd3bee73ff/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f91d2c9b0965e86827a5ba01531d5b669770b01775b23199565d6c1f136616c", size = 17196113, upload-time = "2025-10-22T03:47:33.526Z" },
+ { url = "https://files.pythonhosted.org/packages/9f/a8/3c23a8f75f93122d2b3410bfb74d06d0f8da4ac663185f91866b03f7da1b/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:87d8b6eaf0fbeb6835a60a4265fde7a3b60157cf1b2764773ac47237b4d48612", size = 19153857, upload-time = "2025-10-22T03:46:37.578Z" },
+ { url = "https://files.pythonhosted.org/packages/3f/d8/506eed9af03d86f8db4880a4c47cd0dffee973ef7e4f4cff9f1d4bcf7d22/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbfd2fca76c855317568c1b36a885ddea2272c13cb0e395002c402f2360429a6", size = 15220095, upload-time = "2025-10-22T03:46:24.769Z" },
+ { url = "https://files.pythonhosted.org/packages/e9/80/113381ba832d5e777accedc6cb41d10f9eca82321ae31ebb6bcede530cea/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da44b99206e77734c5819aa2142c69e64f3b46edc3bd314f6a45a932defc0b3e", size = 17372080, upload-time = "2025-10-22T03:47:00.265Z" },
+ { url = "https://files.pythonhosted.org/packages/1b/9e/f748cd64161213adeef83d0cb16cb8ace1e62fa501033acdd9f9341fff57/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:b8f029a6b98d3cf5be564d52802bb50a8489ab73409fa9db0bf583eabb7c2321", size = 17195929, upload-time = "2025-10-22T03:47:36.24Z" },
+ { url = "https://files.pythonhosted.org/packages/91/9d/a81aafd899b900101988ead7fb14974c8a58695338ab6a0f3d6b0100f30b/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:218295a8acae83905f6f1aed8cacb8e3eb3bd7513a13fe4ba3b2664a19fc4a6b", size = 19157705, upload-time = "2025-10-22T03:46:40.415Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/35/4e40f2fba272a6698d62be2cd21ddc3675edfc1a4b9ddefcc4648f115315/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76ff670550dc23e58ea9bc53b5149b99a44e63b34b524f7b8547469aaa0dcb8c", size = 15226915, upload-time = "2025-10-22T03:46:27.773Z" },
+ { url = "https://files.pythonhosted.org/packages/ef/88/9cc25d2bafe6bc0d4d3c1db3ade98196d5b355c0b273e6a5dc09c5d5d0d5/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f9b4ae77f8e3c9bee50c27bc1beede83f786fe1d52e99ac85aa8d65a01e9b77", size = 17382649, upload-time = "2025-10-22T03:47:02.782Z" },
+ { url = "https://files.pythonhosted.org/packages/3d/41/fba0cabccecefe4a1b5fc8020c44febb334637f133acefc7ec492029dd2c/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2ff531ad8496281b4297f32b83b01cdd719617e2351ffe0dba5684fb283afa1f", size = 17196337, upload-time = "2025-10-22T03:46:35.168Z" },
+ { url = "https://files.pythonhosted.org/packages/fe/f9/2d49ca491c6a986acce9f1d1d5fc2099108958cc1710c28e89a032c9cfe9/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:162f4ca894ec3de1a6fd53589e511e06ecdc3ff646849b62a9da7489dee9ce95", size = 19157691, upload-time = "2025-10-22T03:46:43.518Z" },
+ { url = "https://files.pythonhosted.org/packages/1c/a1/428ee29c6eaf09a6f6be56f836213f104618fb35ac6cc586ff0f477263eb/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45d127d6e1e9b99d1ebeae9bcd8f98617a812f53f46699eafeb976275744826b", size = 15226898, upload-time = "2025-10-22T03:46:30.039Z" },
+ { url = "https://files.pythonhosted.org/packages/f2/2b/b57c8a2466a3126dbe0a792f56ad7290949b02f47b86216cd47d857e4b77/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8bace4e0d46480fbeeb7bbe1ffe1f080e6663a42d1086ff95c1551f2d39e7872", size = 17382518, upload-time = "2025-10-22T03:47:05.407Z" },
+ { url = "https://files.pythonhosted.org/packages/7c/3d/6830fa61c69ca8e905f237001dbfc01689a4e4ab06147020a4518318881f/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466", size = 15229610, upload-time = "2025-10-22T03:46:32.239Z" },
+ { url = "https://files.pythonhosted.org/packages/b6/ca/862b1e7a639460f0ca25fd5b6135fb42cf9deea86d398a92e44dfda2279d/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2b9233c4947907fd1818d0e581c049c41ccc39b2856cc942ff6d26317cee145", size = 17394184, upload-time = "2025-10-22T03:47:08.127Z" },
+]
+
[[package]]
name = "openai"
version = "2.7.1"
@@ -1733,6 +1989,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" },
]
+[[package]]
+name = "openpyxl"
+version = "3.1.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "et-xmlfile" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
+]
+
[[package]]
name = "opt-einsum"
version = "3.3.0"
@@ -1821,6 +2089,68 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a3/6f/042d73453ad01a2e3dd4adeae4870e25679d9eaa340d70bd54cd409cb982/paddlepaddle-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:d16fd322246c4a580ca512971840ac8c16126a77d59a7bceee165d8f2196a6b2", size = 101708504, upload-time = "2025-10-30T13:21:18.125Z" },
]
+[[package]]
+name = "pandas"
+version = "2.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+ { name = "python-dateutil" },
+ { name = "pytz" },
+ { name = "tzdata" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" },
+ { url = "https://files.pythonhosted.org/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" },
+ { url = "https://files.pythonhosted.org/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" },
+ { url = "https://files.pythonhosted.org/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" },
+ { url = "https://files.pythonhosted.org/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" },
+ { url = "https://files.pythonhosted.org/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" },
+ { url = "https://files.pythonhosted.org/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" },
+ { url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" },
+ { url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" },
+ { url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" },
+ { url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" },
+ { url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
+ { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
+ { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" },
+ { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" },
+ { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" },
+ { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" },
+ { url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" },
+ { url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" },
+ { url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" },
+ { url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" },
+ { url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" },
+ { url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" },
+ { url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" },
+ { url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" },
+ { url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" },
+ { url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" },
+ { url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
+ { url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" },
+ { url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" },
+ { url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" },
+ { url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" },
+ { url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" },
+ { url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" },
+ { url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" },
+ { url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" },
+ { url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" },
+ { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
+ { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
+ { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
+]
+
[[package]]
name = "pdfminer-six"
version = "20250506"
@@ -2266,6 +2596,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ae/43/2b0607ef7f16d63fbe00de728151a090397ef5b3b9147b4aefe975d17106/pypdfium2-5.0.0-py3-none-win_arm64.whl", hash = "sha256:0a2a473fe95802e7a5f4140f25e5cd036cf17f060f27ee2d28c3977206add763", size = 2939015, upload-time = "2025-10-26T13:31:40.531Z" },
]
+[[package]]
+name = "pyreadline3"
+version = "3.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" },
+]
+
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@@ -2291,6 +2630,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" },
]
+[[package]]
+name = "python-dotenv"
+version = "1.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
+]
+
[[package]]
name = "python-pptx"
version = "1.0.2"
@@ -2306,6 +2654,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" },
]
+[[package]]
+name = "pytz"
+version = "2025.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
+]
+
[[package]]
name = "pyyaml"
version = "6.0.3"
@@ -2654,7 +3011,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version < '3.11' and sys_platform == 'win32'",
+ "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
dependencies = [
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -2717,14 +3075,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
- "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version >= '3.14' and sys_platform == 'win32'",
+ "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+ "python_full_version == '3.13.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version == '3.12.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version == '3.11.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
dependencies = [
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -3083,6 +3445,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/97/45/1097a0eb3ad2a1cb5a29d493aefffe4170e454b2726cfe2a76f6652e91af/stringzilla-4.2.3-cp313-cp313-win_arm64.whl", hash = "sha256:1000a8df547fb3b194def48cc3ed6494715fe1c8ac25c6444ed3cfb0b52942c7", size = 99815, upload-time = "2025-10-27T18:35:56.555Z" },
]
+[[package]]
+name = "sympy"
+version = "1.14.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "mpmath" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
+]
+
[[package]]
name = "termcolor"
version = "3.2.0"
@@ -3116,7 +3490,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version < '3.11' and sys_platform == 'win32'",
+ "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
dependencies = [
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -3135,14 +3510,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
- "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version >= '3.14' and sys_platform == 'win32'",
+ "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
+ "python_full_version == '3.13.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version == '3.12.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
+ "python_full_version == '3.11.*' and sys_platform == 'win32'",
+ "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
dependencies = [
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -3185,6 +3564,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
]
+[[package]]
+name = "tzdata"
+version = "2025.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
+]
+
[[package]]
name = "unidic-lite"
version = "1.0.8"