feat: 新增文档模型类,调整配置与解析逻辑,优化日志及导入

移除日志设置与冗余代码,优化导入、类型提示及OCR后端管理
统一调整各文件模块导入路径为绝对导入
调整导入路径,移除部分导入,优化日志及注释
升级文档解析器为 Docx2Parser,优化超时与图片处理逻辑
This commit is contained in:
begoniezhao
2025-11-07 10:30:02 +08:00
committed by lyingbug
parent af620806e0
commit 2d66abedf0
39 changed files with 2676 additions and 1570 deletions

7
.gitignore vendored
View File

@@ -24,17 +24,14 @@ node_modules/
tmp/
temp/
# Docker compose файл (локальные настройки)
# docker-compose.yml
WeKnora
/models/
**/__pycache__
test/data/mswag.txt
data/files/
.python-version
.venv/
**/__pycache__
.python-version
### macOS
# General

View File

@@ -127,6 +127,7 @@ services:
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
- MINIO_USE_SSL=${MINIO_USE_SSL:-}
- WEB_PROXY=${WEB_PROXY:-}
- MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
healthcheck:
test: ["CMD", "grpc_health_probe", "-addr=:50051"]
interval: 30s

View File

@@ -80,10 +80,11 @@ RUN pip install uv --break-system-packages && \
python -m uv sync --locked --no-dev
# 复制源代码和生成脚本
COPY docreader .
COPY docreader docreader
# 生成 protobuf 代码
RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh
RUN chmod +x docreader/scripts/generate_proto.sh && \
bash docreader/scripts/generate_proto.sh
# 确保模型目录存在
RUN ls -la /root/.paddleocr/whl/
@@ -150,10 +151,11 @@ RUN python -m playwright install-deps webkit
# COPY docreader/scripts/download_deps.py download_deps.py
# RUN python -m download_deps
COPY --from=builder /app/ ./
COPY docreader/pyproject.toml docreader/uv.lock ./
COPY --from=builder /app/docreader docreader
# 暴露 gRPC 端口
EXPOSE 50051
# 直接运行 Python 服务(日志输出到 stdout/stderr
CMD ["uv", "run", "main.py"]
CMD ["uv", "run", "-m", "docreader.main"]

5
docreader/.pylintrc Normal file
View File

@@ -0,0 +1,5 @@
[LOGGING]
logging-format-style=fstr
[MESSAGES CONTROL]
; disable=W1203

View File

@@ -1,37 +1,25 @@
import os
import sys
import logging
from concurrent import futures
import os
import re
import sys
import traceback
import grpc
import uuid
import atexit
from concurrent import futures
from typing import Optional
import grpc
from grpc_health.v1 import health_pb2_grpc
from grpc_health.v1.health import HealthServicer
# Add parent directory to Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
from docreader.models.read_config import ChunkingConfig
from docreader.parser import Parser
from docreader.parser.ocr_engine import OCREngine
from docreader.proto import docreader_pb2_grpc
from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse
from docreader.utils.request import init_logging_request_id, request_id_context
from proto.docreader_pb2 import ReadResponse, Chunk, Image
from proto import docreader_pb2_grpc
from parser import Parser, OCREngine
from parser.config import ChunkingConfig
from utils.request import request_id_context, init_logging_request_id
# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
import re
from typing import Optional
try:
# Optional dependency for charset detection; install via `pip install charset-normalizer`
from charset_normalizer import from_bytes as _cn_from_bytes # type: ignore
except Exception: # pragma: no cover
_cn_from_bytes = None # type: ignore
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
# cannot be encoded to UTF-8
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
@@ -47,29 +35,6 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
return s.encode("utf-8", errors="replace").decode("utf-8")
def read_text_with_fallback(file_path: str) -> str:
"""Read text from file supporting multiple encodings with graceful fallback.
This server currently receives bytes over gRPC and delegates decoding to the parser.
This helper is provided for future local-file reads if needed.
"""
with open(file_path, "rb") as f:
raw = f.read()
if _cn_from_bytes is not None:
try:
result = _cn_from_bytes(raw).best()
if result:
return str(result)
except Exception:
pass
for enc in ("utf-8", "gb18030", "latin-1"):
try:
return raw.decode(enc, errors="replace")
except UnicodeDecodeError:
continue
return raw.decode("utf-8", errors="replace")
# Ensure no existing handlers
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
@@ -113,7 +78,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
request.file_type or os.path.splitext(request.file_name)[1][1:]
)
logger.info(
f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
f"ReadFromFile for file: {request.file_name}, type: {file_type}"
)
logger.info(f"File content size: {len(request.file_content)} bytes")
@@ -124,8 +89,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
enable_multimodal = request.read_config.enable_multimodal or False
logger.info(
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
f"multimodal={enable_multimodal}"
f"Using chunking config: size={chunk_size}, "
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
)
# Get Storage and VLM config from request
@@ -144,7 +109,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
"path_prefix": sc.path_prefix,
}
logger.info(
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
f"Using Storage config: provider={storage_config.get('provider')}, "
f"bucket={storage_config['bucket_name']}"
)
vlm_config = {
@@ -170,7 +136,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
)
# Parse file
logger.info(f"Starting file parsing process")
logger.info("Starting file parsing process")
result = self.parser.parse_file(
request.file_name, file_type, request.file_content, chunking_config
)
@@ -184,7 +150,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
# Convert to protobuf message
logger.info(
f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
f"Parsed file {request.file_name}, with {len(result.chunks)} chunks"
)
# Build response, including image info
@@ -224,8 +190,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
enable_multimodal = request.read_config.enable_multimodal or False
logger.info(
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
f"multimodal={enable_multimodal}"
f"Using chunking config: size={chunk_size}, "
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
)
# Get Storage and VLM config from request
@@ -243,7 +209,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
"path_prefix": sc.path_prefix,
}
logger.info(
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
f"Using Storage config: provider={storage_config.get('provider')}, "
f"bucket={storage_config['bucket_name']}"
)
vlm_config = {
@@ -269,7 +236,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
)
# Parse URL
logger.info(f"Starting URL parsing process")
logger.info("Starting URL parsing process")
result = self.parser.parse_url(
request.url, request.title, chunking_config
)
@@ -282,7 +249,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
# Convert to protobuf message, including image info
logger.info(
f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
f"Parsed URL {request.url}, returning {len(result.chunks)} chunks"
)
response = ReadResponse(
@@ -335,29 +302,15 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
return proto_chunk
def init_ocr_engine(ocr_backend, ocr_config):
def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs):
"""Initialize OCR engine"""
try:
logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
if ocr_engine:
logger.info("OCR engine initialized successfully")
return True
else:
logger.error("OCR engine initialization failed")
return False
except Exception as e:
logger.error(f"Error initializing OCR engine: {str(e)}")
return False
backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle")
logger.info(f"Initializing OCR engine with backend: {backend_type}")
OCREngine.get_instance(backend_type=backend_type, **kwargs)
def main():
init_ocr_engine(
os.getenv("OCR_BACKEND", "paddle"),
{
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
},
)
init_ocr_engine()
# Set max number of worker threads
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))

View File

View File

@@ -0,0 +1,87 @@
"""Chunk document schema."""
import json
from typing import Any, Dict, List
from pydantic import BaseModel, Field
class Chunk(BaseModel):
"""Document Chunk including chunk content, chunk metadata."""
content: str = Field(default="", description="chunk text content")
seq: int = Field(default=0, description="Chunk sequence number")
start: int = Field(default=0, description="Chunk start position")
end: int = Field(description="Chunk end position")
images: List[Dict[str, Any]] = Field(
default_factory=list, description="Images in the chunk"
)
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="metadata fields",
)
def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
"""Convert Chunk to dict."""
data = self.model_dump()
data.update(kwargs)
data["class_name"] = self.__class__.__name__
return data
def to_json(self, **kwargs: Any) -> str:
"""Convert Chunk to json."""
data = self.to_dict(**kwargs)
return json.dumps(data)
def __hash__(self):
"""Hash function."""
return hash((self.content,))
def __eq__(self, other):
"""Equal function."""
return self.content == other.content
@classmethod
def from_dict(cls, data: Dict[str, Any], **kwargs: Any): # type: ignore
"""Create Chunk from dict."""
if isinstance(kwargs, dict):
data.update(kwargs)
data.pop("class_name", None)
return cls(**data)
@classmethod
def from_json(cls, data_str: str, **kwargs: Any): # type: ignore
"""Create Chunk from json."""
data = json.loads(data_str)
return cls.from_dict(data, **kwargs)
class Document(BaseModel):
"""Document including document content, document metadata."""
model_config = {"arbitrary_types_allowed": True}
content: str = Field(default="", description="document text content")
images: Dict[str, str] = Field(
default_factory=dict, description="Images in the document"
)
chunks: List[Chunk] = Field(default_factory=list, description="document chunks")
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="metadata fields",
)
def set_content(self, content: str) -> None:
"""Set document content."""
self.content = content
def get_content(self) -> str:
"""Get document content."""
return self.content
def is_valid(self) -> bool:
return self.content != ""

View File

@@ -0,0 +1,27 @@
from dataclasses import dataclass, field
@dataclass
class ChunkingConfig:
"""
Configuration for text chunking process.
Controls how documents are split into smaller pieces for processing.
"""
# Maximum size of each chunk in tokens/chars
chunk_size: int = 512
# Number of tokens/chars to overlap between chunks
chunk_overlap: int = 50
# Text separators in order of priority
separators: list = field(default_factory=lambda: ["\n\n", "\n", ""])
# Whether to enable multimodal processing (text + images)
enable_multimodal: bool = False
# Preferred field name going forward
storage_config: dict[str, str] = field(default_factory=dict)
# VLM configuration for image captioning
vlm_config: dict[str, str] = field(default_factory=dict)

View File

@@ -13,22 +13,18 @@ The parsers extract content from documents and can split them into
meaningful chunks for further processing and indexing.
"""
from .base_parser import BaseParser, ParseResult
from .docx_parser import DocxParser
from .doc_parser import DocParser
from .pdf_parser import PDFParser
from .markdown_parser import MarkdownParser
from .text_parser import TextParser
from .docx2_parser import Docx2Parser
from .image_parser import ImageParser
from .web_parser import WebParser
from .markdown_parser import MarkdownParser
from .parser import Parser
from .config import ChunkingConfig
from .ocr_engine import OCREngine
from .pdf_parser import PDFParser
from .text_parser import TextParser
from .web_parser import WebParser
# Export public classes and modules
__all__ = [
"BaseParser", # Base parser class that all format parsers inherit from
"DocxParser", # Parser for .docx files (modern Word documents)
"Docx2Parser", # Parser for .docx files (modern Word documents)
"DocParser", # Parser for .doc files (legacy Word documents)
"PDFParser", # Parser for PDF documents
"MarkdownParser", # Parser for Markdown text files
@@ -36,7 +32,4 @@ __all__ = [
"ImageParser", # Parser for images with text content
"WebParser", # Parser for web pages
"Parser", # Main parser factory that selects the appropriate parser
"ChunkingConfig", # Configuration for text chunking behavior
"ParseResult", # Standard result format returned by all parsers
"OCREngine", # OCR engine for extracting text from images
]

View File

@@ -1,65 +1,28 @@
# -*- coding: utf-8 -*-
import re
import os
import asyncio
from typing import List, Dict, Any, Optional, Tuple, Union
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
import logging
import sys
import traceback
import numpy as np
import time
import io
import json
from .ocr_engine import OCREngine
from .image_utils import image_to_base64
from .config import ChunkingConfig
from .storage import create_storage
import logging
import os
import re
import time
from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Tuple
import requests
from PIL import Image
# Add parent directory to Python path for src imports
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
try:
from services.docreader.src.parser.caption import Caption
except ImportError:
# Fallback: try relative import
try:
from .caption import Caption
except ImportError:
# If both imports fail, set to None
Caption = None
logging.warning(
"Failed to import Caption, image captioning will be unavailable"
)
from docreader.models.document import Chunk, Document
from docreader.models.read_config import ChunkingConfig
from docreader.parser.caption import Caption
from docreader.parser.ocr_engine import OCREngine
from docreader.parser.storage import create_storage
from docreader.splitter.splitter import TextSplitter
from docreader.utils import endecode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
@dataclass
class Chunk:
"""Chunk result"""
content: str # Chunk content
seq: int # Chunk sequence number
start: int # Chunk start position
end: int # Chunk end position
images: List[Dict[str, Any]] = field(default_factory=list) # Images in the chunk
@dataclass
class ParseResult:
"""Parse result"""
text: str # Extracted text content
chunks: Optional[List[Chunk]] = None # Chunk results
class BaseParser(ABC):
"""Base parser interface"""
@@ -97,17 +60,17 @@ class BaseParser(ABC):
def __init__(
self,
file_name: str = "",
file_type: str = None,
file_type: Optional[str] = None,
enable_multimodal: bool = True,
chunk_size: int = 1000,
chunk_overlap: int = 200,
separators: list = ["\n\n", "\n", ""],
separators: list[str] = ["\n\n", "\n", ""],
ocr_backend: str = "paddle",
ocr_config: dict = None,
ocr_config: dict = {},
max_image_size: int = 1920, # Maximum image size
max_concurrent_tasks: int = 5, # Max concurrent tasks
max_chunks: int = 1000, # Max number of returned chunks
chunking_config: ChunkingConfig = None, # Chunking configuration object
chunking_config: Optional[ChunkingConfig] = None,
):
"""Initialize parser
@@ -125,7 +88,6 @@ class BaseParser(ABC):
max_chunks: Max number of returned chunks
"""
# Storage client instance
self._storage = None
self.file_name = file_name
self.file_type = file_type or os.path.splitext(file_name)[1]
self.enable_multimodal = enable_multimodal
@@ -133,15 +95,16 @@ class BaseParser(ABC):
self.chunk_overlap = chunk_overlap
self.separators = separators
self.ocr_backend = os.getenv("OCR_BACKEND", ocr_backend)
self.ocr_config = ocr_config or {}
self.ocr_config = ocr_config
self.max_image_size = max_image_size
self.max_concurrent_tasks = max_concurrent_tasks
self.max_chunks = max_chunks
self.chunking_config = chunking_config
logger.info(
f"Initializing {self.__class__.__name__} for file: {file_name}, type: {self.file_type}"
self.storage = create_storage(
self.chunking_config.storage_config if self.chunking_config else None
)
logger.info(f"Initializing parser for file: {file_name}, type: {file_type}")
logger.info(
f"Parser config: chunk_size={chunk_size}, "
f"overlap={chunk_overlap}, "
@@ -150,16 +113,24 @@ class BaseParser(ABC):
f"max_chunks={max_chunks}"
)
# Only initialize Caption service if multimodal is enabled
if self.enable_multimodal:
try:
self.caption_parser = Caption(self.chunking_config.vlm_config)
except Exception as e:
logger.warning(f"Failed to initialize Caption service: {str(e)}")
self.caption_parser = None
else:
self.caption_parser = None
vlm_config = self.chunking_config.vlm_config if self.chunking_config else None
self.caption_parser = (
Caption(vlm_config=vlm_config) if self.enable_multimodal else None
)
def perform_ocr(self, image):
@abstractmethod
def parse_into_text(self, content: bytes) -> Document:
"""Parse document content
Args:
content: Document content
Returns:
Either a string containing the parsed text, or a tuple of (text, image_map)
where image_map is a dict mapping image URLs to Image objects
"""
def perform_ocr(self, image: Image.Image):
"""Execute OCR recognition on the image
Args:
@@ -170,53 +141,23 @@ class BaseParser(ABC):
"""
start_time = time.time()
logger.info("Starting OCR recognition")
resized_image = None
try:
# Resize image to avoid processing large images
resized_image = self._resize_image_if_needed(image)
# Resize image to avoid processing large images
resized_image = self._resize_image_if_needed(image)
# Get OCR engine
ocr_engine = self.get_ocr_engine(
backend_type=self.ocr_backend, **self.ocr_config
)
if ocr_engine is None:
logger.error(
f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, "
"skipping OCR recognition"
)
return ""
# Get OCR engine
ocr_engine = OCREngine.get_instance(self.ocr_backend)
# Execute OCR prediction
logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
# Add extra exception handling
try:
ocr_result = ocr_engine.predict(resized_image)
except RuntimeError as e:
# Handle common CUDA memory issues or other runtime errors
logger.error(f"OCR prediction runtime error: {str(e)}")
return ""
except Exception as e:
# Handle other prediction errors
logger.error(f"Unexpected OCR prediction error: {str(e)}")
return ""
# Execute OCR prediction
logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
ocr_result = ocr_engine.predict(resized_image)
process_time = time.time() - start_time
logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")
return ocr_result
except Exception as e:
process_time = time.time() - start_time
logger.error(
f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds"
)
return ""
finally:
# Release image resources
if resized_image is not image and hasattr(resized_image, "close"):
# Only close the new image we created, not the original image
resized_image.close()
process_time = time.time() - start_time
logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")
def _resize_image_if_needed(self, image):
return ocr_result
def _resize_image_if_needed(self, image: Image.Image) -> Image.Image:
"""Resize image if it exceeds maximum size limit
Args:
@@ -225,102 +166,21 @@ class BaseParser(ABC):
Returns:
Resized image object
"""
try:
# If it's a PIL Image
if hasattr(image, "size"):
width, height = image.size
if width > self.max_image_size or height > self.max_image_size:
logger.info(f"Resizing PIL image, original size: {width}x{height}")
scale = min(
self.max_image_size / width, self.max_image_size / height
)
new_width = int(width * scale)
new_height = int(height * scale)
resized_image = image.resize((new_width, new_height))
logger.info(f"Resized to: {new_width}x{new_height}")
return resized_image
else:
logger.info(
f"PIL image size {width}x{height} is within limits, no resizing needed"
)
return image
# If it's a numpy array
elif hasattr(image, "shape"):
height, width = image.shape[:2]
if width > self.max_image_size or height > self.max_image_size:
logger.info(
f"Resizing numpy image, original size: {width}x{height}"
)
scale = min(
self.max_image_size / width, self.max_image_size / height
)
new_width = int(width * scale)
new_height = int(height * scale)
# Use PIL for resizing numpy arrays
pil_image = Image.fromarray(image)
resized_pil = pil_image.resize((new_width, new_height))
resized_image = np.array(resized_pil)
logger.info(f"Resized to: {new_width}x{new_height}")
return resized_image
else:
logger.info(
f"Numpy image size {width}x{height} is within limits, no resizing needed"
)
return image
else:
logger.warning(f"Unknown image type: {type(image)}, cannot resize")
return image
except Exception as e:
logger.error(f"Error resizing image: {str(e)}")
return image
width, height = image.size
if width > self.max_image_size or height > self.max_image_size:
logger.info(f"Resizing PIL image, original size: {width}x{height}")
scale = min(self.max_image_size / width, self.max_image_size / height)
new_width = int(width * scale)
new_height = int(height * scale)
resized_image = image.resize((new_width, new_height))
logger.info(f"Resized to: {new_width}x{new_height}")
return resized_image
def process_image(self, image, image_url=None):
"""Process image: first perform OCR, then get caption if text is available
logger.info(f"PIL image size is {width}x{height}, no resizing needed")
return image
Args:
image: Image object (PIL.Image or numpy array)
image_url: Image URL (if uploaded)
Returns:
tuple: (ocr_text, caption, image_url)
- ocr_text: OCR extracted text
- caption: Image description (if OCR has text) or empty string
- image_url: Image URL (if provided)
"""
logger.info("Starting image processing (OCR + optional caption)")
# Resize image
image = self._resize_image_if_needed(image)
# Perform OCR recognition
ocr_text = self.perform_ocr(image)
caption = ""
if self.caption_parser:
logger.info(
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
)
# Convert image to base64 for caption generation
img_base64 = image_to_base64(image)
if img_base64:
caption = self.get_image_caption(img_base64)
if caption:
logger.info(f"Successfully obtained image caption: {caption}")
else:
logger.warning("Failed to get caption")
else:
logger.warning("Failed to convert image to base64")
caption = ""
else:
logger.info("Caption service not initialized, skipping caption retrieval")
# Release image resources
del image
return ocr_text, caption, image_url
async def process_image_async(self, image, image_url=None):
"""Asynchronously process image: first perform OCR, then get caption if text is available
async def process_image_async(self, image: Image.Image, image_url: str):
"""Asynchronously process image: first perform OCR, then get caption
Args:
image: Image object (PIL.Image or numpy array)
@@ -333,84 +193,47 @@ class BaseParser(ABC):
- image_url: Image URL (if provided)
"""
logger.info("Starting asynchronous image processing (OCR + optional caption)")
resized_image = None
# Resize image
resized_image = self._resize_image_if_needed(image)
try:
# Resize image
resized_image = self._resize_image_if_needed(image)
# Perform OCR recognition (using run_in_executor to execute synchronous operations in the event loop)
# Perform OCR recognition
loop = asyncio.get_event_loop()
try:
# Add timeout mechanism to avoid infinite blocking (30 seconds timeout)
ocr_task = loop.run_in_executor(None, self.perform_ocr, resized_image)
ocr_text = await asyncio.wait_for(ocr_task, timeout=30.0)
except asyncio.TimeoutError:
logger.error(
"OCR processing timed out (30 seconds), skipping this image"
)
ocr_text = ""
except Exception as e:
logger.error(f"OCR processing error: {str(e)}")
logger.error(f"OCR processing error, skipping this image: {str(e)}")
ocr_text = ""
logger.info(
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
)
caption = ""
if self.caption_parser:
try:
# Convert image to base64 for caption generation
img_base64 = image_to_base64(resized_image)
if img_base64:
# Add timeout to avoid blocking caption retrieval (30 seconds timeout)
caption_task = self.get_image_caption_async(img_base64)
image_data, caption = await asyncio.wait_for(
caption_task, timeout=30.0
)
if caption:
logger.info(
f"Successfully obtained image caption: {caption}"
)
else:
logger.warning("Failed to get caption")
else:
logger.warning("Failed to convert image to base64")
caption = ""
except asyncio.TimeoutError:
logger.warning("Caption retrieval timed out, skipping")
except Exception as e:
logger.error(f"Failed to get caption: {str(e)}")
else:
logger.info(
"Caption service not initialized, skipping caption retrieval"
)
logger.info(f"Successfully obtained image ocr: {ocr_text}")
img_base64 = endecode.decode_image(resized_image)
caption = self.get_image_caption(img_base64)
logger.info(f"Successfully obtained image caption: {caption}")
return ocr_text, caption, image_url
finally:
# Release image resources
if resized_image is not image and hasattr(resized_image, "close"):
# Only close the new image we created, not the original image
resized_image.close()
resized_image.close()
async def process_with_limit(self, idx, image, url, semaphore):
async def process_with_limit(
self, idx: int, image: Image.Image, url: str, semaphore: asyncio.Semaphore
):
"""Function to process a single image using a semaphore"""
try:
logger.info(f"Waiting to process image {idx+1}")
logger.info(f"Waiting to process image {idx + 1}")
async with semaphore: # Use semaphore to control concurrency
logger.info(f"Starting to process image {idx+1}")
logger.info(f"Starting to process image {idx + 1}")
result = await self.process_image_async(image, url)
logger.info(f"Completed processing image {idx+1}")
logger.info(f"Completed processing image {idx + 1}")
return result
except Exception as e:
logger.error(f"Error processing image {idx+1}: {str(e)}")
logger.error(f"Error processing image {idx + 1}: {str(e)}")
return ("", "", url) # Return empty result to avoid overall failure
finally:
# Manually release image resources
if hasattr(image, "close"):
image.close()
image.close()
async def process_multiple_images(self, images_data):
async def process_multiple_images(self, images_data: List[Tuple[Image.Image, str]]):
"""Process multiple images concurrently
Args:
@@ -450,7 +273,7 @@ class BaseParser(ABC):
for i, result in enumerate(completed_results):
if isinstance(result, Exception):
logger.error(
f"Image {i+1} processing returned an exception: {str(result)}"
f"Image {i + 1} processing returned an exception: {str(result)}"
)
# For exceptions, add empty results
if i < len(images_data):
@@ -467,47 +290,10 @@ class BaseParser(ABC):
logger.info("Image processing resource cleanup complete")
logger.info(
f"Completed concurrent processing of {len(results)}/{len(images_data)} images"
f"Concurrent processing of {len(results)}/{len(images_data)} images"
)
return results
def decode_bytes(self, content: bytes) -> str:
"""Intelligently decode byte stream, supports multiple encodings
Tries to decode in common encodings, if all fail, uses latin-1 as fallback
Args:
content: Byte stream to decode
Returns:
Decoded string
"""
logger.info(f"Attempting to decode bytes of length: {len(content)}")
# Common encodings, sorted by priority
encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii", "latin-1"]
text = None
# Try decoding with each encoding format
for encoding in encodings:
try:
text = content.decode(encoding)
logger.info(f"Successfully decoded content using {encoding} encoding")
break
except UnicodeDecodeError:
logger.info(f"Failed to decode using {encoding} encoding")
continue
# If all encodings fail, use latin-1 as fallback
if text is None:
text = content.decode("latin-1")
logger.warning(
f"Unable to determine correct encoding, using latin-1 as fallback. "
f"This may cause character issues."
)
logger.info(f"Decoded text length: {len(text)} characters")
return text
def get_image_caption(self, image_data: str) -> str:
"""Get image description
@@ -517,6 +303,9 @@ class BaseParser(ABC):
Returns:
Image description
"""
if not self.caption_parser:
logger.warning("Caption parser not initialized")
return ""
start_time = time.time()
logger.info(
f"Getting caption for image: {image_data[:250]}..."
@@ -533,80 +322,7 @@ class BaseParser(ABC):
logger.warning("Failed to get caption for image")
return caption
async def get_image_caption_async(self, image_data: str) -> Tuple[str, str]:
"""Asynchronously get image description
Args:
image_data: Image data (base64 encoded string or URL)
Returns:
Tuple[str, str]: Image data and corresponding description
"""
caption = self.get_image_caption(image_data)
return image_data, caption
def __init_storage(self):
"""Initialize storage client based on configuration"""
if self._storage is None:
storage_config = (
self.chunking_config.storage_config if self.chunking_config else None
)
self._storage = create_storage(storage_config)
logger.info(
f"Initialized storage client: {self._storage.__class__.__name__}"
)
return self._storage
def upload_file(self, file_path: str) -> str:
"""Upload file to object storage
Args:
file_path: File path
Returns:
File URL
"""
logger.info(f"Uploading file: {file_path}")
try:
storage = self.__init_storage()
return storage.upload_file(file_path)
except Exception as e:
logger.error(f"Failed to upload file: {str(e)}")
return ""
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to object storage
Args:
content: Byte content to upload
file_ext: File extension
Returns:
File URL
"""
logger.info(f"Uploading bytes content, size: {len(content)} bytes")
try:
storage = self.__init_storage()
return storage.upload_bytes(content, file_ext)
except Exception as e:
logger.error(f"Failed to upload bytes to storage: {str(e)}")
traceback.print_exc()
return ""
@abstractmethod
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
"""Parse document content
Args:
content: Document content
Returns:
Either a string containing the parsed text, or a tuple of (text, image_map)
where image_map is a dict mapping image URLs to Image objects
"""
pass
def parse(self, content: bytes) -> ParseResult:
def parse(self, content: bytes) -> Document:
"""Parse document content
Args:
@@ -616,17 +332,19 @@ class BaseParser(ABC):
Parse result
"""
logger.info(
f"Parsing document with {self.__class__.__name__}, content size: {len(content)} bytes"
f"Parsing document with {self.__class__.__name__}, bytes: {len(content)}"
)
parse_result = self.parse_into_text(content)
if isinstance(parse_result, tuple):
text, image_map = parse_result
else:
text = parse_result
image_map = {}
logger.info(f"Extracted {len(text)} characters of text from {self.file_name}")
logger.info(f"Beginning chunking process for text")
chunks = self.chunk_text(text)
document = self.parse_into_text(content)
logger.info(
f"Extracted {len(document.content)} characters from {self.file_name}"
)
splitter = TextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
separators=self.separators,
)
chunk_str = splitter.split_text(document.content)
chunks = self._str_to_chunk(chunk_str)
logger.info(f"Created {len(chunks)} chunks from document")
# Limit the number of returned chunks
@@ -636,7 +354,7 @@ class BaseParser(ABC):
)
chunks = chunks[: self.max_chunks]
# If multimodal is enabled and file type is supported, process images in each chunk
# If multimodal is enabled and file type is supported, process images
if self.enable_multimodal:
# Get file extension and convert to lowercase
file_ext = (
@@ -647,11 +365,12 @@ class BaseParser(ABC):
# Define allowed file types for image processing
allowed_types = [
".pdf", # PDF files
# Text files
".pdf",
".md",
".markdown", # Markdown files
".markdown",
".doc",
".docx", # Word documents
".docx",
# Image files
".jpg",
".jpeg",
@@ -666,13 +385,21 @@ class BaseParser(ABC):
logger.info(
f"Processing images in each chunk for file type: {file_ext}"
)
chunks = self.process_chunks_images(chunks, image_map)
chunks = self.process_chunks_images(chunks, document.images)
else:
logger.info(
f"Skipping image processing for unsupported file type: {file_ext}"
)
return ParseResult(text=text, chunks=chunks)
document.chunks = chunks
return document
def _str_to_chunk(self, text: List[Tuple[int, int, str]]) -> List[Chunk]:
"""Convert string to Chunk object"""
return [
Chunk(seq=i, content=t, start=start, end=end)
for i, (start, end, t) in enumerate(text)
]
def _split_into_units(self, text: str) -> List[str]:
"""
@@ -682,9 +409,7 @@ class BaseParser(ABC):
Returns:
基本单元的列表
"""
logger.info(
f"Splitting text into basic units with robust structure protection, text length: {len(text)}"
)
logger.info(f"Splitting text into basic units, text length: {len(text)}")
# 定义所有需要作为整体保护的结构模式 ---
table_pattern = r"(?m)(^\|.*\|[ \t]*\r?\n(?:[ \t]*\r?\n)?^\|\s*:?--+.*\r?\n(?:^\|.*\|\r?\n?)*)"
@@ -710,7 +435,8 @@ class BaseParser(ABC):
# 按起始位置排序
protected_ranges.sort(key=lambda x: x[0])
logger.info(
f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links)."
f"Found {len(protected_ranges)} protected structures "
"(tables, code, formulas, images, links)."
)
# 合并可能重叠的保护范围 ---
@@ -731,7 +457,7 @@ class BaseParser(ABC):
merged_ranges.append((current_start, current_end))
protected_ranges = merged_ranges
logger.info(
f"After merging overlaps, {len(protected_ranges)} protected ranges remain."
f"After overlaps, {len(protected_ranges)} protected ranges remain."
)
# 根据保护范围和分隔符来分割文本 ---
@@ -749,7 +475,7 @@ class BaseParser(ABC):
segments = re.split(separator_pattern, pre_text)
units.extend([s for s in segments if s]) # 添加所有非空部分
# b. 将整个受保护的块(例如,一个完整的表格)作为一个单独的、不可分割的单元添加
# b. 将整个受保护的块(例如,一个完整的表格)作为一个不可分割的单元添加
protected_text = text[start:end]
units.append(protected_text)
@@ -764,38 +490,6 @@ class BaseParser(ABC):
logger.info(f"Text splitting complete, created {len(units)} final basic units.")
return units
def _find_complete_units(self, units: List[str], target_size: int) -> List[str]:
"""Find a list of complete units that do not exceed the target size
Args:
units: List of units
target_size: Target size
Returns:
List of complete units
"""
logger.info(f"Finding complete units with target size: {target_size}")
result = []
current_size = 0
for unit in units:
unit_size = len(unit)
if current_size + unit_size > target_size and result:
logger.info(
f"Reached target size limit at {current_size} characters, stopping"
)
break
result.append(unit)
current_size += unit_size
logger.info(
f"Added unit of size {unit_size}, current total: {current_size}/{target_size}"
)
logger.info(
f"Found {len(result)} complete units totaling {current_size} characters"
)
return result
def chunk_text(self, text: str) -> List[Chunk]:
"""Chunk text, preserving Markdown structure
@@ -825,7 +519,7 @@ class BaseParser(ABC):
for i, unit in enumerate(units):
unit_size = len(unit)
logger.info(f"Processing unit {i+1}/{len(units)}, size: {unit_size}")
logger.info(f"Processing unit {i + 1}/{len(units)}, size: {unit_size}")
# If current chunk plus new unit exceeds size limit, create new chunk
if current_size + unit_size > self.chunk_size and current_chunk:
@@ -855,14 +549,12 @@ class BaseParser(ABC):
for u in reversed(current_chunk):
if overlap_size + len(u) > overlap_target:
logger.info(
f"Reached overlap target ({overlap_size}/{overlap_target})"
f"Overlap target ({overlap_size}/{overlap_target})"
)
break
overlap_units.insert(0, u)
overlap_size += len(u)
logger.info(
f"Added unit to overlap, current overlap size: {overlap_size}"
)
logger.info(f"Added unit to overlap, size: {overlap_size}")
# Remove elements from overlap that are included in separators
start_index = 0
@@ -883,7 +575,7 @@ class BaseParser(ABC):
overlap_units = overlap_units[start_index:]
logger.info(
f"Final overlap: {len(overlap_units)} units, {overlap_size} characters"
f"Overlap: {len(overlap_units)} units, {overlap_size} size"
)
current_chunk = overlap_units
@@ -899,7 +591,7 @@ class BaseParser(ABC):
current_chunk.append(unit)
current_size += unit_size
logger.info(
f"Added unit to current chunk, now at {current_size}/{self.chunk_size} characters"
f"Added unit to current chunk, at {current_size}/{self.chunk_size}"
)
# Add the last chunk
@@ -925,12 +617,13 @@ class BaseParser(ABC):
chunk: Document chunk
Returns:
List of image information, each element contains image URL and match position
List of image information
"""
logger.info(f"Extracting image information from Chunk #{chunk.seq}")
text = chunk.content
# Regex to extract image information from text, supporting Markdown images and HTML images
# Regex to extract image information from text,
# support: Markdown images, HTML images
img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)|<img [^>]*src="([^"]+)" [^>]*>'
# Extract image information
@@ -954,28 +647,28 @@ class BaseParser(ABC):
images_info.append(image_info)
logger.info(
f"Image in Chunk #{chunk.seq} {match_idx+1}: " f"URL={img_url[:50]}..."
f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url[:50]}..."
if len(img_url) > 50
else f"Image in Chunk #{chunk.seq} {match_idx+1}: URL={img_url}"
else f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url}"
)
return images_info
async def download_and_upload_image(self, img_url: str):
"""Download image and upload to object storage, if it's already an object storage path or local path, use directly
async def download_and_upload_image(
self, img_url: str
) -> Tuple[str, str, Image.Image | None]:
"""Download image and upload to object storage,
if it's already an object storage path or local path, use directly
Args:
img_url: Image URL or local path
Returns:
tuple: (original URL, storage URL, image object), if failed returns (original URL, None, None)
tuple: (original URL, storage URL, image object),
if failed returns (original URL, None, None)
"""
try:
import requests
from PIL import Image
import io
# Check if it's already a storage URL (COS or MinIO)
is_storage_url = any(
pattern in img_url
@@ -997,12 +690,7 @@ class BaseParser(ABC):
response = requests.get(img_url, timeout=5, proxies=proxies)
if response.status_code == 200:
image = Image.open(io.BytesIO(response.content))
try:
return img_url, img_url, image
finally:
# Ensure image resources are also released after the function returns
# Image will be closed by the caller
pass
return img_url, img_url, image
else:
logger.warning(
f"Failed to get storage image: {response.status_code}"
@@ -1022,7 +710,7 @@ class BaseParser(ABC):
# Upload to storage
with open(img_url, "rb") as f:
content = f.read()
storage_url = self.upload_bytes(content)
storage_url = self.storage.upload_bytes(content)
logger.info(
f"Successfully uploaded local image to storage: {storage_url}"
)
@@ -1031,7 +719,7 @@ class BaseParser(ABC):
logger.error(f"Error processing local image: {str(e)}")
if image and hasattr(image, "close"):
image.close()
return img_url, None, None
return img_url, img_url, None
# Normal remote URL download handling
else:
@@ -1044,9 +732,7 @@ class BaseParser(ABC):
if https_proxy:
proxies["https"] = https_proxy
logger.info(
f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}"
)
logger.info(f"Downloading image {img_url}, using proxy: {proxies}")
response = requests.get(img_url, timeout=5, proxies=proxies)
if response.status_code == 200:
@@ -1054,7 +740,7 @@ class BaseParser(ABC):
image = Image.open(io.BytesIO(response.content))
try:
# Upload to storage using the method in BaseParser
storage_url = self.upload_bytes(response.content)
storage_url = self.storage.upload_bytes(response.content)
logger.info(
f"Successfully uploaded image to storage: {storage_url}"
)
@@ -1064,11 +750,11 @@ class BaseParser(ABC):
pass
else:
logger.warning(f"Failed to download image: {response.status_code}")
return img_url, None, None
return img_url, img_url, None
except Exception as e:
logger.error(f"Error downloading or processing image: {str(e)}")
return img_url, None, None
return img_url, img_url, None
async def process_chunk_images_async(
self, chunk, chunk_idx, total_chunks, image_map=None
@@ -1086,18 +772,19 @@ class BaseParser(ABC):
"""
logger.info(
f"Starting to process images in Chunk #{chunk_idx+1}/{total_chunks}"
f"Starting to process images in Chunk #{chunk_idx + 1}/{total_chunks}"
)
# Extract image information from the Chunk
images_info = self.extract_images_from_chunk(chunk)
if not images_info:
logger.info(f"Chunk #{chunk_idx+1} found no images")
logger.info(f"Chunk #{chunk_idx + 1} found no images")
return chunk
# Prepare images that need to be downloaded and processed
images_to_process = []
url_to_info_map = {} # Map URL to image information
# Map URL to image information
url_to_info_map = {}
# Record all image URLs that need to be processed
for img_info in images_info:
@@ -1106,14 +793,21 @@ class BaseParser(ABC):
results = []
download_tasks = []
for img_url in url_to_info_map.keys(): # Check if image is already in the image_map
# Check if image is already in the image_map
for img_url in url_to_info_map.keys():
if image_map and img_url in image_map:
logger.info(f"Image already in image_map: {img_url}, using cached object")
results.append((img_url, img_url, image_map[img_url]))
logger.info(
f"Image already in image_map: {img_url}, using cached object"
)
image = Image.open(
io.BytesIO(endecode.encode_image(image_map[img_url]))
)
results.append((img_url, img_url, image))
else:
download_task = self.download_and_upload_image(img_url)
download_tasks.append(download_task)
# Concurrent download and upload of images, ignore images that are already in the image_map
# Concurrent download and upload of images,
# ignore images that are already in the image_map
results.extend(await asyncio.gather(*download_tasks))
# Process download results, prepare for OCR processing
@@ -1123,16 +817,17 @@ class BaseParser(ABC):
img_info["cos_url"] = cos_url
images_to_process.append((image, cos_url))
# If no images were successfully downloaded and uploaded, return the original Chunk
# If no images were successfully downloaded and uploaded,
# return the original Chunk
if not images_to_process:
logger.info(
f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images"
f"Chunk #{chunk_idx + 1} not found downloaded and uploaded images"
)
return chunk
# Concurrent processing of all images (OCR + caption)
logger.info(
f"Processing {len(images_to_process)} images in Chunk #{chunk_idx+1}"
f"Processing {len(images_to_process)} images in Chunk #{chunk_idx + 1}"
)
# Concurrent processing of all images
@@ -1163,10 +858,12 @@ class BaseParser(ABC):
# Update image information in the Chunk
chunk.images = processed_images
logger.info(f"Completed image processing in Chunk #{chunk_idx+1}")
logger.info(f"Completed image processing in Chunk #{chunk_idx + 1}")
return chunk
def process_chunks_images(self, chunks: List[Chunk], image_map=None) -> List[Chunk]:
def process_chunks_images(
self, chunks: List[Chunk], image_map: Dict[str, str] = {}
) -> List[Chunk]:
"""Concurrent processing of images in all Chunks
Args:
@@ -1210,7 +907,7 @@ class BaseParser(ABC):
processed_chunks = []
for i, result in enumerate(results):
if isinstance(result, Exception):
logger.error(f"Error processing Chunk {i+1}: {str(result)}")
logger.error(f"Error processing Chunk {i + 1}: {str(result)}")
# Keep original Chunk
if i < len(chunks):
processed_chunks.append(chunks[i])
@@ -1235,7 +932,7 @@ class BaseParser(ABC):
# Execute processing for all Chunks
processed_chunks = loop.run_until_complete(process_all_chunks())
logger.info(
f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks"
f"Completed processing of {len(processed_chunks)}/{len(chunks)} chunks"
)
return processed_chunks

View File

@@ -3,11 +3,10 @@ import logging
import os
import time
from dataclasses import dataclass, field
from typing import List, Optional, Union
from typing import Dict, List, Optional, Union
import requests
import ollama
import requests
logger = logging.getLogger(__name__)
@@ -158,11 +157,16 @@ class CaptionChatResp:
Returns:
The content string from the first choice, or empty string if no choices
"""
if self.choices:
logger.info("Retrieving content from first choice")
return self.choices[0].message.content
logger.warning("No choices available in response")
return ""
if (
not self.choices
or not self.choices[0]
or not self.choices[0].message
or not self.choices[0].message.content
):
logger.warning("No choices available in response")
return ""
logger.info("Retrieving content from first choice")
return self.choices[0].message.content
class Caption:
@@ -171,33 +175,43 @@ class Caption:
Uses an external API to process images and return textual descriptions.
"""
def __init__(self, vlm_config=None):
"""Initialize the Caption service with configuration from parameters or environment variables."""
def __init__(self, vlm_config: Optional[Dict[str, str]] = None):
"""
Initialize the Caption service with configuration
from parameters or environment variables.
"""
logger.info("Initializing Caption service")
self.prompt = """简单凝炼的描述图片的主要内容"""
# Use provided VLM config if available, otherwise fall back to environment variables
self.timeout = 30
# Use provided VLM config if available,
# otherwise fall back to environment variables
if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"):
self.completion_url = vlm_config.get("base_url", "") + "/chat/completions"
self.model = vlm_config.get("model_name", "")
self.api_key = vlm_config.get("api_key", "")
self.interface_type = vlm_config.get("interface_type", "openai").lower()
else:
if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "":
base_url = os.getenv("VLM_MODEL_BASE_URL")
model_name = os.getenv("VLM_MODEL_NAME")
if not base_url or not model_name:
logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set")
return
self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions"
self.model = os.getenv("VLM_MODEL_NAME")
self.api_key = os.getenv("VLM_MODEL_API_KEY")
self.completion_url = base_url + "/chat/completions"
self.model = model_name
self.api_key = os.getenv("VLM_MODEL_API_KEY", "")
self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower()
# 验证接口类型
if self.interface_type not in ["ollama", "openai"]:
logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai")
logger.warning(
f"Unknown interface type: {self.interface_type}, defaulting to openai"
)
self.interface_type = "openai"
logger.info(
f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}"
f"Configured with model: {self.model}, "
f"endpoint: {self.completion_url}, interface: {self.interface_type}"
)
def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]:
@@ -210,8 +224,8 @@ class Caption:
Returns:
CaptionChatResp object if successful, None otherwise
"""
logger.info(f"Calling Caption API for image captioning")
logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}")
logger.info("Calling Caption API for image captioning")
logger.info(f"Processing image data: {image_data[:50]}...")
# 根据接口类型选择调用方式
if self.interface_type == "ollama":
@@ -226,39 +240,35 @@ class Caption:
client = ollama.Client(
host=host,
timeout=self.timeout,
)
try:
logger.info(f"Calling Ollama API with model: {self.model}")
# 调用Ollama API使用images参数传递base64编码的图片
response = client.generate(
model=self.model,
prompt="简单凝炼的描述图片的主要内容",
images=[image_base64], # image_base64是base64编码的图片数据
images=[image_base64], # image_base64是base64编码的图片数据
options={"temperature": 0.1},
stream=False,
)
# 构造响应对象
caption_resp = CaptionChatResp(
id="ollama_response",
created=int(time.time()),
model=self.model,
model=Model(id=self.model),
object="chat.completion",
choices=[
Choice(
message=Message(
role="assistant",
content=response.response
)
)
]
Choice(message=Message(role="assistant", content=response.response))
],
)
logger.info("Successfully received response from Ollama API")
return caption_resp
except Exception as e:
logger.error(f"Error calling Ollama API: {e}")
return None
@@ -266,13 +276,16 @@ class Caption:
def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]:
"""Call OpenAI-compatible API for image captioning."""
logger.info(f"Calling OpenAI-compatible API with model: {self.model}")
user_msg = UserMessage(
role="user",
content=[
Content(type="text", text=self.prompt),
Content(
type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto")
type="image_url",
image_url=ImageUrl(
url="data:image/png;base64," + image_base64, detail="auto"
),
),
],
)
@@ -295,23 +308,23 @@ class Caption:
headers["Authorization"] = f"Bearer {self.api_key}"
try:
logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}")
logger.info(
f"Sending request to OpenAI-compatible API with model: {self.model}"
)
response = requests.post(
self.completion_url,
data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4),
headers=headers,
timeout=30,
timeout=self.timeout,
)
if response.status_code != 200:
logger.error(
f"OpenAI-compatible API returned non-200 status code: {response.status_code}"
f"OpenAI API returned non-200 status code: {response.status_code}"
)
response.raise_for_status()
logger.info(
f"Successfully received response from OpenAI-compatible API with status: {response.status_code}"
)
logger.info(f"Converting response to CaptionChatResp object")
logger.info(f"Received from OpenAI with status: {response.status_code}")
logger.info("Converting response to CaptionChatResp object")
caption_resp = CaptionChatResp.from_json(response.json())
if caption_resp.usage:
@@ -322,7 +335,7 @@ class Caption:
return caption_resp
except requests.exceptions.Timeout:
logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds")
logger.error("Timeout while calling OpenAI-compatible API after 30 seconds")
return None
except requests.exceptions.RequestException as e:
logger.error(f"Request error calling OpenAI-compatible API: {e}")

View File

@@ -0,0 +1,70 @@
import logging
from typing import List, Tuple, Type
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.utils import endecode
logger = logging.getLogger(__name__)
class FirstParser(BaseParser):
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._parsers: List[BaseParser] = []
for parser_cls in self._parser_cls:
try:
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
except Exception as e:
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
def parse_into_text(self, content: bytes) -> Document:
for p in self._parsers:
document = p.parse_into_text(content)
if document.is_valid():
return document
return Document()
@classmethod
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["FirstParser"]:
names = "_".join([p.__name__ for p in parser_classes])
return type(f"FirstParser_{names}", (cls,), {"_parser_cls": parser_classes})
class PipelineParser(BaseParser):
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._parsers: List[BaseParser] = []
for parser_cls in self._parser_cls:
try:
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
except Exception as e:
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
def parse_into_text(self, content: bytes) -> Document:
document = Document()
for p in self._parsers:
document = p.parse_into_text(content)
content = endecode.encode_bytes(document.content)
return document
@classmethod
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["PipelineParser"]:
names = "_".join([p.__name__ for p in parser_classes])
return type(f"PipelineParser_{names}", (cls,), {"_parser_cls": parser_classes})
if __name__ == "__main__":
from docreader.parser.markdown_parser import MarkdownParser
cls = FirstParser.create(MarkdownParser)
parser = cls()
print(parser.parse_into_text(b"aaa"))

View File

@@ -1,21 +0,0 @@
from dataclasses import dataclass, field
@dataclass
class ChunkingConfig:
"""
Configuration for text chunking process.
Controls how documents are split into smaller pieces for processing.
"""
chunk_size: int = 512 # Maximum size of each chunk in tokens/chars
chunk_overlap: int = 50 # Number of tokens/chars to overlap between chunks
separators: list = field(
default_factory=lambda: ["\n\n", "\n", ""]
) # Text separators in order of priority
enable_multimodal: bool = (
False # Whether to enable multimodal processing (text + images)
)
storage_config: dict = None # Preferred field name going forward
vlm_config: dict = None # VLM configuration for image captioning

View File

@@ -1,134 +1,88 @@
import asyncio
import logging
import re
import tempfile
import os
import subprocess
import shutil
from io import BytesIO
from typing import Optional, List, Tuple
import textract
from PIL import Image
import zipfile
import xml.etree.ElementTree as ET
from typing import List, Optional
from .base_parser import BaseParser
from .docx_parser import DocxParser, Docx
import textract
from docreader.models.document import Document
from docreader.parser.docx2_parser import Docx2Parser
from docreader.utils.tempfile import TempDirContext, TempFileContext
logger = logging.getLogger(__name__)
class DocParser(BaseParser):
class DocParser(Docx2Parser):
"""DOC document parser"""
def parse_into_text(self, content: bytes) -> str:
"""Parse DOC document
Args:
content: DOC document content
Returns:
Parse result
"""
def parse_into_text(self, content: bytes) -> Document:
logger.info(f"Parsing DOC document, content size: {len(content)} bytes")
handle_chain = [
# 1. Try to convert to docx format to extract images
self._parse_with_docx,
# 2. If image extraction is not needed or conversion failed,
# try using antiword to extract text
self._parse_with_antiword,
# 3. If antiword extraction fails, use textract
self._parse_with_textract,
]
# Save byte content as a temporary file
with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
temp_file_path = temp_file.name
temp_file.write(content)
temp_file.flush()
logger.info(f"Saved DOC content to temporary file: {temp_file_path}")
with TempFileContext(content, ".doc") as temp_file_path:
for handle in handle_chain:
try:
document = handle(temp_file_path)
if document:
return document
except Exception as e:
logger.warning(f"Failed to parse DOC with {handle.__name__} {e}")
try:
# First try to convert to docx format to extract images
if self.enable_multimodal:
logger.info("Multimodal enabled, attempting to extract images from DOC")
docx_content = self._convert_doc_to_docx(temp_file_path)
return Document(content="")
if docx_content:
logger.info("Successfully converted DOC to DOCX, using DocxParser")
# Use existing DocxParser to parse the converted docx
docx_parser = DocxParser(
file_name=self.file_name,
file_type="docx",
enable_multimodal=self.enable_multimodal,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
chunking_config=self.chunking_config,
separators=self.separators,
)
text = docx_parser.parse_into_text(docx_content)
logger.info(f"Extracted {len(text)} characters using DocxParser")
def _parse_with_docx(self, temp_file_path: str) -> Document:
logger.info("Multimodal enabled, attempting to extract images from DOC")
# Clean up temporary file
os.unlink(temp_file_path)
logger.info(f"Deleted temporary file: {temp_file_path}")
docx_content = self._try_convert_doc_to_docx(temp_file_path)
if not docx_content:
raise RuntimeError("Failed to convert DOC to DOCX")
return text
else:
logger.warning(
"Failed to convert DOC to DOCX, falling back to text-only extraction"
)
logger.info("Successfully converted DOC to DOCX, using DocxParser")
# Use existing DocxParser to parse the converted docx
document = super(Docx2Parser, self).parse_into_text(docx_content)
logger.info(f"Extracted {len(document.content)} characters using DocxParser")
return document
# If image extraction is not needed or conversion failed, try using antiword to extract text
try:
logger.info("Attempting to parse DOC file with antiword")
# Check if antiword is installed
antiword_path = self._find_antiword_path()
def _parse_with_antiword(self, temp_file_path: str) -> Document:
logger.info("Attempting to parse DOC file with antiword")
if antiword_path:
# Use antiword to extract text directly
logger.info(f"Using antiword at {antiword_path} to extract text")
process = subprocess.Popen(
[antiword_path, temp_file_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = process.communicate()
# Check if antiword is installed
antiword_path = self._try_find_antiword()
if not antiword_path:
raise RuntimeError("antiword not found in PATH")
if process.returncode == 0:
text = stdout.decode("utf-8", errors="ignore")
logger.info(
f"Successfully extracted {len(text)} characters using antiword"
)
# Clean up temporary file
os.unlink(temp_file_path)
logger.info(f"Deleted temporary file: {temp_file_path}")
return text
else:
logger.warning(
f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
)
else:
logger.warning("antiword not found, falling back to textract")
except Exception as e:
logger.warning(
f"Error using antiword: {str(e)}, falling back to textract"
)
# If antiword fails, try using textract
logger.info("Parsing DOC file with textract")
text = textract.process(temp_file_path, method="antiword").decode("utf-8")
logger.info(
f"Successfully extracted {len(text)} characters of text from DOC document using textract"
# Use antiword to extract text directly
process = subprocess.Popen(
[antiword_path, temp_file_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = process.communicate()
if process.returncode != 0:
raise RuntimeError(
f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
)
text = stdout.decode("utf-8", errors="ignore")
logger.info(f"Successfully extracted {len(text)} characters using antiword")
return Document(content=text)
# Clean up temporary file
os.unlink(temp_file_path)
logger.info(f"Deleted temporary file: {temp_file_path}")
def _parse_with_textract(self, temp_file_path: str) -> Document:
logger.info(f"Parsing DOC file with textract: {temp_file_path}")
text = textract.process(temp_file_path, method="antiword").decode("utf-8")
logger.info(f"Successfully extracted {len(text)} bytes of DOC using textract")
return Document(content=str(text))
return text
except Exception as e:
logger.error(f"Error parsing DOC document: {str(e)}")
# Ensure temporary file is cleaned up
if os.path.exists(temp_file_path):
os.unlink(temp_file_path)
logger.info(f"Deleted temporary file after error: {temp_file_path}")
return ""
def _convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
def _try_convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
"""Convert DOC file to DOCX format
Uses LibreOffice/OpenOffice for conversion
@@ -141,21 +95,16 @@ class DocParser(BaseParser):
"""
logger.info(f"Converting DOC to DOCX: {doc_path}")
# Check if LibreOffice or OpenOffice is installed
soffice_path = self._try_find_soffice()
if not soffice_path:
return None
# Execute conversion command
logger.info(f"Using {soffice_path} to convert DOC to DOCX")
# Create a temporary directory to store the converted file
temp_dir = tempfile.mkdtemp()
docx_path = os.path.join(temp_dir, "converted.docx")
try:
# Check if LibreOffice or OpenOffice is installed
soffice_path = self._find_soffice_path()
if not soffice_path:
logger.error(
"LibreOffice/OpenOffice not found, cannot convert DOC to DOCX"
)
return None
# Execute conversion command
logger.info(f"Using {soffice_path} to convert DOC to DOCX")
with TempDirContext() as temp_dir:
cmd = [
soffice_path,
"--headless",
@@ -165,7 +114,6 @@ class DocParser(BaseParser):
temp_dir,
doc_path,
]
logger.info(f"Running command: {' '.join(cmd)}")
process = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
@@ -173,41 +121,68 @@ class DocParser(BaseParser):
stdout, stderr = process.communicate()
if process.returncode != 0:
logger.error(
f"Error converting DOC to DOCX: {stderr.decode('utf-8', errors='ignore')}"
logger.warning(
f"Error converting DOC to DOCX: {stderr.decode('utf-8')}"
)
return None
# Find the converted file
for file in os.listdir(temp_dir):
if file.endswith(".docx"):
converted_file = os.path.join(temp_dir, file)
logger.info(f"Found converted file: {converted_file}")
# Read the converted file content
with open(converted_file, "rb") as f:
docx_content = f.read()
docx_file = [
file for file in os.listdir(temp_dir) if file.endswith(".docx")
]
logger.info(f"Found {len(docx_file)} DOCX file(s) in temporary directory")
for file in docx_file:
converted_file = os.path.join(temp_dir, file)
logger.info(f"Found converted file: {converted_file}")
# Read the converted file content
with open(converted_file, "rb") as f:
docx_content = f.read()
logger.info(
f"Successfully read converted DOCX file, size: {len(docx_content)} bytes"
f"Successfully read DOCX file, size: {len(docx_content)}"
)
return docx_content
return None
logger.error("No DOCX file found after conversion")
return None
def _try_find_executable_path(
self,
executable_name: str,
possible_path: List[str] = [],
environment_variable: List[str] = [],
) -> Optional[str]:
"""Find executable path
Args:
executable_name: Executable name
possible_path: List of possible paths
environment_variable: List of environment variables to check
Returns:
Executable path, or None if not found
"""
# Common executable paths
paths: List[str] = []
paths.extend(possible_path)
paths.extend(os.environ.get(env_var, "") for env_var in environment_variable)
paths = list(set(paths))
except Exception as e:
logger.error(f"Error during DOC to DOCX conversion: {str(e)}")
return None
finally:
# Clean up temporary directory
try:
shutil.rmtree(temp_dir)
logger.info(f"Cleaned up temporary directory: {temp_dir}")
except Exception as e:
logger.warning(f"Failed to clean up temporary directory: {str(e)}")
# Check if path is set in environment variable
for path in paths:
if os.path.exists(path):
logger.info(f"Found {executable_name} at {path}")
return path
def _find_soffice_path(self) -> Optional[str]:
# Try to find in PATH
result = subprocess.run(
["which", executable_name], capture_output=True, text=True
)
if result.returncode == 0 and result.stdout.strip():
path = result.stdout.strip()
logger.info(f"Found {executable_name} at {path}")
return path
logger.warning(f"Failed to find {executable_name}")
return None
def _try_find_soffice(self) -> Optional[str]:
"""Find LibreOffice/OpenOffice executable path
Returns:
@@ -225,32 +200,13 @@ class DocParser(BaseParser):
"C:\\Program Files\\LibreOffice\\program\\soffice.exe",
"C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
]
return self._try_find_executable_path(
executable_name="soffice",
possible_path=possible_paths,
environment_variable=["LIBREOFFICE_PATH"],
)
# Check if path is set in environment variable
if os.environ.get("LIBREOFFICE_PATH"):
possible_paths.insert(0, os.environ.get("LIBREOFFICE_PATH"))
for path in possible_paths:
if os.path.exists(path):
logger.info(f"Found LibreOffice/OpenOffice at: {path}")
return path
# Try to find in PATH
try:
result = subprocess.run(
["which", "soffice"], capture_output=True, text=True
)
if result.returncode == 0 and result.stdout.strip():
path = result.stdout.strip()
logger.info(f"Found LibreOffice/OpenOffice in PATH: {path}")
return path
except Exception:
pass
logger.warning("LibreOffice/OpenOffice not found")
return None
def _find_antiword_path(self) -> Optional[str]:
def _try_find_antiword(self) -> Optional[str]:
"""Find antiword executable path
Returns:
@@ -265,51 +221,27 @@ class DocParser(BaseParser):
"C:\\Program Files\\Antiword\\antiword.exe",
"C:\\Program Files (x86)\\Antiword\\antiword.exe",
]
# Check if path is set in environment variable
if os.environ.get("ANTIWORD_PATH"):
possible_paths.insert(0, os.environ.get("ANTIWORD_PATH"))
for path in possible_paths:
if os.path.exists(path):
logger.info(f"Found antiword at: {path}")
return path
# Try to find in PATH
try:
result = subprocess.run(
["which", "antiword"], capture_output=True, text=True
)
if result.returncode == 0 and result.stdout.strip():
path = result.stdout.strip()
logger.info(f"Found antiword in PATH: {path}")
return path
except Exception:
pass
logger.warning("antiword not found")
return None
return self._try_find_executable_path(
executable_name="antiword",
possible_path=possible_paths,
environment_variable=["ANTIWORD_PATH"],
)
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger.info("Running DocParser in standalone mode")
logging.basicConfig(level=logging.DEBUG)
file_name = "/path/to/your/test.doc"
logger.info(f"Processing file: {file_name}")
doc_parser = DocParser(
file_name, enable_multimodal=True, chunk_size=512, chunk_overlap=60
file_name=file_name,
enable_multimodal=True,
chunk_size=512,
chunk_overlap=60,
)
logger.info("Parser initialized, starting processing")
with open(file_name, "rb") as f:
content = f.read()
text = doc_parser.parse_into_text(content)
logger.info(f"Processing complete, extracted text length: {len(text)}")
logger.info(f"Sample text: {text[:200]}...")
document = doc_parser.parse_into_text(content)
logger.info(f"Processing complete, extracted text length: {len(document.content)}")
logger.info(f"Sample text: {document.content[:200]}...")

View File

@@ -0,0 +1,28 @@
import logging
from docreader.parser.chain_parser import FirstParser
from docreader.parser.docx_parser import DocxParser
from docreader.parser.markitdown_parser import MarkitdownParser
logger = logging.getLogger(__name__)
class Docx2Parser(FirstParser):
_parser_cls = (MarkitdownParser, DocxParser)
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
your_file = "/path/to/your/file.docx"
parser = Docx2Parser(separators=[".", "?", "!", "", "", ""])
with open(your_file, "rb") as f:
content = f.read()
document = parser.parse(content)
for cc in document.chunks:
logger.info(f"chunk: {cc}")
# document = parser.parse_into_text(content)
# logger.info(f"docx content: {document.content}")
# logger.info(f"find images {document.images.keys()}")

View File

@@ -1,37 +1,36 @@
import logging
import tempfile
import os
import sys
import time
from io import BytesIO
from typing import Optional, Dict, Any, Tuple, List, Union
from dataclasses import dataclass, field
from PIL import Image
from docx import Document
from docx.image.exceptions import (
UnrecognizedImageError,
UnexpectedEndOfFileError,
InvalidImageStreamError,
)
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import re
import tempfile
import threading
import time
import traceback
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass, field
from io import BytesIO
from multiprocessing import Manager
import re
from typing import Any, Dict, List, Optional, Tuple
from .base_parser import BaseParser
from docx import Document
from docx.image.exceptions import (
InvalidImageStreamError,
UnexpectedEndOfFileError,
UnrecognizedImageError,
)
from PIL import Image
from docreader.models.document import Document as DocumentModel
from docreader.parser.base_parser import BaseParser
from docreader.utils import endecode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# Add thread local storage to track the processing status of each thread
thread_local = threading.local()
class ImageData:
"""Represents a processed image of document content"""
local_path: str = ""
object: Image.Image = None
object: Optional[Image.Image] = None
url: str = ""
@@ -40,7 +39,9 @@ class LineData:
"""Represents a processed line of document content with associated images"""
text: str = "" # Extracted text content
images: List[ImageData] = field(default_factory=list) # List of images or image paths
images: List[ImageData] = field(
default_factory=list
) # List of images or image paths
extra_info: str = "" # Placeholder for additional info (currently unused)
page_num: int = 0 # Page number
content_sequence: List[Tuple[str, Any]] = field(
@@ -53,18 +54,8 @@ class DocxParser(BaseParser):
def __init__(
self,
file_name: str = "",
file_type: str = None,
enable_multimodal: bool = True,
chunk_size: int = 1000,
chunk_overlap: int = 200,
separators: list = ["\n\n", "\n", ""],
ocr_backend: str = "paddle",
ocr_config: dict = None,
max_image_size: int = 1920,
max_concurrent_tasks: int = 5,
max_pages: int = 100, # Maximum number of pages to process, default to 50 pages
chunking_config=None,
max_pages: int = 100, # Maximum number of pages to process
**kwargs,
):
"""Initialize DOCX document parser
@@ -79,37 +70,16 @@ class DocxParser(BaseParser):
ocr_config: OCR engine configuration
max_image_size: Maximum image size limit
max_concurrent_tasks: Maximum number of concurrent tasks
max_pages: Maximum number of pages to process, if more than this, only process the first max_pages pages
max_pages: Maximum number of pages to process
"""
super().__init__(
file_name=file_name,
file_type=file_type,
enable_multimodal=enable_multimodal,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=separators,
ocr_backend=ocr_backend,
ocr_config=ocr_config,
max_image_size=max_image_size,
max_concurrent_tasks=max_concurrent_tasks,
chunking_config=chunking_config,
)
super().__init__(**kwargs)
self.max_pages = max_pages
logger.info(f"DocxParser initialized with max_pages={max_pages}")
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
"""Parse DOCX document, extract text content and image Markdown links
Args:
content: DOCX document content
Returns:
Tuple of (parsed_text, image_map) where image_map maps image URLs to Image objects
All LineData objects are used internally but not returned directly through this interface
"""
def parse_into_text(self, content: bytes) -> DocumentModel:
"""Parse DOCX document, extract text content and image Markdown links"""
logger.info(f"Parsing DOCX document, content size: {len(content)} bytes")
logger.info(f"Max pages limit set to: {self.max_pages}")
logger.info("Converting DOCX content to sections and tables")
start_time = time.time()
# Use concurrent processing to handle the document
@@ -123,7 +93,7 @@ class DocxParser(BaseParser):
docx_processor = Docx(
max_image_size=self.max_image_size,
enable_multimodal=self.enable_multimodal,
upload_file=self.upload_file,
upload_file=self.storage.upload_file,
)
all_lines, tables = docx_processor(
binary=content,
@@ -140,7 +110,7 @@ class DocxParser(BaseParser):
section_start_time = time.time()
text_parts = []
image_parts = {}
image_parts: Dict[str, str] = {}
for sec_idx, line in enumerate(all_lines):
try:
@@ -148,16 +118,19 @@ class DocxParser(BaseParser):
text_parts.append(line.text)
if sec_idx < 3 or sec_idx % 50 == 0:
logger.info(
f"Added section {sec_idx+1} text: {line.text[:50]}..."
f"Added section {sec_idx + 1} text: {line.text[:50]}..."
if len(line.text) > 50
else f"Added section {sec_idx+1} text: {line.text}"
else f"Added section {sec_idx + 1} text: {line.text}"
)
if line.images:
for image_data in line.images:
if image_data.url:
image_parts[image_data.url] = image_data.object
if image_data.url and image_data.object:
image_parts[image_data.url] = endecode.decode_image(
image_data.object
)
image_data.object.close()
except Exception as e:
logger.error(f"Error processing section {sec_idx+1}: {str(e)}")
logger.error(f"Error processing section {sec_idx + 1}: {str(e)}")
logger.error(f"Detailed stack trace: {traceback.format_exc()}")
continue
@@ -176,17 +149,17 @@ class DocxParser(BaseParser):
total_processing_time = time.time() - start_time
logger.info(
f"Parsing complete in {total_processing_time:.2f}s, generated {len(text)} characters of text "
f"Parsing complete in {total_processing_time:.2f}s, "
f"generated {len(text)} characters of text"
)
return text, image_parts
return DocumentModel(content=text, images=image_parts)
except Exception as e:
logger.error(f"Error parsing DOCX document: {str(e)}")
logger.error(f"Detailed stack trace: {traceback.format_exc()}")
fallback_text = self._parse_using_simple_method(content)
return fallback_text, {}
return self._parse_using_simple_method(content)
def _parse_using_simple_method(self, content: bytes) -> str:
def _parse_using_simple_method(self, content: bytes) -> DocumentModel:
"""Parse document using a simplified method, as a fallback
Args:
@@ -201,7 +174,8 @@ class DocxParser(BaseParser):
doc = Document(BytesIO(content))
logger.info(
f"Successfully loaded document in simplified method, "
f"contains {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables"
f"contains {len(doc.paragraphs)} paragraphs "
f"and {len(doc.tables)} tables"
)
text_parts = []
@@ -211,7 +185,7 @@ class DocxParser(BaseParser):
para_with_text = 0
for i, para in enumerate(doc.paragraphs):
if i % 100 == 0:
logger.info(f"Processing paragraph {i+1}/{para_count}")
logger.info(f"Processing paragraph {i + 1}/{para_count}")
if para.text.strip():
text_parts.append(para.text.strip())
para_with_text += 1
@@ -225,7 +199,7 @@ class DocxParser(BaseParser):
rows_processed = 0
for i, table in enumerate(doc.tables):
if i % 10 == 0:
logger.info(f"Processing table {i+1}/{table_count}")
logger.info(f"Processing table {i + 1}/{table_count}")
table_has_content = False
for row in table.rows:
@@ -256,25 +230,24 @@ class DocxParser(BaseParser):
# If the result is still empty, return an error message
if not result_text:
logger.warning("No text extracted using simplified method")
return "", {}
return DocumentModel()
return result_text, {}
return DocumentModel(content=result_text)
except Exception as backup_error:
processing_time = time.time() - start_time
logger.error(
f"Simplified parsing failed after {processing_time:.2f}s: {str(backup_error)}"
f"Simplified parsing failed {processing_time:.2f}s: {backup_error}"
)
logger.error(f"Detailed traceback: {traceback.format_exc()}")
return "", {}
return DocumentModel()
class Docx:
def __init__(self, max_image_size=1920, enable_multimodal=False, upload_file=None):
logger.info("Initializing DOCX processor")
self.max_image_size = max_image_size # Maximum image size limit
self.picture_cache = (
{}
) # Image cache to avoid processing the same image repeatedly
# Image cache to avoid processing the same image repeatedly
self.picture_cache = {}
self.enable_multimodal = enable_multimodal
self.upload_file = upload_file
@@ -454,7 +427,6 @@ class Docx:
return page_to_paragraphs
def __call__(
self,
binary: Optional[bytes] = None,
@@ -611,7 +583,6 @@ class Docx:
return pages_to_process
def _process_document(
self,
binary,
@@ -806,7 +777,9 @@ class Docx:
# Collect temporary image paths for later cleanup
for line in page_lines:
for image_data in line.images:
if image_data.local_path and image_data.local_path.startswith("/tmp/docx_img_"):
if image_data.local_path and image_data.local_path.startswith(
"/tmp/docx_img_"
):
temp_img_paths.add(image_data.local_path)
results.extend(page_lines)
@@ -876,7 +849,11 @@ class Docx:
# Process all image data objects
for image_data in image_paths:
if image_data.local_path and os.path.exists(image_data.local_path) and image_data.local_path not in image_url_map:
if (
image_data.local_path
and os.path.exists(image_data.local_path)
and image_data.local_path not in image_url_map
):
try:
# Upload the image if it doesn't have a URL yet
if not image_data.url:
@@ -886,12 +863,16 @@ class Docx:
image_data.url = image_url
# Add image URL as Markdown format
markdown_image = f"![]({image_url})"
image_url_map[image_data.local_path] = markdown_image
image_url_map[image_data.local_path] = (
markdown_image
)
logger.info(
f"Added image URL for {image_data.local_path}: {image_url}"
)
else:
logger.warning(f"Failed to upload image: {image_data.local_path}")
logger.warning(
f"Failed to upload image: {image_data.local_path}"
)
else:
# Already has a URL, use it
markdown_image = f"![]({image_data.url})"
@@ -925,12 +906,19 @@ class Docx:
# For ImageData objects, use the URL
if isinstance(content, str) and content in image_url_map:
combined_parts.append(image_url_map[content])
elif hasattr(content, 'local_path') and content.local_path in image_url_map:
elif (
hasattr(content, "local_path")
and content.local_path in image_url_map
):
combined_parts.append(image_url_map[content.local_path])
# Create the final text with proper ordering
final_text = "\n\n".join(part for part in combined_parts if part)
processed_lines.append(LineData(text=final_text, page_num=page_num, images=line_data.images))
processed_lines.append(
LineData(
text=final_text, page_num=page_num, images=line_data.images
)
)
else:
processed_lines = lines
@@ -1003,11 +991,11 @@ class Docx:
logger.info(f"Processing {table_count} tables")
for tb_idx, tb in enumerate(self.doc.tables):
if tb_idx % 10 == 0: # Log only every 10 tables to reduce log volume
logger.info(f"Processing table {tb_idx+1}/{table_count}")
logger.info(f"Processing table {tb_idx + 1}/{table_count}")
# Optimize: Check if table is empty
if len(tb.rows) == 0 or all(len(r.cells) == 0 for r in tb.rows):
logger.info(f"Skipping empty table {tb_idx+1}")
logger.info(f"Skipping empty table {tb_idx + 1}")
continue
table_html = self._convert_table_to_html(tb)
@@ -1111,8 +1099,8 @@ def _save_image_to_temp(logger, image, page_num, img_idx):
if not image:
return None
import tempfile
import os
import tempfile
try:
# Create a temporary file
@@ -1187,8 +1175,15 @@ def process_page_multiprocess(
return []
# Extract page content
combined_text, image_objects, content_sequence = _extract_page_content_in_process(
process_logger, doc, page_num, paragraphs, enable_multimodal, max_image_size
combined_text, image_objects, content_sequence = (
_extract_page_content_in_process(
process_logger,
doc,
page_num,
paragraphs,
enable_multimodal,
max_image_size,
)
)
# Process content sequence to maintain order between processes
@@ -1199,7 +1194,9 @@ def process_page_multiprocess(
if enable_multimodal:
# First pass: save all images to temporary files
for i, image_object in enumerate(image_objects):
img_path = _save_image_to_temp(process_logger, image_object, page_num, i)
img_path = _save_image_to_temp(
process_logger, image_object, page_num, i
)
if img_path:
# Create ImageData object
image_data = ImageData()

View File

@@ -1,15 +1,13 @@
import base64
import logging
import os
import asyncio
from PIL import Image
import io
from typing import Dict, Any, Tuple, Union
from .base_parser import BaseParser, ParseResult
import numpy as np
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
# Set up logger for this module
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class ImageParser(BaseParser):
"""
@@ -23,46 +21,24 @@ class ImageParser(BaseParser):
4. Returning a combined result with both text and image reference
"""
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
def parse_into_text(self, content: bytes) -> Document:
"""
Parse image content, upload the image and return Markdown reference along with image map.
Args:
content: Raw image data (bytes)
Returns:
Tuple of (markdown_text, image_map) where image_map maps image URLs to PIL Image objects
Parse image content into markdown text
:param content: bytes content of the image
:return: Document object
"""
logger.info(f"Parsing image content, size: {len(content)} bytes")
image_map = {}
try:
# Upload image to storage service
logger.info("Uploading image to storage")
_, ext = os.path.splitext(self.file_name)
image_url = self.upload_bytes(content, file_ext=ext)
if not image_url:
logger.error("Failed to upload image to storage")
return "", {}
logger.info(
f"Successfully uploaded image, URL: {image_url[:50]}..."
if len(image_url) > 50
else f"Successfully uploaded image, URL: {image_url}"
)
# Create image object and add to map
try:
from PIL import Image
import io
image = Image.open(io.BytesIO(content))
image_map[image_url] = image
logger.info(f"Added image to image_map for URL: {image_url}")
except Exception as img_err:
logger.error(f"Error creating image object: {str(img_err)}")
# Get file extension
ext = os.path.splitext(self.file_name)[1].lower()
markdown_text = f"![{self.file_name}]({image_url})"
return markdown_text, image_map
# Upload image to storage
image_url = self.storage.upload_bytes(content, file_ext=ext)
logger.info(f"Successfully uploaded image, URL: {image_url[:50]}...")
except Exception as e:
logger.error(f"Error parsing image: {str(e)}")
return "", {}
# Generate markdown text
text = f"![{self.file_name}]({image_url})"
images = {image_url: base64.b64encode(content).decode()}
# Create image object and add to map
return Document(content=text, images=images)

View File

@@ -1,43 +0,0 @@
import base64
import io
import logging
from typing import Union
from PIL import Image
import numpy as np
logger = logging.getLogger(__name__)
def image_to_base64(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
"""Convert image to base64 encoded string
Args:
image: Image file path, bytes, PIL Image object, or numpy array
Returns:
Base64 encoded image string, or empty string if conversion fails
"""
try:
if isinstance(image, str):
# It's a file path
with open(image, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
elif isinstance(image, bytes):
# It's bytes data
return base64.b64encode(image).decode("utf-8")
elif isinstance(image, Image.Image):
# It's a PIL Image
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
elif isinstance(image, np.ndarray):
# It's a numpy array
pil_image = Image.fromarray(image)
buffer = io.BytesIO()
pil_image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
else:
logger.error(f"Unsupported image type: {type(image)}")
return ""
except Exception as e:
logger.error(f"Error converting image to base64: {str(e)}")
return ""

View File

@@ -0,0 +1,111 @@
import logging
import re
import uuid
from typing import Dict, List, Match, Optional, Tuple
from docreader.utils import endecode
# Get logger object
logger = logging.getLogger(__name__)
class MarkdownImageUtil:
def __init__(self):
self.b64_pattern = re.compile(
r"!\[([^\]]*)\]\(data:image/(\w+);base64,([^\)]+)\)"
)
self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
def extract_image(
self,
content: str,
path_prefix: Optional[str] = None,
replace: bool = True,
) -> Tuple[str, List[str]]:
"""Extract base64 encoded images from Markdown content"""
# image_path => base64 bytes
images: List[str] = []
def repl(match: Match[str]) -> str:
title = match.group(1)
image_path = match.group(2)
if path_prefix:
image_path = f"{path_prefix}/{image_path}"
images.append(image_path)
if not replace:
return match.group(0)
# Replace image path with URL
return f"![{title}]({image_path})"
text = self.image_pattern.sub(repl, content)
logger.debug(f"Extracted {len(images)} images from markdown")
return text, images
def extract_base64(
self,
content: str,
path_prefix: Optional[str] = None,
replace: bool = True,
) -> Tuple[str, Dict[str, bytes]]:
"""Extract base64 encoded images from Markdown content"""
# image_path => base64 bytes
images: Dict[str, bytes] = {}
def repl(match: Match[str]) -> str:
title = match.group(1)
img_ext = match.group(2)
img_b64 = match.group(3)
image_byte = endecode.encode_image(img_b64, errors="ignore")
if not image_byte:
logger.error(f"Failed to decode base64 image skip it: {img_b64}")
return title
image_path = f"{uuid.uuid4()}.{img_ext}"
if path_prefix:
image_path = f"{path_prefix}/{image_path}"
images[image_path] = image_byte
if not replace:
return match.group(0)
# Replace image path with URL
return f"![{title}]({image_path})"
text = self.b64_pattern.sub(repl, content)
logger.debug(f"Extracted {len(images)} base64 images from markdown")
return text, images
def replace_path(self, content: str, images: Dict[str, str]) -> str:
content_replace: set = set()
def repl(match: Match[str]) -> str:
title = match.group(1)
image_path = match.group(2)
if image_path not in images:
return match.group(0)
content_replace.add(image_path)
image_path = images[image_path]
return f"![{title}]({image_path})"
text = self.replace_pattern.sub(repl, content)
logger.debug(f"Replaced {len(content_replace)} images in markdown")
return text
if __name__ == "__main__":
your_content = "test![]()test"
image_handle = MarkdownImageUtil()
text, images = image_handle.extract_base64(your_content)
print(text)
for image_url, image_byte in images.items():
with open(image_url, "wb") as f:
f.write(image_byte)

View File

@@ -1,33 +1,53 @@
import asyncio
import re
import base64
import logging
import numpy as np
import os # Import os module to get environment variables
from typing import Dict, List, Optional, Tuple, Union, Any
from .base_parser import BaseParser
import os
from typing import Dict
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.chain_parser import PipelineParser
from docreader.parser.markdown_image_util import MarkdownImageUtil
from docreader.utils import endecode
# Get logger object
logger = logging.getLogger(__name__)
class MarkdownParser(BaseParser):
"""Markdown document parser"""
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
"""Parse Markdown document, only extract text content, do not process images
Args:
content: Markdown document content
Returns:
Parsed text result
"""
logger.info(f"Parsing Markdown document, content size: {len(content)} bytes")
class MarkdownImageBase64(BaseParser):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.image_helper = MarkdownImageUtil()
def parse_into_text(self, content: bytes) -> Document:
# Convert byte content to string using universal decoding method
text = self.decode_bytes(content)
logger.info(f"Decoded Markdown content, text length: {len(text)} characters")
text = endecode.decode_bytes(content)
text, img_b64 = self.image_helper.extract_base64(text, path_prefix="images")
logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text")
return text
images: Dict[str, str] = {}
image_replace: Dict[str, str] = {}
logger.debug(f"Uploading {len(img_b64)} images from markdown")
for ipath, b64_bytes in img_b64.items():
ext = os.path.splitext(ipath)[1].lower()
image_url = self.storage.upload_bytes(b64_bytes, ext)
image_replace[ipath] = image_url
images[image_url] = base64.b64encode(b64_bytes).decode()
text = self.image_helper.replace_path(text, image_replace)
return Document(content=text, images=images)
class MarkdownParser(PipelineParser):
_parser_cls = (MarkdownImageBase64,)
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
your_content = "test![]()test"
parser = MarkdownParser()
document = parser.parse_into_text(your_content.encode())
logger.info(document.content)
logger.info(f"Images: {len(document.images)}, name: {document.images.keys()}")

View File

@@ -0,0 +1,31 @@
import io
import logging
from markitdown import MarkItDown
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.chain_parser import PipelineParser
from docreader.parser.markdown_parser import MarkdownParser
logger = logging.getLogger(__name__)
class StdMarkitdownParser(BaseParser):
"""
PDF Document Parser
This parser handles PDF documents by extracting text content.
It uses the markitdown library for simple text extraction.
"""
def __init__(self, *args, **kwargs):
self.markitdown = MarkItDown()
def parse_into_text(self, content: bytes) -> Document:
result = self.markitdown.convert(io.BytesIO(content), keep_data_uris=True)
return Document(content=result.text_content)
class MarkitdownParser(PipelineParser):
_parser_cls = (StdMarkitdownParser, MarkdownParser)

View File

@@ -0,0 +1,124 @@
import logging
import os
import re
from typing import Dict
import markdownify
import requests
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.markdown_parser import MarkdownImageUtil
from docreader.utils import endecode
logger = logging.getLogger(__name__)
class MinerUParser(BaseParser):
def __init__(
self,
enable_markdownify: bool = True,
mineru_endpoint: str = "",
**kwargs,
):
super().__init__(**kwargs)
self.minerU = os.getenv("MINERU_ENDPOINT", mineru_endpoint)
self.enable_markdownify = enable_markdownify
self.image_helper = MarkdownImageUtil()
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
self.enable = self.ping()
assert self.ping(), "MinerU API is not reachable"
def ping(self, timeout: int = 5) -> bool:
try:
response = requests.get(
self.minerU + "/docs", timeout=timeout, allow_redirects=True
)
response.raise_for_status()
return True
except Exception:
return False
def parse_into_text(self, content: bytes) -> Document:
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
md_content: str = ""
images_b64: Dict[str, str] = {}
try:
response = requests.post(
url=self.minerU + "/file_parse",
data={
"return_md": True,
"return_images": True,
"lang_list": ["ch", "en"],
"table_enable": True,
"formula_enable": True,
"parse_method": "auto",
"start_page_id": 0,
"end_page_id": 99999,
"backend": "pipeline",
"response_format_zip": False,
"return_middle_json": False,
"return_model_output": False,
"return_content_list": False,
},
files={"files": content},
timeout=1000,
)
response.raise_for_status()
result = response.json()["results"]["files"]
md_content = result["md_content"]
images_b64 = result.get("images", {})
except Exception as e:
logger.error(f"MinerU parsing failed: {e}", exc_info=True)
return Document()
# convert table(HTML) in markdown to markdown table
if self.enable_markdownify:
logger.debug("Converting HTML to Markdown")
md_content = markdownify.markdownify(md_content)
images = {}
image_replace = {}
# image in images_bs64 may not be used in md_content
# such as: table ...
# so we need to filter them
for ipath, b64_str in images_b64.items():
if f"images/{ipath}" not in md_content:
logger.debug(f"Image {ipath} not used in markdown")
continue
match = self.base64_pattern.match(b64_str)
if match:
file_ext = match.group(1)
b64_str = match.group(2)
image_bytes = endecode.encode_image(b64_str, errors="ignore")
if not image_bytes:
logger.error("Failed to decode base64 image skip it")
continue
image_url = self.storage.upload_bytes(
image_bytes, file_ext=f".{file_ext}"
)
images[image_url] = b64_str
image_replace[f"images/{ipath}"] = image_url
logger.info(f"Replaced {len(image_replace)} images in markdown")
text = self.image_helper.replace_path(md_content, image_replace)
logger.info(
f"Successfully parsed PDF, text: {len(text)}, images: {len(images)}"
)
return Document(content=text, images=images)
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
your_file = "/path/to/your/file.pdf"
your_mineru = "http://host.docker.internal:9987"
parser = MinerUParser(mineru_endpoint=your_mineru)
with open(your_file, "rb") as f:
content = f.read()
document = parser.parse_into_text(content)
logger.error(document.content)

View File

@@ -1,71 +1,96 @@
import os
import logging
import base64
from typing import Optional, Union, Dict, Any
from abc import ABC, abstractmethod
from PIL import Image
import io
import logging
import os
import platform
import subprocess
from abc import ABC, abstractmethod
from typing import Dict, Union
import numpy as np
from .image_utils import image_to_base64
from openai import OpenAI
from PIL import Image
from docreader.utils import endecode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class OCRBackend(ABC):
"""Base class for OCR backends"""
@abstractmethod
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
"""Extract text from an image
Args:
image: Image file path, bytes, or PIL Image object
Returns:
Extracted text
"""
pass
class DummyOCRBackend(OCRBackend):
"""Dummy OCR backend implementation"""
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
logger.warning("Dummy OCR backend is used")
return ""
class PaddleOCRBackend(OCRBackend):
"""PaddleOCR backend implementation"""
def __init__(self, **kwargs):
def __init__(self):
"""Initialize PaddleOCR backend"""
self.ocr = None
try:
import os
import paddle
# Set PaddlePaddle to use CPU and disable GPU
os.environ['CUDA_VISIBLE_DEVICES'] = ''
paddle.set_device('cpu')
os.environ["CUDA_VISIBLE_DEVICES"] = ""
paddle.device.set_device("cpu")
# 尝试检测CPU是否支持AVX指令集
try:
import subprocess
import platform
# 检测CPU是否支持AVX
if platform.system() == "Linux":
try:
result = subprocess.run(['grep', '-o', 'avx', '/proc/cpuinfo'],
capture_output=True, text=True, timeout=5)
has_avx = 'avx' in result.stdout.lower()
result = subprocess.run(
["grep", "-o", "avx", "/proc/cpuinfo"],
capture_output=True,
text=True,
timeout=5,
)
has_avx = "avx" in result.stdout.lower()
if not has_avx:
logger.warning("CPU does not support AVX instructions, using compatibility mode")
logger.warning(
"CPU does not support AVX instructions, "
"using compatibility mode"
)
# 进一步限制指令集使用
os.environ['FLAGS_use_avx2'] = '0'
os.environ['FLAGS_use_avx'] = '1'
except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
logger.warning("Could not detect AVX support, using compatibility mode")
os.environ['FLAGS_use_avx2'] = '0'
os.environ['FLAGS_use_avx'] = '1'
os.environ["FLAGS_use_avx2"] = "0"
os.environ["FLAGS_use_avx"] = "1"
except (
subprocess.TimeoutExpired,
FileNotFoundError,
subprocess.SubprocessError,
):
logger.warning(
"Could not detect AVX support, using compatibility mode"
)
os.environ["FLAGS_use_avx2"] = "0"
os.environ["FLAGS_use_avx"] = "1"
except Exception as e:
logger.warning(f"Error detecting CPU capabilities: {e}, using compatibility mode")
os.environ['FLAGS_use_avx2'] = '0'
os.environ['FLAGS_use_avx'] = '1'
logger.warning(
f"Error detecting CPU capabilities: {e}, using compatibility mode"
)
os.environ["FLAGS_use_avx2"] = "0"
os.environ["FLAGS_use_avx"] = "1"
from paddleocr import PaddleOCR
# OCR configuration with text orientation classification enabled
ocr_config = {
"use_gpu": False,
@@ -86,23 +111,53 @@ class PaddleOCRBackend(OCRBackend):
"use_dilation": True, # improves accuracy
"det_db_score_mode": "slow", # improves accuracy
}
self.ocr = PaddleOCR(**ocr_config)
logger.info("PaddleOCR engine initialized successfully")
except ImportError as e:
logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
logger.error(
f"Failed to import paddleocr: {str(e)}. "
"Please install it with 'pip install paddleocr'"
)
except OSError as e:
if "Illegal instruction" in str(e) or "core dumped" in str(e):
logger.error(f"PaddlePaddle crashed due to CPU instruction set incompatibility: {str(e)}")
logger.error("This usually happens when the CPU doesn't support AVX instructions.")
logger.error("Please try installing a CPU-only version of PaddlePaddle or use a different OCR backend.")
logger.error(
f"PaddlePaddle crashed due to CPU instruction set incompatibility:"
f"{e}"
)
logger.error(
"This happens when the CPU doesn't support AVX instructions. "
"Try install CPU-only version of PaddlePaddle, "
"or use a different OCR backend."
)
else:
logger.error(f"Failed to initialize PaddleOCR due to OS error: {str(e)}")
logger.error(
f"Failed to initialize PaddleOCR due to OS error: {str(e)}"
)
except Exception as e:
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
def predict(self, image):
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
"""Extract text from an image
Args:
image: Image file path, bytes, or PIL Image object
Returns:
Extracted text
"""
if isinstance(image, str):
image = Image.open(image)
elif isinstance(image, bytes):
image = Image.open(io.BytesIO(image))
if not isinstance(image, Image.Image):
raise TypeError("image must be a string, bytes, or PIL Image object")
return self._predict(image)
def _predict(self, image: Image.Image) -> str:
"""Perform OCR recognition on the image
Args:
@@ -111,63 +166,59 @@ class PaddleOCRBackend(OCRBackend):
Returns:
Extracted text string
"""
if self.ocr is None:
logger.error("PaddleOCR engine not initialized")
return ""
try:
# Ensure image is in RGB format
if hasattr(image, "convert") and image.mode != "RGB":
if image.mode != "RGB":
image = image.convert("RGB")
# Convert to numpy array if needed
if hasattr(image, "convert"):
image_array = np.array(image)
else:
image_array = image
image_array = np.array(image)
# Perform OCR
ocr_result = self.ocr.ocr(image_array, cls=False)
# Extract text
ocr_text = ""
if ocr_result and ocr_result[0]:
for line in ocr_result[0]:
if line and len(line) >= 2:
text = line[1][0] if line[1] else ""
if text:
ocr_text += text + " "
text_length = len(ocr_text.strip())
if text_length > 0:
logger.info(f"OCR extracted {text_length} characters")
return ocr_text.strip()
else:
logger.warning("OCR returned empty result")
return ""
text = [
line[1][0] if line and len(line) >= 2 and line[1] else ""
for line in ocr_result[0]
]
text = [t.strip() for t in text if t]
ocr_text = " ".join(text)
logger.info(f"OCR extracted {len(ocr_text)} characters")
return ocr_text
except Exception as e:
logger.error(f"OCR recognition error: {str(e)}")
return ""
class NanonetsOCRBackend(OCRBackend):
"""Nanonets OCR backend implementation using OpenAI API format"""
def __init__(self, **kwargs):
def __init__(self):
"""Initialize Nanonets OCR backend
Args:
api_key: API key for OpenAI API
base_url: Base URL for OpenAI API
model: Model name
"""
try:
from openai import OpenAI
self.api_key = kwargs.get("api_key", "123")
self.base_url = kwargs.get("base_url", "http://localhost:8000/v1")
self.model = kwargs.get("model", "nanonets/Nanonets-OCR-s")
self.temperature = kwargs.get("temperature", 0.0)
self.max_tokens = kwargs.get("max_tokens", 15000)
self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
self.prompt = """
## 任务说明
base_url = os.getenv("OCR_API_BASE_URL", "http://localhost:8000/v1")
api_key = os.getenv("OCR_API_KEY", "123")
timeout = 30
self.client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout)
self.model = os.getenv("OCR_MODEL", "nanonets/Nanonets-OCR-s")
logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
self.temperature = 0.0
self.max_tokens = 15000
self.prompt = """## 任务说明
请从上传的文档中提取文字内容,严格按自然阅读顺序(从上到下,从左到右)输出,并遵循以下格式规范。
@@ -192,33 +243,26 @@ class NanonetsOCRBackend(OCRBackend):
* 不要猜测或补全不确定的链接地址。
"""
logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
except ImportError:
logger.error("Failed to import openai. Please install it with 'pip install openai'")
self.client = None
except Exception as e:
logger.error(f"Failed to initialize Nanonets OCR: {str(e)}")
self.client = None
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
"""Extract text from an image using Nanonets OCR
Args:
image: Image file path, bytes, or PIL Image object
Returns:
Extracted text
"""
if self.client is None:
logger.error("Nanonets OCR client not initialized")
return ""
try:
# Encode image to base64
img_base64 = image_to_base64(image)
img_base64 = endecode.decode_image(image)
if not img_base64:
return ""
# Call Nanonets OCR API
logger.info(f"Calling Nanonets OCR API with model: {self.model}")
response = self.client.chat.completions.create(
@@ -229,7 +273,9 @@ class NanonetsOCRBackend(OCRBackend):
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{img_base64}"},
"image_url": {
"url": f"data:image/png;base64,{img_base64}"
},
},
{
"type": "text",
@@ -239,40 +285,43 @@ class NanonetsOCRBackend(OCRBackend):
}
],
temperature=self.temperature,
max_tokens=self.max_tokens
max_tokens=self.max_tokens,
)
return response.choices[0].message.content
return response.choices[0].message.content or ""
except Exception as e:
logger.error(f"Nanonets OCR prediction error: {str(e)}")
return ""
class OCREngine:
"""OCR Engine factory class"""
_instance = None
_instance: Dict[str, OCRBackend] = {}
@classmethod
def get_instance(cls, backend_type="paddle", **kwargs) -> Optional[OCRBackend]:
def get_instance(cls, backend_type: str) -> OCRBackend:
"""Get OCR engine instance
Args:
backend_type: OCR backend type, one of: "paddle", "nanonets"
**kwargs: Additional arguments for the backend
Returns:
OCR engine instance or None if initialization fails
"""
if cls._instance is None:
logger.info(f"Initializing OCR engine with backend: {backend_type}")
if backend_type.lower() == "paddle":
cls._instance = PaddleOCRBackend(**kwargs)
elif backend_type.lower() == "nanonets":
cls._instance = NanonetsOCRBackend(**kwargs)
else:
logger.error(f"Unknown OCR backend type: {backend_type}")
return None
return cls._instance
backend_type = backend_type.lower()
if cls._instance.get(backend_type):
return cls._instance[backend_type]
logger.info(f"Initializing OCR engine with backend: {backend_type}")
if backend_type == "paddle":
cls._instance[backend_type] = PaddleOCRBackend()
elif backend_type == "nanonets":
cls._instance[backend_type] = NanonetsOCRBackend()
else:
cls._instance[backend_type] = DummyOCRBackend()
return cls._instance[backend_type]

View File

@@ -1,30 +1,19 @@
import logging
from dataclasses import dataclass, field
from typing import Dict, Any, Optional, Type
from typing import Dict, Type
from .base_parser import BaseParser, ParseResult
from .docx_parser import DocxParser
from .doc_parser import DocParser
from .pdf_parser import PDFParser
from .markdown_parser import MarkdownParser
from .text_parser import TextParser
from .image_parser import ImageParser
from .web_parser import WebParser
from .config import ChunkingConfig
import traceback
from docreader.models.document import Document
from docreader.models.read_config import ChunkingConfig
from docreader.parser.base_parser import BaseParser
from docreader.parser.doc_parser import DocParser
from docreader.parser.docx2_parser import Docx2Parser
from docreader.parser.image_parser import ImageParser
from docreader.parser.markdown_parser import MarkdownParser
from docreader.parser.pdf_parser import PDFParser
from docreader.parser.text_parser import TextParser
from docreader.parser.web_parser import WebParser
logger = logging.getLogger(__name__)
@dataclass
class Chunk:
"""
Represents a single text chunk with associated metadata.
Basic unit for document processing and embedding.
"""
content: str # Text content of the chunk
metadata: Dict[str, Any] = None # Associated metadata (source, page number, etc.)
class Parser:
"""
@@ -33,10 +22,9 @@ class Parser:
"""
def __init__(self):
logger.info("Initializing document parser")
# Initialize all parser types
self.parsers: Dict[str, Type[BaseParser]] = {
"docx": DocxParser,
"docx": Docx2Parser,
"doc": DocParser,
"pdf": PDFParser,
"md": MarkdownParser,
@@ -56,8 +44,7 @@ class Parser:
", ".join(self.parsers.keys()),
)
def get_parser(self, file_type: str) -> Optional[Type[BaseParser]]:
def get_parser(self, file_type: str) -> Type[BaseParser]:
"""
Get parser class for the specified file type.
@@ -67,12 +54,9 @@ class Parser:
Returns:
Parser class for the file type, or None if unsupported
"""
file_type = file_type.lower()
parser = self.parsers.get(file_type)
if parser:
logger.info(f"Found parser for file type: {file_type}")
else:
logger.warning(f"No parser found for file type: {file_type}")
parser = self.parsers.get(file_type.lower())
if not parser:
raise ValueError(f"Unsupported file type: {file_type}")
return parser
def parse_file(
@@ -81,7 +65,7 @@ class Parser:
file_type: str,
content: bytes,
config: ChunkingConfig,
) -> Optional[ParseResult]:
) -> Document:
"""
Parse file content using appropriate parser based on file type.
@@ -96,60 +80,41 @@ class Parser:
"""
logger.info(f"Parsing file: {file_name} with type: {file_type}")
logger.info(
f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
f"Chunking config: size={config.chunk_size}, "
f"overlap={config.chunk_overlap}, "
f"multimodal={config.enable_multimodal}"
)
parser_instance = None
try:
# Get appropriate parser for file type
cls = self.get_parser(file_type)
if cls is None:
logger.error(f"Unsupported file type: {file_type}")
return None
# Parse file content
logger.info(f"Creating parser instance for {file_type} file")
parser_instance = cls(
file_name=file_name,
file_type=file_type,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
separators=config.separators,
enable_multimodal=config.enable_multimodal,
max_image_size=1920, # Limit image size to 1920px
max_concurrent_tasks=5, # Limit concurrent tasks to 5
chunking_config=config, # Pass the entire chunking config
)
# Get appropriate parser for file type
cls = self.get_parser(file_type)
logger.info(f"Starting to parse file content, size: {len(content)} bytes")
result = parser_instance.parse(content)
# Parse file content
logger.info(f"Creating parser instance for {file_type} file")
parser = cls(
file_name=file_name,
file_type=file_type,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
separators=config.separators,
enable_multimodal=config.enable_multimodal,
max_image_size=1920, # Limit image size to 1920px
max_concurrent_tasks=5, # Limit concurrent tasks to 5
chunking_config=config, # Pass the entire chunking config
)
if result:
logger.info(
f"Successfully parsed file {file_name}, generated {len(result.chunks)} chunks"
)
if result.chunks and len(result.chunks) > 0:
logger.info(
f"First chunk content length: {len(result.chunks[0].content)}"
)
else:
logger.warning(f"Parser returned empty chunks for file: {file_name}")
else:
logger.warning(f"Parser returned None result for file: {file_name}")
logger.info(f"Starting to parse file content, size: {len(content)} bytes")
result = parser.parse(content)
# Return parse results
return result
if not result.content:
logger.warning(f"Parser returned empty content for file: {file_name}")
elif not result.chunks:
logger.warning(f"Parser returned empty chunks for file: {file_name}")
elif result.chunks[0]:
logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
logger.info(f"Parsed file {file_name}, with {len(result.chunks)} chunks")
return result
except Exception as e:
logger.error(f"Error parsing file {file_name}: {str(e)}")
logger.info(f"Detailed traceback: {traceback.format_exc()}")
return None
def parse_url(
self, url: str, title: str, config: ChunkingConfig
) -> Optional[ParseResult]:
def parse_url(self, url: str, title: str, config: ChunkingConfig) -> Document:
"""
Parse content from a URL using the WebParser.
@@ -163,44 +128,31 @@ class Parser:
"""
logger.info(f"Parsing URL: {url}, title: {title}")
logger.info(
f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
f"multimodal={config.enable_multimodal}"
f"Chunking config: size={config.chunk_size}, "
f"overlap={config.chunk_overlap}, multimodal={config.enable_multimodal}"
)
parser_instance = None
try:
# Create web parser instance
logger.info("Creating WebParser instance")
parser_instance = WebParser(
title=title,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
separators=config.separators,
enable_multimodal=config.enable_multimodal,
max_image_size=1920, # Limit image size
max_concurrent_tasks=5, # Limit concurrent tasks
chunking_config=config,
)
# Create web parser instance
logger.info("Creating WebParser instance")
parser = WebParser(
title=title,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
separators=config.separators,
enable_multimodal=config.enable_multimodal,
max_image_size=1920, # Limit image size
max_concurrent_tasks=5, # Limit concurrent tasks
chunking_config=config,
)
logger.info(f"Starting to parse URL content")
result = parser_instance.parse(url)
if result:
logger.info(
f"Successfully parsed URL, generated {len(result.chunks)} chunks"
)
logger.info(
f"First chunk content length: {len(result.chunks[0].content) if result.chunks else 0}"
)
else:
logger.warning(f"Parser returned empty result for URL: {url}")
# Return parse results
return result
except Exception as e:
logger.error(f"Error parsing URL {url}: {str(e)}")
logger.info(f"Detailed traceback: {traceback.format_exc()}")
return None
logger.info("Starting to parse URL content")
result = parser.parse(url.encode())
if not result.content:
logger.warning(f"Parser returned empty content for url: {url}")
elif not result.chunks:
logger.warning(f"Parser returned empty chunks for url: {url}")
elif result.chunks[0]:
logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
logger.info(f"Parsed url {url}, with {len(result.chunks)} chunks")
return result

View File

@@ -1,113 +1,7 @@
import logging
import os
import io
from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
from docreader.parser.chain_parser import FirstParser
from docreader.parser.markitdown_parser import MarkitdownParser
from docreader.parser.mineru_parser import MinerUParser
import pdfplumber
import tempfile
from .base_parser import BaseParser
logger = logging.getLogger(__name__)
class PDFParser(BaseParser):
"""
PDF Document Parser
This parser handles PDF documents by extracting text content.
It uses the pypdf library for simple text extraction.
"""
def _convert_table_to_markdown(self, table_data: list) -> str:
if not table_data or not table_data[0]: return ""
def clean_cell(cell):
if cell is None: return ""
return str(cell).replace("\n", " <br> ")
try:
markdown = ""
header = [clean_cell(cell) for cell in table_data[0]]
markdown += "| " + " | ".join(header) + " |\n"
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
for row in table_data[1:]:
if not row: continue
body_row = [clean_cell(cell) for cell in row]
if len(body_row) != len(header):
logger.warning(f"Skipping malformed table row: {body_row}")
continue
markdown += "| " + " | ".join(body_row) + " |\n"
return markdown
except Exception as e:
logger.error(f"Error converting table to markdown: {e}")
return ""
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
all_page_content = []
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
temp_pdf_path = temp_pdf.name
try:
temp_pdf.write(content)
temp_pdf.close()
logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
with pdfplumber.open(temp_pdf_path) as pdf:
logger.info(f"PDF has {len(pdf.pages)} pages")
for page_num, page in enumerate(pdf.pages):
page_content_parts = []
# Try-fallback strategy for table detection
default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
found_tables = page.find_tables(default_settings)
if not found_tables:
logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
found_tables = page.find_tables(fallback_settings)
table_bboxes = [table.bbox for table in found_tables]
# Define a filter function that keeps objects NOT inside any table bbox.
def not_within_bboxes(obj):
"""Check if an object is outside all table bounding boxes."""
for bbox in table_bboxes:
# Check if the object's vertical center is within a bbox
if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
return False # It's inside a table, so we DON'T keep it
return True # It's outside all tables, so we DO keep it
# that contains only the non-table text.
non_table_page = page.filter(not_within_bboxes)
# Now, extract text from this filtered page view.
text = non_table_page.extract_text(x_tolerance=2)
if text:
page_content_parts.append(text)
# Process and append the structured Markdown tables
if found_tables:
logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
for table in found_tables:
markdown_table = self._convert_table_to_markdown(table.extract())
page_content_parts.append(f"\n\n{markdown_table}\n\n")
all_page_content.append("".join(page_content_parts))
final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
return final_text
except Exception as e:
logger.error(f"Failed to parse PDF document: {str(e)}")
return ""
finally:
# This block is GUARANTEED to execute, preventing resource leaks.
if os.path.exists(temp_pdf_path):
try:
os.remove(temp_pdf_path)
logger.info(f"Temporary file cleaned up: {temp_pdf_path}")
except OSError as e:
logger.error(f"Error removing temporary file {temp_pdf_path}: {e}")
class PDFParser(FirstParser):
_parser_cls = (MinerUParser, MarkitdownParser)

View File

@@ -1,64 +1,68 @@
# -*- coding: utf-8 -*-
import os
import uuid
import logging
import io
import logging
import os
import traceback
import uuid
from abc import ABC, abstractmethod
from typing import Tuple, Optional
from typing import Dict
from qcloud_cos import CosConfig, CosS3Client
from minio import Minio
from qcloud_cos import CosConfig, CosS3Client
from docreader.utils import endecode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class Storage(ABC):
"""Abstract base class for object storage operations"""
@abstractmethod
def upload_file(self, file_path: str) -> str:
"""Upload file to object storage
Args:
file_path: File path
Returns:
File URL
"""
pass
@abstractmethod
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to object storage
Args:
content: Byte content to upload
file_ext: File extension
Returns:
File URL
"""
pass
@abstractmethod
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to object storage
Args:
content: Byte content to upload
file_ext: File extension
Returns:
File URL
"""
pass
class CosStorage(Storage):
"""Tencent Cloud COS storage implementation"""
def __init__(self, storage_config=None):
"""Initialize COS storage
Args:
storage_config: Storage configuration
"""
self.storage_config = storage_config
self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client()
self.client, self.bucket_name, self.region, self.prefix = (
self._init_cos_client()
)
def _init_cos_client(self):
"""Initialize Tencent Cloud COS client"""
try:
# Use provided COS config if available, otherwise fall back to environment variables
# Use provided COS config if available,
# otherwise fall back to environment variables
if self.storage_config and self.storage_config.get("access_key_id") != "":
cos_config = self.storage_config
secret_id = cos_config.get("access_key_id")
@@ -75,15 +79,16 @@ class CosStorage(Storage):
bucket_name = os.getenv("COS_BUCKET_NAME")
appid = os.getenv("COS_APP_ID")
prefix = os.getenv("COS_PATH_PREFIX")
enable_old_domain = (
os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true"
)
if not all([secret_id, secret_key, region, bucket_name, appid]):
logger.error(
"Incomplete COS configuration, missing required environment variables"
f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}"
"Incomplete COS configuration, missing environment variables"
f"secret_id: {secret_id}, secret_key: {secret_key}, "
f"region: {region}, bucket_name: {bucket_name}, appid: {appid}"
)
return None, None, None, None
@@ -105,27 +110,26 @@ class CosStorage(Storage):
except Exception as e:
logger.error(f"Failed to initialize COS client: {str(e)}")
return None, None, None, None
def _get_download_url(self, bucket_name, region, object_key):
"""Generate COS object URL
Args:
bucket_name: Bucket name
region: Region
object_key: Object key
Returns:
File URL
"""
return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
def upload_file(self, file_path: str) -> str:
"""Upload file to Tencent Cloud COS
Args:
file_path: File path
Returns:
File URL
"""
@@ -135,16 +139,16 @@ class CosStorage(Storage):
return ""
# Generate object key, use UUID to avoid conflicts
file_name = os.path.basename(file_path)
object_key = (
f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
)
file_ext = os.path.splitext(file_path)[1]
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
logger.info(f"Generated object key: {object_key}")
# Upload file
logger.info("Attempting to upload file to COS")
response = self.client.upload_file(
Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key
self.client.upload_file(
Bucket=self.bucket_name,
LocalFilePath=file_path,
Key=object_key,
)
# Get file URL
@@ -156,14 +160,14 @@ class CosStorage(Storage):
except Exception as e:
logger.error(f"Failed to upload file to COS: {str(e)}")
return ""
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to Tencent Cloud COS
Args:
content: Byte content to upload
file_ext: File extension
Returns:
File URL
"""
@@ -171,10 +175,16 @@ class CosStorage(Storage):
logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes")
if not self.client:
return ""
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}"
object_key = (
f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
if self.prefix
else f"images/{uuid.uuid4().hex}{file_ext}"
)
logger.info(f"Generated object key: {object_key}")
self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key)
self.client.put_object(
Bucket=self.bucket_name, Body=content, Key=object_key
)
file_url = self._get_download_url(self.bucket_name, self.region, object_key)
logger.info(f"Successfully uploaded bytes to COS: {file_url}")
return file_url
@@ -186,16 +196,18 @@ class CosStorage(Storage):
class MinioStorage(Storage):
"""MinIO storage implementation"""
def __init__(self, storage_config=None):
"""Initialize MinIO storage
Args:
storage_config: Storage configuration
"""
self.storage_config = storage_config
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client()
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = (
self._init_minio_client()
)
def _init_minio_client(self):
"""Initialize MinIO client from environment variables or injected config.
@@ -203,58 +215,69 @@ class MinioStorage(Storage):
prefer those values to override envs.
"""
try:
endpoint = os.getenv("MINIO_ENDPOINT")
endpoint = os.getenv("MINIO_ENDPOINT", "")
use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true"
if self.storage_config and self.storage_config.get("bucket_name"):
storage_config = self.storage_config
bucket_name = storage_config.get("bucket_name")
bucket_name = storage_config.get("bucket_name", "")
path_prefix = storage_config.get("path_prefix").strip().strip("/")
access_key = storage_config.get("access_key_id")
secret_key = storage_config.get("secret_access_key")
else:
access_key = os.getenv("MINIO_ACCESS_KEY_ID")
secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY")
bucket_name = os.getenv("MINIO_BUCKET_NAME")
bucket_name = os.getenv("MINIO_BUCKET_NAME", "")
path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/")
if not all([endpoint, access_key, secret_key, bucket_name]):
logger.error("Incomplete MinIO configuration, missing required environment variables")
logger.error(
"Incomplete MinIO configuration, missing environment variables"
)
return None, None, None, None, None
# Initialize client
client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl)
client = Minio(
endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl
)
# Ensure bucket exists
found = client.bucket_exists(bucket_name)
if not found:
client.make_bucket(bucket_name)
policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name)
policy = (
'{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}'
% (bucket_name, bucket_name)
)
client.set_bucket_policy(bucket_name, policy)
return client, bucket_name, use_ssl, endpoint, path_prefix
except Exception as e:
logger.error(f"Failed to initialize MinIO client: {str(e)}")
return None, None, None, None, None
def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None):
def _get_download_url(self, object_key: str):
"""Construct a public URL for MinIO object.
If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint.
"""
if public_endpoint:
base = public_endpoint
else:
scheme = "https" if use_ssl else "http"
base = f"{scheme}://{endpoint}"
# Path-style URL for MinIO
return f"{base}/{bucket_name}/{object_key}"
# 1. Use public endpoint if provided
endpoint = os.getenv("MINIO_PUBLIC_ENDPOINT")
if endpoint:
return f"{endpoint}/{self.bucket_name}/{object_key}"
# 2. Use SSL if enabled
if self.use_ssl:
return f"https://{self.endpoint}/{self.bucket_name}/{object_key}"
# 3. Use HTTP default
return f"http://{self.endpoint}/{self.bucket_name}/{object_key}"
def upload_file(self, file_path: str) -> str:
"""Upload file to MinIO
Args:
file_path: File path
Returns:
File URL
"""
@@ -265,29 +288,27 @@ class MinioStorage(Storage):
# Generate object key, use UUID to avoid conflicts
file_name = os.path.basename(file_path)
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
object_key = (
f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
if self.path_prefix
else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
)
logger.info(f"Generated MinIO object key: {object_key}")
# Upload file
logger.info("Attempting to upload file to MinIO")
with open(file_path, 'rb') as file_data:
with open(file_path, "rb") as file_data:
file_size = os.path.getsize(file_path)
self.client.put_object(
bucket_name=self.bucket_name,
bucket_name=self.bucket_name or "",
object_name=object_key,
data=file_data,
length=file_size,
content_type='application/octet-stream'
content_type="application/octet-stream",
)
# Get file URL
file_url = self._get_download_url(
self.bucket_name,
object_key,
self.use_ssl,
self.endpoint,
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
)
file_url = self._get_download_url(object_key)
logger.info(f"Successfully uploaded file to MinIO: {file_url}")
return file_url
@@ -295,14 +316,14 @@ class MinioStorage(Storage):
except Exception as e:
logger.error(f"Failed to upload file to MinIO: {str(e)}")
return ""
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to MinIO
Args:
content: Byte content to upload
file_ext: File extension
Returns:
File URL
"""
@@ -310,23 +331,21 @@ class MinioStorage(Storage):
logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes")
if not self.client:
return ""
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}"
object_key = (
f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}"
if self.path_prefix
else f"images/{uuid.uuid4().hex}{file_ext}"
)
logger.info(f"Generated MinIO object key: {object_key}")
self.client.put_object(
self.bucket_name,
object_key,
data=io.BytesIO(content),
length=len(content),
content_type="application/octet-stream"
)
file_url = self._get_download_url(
self.bucket_name,
object_key,
self.use_ssl,
self.endpoint,
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
self.bucket_name or "",
object_key,
data=io.BytesIO(content),
length=len(content),
content_type="application/octet-stream",
)
file_url = self._get_download_url(object_key)
logger.info(f"Successfully uploaded bytes to MinIO: {file_url}")
return file_url
except Exception as e:
@@ -335,26 +354,61 @@ class MinioStorage(Storage):
return ""
def create_storage(storage_config=None) -> Storage:
class LocalStorage(Storage):
"""Local file system storage implementation"""
def __init__(self, storage_config: Dict[str, str] = {}):
self.storage_config = storage_config
base_dir = storage_config.get(
"base_dir", os.getenv("LOCAL_STORAGE_BASE_DIR", "")
)
self.image_dir = os.path.join(base_dir, "images")
os.makedirs(self.image_dir, exist_ok=True)
def upload_file(self, file_path: str) -> str:
logger.info(f"Uploading file to local storage: {file_path}")
return file_path
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
logger.info(f"Uploading file to local storage: {len(content)} bytes")
fname = os.path.join(self.image_dir, f"{uuid.uuid4()}{file_ext}")
with open(fname, "wb") as f:
f.write(content)
return fname
class Base64Storage(Storage):
def upload_file(self, file_path: str) -> str:
logger.info(f"Uploading file to base64 storage: {file_path}")
return file_path
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
logger.info(f"Uploading file to base64 storage: {len(content)} bytes")
file_ext = file_ext.lstrip(".")
return f"data:image/{file_ext};base64,{endecode.decode_image(content)}"
def create_storage(storage_config: Dict[str, str] | None = None) -> Storage:
"""Create a storage instance based on configuration or environment variables
Args:
storage_config: Storage configuration dictionary
Returns:
Storage instance
"""
storage_type = os.getenv("STORAGE_TYPE", "cos").lower()
if storage_config:
storage_type = str(storage_config.get("provider", storage_type)).lower()
logger.info(f"Creating {storage_type} storage instance")
if storage_type == "minio":
return MinioStorage(storage_config)
elif storage_type == "cos":
# Default to COS
return CosStorage(storage_config)
else:
return None
elif storage_type == "local":
return LocalStorage(storage_config or {})
elif storage_type == "base64":
return Base64Storage()
raise ValueError(f"Invalid storage type: {storage_type}")

View File

@@ -1,6 +1,8 @@
import logging
from .base_parser import BaseParser
from typing import Dict, Any, Tuple, Union
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.utils import endecode
logger = logging.getLogger(__name__)
@@ -11,7 +13,7 @@ class TextParser(BaseParser):
This parser handles text extraction and chunking from plain text documents.
"""
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
def parse_into_text(self, content: bytes) -> Document:
"""
Parse text document content by decoding bytes to string.
@@ -25,20 +27,15 @@ class TextParser(BaseParser):
Parsed text content as string
"""
logger.info(f"Parsing text document, content size: {len(content)} bytes")
text = self.decode_bytes(content)
text = endecode.decode_bytes(content)
logger.info(
f"Successfully parsed text document, extracted {len(text)} characters"
)
return text
return Document(content=text)
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger.info("Running TextParser in standalone mode")
logger = logging.getLogger(__name__)
# Sample text for testing
text = """## 标题1

View File

@@ -1,11 +1,14 @@
from typing import Any, Optional, Tuple, Dict, Union
import os
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from .base_parser import BaseParser, ParseResult
import logging
import asyncio
import logging
import os
from typing import Any
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.utils import endecode
logger = logging.getLogger(__name__)
@@ -59,7 +62,7 @@ class WebParser(BaseParser):
# Return empty BeautifulSoup object on error
return BeautifulSoup("", "html.parser")
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
def parse_into_text(self, content: bytes) -> Document:
"""Parse web page
Args:
@@ -78,10 +81,10 @@ class WebParser(BaseParser):
# Run async method
# Handle content possibly being a string
if isinstance(content, bytes):
url = self.decode_bytes(content)
url = endecode.decode_bytes(content)
logger.info(f"Decoded URL from bytes: {url}")
else:
url = content
url = str(content)
logger.info(f"Using content as URL directly: {url}")
logger.info(f"Scraping web page: {url}")
@@ -118,11 +121,11 @@ class WebParser(BaseParser):
logger.info(
f"Web page parsing complete, total content: {len(result)} characters"
)
return result
return Document(content=result)
except Exception as e:
logger.error(f"Error parsing web page: {str(e)}")
return f"Error parsing web page: {str(e)}"
return Document(content=f"Error parsing web page: {str(e)}")
finally:
# Close event loop

View File

@@ -0,0 +1,127 @@
from google.protobuf.internal import containers as _containers
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from collections.abc import Iterable as _Iterable, Mapping as _Mapping
from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
DESCRIPTOR: _descriptor.FileDescriptor
class StorageProvider(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
__slots__ = ()
STORAGE_PROVIDER_UNSPECIFIED: _ClassVar[StorageProvider]
COS: _ClassVar[StorageProvider]
MINIO: _ClassVar[StorageProvider]
STORAGE_PROVIDER_UNSPECIFIED: StorageProvider
COS: StorageProvider
MINIO: StorageProvider
class StorageConfig(_message.Message):
__slots__ = ("provider", "region", "bucket_name", "access_key_id", "secret_access_key", "app_id", "path_prefix")
PROVIDER_FIELD_NUMBER: _ClassVar[int]
REGION_FIELD_NUMBER: _ClassVar[int]
BUCKET_NAME_FIELD_NUMBER: _ClassVar[int]
ACCESS_KEY_ID_FIELD_NUMBER: _ClassVar[int]
SECRET_ACCESS_KEY_FIELD_NUMBER: _ClassVar[int]
APP_ID_FIELD_NUMBER: _ClassVar[int]
PATH_PREFIX_FIELD_NUMBER: _ClassVar[int]
provider: StorageProvider
region: str
bucket_name: str
access_key_id: str
secret_access_key: str
app_id: str
path_prefix: str
def __init__(self, provider: _Optional[_Union[StorageProvider, str]] = ..., region: _Optional[str] = ..., bucket_name: _Optional[str] = ..., access_key_id: _Optional[str] = ..., secret_access_key: _Optional[str] = ..., app_id: _Optional[str] = ..., path_prefix: _Optional[str] = ...) -> None: ...
class VLMConfig(_message.Message):
__slots__ = ("model_name", "base_url", "api_key", "interface_type")
MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
BASE_URL_FIELD_NUMBER: _ClassVar[int]
API_KEY_FIELD_NUMBER: _ClassVar[int]
INTERFACE_TYPE_FIELD_NUMBER: _ClassVar[int]
model_name: str
base_url: str
api_key: str
interface_type: str
def __init__(self, model_name: _Optional[str] = ..., base_url: _Optional[str] = ..., api_key: _Optional[str] = ..., interface_type: _Optional[str] = ...) -> None: ...
class ReadConfig(_message.Message):
__slots__ = ("chunk_size", "chunk_overlap", "separators", "enable_multimodal", "storage_config", "vlm_config")
CHUNK_SIZE_FIELD_NUMBER: _ClassVar[int]
CHUNK_OVERLAP_FIELD_NUMBER: _ClassVar[int]
SEPARATORS_FIELD_NUMBER: _ClassVar[int]
ENABLE_MULTIMODAL_FIELD_NUMBER: _ClassVar[int]
STORAGE_CONFIG_FIELD_NUMBER: _ClassVar[int]
VLM_CONFIG_FIELD_NUMBER: _ClassVar[int]
chunk_size: int
chunk_overlap: int
separators: _containers.RepeatedScalarFieldContainer[str]
enable_multimodal: bool
storage_config: StorageConfig
vlm_config: VLMConfig
def __init__(self, chunk_size: _Optional[int] = ..., chunk_overlap: _Optional[int] = ..., separators: _Optional[_Iterable[str]] = ..., enable_multimodal: bool = ..., storage_config: _Optional[_Union[StorageConfig, _Mapping]] = ..., vlm_config: _Optional[_Union[VLMConfig, _Mapping]] = ...) -> None: ...
class ReadFromFileRequest(_message.Message):
__slots__ = ("file_content", "file_name", "file_type", "read_config", "request_id")
FILE_CONTENT_FIELD_NUMBER: _ClassVar[int]
FILE_NAME_FIELD_NUMBER: _ClassVar[int]
FILE_TYPE_FIELD_NUMBER: _ClassVar[int]
READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
file_content: bytes
file_name: str
file_type: str
read_config: ReadConfig
request_id: str
def __init__(self, file_content: _Optional[bytes] = ..., file_name: _Optional[str] = ..., file_type: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
class ReadFromURLRequest(_message.Message):
__slots__ = ("url", "title", "read_config", "request_id")
URL_FIELD_NUMBER: _ClassVar[int]
TITLE_FIELD_NUMBER: _ClassVar[int]
READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
url: str
title: str
read_config: ReadConfig
request_id: str
def __init__(self, url: _Optional[str] = ..., title: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
class Image(_message.Message):
__slots__ = ("url", "caption", "ocr_text", "original_url", "start", "end")
URL_FIELD_NUMBER: _ClassVar[int]
CAPTION_FIELD_NUMBER: _ClassVar[int]
OCR_TEXT_FIELD_NUMBER: _ClassVar[int]
ORIGINAL_URL_FIELD_NUMBER: _ClassVar[int]
START_FIELD_NUMBER: _ClassVar[int]
END_FIELD_NUMBER: _ClassVar[int]
url: str
caption: str
ocr_text: str
original_url: str
start: int
end: int
def __init__(self, url: _Optional[str] = ..., caption: _Optional[str] = ..., ocr_text: _Optional[str] = ..., original_url: _Optional[str] = ..., start: _Optional[int] = ..., end: _Optional[int] = ...) -> None: ...
class Chunk(_message.Message):
__slots__ = ("content", "seq", "start", "end", "images")
CONTENT_FIELD_NUMBER: _ClassVar[int]
SEQ_FIELD_NUMBER: _ClassVar[int]
START_FIELD_NUMBER: _ClassVar[int]
END_FIELD_NUMBER: _ClassVar[int]
IMAGES_FIELD_NUMBER: _ClassVar[int]
content: str
seq: int
start: int
end: int
images: _containers.RepeatedCompositeFieldContainer[Image]
def __init__(self, content: _Optional[str] = ..., seq: _Optional[int] = ..., start: _Optional[int] = ..., end: _Optional[int] = ..., images: _Optional[_Iterable[_Union[Image, _Mapping]]] = ...) -> None: ...
class ReadResponse(_message.Message):
__slots__ = ("chunks", "error")
CHUNKS_FIELD_NUMBER: _ClassVar[int]
ERROR_FIELD_NUMBER: _ClassVar[int]
chunks: _containers.RepeatedCompositeFieldContainer[Chunk]
error: str
def __init__(self, chunks: _Optional[_Iterable[_Union[Chunk, _Mapping]]] = ..., error: _Optional[str] = ...) -> None: ...

View File

@@ -3,7 +3,7 @@
import grpc
import warnings
from . import docreader_pb2 as docreader__pb2
import docreader_pb2 as docreader__pb2
GRPC_GENERATED_VERSION = '1.76.0'
GRPC_VERSION = grpc.__version__

View File

@@ -16,6 +16,7 @@ dependencies = [
"lxml>=6.0.2",
"markdown>=3.10",
"markdownify>=1.2.0",
"markitdown[docx,pdf,xls,xlsx]>=0.1.3",
"minio>=7.2.18",
"mistletoe>=1.5.0",
"ollama>=0.6.0",
@@ -26,6 +27,7 @@ dependencies = [
"pillow>=12.0.0",
"playwright>=1.55.0",
"protobuf>=6.33.0",
"pydantic>=2.12.3",
"pypdf>=6.1.3",
"pypdf2>=3.0.1",
"python-docx>=1.2.0",

View File

@@ -2,13 +2,14 @@
set -x
# 设置目录
PROTO_DIR="proto"
PYTHON_OUT="proto"
GO_OUT="proto"
PROTO_DIR="docreader/proto"
PYTHON_OUT="docreader/proto"
GO_OUT="docreader/proto"
# 生成Python代码
python3 -m grpc_tools.protoc -I${PROTO_DIR} \
--python_out=${PYTHON_OUT} \
--pyi_out=${PYTHON_OUT} \
--grpc_python_out=${PYTHON_OUT} \
${PROTO_DIR}/docreader.proto
@@ -22,10 +23,10 @@ protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
# 修复Python导入问题MacOS兼容版本
if [ "$(uname)" == "Darwin" ]; then
# MacOS版本
sed -i '' 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
sed -i '' 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
else
# Linux版本
sed -i 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
sed -i 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
fi
echo "Proto files generated successfully!"

View File

@@ -0,0 +1,112 @@
import re
from typing import Callable, Dict, List, Match, Pattern, Union
from pydantic import BaseModel, Field
class HeaderTrackerHook(BaseModel):
"""表头追踪Hook的配置类支持多种场景的表头识别"""
start_pattern: Pattern[str] = Field(
description="表头开始匹配(正则表达式或字符串)"
)
end_pattern: Pattern[str] = Field(description="表头结束匹配(正则表达式或字符串)")
extract_header_fn: Callable[[Match[str]], str] = Field(
default=lambda m: m.group(0),
description="从开始匹配结果中提取表头内容的函数(默认取匹配到的整个内容)",
)
priority: int = Field(default=0, description="优先级(多个配置时,高优先级先匹配)")
case_sensitive: bool = Field(
default=True, description="是否大小写敏感仅当传入字符串pattern时生效"
)
def __init__(
self,
start_pattern: Union[str, Pattern[str]],
end_pattern: Union[str, Pattern[str]],
**kwargs,
):
flags = 0 if kwargs.get("case_sensitive", True) else re.IGNORECASE
if isinstance(start_pattern, str):
start_pattern = re.compile(start_pattern, flags | re.DOTALL)
if isinstance(end_pattern, str):
end_pattern = re.compile(end_pattern, flags | re.DOTALL)
super().__init__(
start_pattern=start_pattern,
end_pattern=end_pattern,
**kwargs,
)
# 初始化表头Hook配置提供默认配置支持Markdown表格、代码块
DEFAULT_CONFIGS = [
# 代码块配置(```开头,```结尾)
# HeaderTrackerHook(
# # 代码块开始(支持语言指定)
# start_pattern=r"^\s*```(\w+).*(?!```$)",
# # 代码块结束
# end_pattern=r"^\s*```.*$",
# extract_header_fn=lambda m: f"```{m.group(1)}" if m.group(1) else "```",
# priority=20, # 代码块优先级高于表格
# case_sensitive=True,
# ),
# Markdown表格配置表头带下划线
HeaderTrackerHook(
# 表头行 + 分隔行
start_pattern=r"^\s*(?:\|[^|\n]*)+[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|?[\r\n]+$",
# 空行或非表格内容
end_pattern=r"^\s*$|^\s*[^|\s].*$",
priority=15,
case_sensitive=False,
),
]
DEFAULT_CONFIGS.sort(key=lambda x: -x.priority)
# 定义Hook状态数据结构
class HeaderTracker(BaseModel):
"""表头追踪 Hook 的状态类"""
header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS)
active_headers: Dict[int, str] = Field(default_factory=dict)
ended_headers: set[int] = Field(default_factory=set)
def update(self, split: str) -> Dict[int, str]:
"""检测当前split中的表头开始/结束更新Hook状态"""
new_headers: Dict[int, str] = {}
# 1. 检查是否有表头结束标记
for config in self.header_hook_configs:
if config.priority in self.active_headers and config.end_pattern.search(
split
):
self.ended_headers.add(config.priority)
del self.active_headers[config.priority]
# 2. 检查是否有新的表头开始标记(只处理未活跃且未结束的)
for config in self.header_hook_configs:
if (
config.priority not in self.active_headers
and config.priority not in self.ended_headers
):
match = config.start_pattern.search(split)
if match:
header = config.extract_header_fn(match)
self.active_headers[config.priority] = header
new_headers[config.priority] = header
# 3. 检查是否所有活跃表头都已结束(清空结束标记)
if not self.active_headers:
self.ended_headers.clear()
return new_headers
def get_headers(self) -> str:
"""获取当前所有活跃表头的拼接文本(按优先级排序)"""
# 按优先级降序排列表头
sorted_headers = sorted(self.active_headers.items(), key=lambda x: -x[0])
return (
"\n".join([header for _, header in sorted_headers])
if sorted_headers
else ""
)

View File

@@ -0,0 +1,313 @@
"""Token splitter."""
import itertools
import logging
import re
from typing import Callable, Generic, List, Pattern, Tuple, TypeVar
from pydantic import BaseModel, Field, PrivateAttr
from docreader.splitter.header_hook import (
HeaderTracker,
)
from docreader.utils.split import split_by_char, split_by_sep
DEFAULT_CHUNK_OVERLAP = 100
DEFAULT_CHUNK_SIZE = 512
T = TypeVar("T")
logger = logging.getLogger(__name__)
class TextSplitter(BaseModel, Generic[T]):
chunk_size: int = Field(description="The token chunk size for each chunk.")
chunk_overlap: int = Field(
description="The token overlap of each chunk when splitting."
)
separators: List[str] = Field(
description="Default separators for splitting into words"
)
# Try to keep the matched characters as a whole.
# If it's too long, the content will be further segmented.
protected_regex: List[str] = Field(
description="Protected regex for splitting into words"
)
len_function: Callable[[str], int] = Field(description="The length function.")
# Header tracking Hook related attributes
header_hook: HeaderTracker = Field(default_factory=HeaderTracker, exclude=True)
_protected_fns: List[Pattern] = PrivateAttr()
_split_fns: List[Callable] = PrivateAttr()
def __init__(
self,
chunk_size: int = DEFAULT_CHUNK_SIZE,
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
separators: List[str] = ["\n", "", " "],
protected_regex: List[str] = [
# math formula
r"\$\$[\s\S]*?\$\$",
# image
r"!\[.*?\]\(.*?\)",
# link
r"\[.*?\]\(.*?\)",
# table header
r"(?:\|[^|\n]*)+\|[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|[\r\n]+",
# table body
r"(?:\|[^|\n]*)+\|[\r\n]+",
# code header
r"```(?:\w+)[\r\n]+[^\r\n]*",
],
length_function: Callable[[str], int] = lambda x: len(x),
):
"""Initialize with parameters."""
if chunk_overlap > chunk_size:
raise ValueError(
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
f"({chunk_size}), should be smaller."
)
super().__init__(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=separators,
protected_regex=protected_regex,
len_function=length_function,
)
self._protected_fns = [re.compile(reg) for reg in protected_regex]
self._split_fns = [split_by_sep(sep) for sep in separators] + [split_by_char()]
def split_text(self, text: str) -> List[Tuple[int, int, str]]:
"""Split text into chunks."""
if text == "":
return []
splits = self._split(text)
protect = self._split_protected(text)
splits = self._join(splits, protect)
assert "".join(splits) == text
chunks = self._merge(splits)
return chunks
def _split(self, text: str) -> List[str]:
"""Break text into splits that are smaller than chunk size.
NOTE: the splits contain the separators.
"""
if self.len_function(text) <= self.chunk_size:
return [text]
splits = []
for split_fn in self._split_fns:
splits = split_fn(text)
if len(splits) > 1:
break
new_splits = []
for split in splits:
split_len = self.len_function(split)
if split_len <= self.chunk_size:
new_splits.append(split)
else:
# recursively split
new_splits.extend(self._split(split))
return new_splits
def _merge(self, splits: List[str]) -> List[Tuple[int, int, str]]:
"""Merge splits into chunks.
The high-level idea is to keep adding splits to a chunk until we
exceed the chunk size, then we start a new chunk with overlap.
When we start a new chunk, we pop off the first element of the previous
chunk until the total length is less than the chunk size.
"""
chunks: List[Tuple[int, int, str]] = []
cur_chunk: List[Tuple[int, int, str]] = []
cur_headers, cur_len = "", 0
cur_start, cur_end = 0, 0
for split in splits:
cur_end = cur_start + len(split)
split_len = self.len_function(split)
if split_len > self.chunk_size:
logger.error(
f"Got a split of size {split_len}, ",
f"larger than chunk size {self.chunk_size}.",
)
self.header_hook.update(split)
cur_headers = self.header_hook.get_headers()
cur_headers_len = self.len_function(cur_headers)
if cur_headers_len > self.chunk_size:
logger.error(
f"Got headers of size {cur_headers_len}, ",
f"larger than chunk size {self.chunk_size}.",
)
cur_headers, cur_headers_len = "", 0
# if we exceed the chunk size after adding the new split, then
# we need to end the current chunk and start a new one
if cur_len + split_len + cur_headers_len > self.chunk_size:
# end the previous chunk
if len(cur_chunk) > 0:
chunks.append(
(
cur_chunk[0][0],
cur_chunk[-1][1],
"".join([c[2] for c in cur_chunk]),
)
)
# start a new chunk with overlap
# keep popping off the first element of the previous chunk until:
# 1. the current chunk length is less than chunk overlap
# 2. the total length is less than chunk size
while cur_chunk and (
cur_len > self.chunk_overlap
or cur_len + split_len + cur_headers_len > self.chunk_size
):
# pop off the first element
first_chunk = cur_chunk.pop(0)
cur_len -= self.len_function(first_chunk[2])
if (
cur_headers
and split_len + cur_headers_len < self.chunk_size
and cur_headers not in split
):
cur_chunk.insert(
0,
(
cur_chunk[0][0] if cur_chunk else cur_start,
cur_chunk[0][1] if cur_chunk else cur_end,
cur_headers,
),
)
cur_len += cur_headers_len
cur_chunk.append((cur_start, cur_end, split))
cur_len += split_len
cur_start = cur_end
# handle the last chunk
assert cur_chunk
if cur_headers and cur_len < self.chunk_size:
cur_chunk.insert(0, (cur_chunk[0][0], cur_chunk[0][1], cur_headers))
chunks.append(
(
cur_chunk[0][0],
cur_chunk[-1][1],
"".join([c[2] for c in cur_chunk]),
)
)
return chunks
def _split_protected(self, text: str) -> List[Tuple[int, str]]:
matches = [
(match.start(), match.end())
for pattern in self._protected_fns
for match in pattern.finditer(text)
]
matches.sort(key=lambda x: (x[0], -x[1]))
res = []
def fold(initial: int, current: Tuple[int, int]) -> int:
if current[0] >= initial:
if current[1] - current[0] < self.chunk_size:
res.append((current[0], text[current[0] : current[1]]))
else:
logger.warning(f"Protected text ignore: {current}")
return max(initial, current[1])
# filter overlapping matches
list(itertools.accumulate(matches, fold, initial=-1))
return res
def _join(self, splits: List[str], protect: List[Tuple[int, str]]) -> List[str]:
"""
Merges and splits elements in splits array based on protected substrings.
The function processes the input splits to ensure all protected substrings
remain as single items. If a protected substring is concatenated with preceding
or following content in any split element, it will be separated from
the adjacent content. The final result maintains the original order of content
while enforcing the integrity of protected substrings.
Key behaviors:
1. Preserves the complete structure of each protected substring
2. Separates protected substrings from any adjacent non-protected content
3. Maintains the original sequence of all content except for necessary
4. Handles cases where protected substrings are partially concatenated
"""
j = 0
point, start = 0, 0
res = []
for split in splits:
end = start + len(split)
cur = split[point - start :]
while j < len(protect):
p_start, p_content = protect[j]
p_end = p_start + len(p_content)
if end <= p_start:
break
if point < p_start:
local_end = p_start - point
res.append(cur[:local_end])
cur = cur[local_end:]
point = p_start
res.append(p_content)
j += 1
if point < p_end:
local_start = p_end - point
cur = cur[local_start:]
point = p_end
if not cur:
break
if cur:
res.append(cur)
point = end
start = end
return res
if __name__ == "__main__":
s = """
这是一些普通文本。
| 姓名 | 年龄 | 城市 |
|------|------|------|
| 张三 | 25 | 北京 |
| 李四 | 30 | 上海 |
| 王五 | 28 | 广州 |
| 张三 | 25 | 北京 |
| 李四 | 30 | 上海 |
| 王五 | 28 | 广州 |
这是文本结束。
"""
sp = TextSplitter(chunk_size=200, chunk_overlap=2)
ck = sp.split_text(s)
for c in ck:
print("------", len(c))
print(c)
pass

103
docreader/utils/endecode.py Normal file
View File

@@ -0,0 +1,103 @@
import base64
import binascii
import io
import logging
from typing import List, Union
import numpy as np
from PIL import Image
logger = logging.getLogger(__name__)
def decode_image(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
"""Convert image to base64 encoded string
Args:
image: Image file path, bytes, PIL Image object, or numpy array
Returns:
Base64 encoded image string, or empty string if conversion fails
"""
if isinstance(image, str):
# It's a file path
with open(image, "rb") as image_file:
return base64.b64encode(image_file.read()).decode()
elif isinstance(image, bytes):
# It's bytes data
return base64.b64encode(image).decode()
elif isinstance(image, Image.Image):
# It's a PIL Image
buffer = io.BytesIO()
image.save(buffer, format=image.format)
return base64.b64encode(buffer.getvalue()).decode()
elif isinstance(image, np.ndarray):
# It's a numpy array
pil_image = Image.fromarray(image)
buffer = io.BytesIO()
pil_image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode()
raise ValueError(f"Unsupported image type: {type(image)}")
def encode_image(image: str, errors="strict") -> bytes:
"""
Decode image bytes using base64.
errors
The error handling scheme to use for the handling of decoding errors.
The default is 'strict' meaning that decoding errors raise a
UnicodeDecodeError. Other possible values are 'ignore' and '????'
as well as any other name registered with codecs.register_error that
can handle UnicodeDecodeErrors.
"""
try:
image_bytes = base64.b64decode(image)
except binascii.Error as e:
if errors == "ignore":
return b""
else:
raise e
return image_bytes
def encode_bytes(content: str) -> bytes:
return content.encode()
def decode_bytes(
content: bytes,
encodings: List[str] = [
"utf-8",
"gb18030",
"gb2312",
"gbk",
"big5",
"ascii",
"latin-1",
],
) -> str:
# Try decoding with each encoding format
for encoding in encodings:
try:
text = content.decode(encoding)
logger.debug(f"Decode content with {encoding}: {len(text)} characters")
return text
except UnicodeDecodeError:
continue
text = content.decode(encoding="latin-1", errors="replace")
logger.warning(
"Unable to determine correct encoding, using latin-1 as fallback. "
"This may cause character issues."
)
return text
if __name__ == "__main__":
img = "test![]()test"
encode_image(img, errors="ignore")

View File

@@ -1,10 +1,10 @@
from contextvars import ContextVar
import logging
import uuid
import contextlib
import logging
import time
from typing import Optional
import uuid
from contextvars import ContextVar
from logging import LogRecord
from typing import Optional
# 配置日志
logger = logging.getLogger(__name__)
@@ -26,21 +26,21 @@ def get_request_id() -> Optional[str]:
class MillisecondFormatter(logging.Formatter):
"""自定义日志格式化器,只显示毫秒级时间戳(3位数字)而不是微秒(6位)"""
def formatTime(self, record, datefmt=None):
"""重写formatTime方法将微秒格式化为毫秒"""
# 先获取标准的格式化时间
result = super().formatTime(record, datefmt)
# 如果使用了包含.%f的格式则将微秒(6位)截断为毫秒(3位)
if datefmt and ".%f" in datefmt:
# 格式化的时间字符串应该在最后有6位微秒数
parts = result.split('.')
parts = result.split(".")
if len(parts) > 1 and len(parts[1]) >= 6:
# 只保留前3位作为毫秒
millis = parts[1][:3]
result = f"{parts[0]}.{millis}"
return result

34
docreader/utils/split.py Normal file
View File

@@ -0,0 +1,34 @@
import re
from typing import Callable, List
def split_text_keep_separator(text: str, separator: str) -> List[str]:
"""Split text with separator and keep the separator at the end of each split."""
parts = text.split(separator)
result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
return [s for s in result if s]
def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
"""Split text by separator."""
if keep_sep:
return lambda text: split_text_keep_separator(text, sep)
else:
return lambda text: text.split(sep)
def split_by_char() -> Callable[[str], List[str]]:
"""Split text by character."""
return lambda text: list(text)
def split_by_regex(regex: str) -> Callable[[str], List[str]]:
"""Split text by regex."""
pattern = re.compile(f"({regex})")
return lambda text: list(filter(None, pattern.split(text)))
def match_by_regex(regex: str) -> Callable[[str], bool]:
"""Split text by regex."""
pattern = re.compile(regex)
return lambda text: bool(pattern.match(text))

View File

@@ -0,0 +1,77 @@
import logging
import os
import tempfile
logger = logging.getLogger(__name__)
class TempFileContext:
def __init__(self, file_content: bytes, suffix: str):
"""
Initialize the context
:param file_content: Byte data to write to file
:param suffix: File suffix
"""
self.file_content = file_content
self.suffix = suffix
self.file = None
def __enter__(self):
"""
Create file when entering context
"""
self.temp_file = tempfile.NamedTemporaryFile(suffix=self.suffix, delete=False)
self.temp_file.write(self.file_content)
self.temp_file.flush()
logger.info(
f"Saved {self.suffix} content to temporary file: {self.temp_file.name}"
)
return self.temp_file.name
def __exit__(self, exc_type, exc_val, exc_tb):
"""
Delete file when exiting context
"""
if self.temp_file:
self.temp_file.close()
if os.path.exists(self.temp_file.name):
os.remove(self.temp_file.name)
logger.info(f"File {self.temp_file.name} has been deleted.")
# Return False to propagate exception (if any exception occurred)
return False
class TempDirContext:
def __init__(self):
"""
Initialize the context
"""
self.temp_dir = None
def __enter__(self):
"""
Create directory when entering context
"""
self.temp_dir = tempfile.TemporaryDirectory()
logger.info(f"Created temporary directory: {self.temp_dir.name}")
return self.temp_dir.name
def __exit__(self, exc_type, exc_val, exc_tb):
"""
Delete directory when exiting context
"""
if self.temp_dir and os.path.exists(self.temp_dir.name):
self.temp_dir.cleanup()
logger.info(f"Directory {self.temp_dir.name} has been deleted.")
# Return False to propagate exception (if any exception occurred)
return False
if __name__ == "__main__":
example_bytes = b"Hello, this is a test file."
file_name = "test_file.txt"
# Using with statement
with TempFileContext(example_bytes, file_name) as temp_file:
# File operations can be performed within the context
print(f"Does file {file_name} exist: {os.path.exists(file_name)}")

438
docreader/uv.lock generated
View File

@@ -6,17 +6,22 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
[[package]]
@@ -423,6 +428,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" },
]
[[package]]
name = "cobble"
version = "0.1.4"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa", size = 3805, upload-time = "2024-06-01T18:11:09.528Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44", size = 3984, upload-time = "2024-06-01T18:11:07.911Z" },
]
[[package]]
name = "colorama"
version = "0.4.6"
@@ -432,6 +446,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
]
[[package]]
name = "coloredlogs"
version = "15.0.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "humanfriendly" },
]
sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" },
]
[[package]]
name = "cos-python-sdk-v5"
version = "1.9.38"
@@ -587,6 +613,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/18/d5/7a04640bf559bb890455ffb28978daf7d44f667c3f04a4d422c655c1ba92/cython-3.1.6-py3-none-any.whl", hash = "sha256:91dcf7eb9b6a089ce4e9e1140e571d84c3bca834afb77ec269be7aa9d31a8157", size = 1223550, upload-time = "2025-10-23T12:38:16.732Z" },
]
[[package]]
name = "defusedxml"
version = "0.7.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" },
]
[[package]]
name = "distro"
version = "1.9.0"
@@ -612,6 +647,7 @@ dependencies = [
{ name = "lxml" },
{ name = "markdown" },
{ name = "markdownify" },
{ name = "markitdown", extra = ["docx", "pdf", "xls", "xlsx"] },
{ name = "minio" },
{ name = "mistletoe" },
{ name = "ollama" },
@@ -622,6 +658,7 @@ dependencies = [
{ name = "pillow" },
{ name = "playwright" },
{ name = "protobuf" },
{ name = "pydantic" },
{ name = "pypdf" },
{ name = "pypdf2" },
{ name = "python-docx" },
@@ -643,6 +680,7 @@ requires-dist = [
{ name = "lxml", specifier = ">=6.0.2" },
{ name = "markdown", specifier = ">=3.10" },
{ name = "markdownify", specifier = ">=1.2.0" },
{ name = "markitdown", extras = ["docx", "pdf", "xls", "xlsx"], specifier = ">=0.1.3" },
{ name = "minio", specifier = ">=7.2.18" },
{ name = "mistletoe", specifier = ">=1.5.0" },
{ name = "ollama", specifier = ">=0.6.0" },
@@ -653,6 +691,7 @@ requires-dist = [
{ name = "pillow", specifier = ">=12.0.0" },
{ name = "playwright", specifier = ">=1.55.0" },
{ name = "protobuf", specifier = ">=6.33.0" },
{ name = "pydantic", specifier = ">=2.12.3" },
{ name = "pypdf", specifier = ">=6.1.3" },
{ name = "pypdf2", specifier = ">=3.0.1" },
{ name = "python-docx", specifier = ">=1.2.0" },
@@ -683,6 +722,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/bf/ee/aa015c5de8b0dc42a8e507eae8c2de5d1c0e068c896858fec6d502402ed6/ebooklib-0.20-py3-none-any.whl", hash = "sha256:fff5322517a37e31c972d27be7d982cc3928c16b3dcc5fd7e8f7c0f5d7bcf42b", size = 40995, upload-time = "2025-10-26T20:56:19.104Z" },
]
[[package]]
name = "et-xmlfile"
version = "2.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
]
[[package]]
name = "exceptiongroup"
version = "1.3.0"
@@ -707,6 +755,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" },
]
[[package]]
name = "flatbuffers"
version = "25.9.23"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" },
]
[[package]]
name = "fonttools"
version = "4.60.1"
@@ -850,6 +907,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" },
{ url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" },
{ url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126, upload-time = "2025-08-07T13:18:20.239Z" },
{ url = "https://files.pythonhosted.org/packages/f1/29/74242b7d72385e29bcc5563fba67dad94943d7cd03552bac320d597f29b2/greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7", size = 1544904, upload-time = "2025-11-04T12:42:04.763Z" },
{ url = "https://files.pythonhosted.org/packages/c8/e2/1572b8eeab0f77df5f6729d6ab6b141e4a84ee8eb9bc8c1e7918f94eda6d/greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8", size = 1611228, upload-time = "2025-11-04T12:42:08.423Z" },
{ url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654, upload-time = "2025-08-07T13:50:00.469Z" },
{ url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" },
{ url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" },
@@ -859,6 +918,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" },
{ url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" },
{ url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" },
{ url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" },
{ url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" },
{ url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" },
{ url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
{ url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
@@ -868,6 +929,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
{ url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
{ url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" },
{ url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" },
{ url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" },
{ url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" },
{ url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
{ url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
@@ -877,6 +940,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
{ url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
{ url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" },
{ url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" },
{ url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" },
{ url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" },
{ url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
{ url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
@@ -884,6 +949,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
{ url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
{ url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
{ url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
{ url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" },
{ url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" },
]
@@ -1061,6 +1128,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
]
[[package]]
name = "humanfriendly"
version = "10.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyreadline3", marker = "sys_platform == 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" },
]
[[package]]
name = "idna"
version = "3.11"
@@ -1386,6 +1465,38 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" },
]
[[package]]
name = "magika"
version = "0.6.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "onnxruntime", version = "1.23.2", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
{ name = "python-dotenv" },
]
sdist = { url = "https://files.pythonhosted.org/packages/a3/f3/3d1dcdd7b9c41d589f5cff252d32ed91cdf86ba84391cfc81d9d8773571d/magika-0.6.3.tar.gz", hash = "sha256:7cc52aa7359af861957043e2bf7265ed4741067251c104532765cd668c0c0cb1", size = 3042784, upload-time = "2025-10-30T15:22:34.499Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a2/e4/35c323beb3280482c94299d61626116856ac2d4ec16ecef50afc4fdd4291/magika-0.6.3-py3-none-any.whl", hash = "sha256:eda443d08006ee495e02083b32e51b98cb3696ab595a7d13900d8e2ef506ec9d", size = 2969474, upload-time = "2025-10-30T15:22:25.298Z" },
{ url = "https://files.pythonhosted.org/packages/25/8f/132b0d7cd51c02c39fd52658a5896276c30c8cc2fd453270b19db8c40f7e/magika-0.6.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:86901e64b05dde5faff408c9b8245495b2e1fd4c226e3393d3d2a3fee65c504b", size = 13358841, upload-time = "2025-10-30T15:22:27.413Z" },
{ url = "https://files.pythonhosted.org/packages/c4/03/5ed859be502903a68b7b393b17ae0283bf34195cfcca79ce2dc25b9290e7/magika-0.6.3-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:3d9661eedbdf445ac9567e97e7ceefb93545d77a6a32858139ea966b5806fb64", size = 15367335, upload-time = "2025-10-30T15:22:29.907Z" },
{ url = "https://files.pythonhosted.org/packages/7b/9e/f8ee7d644affa3b80efdd623a3d75865c8f058f3950cb87fb0c48e3559bc/magika-0.6.3-py3-none-win_amd64.whl", hash = "sha256:e57f75674447b20cab4db928ae58ab264d7d8582b55183a0b876711c2b2787f3", size = 12692831, upload-time = "2025-10-30T15:22:32.063Z" },
]
[[package]]
name = "mammoth"
version = "1.11.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "cobble" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ed/3c/a58418d2af00f2da60d4a51e18cd0311307b72d48d2fffec36a97b4a5e44/mammoth-1.11.0.tar.gz", hash = "sha256:a0f59e442f34d5b6447f4b0999306cbf3e67aaabfa8cb516f878fb1456744637", size = 53142, upload-time = "2025-09-19T10:35:20.373Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ca/54/2e39566a131b13f6d8d193f974cb6a34e81bb7cc2fa6f7e03de067b36588/mammoth-1.11.0-py2.py3-none-any.whl", hash = "sha256:c077ab0d450bd7c0c6ecd529a23bf7e0fa8190c929e28998308ff4eada3f063b", size = 54752, upload-time = "2025-09-19T10:35:18.699Z" },
]
[[package]]
name = "markdown"
version = "3.10"
@@ -1408,6 +1519,41 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6a/e2/7af643acb4cae0741dffffaa7f3f7c9e7ab4046724543ba1777c401d821c/markdownify-1.2.0-py3-none-any.whl", hash = "sha256:48e150a1c4993d4d50f282f725c0111bd9eb25645d41fa2f543708fd44161351", size = 15561, upload-time = "2025-08-09T17:44:14.074Z" },
]
[[package]]
name = "markitdown"
version = "0.1.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "charset-normalizer" },
{ name = "defusedxml" },
{ name = "magika" },
{ name = "markdownify" },
{ name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'win32'" },
{ name = "requests" },
]
sdist = { url = "https://files.pythonhosted.org/packages/87/31/90cef2bc8ecd85c200ed3b3d1e20fc7a724213502685c4b05b5431e02668/markitdown-0.1.3.tar.gz", hash = "sha256:b0d9127c3373a68274dede6af6c9bb0684b78ce364c727c4c304da97a20d6fd9", size = 40039, upload-time = "2025-08-26T22:37:04.4Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/97/83/7b47d2ecbf58650a03aeeb21ba2d59175f202bf4fb81d44f40f1deb82bc0/markitdown-0.1.3-py3-none-any.whl", hash = "sha256:08d9a25770979d78f60dcc0afcb868de6799608e4db65342b2e03304fb091251", size = 58391, upload-time = "2025-08-26T22:37:02.924Z" },
]
[package.optional-dependencies]
docx = [
{ name = "lxml" },
{ name = "mammoth" },
]
pdf = [
{ name = "pdfminer-six" },
]
xls = [
{ name = "pandas" },
{ name = "xlrd" },
]
xlsx = [
{ name = "openpyxl" },
{ name = "pandas" },
]
[[package]]
name = "minio"
version = "7.2.18"
@@ -1433,6 +1579,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3f/c2/bd0fc48dc323035cd65d21c45a3bb5c7b054001bf4ea75e461beb7656e07/mistletoe-1.5.0-py3-none-any.whl", hash = "sha256:d4e77b991b998c5efe3c4eab4e1b472263b5349688acd50d79bd9a6c317a9df1", size = 55262, upload-time = "2025-10-18T16:37:08.376Z" },
]
[[package]]
name = "mpmath"
version = "1.3.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
]
[[package]]
name = "networkx"
version = "3.4.2"
@@ -1440,7 +1595,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" }
wheels = [
@@ -1456,14 +1612,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" }
wheels = [
@@ -1492,7 +1652,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
wheels = [
@@ -1561,14 +1722,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
sdist = { url = "https://files.pythonhosted.org/packages/b5/f4/098d2270d52b41f1bd7db9fc288aaa0400cb48c2a3e2af6fa365d9720947/numpy-2.3.4.tar.gz", hash = "sha256:a7d018bfedb375a8d979ac758b120ba846a7fe764911a64465fd87b8729f4a6a", size = 20582187, upload-time = "2025-10-15T16:18:11.77Z" }
wheels = [
@@ -1660,6 +1825,97 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b5/c1/edc9f41b425ca40b26b7c104c5f6841a4537bb2552bfa6ca66e81405bb95/ollama-0.6.0-py3-none-any.whl", hash = "sha256:534511b3ccea2dff419ae06c3b58d7f217c55be7897c8ce5868dfb6b219cf7a0", size = 14130, upload-time = "2025-09-24T22:46:01.19Z" },
]
[[package]]
name = "onnxruntime"
version = "1.20.1"
source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
dependencies = [
{ name = "coloredlogs", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "flatbuffers", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "packaging", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "protobuf", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "sympy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/6d/c6/c4c0860bee2fde6037bdd9dcd12d323f6e38cf00fcc9a5065b394337fc55/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b2908b50101a19e99c4d4e97ebb9905561daf61829403061c1adc1b588bc0de", size = 11954010, upload-time = "2024-11-21T00:48:35.254Z" },
{ url = "https://files.pythonhosted.org/packages/63/47/3dc0b075ab539f16b3d8b09df6b504f51836086ee709690a6278d791737d/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d82daaec24045a2e87598b8ac2b417b1cce623244e80e663882e9fe1aae86410", size = 13330452, upload-time = "2024-11-21T00:48:40.02Z" },
{ url = "https://files.pythonhosted.org/packages/27/ef/80fab86289ecc01a734b7ddf115dfb93d8b2e004bd1e1977e12881c72b12/onnxruntime-1.20.1-cp310-cp310-win32.whl", hash = "sha256:4c4b251a725a3b8cf2aab284f7d940c26094ecd9d442f07dd81ab5470e99b83f", size = 9813849, upload-time = "2024-11-21T00:48:43.569Z" },
{ url = "https://files.pythonhosted.org/packages/a9/e6/33ab10066c9875a29d55e66ae97c3bf91b9b9b987179455d67c32261a49c/onnxruntime-1.20.1-cp310-cp310-win_amd64.whl", hash = "sha256:d3b616bb53a77a9463707bb313637223380fc327f5064c9a782e8ec69c22e6a2", size = 11329702, upload-time = "2024-11-21T00:48:46.599Z" },
{ url = "https://files.pythonhosted.org/packages/a5/da/c44bf9bd66cd6d9018a921f053f28d819445c4d84b4dd4777271b0fe52a2/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6243e34d74423bdd1edf0ae9596dd61023b260f546ee17d701723915f06a9f7", size = 11955227, upload-time = "2024-11-21T00:48:54.556Z" },
{ url = "https://files.pythonhosted.org/packages/11/ac/4120dfb74c8e45cce1c664fc7f7ce010edd587ba67ac41489f7432eb9381/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5eec64c0269dcdb8d9a9a53dc4d64f87b9e0c19801d9321246a53b7eb5a7d1bc", size = 13331703, upload-time = "2024-11-21T00:48:57.97Z" },
{ url = "https://files.pythonhosted.org/packages/12/f1/cefacac137f7bb7bfba57c50c478150fcd3c54aca72762ac2c05ce0532c1/onnxruntime-1.20.1-cp311-cp311-win32.whl", hash = "sha256:a19bc6e8c70e2485a1725b3d517a2319603acc14c1f1a017dda0afe6d4665b41", size = 9813977, upload-time = "2024-11-21T00:49:00.519Z" },
{ url = "https://files.pythonhosted.org/packages/2c/2d/2d4d202c0bcfb3a4cc2b171abb9328672d7f91d7af9ea52572722c6d8d96/onnxruntime-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:8508887eb1c5f9537a4071768723ec7c30c28eb2518a00d0adcd32c89dea3221", size = 11329895, upload-time = "2024-11-21T00:49:03.845Z" },
{ url = "https://files.pythonhosted.org/packages/c5/9d/a42a84e10f1744dd27c6f2f9280cc3fb98f869dd19b7cd042e391ee2ab61/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f56e898815963d6dc4ee1c35fc6c36506466eff6d16f3cb9848cea4e8c8172", size = 11952833, upload-time = "2024-11-21T00:49:10.563Z" },
{ url = "https://files.pythonhosted.org/packages/47/42/2f71f5680834688a9c81becbe5c5bb996fd33eaed5c66ae0606c3b1d6a02/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb71a814f66517a65628c9e4a2bb530a6edd2cd5d87ffa0af0f6f773a027d99e", size = 13333903, upload-time = "2024-11-21T00:49:12.984Z" },
{ url = "https://files.pythonhosted.org/packages/c8/f1/aabfdf91d013320aa2fc46cf43c88ca0182860ff15df872b4552254a9680/onnxruntime-1.20.1-cp312-cp312-win32.whl", hash = "sha256:bd386cc9ee5f686ee8a75ba74037750aca55183085bf1941da8efcfe12d5b120", size = 9814562, upload-time = "2024-11-21T00:49:15.453Z" },
{ url = "https://files.pythonhosted.org/packages/dd/80/76979e0b744307d488c79e41051117634b956612cc731f1028eb17ee7294/onnxruntime-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:19c2d843eb074f385e8bbb753a40df780511061a63f9def1b216bf53860223fb", size = 11331482, upload-time = "2024-11-21T00:49:19.412Z" },
{ url = "https://files.pythonhosted.org/packages/81/0d/13bbd9489be2a6944f4a940084bfe388f1100472f38c07080a46fbd4ab96/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb44b08e017a648924dbe91b82d89b0c105b1adcfe31e90d1dc06b8677ad37be", size = 11951459, upload-time = "2024-11-21T00:49:26.269Z" },
{ url = "https://files.pythonhosted.org/packages/c0/ea/4454ae122874fd52bbb8a961262de81c5f932edeb1b72217f594c700d6ef/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bda6aebdf7917c1d811f21d41633df00c58aff2bef2f598f69289c1f1dabc4b3", size = 13331620, upload-time = "2024-11-21T00:49:28.875Z" },
{ url = "https://files.pythonhosted.org/packages/d8/e0/50db43188ca1c945decaa8fc2a024c33446d31afed40149897d4f9de505f/onnxruntime-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:d30367df7e70f1d9fc5a6a68106f5961686d39b54d3221f760085524e8d38e16", size = 11331758, upload-time = "2024-11-21T00:49:31.417Z" },
{ url = "https://files.pythonhosted.org/packages/d8/55/3821c5fd60b52a6c82a00bba18531793c93c4addfe64fbf061e235c5617a/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9158465745423b2b5d97ed25aa7740c7d38d2993ee2e5c3bfacb0c4145c49d8", size = 11950342, upload-time = "2024-11-21T00:49:34.164Z" },
{ url = "https://files.pythonhosted.org/packages/14/56/fd990ca222cef4f9f4a9400567b9a15b220dee2eafffb16b2adbc55c8281/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0df6f2df83d61f46e842dbcde610ede27218947c33e994545a22333491e72a3b", size = 13337040, upload-time = "2024-11-21T00:49:37.271Z" },
]
[[package]]
name = "onnxruntime"
version = "1.23.2"
source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version >= '3.14' and sys_platform == 'darwin'",
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
]
dependencies = [
{ name = "coloredlogs", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
{ name = "flatbuffers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'darwin')" },
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform == 'darwin')" },
{ name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
{ name = "protobuf", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
{ name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/35/d6/311b1afea060015b56c742f3531168c1644650767f27ef40062569960587/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a7730122afe186a784660f6ec5807138bf9d792fa1df76556b27307ea9ebcbe3", size = 17195934, upload-time = "2025-10-27T23:06:14.143Z" },
{ url = "https://files.pythonhosted.org/packages/db/db/81bf3d7cecfbfed9092b6b4052e857a769d62ed90561b410014e0aae18db/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:b28740f4ecef1738ea8f807461dd541b8287d5650b5be33bca7b474e3cbd1f36", size = 19153079, upload-time = "2025-10-27T23:05:57.686Z" },
{ url = "https://files.pythonhosted.org/packages/2e/4d/a382452b17cf70a2313153c520ea4c96ab670c996cb3a95cc5d5ac7bfdac/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f7d1fe034090a1e371b7f3ca9d3ccae2fabae8c1d8844fb7371d1ea38e8e8d2", size = 15219883, upload-time = "2025-10-22T03:46:21.66Z" },
{ url = "https://files.pythonhosted.org/packages/fb/56/179bf90679984c85b417664c26aae4f427cba7514bd2d65c43b181b7b08b/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ca88747e708e5c67337b0f65eed4b7d0dd70d22ac332038c9fc4635760018f7", size = 17370357, upload-time = "2025-10-22T03:46:57.968Z" },
{ url = "https://files.pythonhosted.org/packages/44/be/467b00f09061572f022ffd17e49e49e5a7a789056bad95b54dfd3bee73ff/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f91d2c9b0965e86827a5ba01531d5b669770b01775b23199565d6c1f136616c", size = 17196113, upload-time = "2025-10-22T03:47:33.526Z" },
{ url = "https://files.pythonhosted.org/packages/9f/a8/3c23a8f75f93122d2b3410bfb74d06d0f8da4ac663185f91866b03f7da1b/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:87d8b6eaf0fbeb6835a60a4265fde7a3b60157cf1b2764773ac47237b4d48612", size = 19153857, upload-time = "2025-10-22T03:46:37.578Z" },
{ url = "https://files.pythonhosted.org/packages/3f/d8/506eed9af03d86f8db4880a4c47cd0dffee973ef7e4f4cff9f1d4bcf7d22/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbfd2fca76c855317568c1b36a885ddea2272c13cb0e395002c402f2360429a6", size = 15220095, upload-time = "2025-10-22T03:46:24.769Z" },
{ url = "https://files.pythonhosted.org/packages/e9/80/113381ba832d5e777accedc6cb41d10f9eca82321ae31ebb6bcede530cea/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da44b99206e77734c5819aa2142c69e64f3b46edc3bd314f6a45a932defc0b3e", size = 17372080, upload-time = "2025-10-22T03:47:00.265Z" },
{ url = "https://files.pythonhosted.org/packages/1b/9e/f748cd64161213adeef83d0cb16cb8ace1e62fa501033acdd9f9341fff57/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:b8f029a6b98d3cf5be564d52802bb50a8489ab73409fa9db0bf583eabb7c2321", size = 17195929, upload-time = "2025-10-22T03:47:36.24Z" },
{ url = "https://files.pythonhosted.org/packages/91/9d/a81aafd899b900101988ead7fb14974c8a58695338ab6a0f3d6b0100f30b/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:218295a8acae83905f6f1aed8cacb8e3eb3bd7513a13fe4ba3b2664a19fc4a6b", size = 19157705, upload-time = "2025-10-22T03:46:40.415Z" },
{ url = "https://files.pythonhosted.org/packages/3c/35/4e40f2fba272a6698d62be2cd21ddc3675edfc1a4b9ddefcc4648f115315/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76ff670550dc23e58ea9bc53b5149b99a44e63b34b524f7b8547469aaa0dcb8c", size = 15226915, upload-time = "2025-10-22T03:46:27.773Z" },
{ url = "https://files.pythonhosted.org/packages/ef/88/9cc25d2bafe6bc0d4d3c1db3ade98196d5b355c0b273e6a5dc09c5d5d0d5/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f9b4ae77f8e3c9bee50c27bc1beede83f786fe1d52e99ac85aa8d65a01e9b77", size = 17382649, upload-time = "2025-10-22T03:47:02.782Z" },
{ url = "https://files.pythonhosted.org/packages/3d/41/fba0cabccecefe4a1b5fc8020c44febb334637f133acefc7ec492029dd2c/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2ff531ad8496281b4297f32b83b01cdd719617e2351ffe0dba5684fb283afa1f", size = 17196337, upload-time = "2025-10-22T03:46:35.168Z" },
{ url = "https://files.pythonhosted.org/packages/fe/f9/2d49ca491c6a986acce9f1d1d5fc2099108958cc1710c28e89a032c9cfe9/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:162f4ca894ec3de1a6fd53589e511e06ecdc3ff646849b62a9da7489dee9ce95", size = 19157691, upload-time = "2025-10-22T03:46:43.518Z" },
{ url = "https://files.pythonhosted.org/packages/1c/a1/428ee29c6eaf09a6f6be56f836213f104618fb35ac6cc586ff0f477263eb/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45d127d6e1e9b99d1ebeae9bcd8f98617a812f53f46699eafeb976275744826b", size = 15226898, upload-time = "2025-10-22T03:46:30.039Z" },
{ url = "https://files.pythonhosted.org/packages/f2/2b/b57c8a2466a3126dbe0a792f56ad7290949b02f47b86216cd47d857e4b77/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8bace4e0d46480fbeeb7bbe1ffe1f080e6663a42d1086ff95c1551f2d39e7872", size = 17382518, upload-time = "2025-10-22T03:47:05.407Z" },
{ url = "https://files.pythonhosted.org/packages/7c/3d/6830fa61c69ca8e905f237001dbfc01689a4e4ab06147020a4518318881f/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466", size = 15229610, upload-time = "2025-10-22T03:46:32.239Z" },
{ url = "https://files.pythonhosted.org/packages/b6/ca/862b1e7a639460f0ca25fd5b6135fb42cf9deea86d398a92e44dfda2279d/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2b9233c4947907fd1818d0e581c049c41ccc39b2856cc942ff6d26317cee145", size = 17394184, upload-time = "2025-10-22T03:47:08.127Z" },
]
[[package]]
name = "openai"
version = "2.7.1"
@@ -1733,6 +1989,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" },
]
[[package]]
name = "openpyxl"
version = "3.1.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "et-xmlfile" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
]
[[package]]
name = "opt-einsum"
version = "3.3.0"
@@ -1821,6 +2089,68 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a3/6f/042d73453ad01a2e3dd4adeae4870e25679d9eaa340d70bd54cd409cb982/paddlepaddle-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:d16fd322246c4a580ca512971840ac8c16126a77d59a7bceee165d8f2196a6b2", size = 101708504, upload-time = "2025-10-30T13:21:18.125Z" },
]
[[package]]
name = "pandas"
version = "2.3.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "python-dateutil" },
{ name = "pytz" },
{ name = "tzdata" },
]
sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" },
{ url = "https://files.pythonhosted.org/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" },
{ url = "https://files.pythonhosted.org/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" },
{ url = "https://files.pythonhosted.org/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" },
{ url = "https://files.pythonhosted.org/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" },
{ url = "https://files.pythonhosted.org/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" },
{ url = "https://files.pythonhosted.org/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" },
{ url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" },
{ url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" },
{ url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" },
{ url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" },
{ url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" },
{ url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" },
{ url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" },
{ url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
{ url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
{ url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
{ url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" },
{ url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" },
{ url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" },
{ url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" },
{ url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" },
{ url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" },
{ url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" },
{ url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" },
{ url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" },
{ url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" },
{ url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" },
{ url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" },
{ url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" },
{ url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" },
{ url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" },
{ url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" },
{ url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
{ url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" },
{ url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" },
{ url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" },
{ url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" },
{ url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" },
{ url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" },
{ url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" },
{ url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" },
{ url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" },
{ url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" },
{ url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
{ url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
{ url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
]
[[package]]
name = "pdfminer-six"
version = "20250506"
@@ -2266,6 +2596,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ae/43/2b0607ef7f16d63fbe00de728151a090397ef5b3b9147b4aefe975d17106/pypdfium2-5.0.0-py3-none-win_arm64.whl", hash = "sha256:0a2a473fe95802e7a5f4140f25e5cd036cf17f060f27ee2d28c3977206add763", size = 2939015, upload-time = "2025-10-26T13:31:40.531Z" },
]
[[package]]
name = "pyreadline3"
version = "3.5.4"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" },
]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@@ -2291,6 +2630,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" },
]
[[package]]
name = "python-dotenv"
version = "1.2.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
]
[[package]]
name = "python-pptx"
version = "1.0.2"
@@ -2306,6 +2654,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" },
]
[[package]]
name = "pytz"
version = "2025.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
]
[[package]]
name = "pyyaml"
version = "6.0.3"
@@ -2654,7 +3011,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
dependencies = [
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -2717,14 +3075,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
dependencies = [
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -3083,6 +3445,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/97/45/1097a0eb3ad2a1cb5a29d493aefffe4170e454b2726cfe2a76f6652e91af/stringzilla-4.2.3-cp313-cp313-win_arm64.whl", hash = "sha256:1000a8df547fb3b194def48cc3ed6494715fe1c8ac25c6444ed3cfb0b52942c7", size = 99815, upload-time = "2025-10-27T18:35:56.555Z" },
]
[[package]]
name = "sympy"
version = "1.14.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "mpmath" },
]
sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
]
[[package]]
name = "termcolor"
version = "3.2.0"
@@ -3116,7 +3490,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
dependencies = [
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -3135,14 +3510,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
"python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
dependencies = [
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -3185,6 +3564,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
]
[[package]]
name = "tzdata"
version = "2025.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
]
[[package]]
name = "unidic-lite"
version = "1.0.8"