feat: 新增文档模型类,调整配置与解析逻辑,优化日志及导入

移除日志设置与冗余代码,优化导入、类型提示及OCR后端管理
统一调整各文件模块导入路径为绝对导入
调整导入路径,移除部分导入,优化日志及注释
升级文档解析器为 Docx2Parser,优化超时与图片处理逻辑
This commit is contained in:
begoniezhao
2025-11-07 10:30:02 +08:00
committed by lyingbug
parent af620806e0
commit 2d66abedf0
39 changed files with 2676 additions and 1570 deletions

7
.gitignore vendored
View File

@@ -24,17 +24,14 @@ node_modules/
tmp/ tmp/
temp/ temp/
# Docker compose файл (локальные настройки)
# docker-compose.yml
WeKnora WeKnora
/models/ /models/
**/__pycache__
test/data/mswag.txt test/data/mswag.txt
data/files/ data/files/
.python-version
.venv/ .venv/
**/__pycache__
.python-version
### macOS ### macOS
# General # General

View File

@@ -127,6 +127,7 @@ services:
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-} - MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
- MINIO_USE_SSL=${MINIO_USE_SSL:-} - MINIO_USE_SSL=${MINIO_USE_SSL:-}
- WEB_PROXY=${WEB_PROXY:-} - WEB_PROXY=${WEB_PROXY:-}
- MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
healthcheck: healthcheck:
test: ["CMD", "grpc_health_probe", "-addr=:50051"] test: ["CMD", "grpc_health_probe", "-addr=:50051"]
interval: 30s interval: 30s

View File

@@ -80,10 +80,11 @@ RUN pip install uv --break-system-packages && \
python -m uv sync --locked --no-dev python -m uv sync --locked --no-dev
# 复制源代码和生成脚本 # 复制源代码和生成脚本
COPY docreader . COPY docreader docreader
# 生成 protobuf 代码 # 生成 protobuf 代码
RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh RUN chmod +x docreader/scripts/generate_proto.sh && \
bash docreader/scripts/generate_proto.sh
# 确保模型目录存在 # 确保模型目录存在
RUN ls -la /root/.paddleocr/whl/ RUN ls -la /root/.paddleocr/whl/
@@ -150,10 +151,11 @@ RUN python -m playwright install-deps webkit
# COPY docreader/scripts/download_deps.py download_deps.py # COPY docreader/scripts/download_deps.py download_deps.py
# RUN python -m download_deps # RUN python -m download_deps
COPY --from=builder /app/ ./ COPY docreader/pyproject.toml docreader/uv.lock ./
COPY --from=builder /app/docreader docreader
# 暴露 gRPC 端口 # 暴露 gRPC 端口
EXPOSE 50051 EXPOSE 50051
# 直接运行 Python 服务(日志输出到 stdout/stderr # 直接运行 Python 服务(日志输出到 stdout/stderr
CMD ["uv", "run", "main.py"] CMD ["uv", "run", "-m", "docreader.main"]

5
docreader/.pylintrc Normal file
View File

@@ -0,0 +1,5 @@
[LOGGING]
logging-format-style=fstr
[MESSAGES CONTROL]
; disable=W1203

View File

@@ -1,37 +1,25 @@
import os
import sys
import logging import logging
from concurrent import futures import os
import re
import sys
import traceback import traceback
import grpc
import uuid import uuid
import atexit from concurrent import futures
from typing import Optional
import grpc
from grpc_health.v1 import health_pb2_grpc from grpc_health.v1 import health_pb2_grpc
from grpc_health.v1.health import HealthServicer from grpc_health.v1.health import HealthServicer
# Add parent directory to Python path from docreader.models.read_config import ChunkingConfig
current_dir = os.path.dirname(os.path.abspath(__file__)) from docreader.parser import Parser
parent_dir = os.path.dirname(current_dir) from docreader.parser.ocr_engine import OCREngine
if parent_dir not in sys.path: from docreader.proto import docreader_pb2_grpc
sys.path.insert(0, parent_dir) from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse
from docreader.utils.request import init_logging_request_id, request_id_context
from proto.docreader_pb2 import ReadResponse, Chunk, Image # Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
from proto import docreader_pb2_grpc # cannot be encoded to UTF-8
from parser import Parser, OCREngine
from parser.config import ChunkingConfig
from utils.request import request_id_context, init_logging_request_id
# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
import re
from typing import Optional
try:
# Optional dependency for charset detection; install via `pip install charset-normalizer`
from charset_normalizer import from_bytes as _cn_from_bytes # type: ignore
except Exception: # pragma: no cover
_cn_from_bytes = None # type: ignore
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]") _SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
@@ -47,29 +35,6 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
return s.encode("utf-8", errors="replace").decode("utf-8") return s.encode("utf-8", errors="replace").decode("utf-8")
def read_text_with_fallback(file_path: str) -> str:
"""Read text from file supporting multiple encodings with graceful fallback.
This server currently receives bytes over gRPC and delegates decoding to the parser.
This helper is provided for future local-file reads if needed.
"""
with open(file_path, "rb") as f:
raw = f.read()
if _cn_from_bytes is not None:
try:
result = _cn_from_bytes(raw).best()
if result:
return str(result)
except Exception:
pass
for enc in ("utf-8", "gb18030", "latin-1"):
try:
return raw.decode(enc, errors="replace")
except UnicodeDecodeError:
continue
return raw.decode("utf-8", errors="replace")
# Ensure no existing handlers # Ensure no existing handlers
for handler in logging.root.handlers[:]: for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler) logging.root.removeHandler(handler)
@@ -113,7 +78,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
request.file_type or os.path.splitext(request.file_name)[1][1:] request.file_type or os.path.splitext(request.file_name)[1][1:]
) )
logger.info( logger.info(
f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}" f"ReadFromFile for file: {request.file_name}, type: {file_type}"
) )
logger.info(f"File content size: {len(request.file_content)} bytes") logger.info(f"File content size: {len(request.file_content)} bytes")
@@ -124,8 +89,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
enable_multimodal = request.read_config.enable_multimodal or False enable_multimodal = request.read_config.enable_multimodal or False
logger.info( logger.info(
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, " f"Using chunking config: size={chunk_size}, "
f"multimodal={enable_multimodal}" f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
) )
# Get Storage and VLM config from request # Get Storage and VLM config from request
@@ -144,7 +109,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
"path_prefix": sc.path_prefix, "path_prefix": sc.path_prefix,
} }
logger.info( logger.info(
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}" f"Using Storage config: provider={storage_config.get('provider')}, "
f"bucket={storage_config['bucket_name']}"
) )
vlm_config = { vlm_config = {
@@ -170,7 +136,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
) )
# Parse file # Parse file
logger.info(f"Starting file parsing process") logger.info("Starting file parsing process")
result = self.parser.parse_file( result = self.parser.parse_file(
request.file_name, file_type, request.file_content, chunking_config request.file_name, file_type, request.file_content, chunking_config
) )
@@ -184,7 +150,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
# Convert to protobuf message # Convert to protobuf message
logger.info( logger.info(
f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks" f"Parsed file {request.file_name}, with {len(result.chunks)} chunks"
) )
# Build response, including image info # Build response, including image info
@@ -224,8 +190,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
enable_multimodal = request.read_config.enable_multimodal or False enable_multimodal = request.read_config.enable_multimodal or False
logger.info( logger.info(
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, " f"Using chunking config: size={chunk_size}, "
f"multimodal={enable_multimodal}" f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
) )
# Get Storage and VLM config from request # Get Storage and VLM config from request
@@ -243,7 +209,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
"path_prefix": sc.path_prefix, "path_prefix": sc.path_prefix,
} }
logger.info( logger.info(
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}" f"Using Storage config: provider={storage_config.get('provider')}, "
f"bucket={storage_config['bucket_name']}"
) )
vlm_config = { vlm_config = {
@@ -269,7 +236,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
) )
# Parse URL # Parse URL
logger.info(f"Starting URL parsing process") logger.info("Starting URL parsing process")
result = self.parser.parse_url( result = self.parser.parse_url(
request.url, request.title, chunking_config request.url, request.title, chunking_config
) )
@@ -282,7 +249,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
# Convert to protobuf message, including image info # Convert to protobuf message, including image info
logger.info( logger.info(
f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks" f"Parsed URL {request.url}, returning {len(result.chunks)} chunks"
) )
response = ReadResponse( response = ReadResponse(
@@ -335,29 +302,15 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
return proto_chunk return proto_chunk
def init_ocr_engine(ocr_backend, ocr_config): def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs):
"""Initialize OCR engine""" """Initialize OCR engine"""
try: backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle")
logger.info(f"Initializing OCR engine with backend: {ocr_backend}") logger.info(f"Initializing OCR engine with backend: {backend_type}")
ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config) OCREngine.get_instance(backend_type=backend_type, **kwargs)
if ocr_engine:
logger.info("OCR engine initialized successfully")
return True
else:
logger.error("OCR engine initialization failed")
return False
except Exception as e:
logger.error(f"Error initializing OCR engine: {str(e)}")
return False
def main(): def main():
init_ocr_engine( init_ocr_engine()
os.getenv("OCR_BACKEND", "paddle"),
{
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
},
)
# Set max number of worker threads # Set max number of worker threads
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4")) max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))

View File

View File

@@ -0,0 +1,87 @@
"""Chunk document schema."""
import json
from typing import Any, Dict, List
from pydantic import BaseModel, Field
class Chunk(BaseModel):
"""Document Chunk including chunk content, chunk metadata."""
content: str = Field(default="", description="chunk text content")
seq: int = Field(default=0, description="Chunk sequence number")
start: int = Field(default=0, description="Chunk start position")
end: int = Field(description="Chunk end position")
images: List[Dict[str, Any]] = Field(
default_factory=list, description="Images in the chunk"
)
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="metadata fields",
)
def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
"""Convert Chunk to dict."""
data = self.model_dump()
data.update(kwargs)
data["class_name"] = self.__class__.__name__
return data
def to_json(self, **kwargs: Any) -> str:
"""Convert Chunk to json."""
data = self.to_dict(**kwargs)
return json.dumps(data)
def __hash__(self):
"""Hash function."""
return hash((self.content,))
def __eq__(self, other):
"""Equal function."""
return self.content == other.content
@classmethod
def from_dict(cls, data: Dict[str, Any], **kwargs: Any): # type: ignore
"""Create Chunk from dict."""
if isinstance(kwargs, dict):
data.update(kwargs)
data.pop("class_name", None)
return cls(**data)
@classmethod
def from_json(cls, data_str: str, **kwargs: Any): # type: ignore
"""Create Chunk from json."""
data = json.loads(data_str)
return cls.from_dict(data, **kwargs)
class Document(BaseModel):
"""Document including document content, document metadata."""
model_config = {"arbitrary_types_allowed": True}
content: str = Field(default="", description="document text content")
images: Dict[str, str] = Field(
default_factory=dict, description="Images in the document"
)
chunks: List[Chunk] = Field(default_factory=list, description="document chunks")
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="metadata fields",
)
def set_content(self, content: str) -> None:
"""Set document content."""
self.content = content
def get_content(self) -> str:
"""Get document content."""
return self.content
def is_valid(self) -> bool:
return self.content != ""

View File

@@ -0,0 +1,27 @@
from dataclasses import dataclass, field
@dataclass
class ChunkingConfig:
"""
Configuration for text chunking process.
Controls how documents are split into smaller pieces for processing.
"""
# Maximum size of each chunk in tokens/chars
chunk_size: int = 512
# Number of tokens/chars to overlap between chunks
chunk_overlap: int = 50
# Text separators in order of priority
separators: list = field(default_factory=lambda: ["\n\n", "\n", ""])
# Whether to enable multimodal processing (text + images)
enable_multimodal: bool = False
# Preferred field name going forward
storage_config: dict[str, str] = field(default_factory=dict)
# VLM configuration for image captioning
vlm_config: dict[str, str] = field(default_factory=dict)

View File

@@ -13,22 +13,18 @@ The parsers extract content from documents and can split them into
meaningful chunks for further processing and indexing. meaningful chunks for further processing and indexing.
""" """
from .base_parser import BaseParser, ParseResult
from .docx_parser import DocxParser
from .doc_parser import DocParser from .doc_parser import DocParser
from .pdf_parser import PDFParser from .docx2_parser import Docx2Parser
from .markdown_parser import MarkdownParser
from .text_parser import TextParser
from .image_parser import ImageParser from .image_parser import ImageParser
from .web_parser import WebParser from .markdown_parser import MarkdownParser
from .parser import Parser from .parser import Parser
from .config import ChunkingConfig from .pdf_parser import PDFParser
from .ocr_engine import OCREngine from .text_parser import TextParser
from .web_parser import WebParser
# Export public classes and modules # Export public classes and modules
__all__ = [ __all__ = [
"BaseParser", # Base parser class that all format parsers inherit from "Docx2Parser", # Parser for .docx files (modern Word documents)
"DocxParser", # Parser for .docx files (modern Word documents)
"DocParser", # Parser for .doc files (legacy Word documents) "DocParser", # Parser for .doc files (legacy Word documents)
"PDFParser", # Parser for PDF documents "PDFParser", # Parser for PDF documents
"MarkdownParser", # Parser for Markdown text files "MarkdownParser", # Parser for Markdown text files
@@ -36,7 +32,4 @@ __all__ = [
"ImageParser", # Parser for images with text content "ImageParser", # Parser for images with text content
"WebParser", # Parser for web pages "WebParser", # Parser for web pages
"Parser", # Main parser factory that selects the appropriate parser "Parser", # Main parser factory that selects the appropriate parser
"ChunkingConfig", # Configuration for text chunking behavior
"ParseResult", # Standard result format returned by all parsers
"OCREngine", # OCR engine for extracting text from images
] ]

View File

@@ -1,65 +1,28 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re
import os
import asyncio import asyncio
from typing import List, Dict, Any, Optional, Tuple, Union
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
import logging
import sys
import traceback
import numpy as np
import time
import io import io
import json import logging
from .ocr_engine import OCREngine import os
from .image_utils import image_to_base64 import re
from .config import ChunkingConfig import time
from .storage import create_storage from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Tuple
import requests
from PIL import Image from PIL import Image
# Add parent directory to Python path for src imports from docreader.models.document import Chunk, Document
current_dir = os.path.dirname(os.path.abspath(__file__)) from docreader.models.read_config import ChunkingConfig
parent_dir = os.path.dirname(current_dir) from docreader.parser.caption import Caption
if parent_dir not in sys.path: from docreader.parser.ocr_engine import OCREngine
sys.path.insert(0, parent_dir) from docreader.parser.storage import create_storage
from docreader.splitter.splitter import TextSplitter
try: from docreader.utils import endecode
from services.docreader.src.parser.caption import Caption
except ImportError:
# Fallback: try relative import
try:
from .caption import Caption
except ImportError:
# If both imports fail, set to None
Caption = None
logging.warning(
"Failed to import Caption, image captioning will be unavailable"
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
@dataclass
class Chunk:
"""Chunk result"""
content: str # Chunk content
seq: int # Chunk sequence number
start: int # Chunk start position
end: int # Chunk end position
images: List[Dict[str, Any]] = field(default_factory=list) # Images in the chunk
@dataclass
class ParseResult:
"""Parse result"""
text: str # Extracted text content
chunks: Optional[List[Chunk]] = None # Chunk results
class BaseParser(ABC): class BaseParser(ABC):
"""Base parser interface""" """Base parser interface"""
@@ -97,17 +60,17 @@ class BaseParser(ABC):
def __init__( def __init__(
self, self,
file_name: str = "", file_name: str = "",
file_type: str = None, file_type: Optional[str] = None,
enable_multimodal: bool = True, enable_multimodal: bool = True,
chunk_size: int = 1000, chunk_size: int = 1000,
chunk_overlap: int = 200, chunk_overlap: int = 200,
separators: list = ["\n\n", "\n", ""], separators: list[str] = ["\n\n", "\n", ""],
ocr_backend: str = "paddle", ocr_backend: str = "paddle",
ocr_config: dict = None, ocr_config: dict = {},
max_image_size: int = 1920, # Maximum image size max_image_size: int = 1920, # Maximum image size
max_concurrent_tasks: int = 5, # Max concurrent tasks max_concurrent_tasks: int = 5, # Max concurrent tasks
max_chunks: int = 1000, # Max number of returned chunks max_chunks: int = 1000, # Max number of returned chunks
chunking_config: ChunkingConfig = None, # Chunking configuration object chunking_config: Optional[ChunkingConfig] = None,
): ):
"""Initialize parser """Initialize parser
@@ -125,7 +88,6 @@ class BaseParser(ABC):
max_chunks: Max number of returned chunks max_chunks: Max number of returned chunks
""" """
# Storage client instance # Storage client instance
self._storage = None
self.file_name = file_name self.file_name = file_name
self.file_type = file_type or os.path.splitext(file_name)[1] self.file_type = file_type or os.path.splitext(file_name)[1]
self.enable_multimodal = enable_multimodal self.enable_multimodal = enable_multimodal
@@ -133,15 +95,16 @@ class BaseParser(ABC):
self.chunk_overlap = chunk_overlap self.chunk_overlap = chunk_overlap
self.separators = separators self.separators = separators
self.ocr_backend = os.getenv("OCR_BACKEND", ocr_backend) self.ocr_backend = os.getenv("OCR_BACKEND", ocr_backend)
self.ocr_config = ocr_config or {} self.ocr_config = ocr_config
self.max_image_size = max_image_size self.max_image_size = max_image_size
self.max_concurrent_tasks = max_concurrent_tasks self.max_concurrent_tasks = max_concurrent_tasks
self.max_chunks = max_chunks self.max_chunks = max_chunks
self.chunking_config = chunking_config self.chunking_config = chunking_config
self.storage = create_storage(
logger.info( self.chunking_config.storage_config if self.chunking_config else None
f"Initializing {self.__class__.__name__} for file: {file_name}, type: {self.file_type}"
) )
logger.info(f"Initializing parser for file: {file_name}, type: {file_type}")
logger.info( logger.info(
f"Parser config: chunk_size={chunk_size}, " f"Parser config: chunk_size={chunk_size}, "
f"overlap={chunk_overlap}, " f"overlap={chunk_overlap}, "
@@ -150,16 +113,24 @@ class BaseParser(ABC):
f"max_chunks={max_chunks}" f"max_chunks={max_chunks}"
) )
# Only initialize Caption service if multimodal is enabled # Only initialize Caption service if multimodal is enabled
if self.enable_multimodal: vlm_config = self.chunking_config.vlm_config if self.chunking_config else None
try: self.caption_parser = (
self.caption_parser = Caption(self.chunking_config.vlm_config) Caption(vlm_config=vlm_config) if self.enable_multimodal else None
except Exception as e: )
logger.warning(f"Failed to initialize Caption service: {str(e)}")
self.caption_parser = None
else:
self.caption_parser = None
def perform_ocr(self, image): @abstractmethod
def parse_into_text(self, content: bytes) -> Document:
"""Parse document content
Args:
content: Document content
Returns:
Either a string containing the parsed text, or a tuple of (text, image_map)
where image_map is a dict mapping image URLs to Image objects
"""
def perform_ocr(self, image: Image.Image):
"""Execute OCR recognition on the image """Execute OCR recognition on the image
Args: Args:
@@ -170,53 +141,23 @@ class BaseParser(ABC):
""" """
start_time = time.time() start_time = time.time()
logger.info("Starting OCR recognition") logger.info("Starting OCR recognition")
resized_image = None
try: # Resize image to avoid processing large images
# Resize image to avoid processing large images resized_image = self._resize_image_if_needed(image)
resized_image = self._resize_image_if_needed(image)
# Get OCR engine # Get OCR engine
ocr_engine = self.get_ocr_engine( ocr_engine = OCREngine.get_instance(self.ocr_backend)
backend_type=self.ocr_backend, **self.ocr_config
)
if ocr_engine is None:
logger.error(
f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, "
"skipping OCR recognition"
)
return ""
# Execute OCR prediction # Execute OCR prediction
logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)") logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
# Add extra exception handling ocr_result = ocr_engine.predict(resized_image)
try:
ocr_result = ocr_engine.predict(resized_image)
except RuntimeError as e:
# Handle common CUDA memory issues or other runtime errors
logger.error(f"OCR prediction runtime error: {str(e)}")
return ""
except Exception as e:
# Handle other prediction errors
logger.error(f"Unexpected OCR prediction error: {str(e)}")
return ""
process_time = time.time() - start_time process_time = time.time() - start_time
logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds") logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")
return ocr_result
except Exception as e:
process_time = time.time() - start_time
logger.error(
f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds"
)
return ""
finally:
# Release image resources
if resized_image is not image and hasattr(resized_image, "close"):
# Only close the new image we created, not the original image
resized_image.close()
def _resize_image_if_needed(self, image): return ocr_result
def _resize_image_if_needed(self, image: Image.Image) -> Image.Image:
"""Resize image if it exceeds maximum size limit """Resize image if it exceeds maximum size limit
Args: Args:
@@ -225,102 +166,21 @@ class BaseParser(ABC):
Returns: Returns:
Resized image object Resized image object
""" """
try: width, height = image.size
# If it's a PIL Image if width > self.max_image_size or height > self.max_image_size:
if hasattr(image, "size"): logger.info(f"Resizing PIL image, original size: {width}x{height}")
width, height = image.size scale = min(self.max_image_size / width, self.max_image_size / height)
if width > self.max_image_size or height > self.max_image_size: new_width = int(width * scale)
logger.info(f"Resizing PIL image, original size: {width}x{height}") new_height = int(height * scale)
scale = min( resized_image = image.resize((new_width, new_height))
self.max_image_size / width, self.max_image_size / height logger.info(f"Resized to: {new_width}x{new_height}")
) return resized_image
new_width = int(width * scale)
new_height = int(height * scale)
resized_image = image.resize((new_width, new_height))
logger.info(f"Resized to: {new_width}x{new_height}")
return resized_image
else:
logger.info(
f"PIL image size {width}x{height} is within limits, no resizing needed"
)
return image
# If it's a numpy array
elif hasattr(image, "shape"):
height, width = image.shape[:2]
if width > self.max_image_size or height > self.max_image_size:
logger.info(
f"Resizing numpy image, original size: {width}x{height}"
)
scale = min(
self.max_image_size / width, self.max_image_size / height
)
new_width = int(width * scale)
new_height = int(height * scale)
# Use PIL for resizing numpy arrays
pil_image = Image.fromarray(image)
resized_pil = pil_image.resize((new_width, new_height))
resized_image = np.array(resized_pil)
logger.info(f"Resized to: {new_width}x{new_height}")
return resized_image
else:
logger.info(
f"Numpy image size {width}x{height} is within limits, no resizing needed"
)
return image
else:
logger.warning(f"Unknown image type: {type(image)}, cannot resize")
return image
except Exception as e:
logger.error(f"Error resizing image: {str(e)}")
return image
def process_image(self, image, image_url=None): logger.info(f"PIL image size is {width}x{height}, no resizing needed")
"""Process image: first perform OCR, then get caption if text is available return image
Args: async def process_image_async(self, image: Image.Image, image_url: str):
image: Image object (PIL.Image or numpy array) """Asynchronously process image: first perform OCR, then get caption
image_url: Image URL (if uploaded)
Returns:
tuple: (ocr_text, caption, image_url)
- ocr_text: OCR extracted text
- caption: Image description (if OCR has text) or empty string
- image_url: Image URL (if provided)
"""
logger.info("Starting image processing (OCR + optional caption)")
# Resize image
image = self._resize_image_if_needed(image)
# Perform OCR recognition
ocr_text = self.perform_ocr(image)
caption = ""
if self.caption_parser:
logger.info(
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
)
# Convert image to base64 for caption generation
img_base64 = image_to_base64(image)
if img_base64:
caption = self.get_image_caption(img_base64)
if caption:
logger.info(f"Successfully obtained image caption: {caption}")
else:
logger.warning("Failed to get caption")
else:
logger.warning("Failed to convert image to base64")
caption = ""
else:
logger.info("Caption service not initialized, skipping caption retrieval")
# Release image resources
del image
return ocr_text, caption, image_url
async def process_image_async(self, image, image_url=None):
"""Asynchronously process image: first perform OCR, then get caption if text is available
Args: Args:
image: Image object (PIL.Image or numpy array) image: Image object (PIL.Image or numpy array)
@@ -333,84 +193,47 @@ class BaseParser(ABC):
- image_url: Image URL (if provided) - image_url: Image URL (if provided)
""" """
logger.info("Starting asynchronous image processing (OCR + optional caption)") logger.info("Starting asynchronous image processing (OCR + optional caption)")
resized_image = None
# Resize image
resized_image = self._resize_image_if_needed(image)
try: try:
# Resize image # Perform OCR recognition
resized_image = self._resize_image_if_needed(image)
# Perform OCR recognition (using run_in_executor to execute synchronous operations in the event loop)
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
try: try:
# Add timeout mechanism to avoid infinite blocking (30 seconds timeout) # Add timeout mechanism to avoid infinite blocking (30 seconds timeout)
ocr_task = loop.run_in_executor(None, self.perform_ocr, resized_image) ocr_task = loop.run_in_executor(None, self.perform_ocr, resized_image)
ocr_text = await asyncio.wait_for(ocr_task, timeout=30.0) ocr_text = await asyncio.wait_for(ocr_task, timeout=30.0)
except asyncio.TimeoutError:
logger.error(
"OCR processing timed out (30 seconds), skipping this image"
)
ocr_text = ""
except Exception as e: except Exception as e:
logger.error(f"OCR processing error: {str(e)}") logger.error(f"OCR processing error, skipping this image: {str(e)}")
ocr_text = "" ocr_text = ""
logger.info( logger.info(f"Successfully obtained image ocr: {ocr_text}")
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption" img_base64 = endecode.decode_image(resized_image)
) caption = self.get_image_caption(img_base64)
caption = "" logger.info(f"Successfully obtained image caption: {caption}")
if self.caption_parser:
try:
# Convert image to base64 for caption generation
img_base64 = image_to_base64(resized_image)
if img_base64:
# Add timeout to avoid blocking caption retrieval (30 seconds timeout)
caption_task = self.get_image_caption_async(img_base64)
image_data, caption = await asyncio.wait_for(
caption_task, timeout=30.0
)
if caption:
logger.info(
f"Successfully obtained image caption: {caption}"
)
else:
logger.warning("Failed to get caption")
else:
logger.warning("Failed to convert image to base64")
caption = ""
except asyncio.TimeoutError:
logger.warning("Caption retrieval timed out, skipping")
except Exception as e:
logger.error(f"Failed to get caption: {str(e)}")
else:
logger.info(
"Caption service not initialized, skipping caption retrieval"
)
return ocr_text, caption, image_url return ocr_text, caption, image_url
finally: finally:
# Release image resources resized_image.close()
if resized_image is not image and hasattr(resized_image, "close"):
# Only close the new image we created, not the original image
resized_image.close()
async def process_with_limit(self, idx, image, url, semaphore): async def process_with_limit(
self, idx: int, image: Image.Image, url: str, semaphore: asyncio.Semaphore
):
"""Function to process a single image using a semaphore""" """Function to process a single image using a semaphore"""
try: try:
logger.info(f"Waiting to process image {idx+1}") logger.info(f"Waiting to process image {idx + 1}")
async with semaphore: # Use semaphore to control concurrency async with semaphore: # Use semaphore to control concurrency
logger.info(f"Starting to process image {idx+1}") logger.info(f"Starting to process image {idx + 1}")
result = await self.process_image_async(image, url) result = await self.process_image_async(image, url)
logger.info(f"Completed processing image {idx+1}") logger.info(f"Completed processing image {idx + 1}")
return result return result
except Exception as e: except Exception as e:
logger.error(f"Error processing image {idx+1}: {str(e)}") logger.error(f"Error processing image {idx + 1}: {str(e)}")
return ("", "", url) # Return empty result to avoid overall failure return ("", "", url) # Return empty result to avoid overall failure
finally: finally:
# Manually release image resources # Manually release image resources
if hasattr(image, "close"): image.close()
image.close()
async def process_multiple_images(self, images_data): async def process_multiple_images(self, images_data: List[Tuple[Image.Image, str]]):
"""Process multiple images concurrently """Process multiple images concurrently
Args: Args:
@@ -450,7 +273,7 @@ class BaseParser(ABC):
for i, result in enumerate(completed_results): for i, result in enumerate(completed_results):
if isinstance(result, Exception): if isinstance(result, Exception):
logger.error( logger.error(
f"Image {i+1} processing returned an exception: {str(result)}" f"Image {i + 1} processing returned an exception: {str(result)}"
) )
# For exceptions, add empty results # For exceptions, add empty results
if i < len(images_data): if i < len(images_data):
@@ -467,47 +290,10 @@ class BaseParser(ABC):
logger.info("Image processing resource cleanup complete") logger.info("Image processing resource cleanup complete")
logger.info( logger.info(
f"Completed concurrent processing of {len(results)}/{len(images_data)} images" f"Concurrent processing of {len(results)}/{len(images_data)} images"
) )
return results return results
def decode_bytes(self, content: bytes) -> str:
"""Intelligently decode byte stream, supports multiple encodings
Tries to decode in common encodings, if all fail, uses latin-1 as fallback
Args:
content: Byte stream to decode
Returns:
Decoded string
"""
logger.info(f"Attempting to decode bytes of length: {len(content)}")
# Common encodings, sorted by priority
encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii", "latin-1"]
text = None
# Try decoding with each encoding format
for encoding in encodings:
try:
text = content.decode(encoding)
logger.info(f"Successfully decoded content using {encoding} encoding")
break
except UnicodeDecodeError:
logger.info(f"Failed to decode using {encoding} encoding")
continue
# If all encodings fail, use latin-1 as fallback
if text is None:
text = content.decode("latin-1")
logger.warning(
f"Unable to determine correct encoding, using latin-1 as fallback. "
f"This may cause character issues."
)
logger.info(f"Decoded text length: {len(text)} characters")
return text
def get_image_caption(self, image_data: str) -> str: def get_image_caption(self, image_data: str) -> str:
"""Get image description """Get image description
@@ -517,6 +303,9 @@ class BaseParser(ABC):
Returns: Returns:
Image description Image description
""" """
if not self.caption_parser:
logger.warning("Caption parser not initialized")
return ""
start_time = time.time() start_time = time.time()
logger.info( logger.info(
f"Getting caption for image: {image_data[:250]}..." f"Getting caption for image: {image_data[:250]}..."
@@ -533,80 +322,7 @@ class BaseParser(ABC):
logger.warning("Failed to get caption for image") logger.warning("Failed to get caption for image")
return caption return caption
async def get_image_caption_async(self, image_data: str) -> Tuple[str, str]: def parse(self, content: bytes) -> Document:
"""Asynchronously get image description
Args:
image_data: Image data (base64 encoded string or URL)
Returns:
Tuple[str, str]: Image data and corresponding description
"""
caption = self.get_image_caption(image_data)
return image_data, caption
def __init_storage(self):
"""Initialize storage client based on configuration"""
if self._storage is None:
storage_config = (
self.chunking_config.storage_config if self.chunking_config else None
)
self._storage = create_storage(storage_config)
logger.info(
f"Initialized storage client: {self._storage.__class__.__name__}"
)
return self._storage
def upload_file(self, file_path: str) -> str:
"""Upload file to object storage
Args:
file_path: File path
Returns:
File URL
"""
logger.info(f"Uploading file: {file_path}")
try:
storage = self.__init_storage()
return storage.upload_file(file_path)
except Exception as e:
logger.error(f"Failed to upload file: {str(e)}")
return ""
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to object storage
Args:
content: Byte content to upload
file_ext: File extension
Returns:
File URL
"""
logger.info(f"Uploading bytes content, size: {len(content)} bytes")
try:
storage = self.__init_storage()
return storage.upload_bytes(content, file_ext)
except Exception as e:
logger.error(f"Failed to upload bytes to storage: {str(e)}")
traceback.print_exc()
return ""
@abstractmethod
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
"""Parse document content
Args:
content: Document content
Returns:
Either a string containing the parsed text, or a tuple of (text, image_map)
where image_map is a dict mapping image URLs to Image objects
"""
pass
def parse(self, content: bytes) -> ParseResult:
"""Parse document content """Parse document content
Args: Args:
@@ -616,17 +332,19 @@ class BaseParser(ABC):
Parse result Parse result
""" """
logger.info( logger.info(
f"Parsing document with {self.__class__.__name__}, content size: {len(content)} bytes" f"Parsing document with {self.__class__.__name__}, bytes: {len(content)}"
) )
parse_result = self.parse_into_text(content) document = self.parse_into_text(content)
if isinstance(parse_result, tuple): logger.info(
text, image_map = parse_result f"Extracted {len(document.content)} characters from {self.file_name}"
else: )
text = parse_result splitter = TextSplitter(
image_map = {} chunk_size=self.chunk_size,
logger.info(f"Extracted {len(text)} characters of text from {self.file_name}") chunk_overlap=self.chunk_overlap,
logger.info(f"Beginning chunking process for text") separators=self.separators,
chunks = self.chunk_text(text) )
chunk_str = splitter.split_text(document.content)
chunks = self._str_to_chunk(chunk_str)
logger.info(f"Created {len(chunks)} chunks from document") logger.info(f"Created {len(chunks)} chunks from document")
# Limit the number of returned chunks # Limit the number of returned chunks
@@ -636,7 +354,7 @@ class BaseParser(ABC):
) )
chunks = chunks[: self.max_chunks] chunks = chunks[: self.max_chunks]
# If multimodal is enabled and file type is supported, process images in each chunk # If multimodal is enabled and file type is supported, process images
if self.enable_multimodal: if self.enable_multimodal:
# Get file extension and convert to lowercase # Get file extension and convert to lowercase
file_ext = ( file_ext = (
@@ -647,11 +365,12 @@ class BaseParser(ABC):
# Define allowed file types for image processing # Define allowed file types for image processing
allowed_types = [ allowed_types = [
".pdf", # PDF files # Text files
".pdf",
".md", ".md",
".markdown", # Markdown files ".markdown",
".doc", ".doc",
".docx", # Word documents ".docx",
# Image files # Image files
".jpg", ".jpg",
".jpeg", ".jpeg",
@@ -666,13 +385,21 @@ class BaseParser(ABC):
logger.info( logger.info(
f"Processing images in each chunk for file type: {file_ext}" f"Processing images in each chunk for file type: {file_ext}"
) )
chunks = self.process_chunks_images(chunks, image_map) chunks = self.process_chunks_images(chunks, document.images)
else: else:
logger.info( logger.info(
f"Skipping image processing for unsupported file type: {file_ext}" f"Skipping image processing for unsupported file type: {file_ext}"
) )
return ParseResult(text=text, chunks=chunks) document.chunks = chunks
return document
def _str_to_chunk(self, text: List[Tuple[int, int, str]]) -> List[Chunk]:
"""Convert string to Chunk object"""
return [
Chunk(seq=i, content=t, start=start, end=end)
for i, (start, end, t) in enumerate(text)
]
def _split_into_units(self, text: str) -> List[str]: def _split_into_units(self, text: str) -> List[str]:
""" """
@@ -682,9 +409,7 @@ class BaseParser(ABC):
Returns: Returns:
基本单元的列表 基本单元的列表
""" """
logger.info( logger.info(f"Splitting text into basic units, text length: {len(text)}")
f"Splitting text into basic units with robust structure protection, text length: {len(text)}"
)
# 定义所有需要作为整体保护的结构模式 --- # 定义所有需要作为整体保护的结构模式 ---
table_pattern = r"(?m)(^\|.*\|[ \t]*\r?\n(?:[ \t]*\r?\n)?^\|\s*:?--+.*\r?\n(?:^\|.*\|\r?\n?)*)" table_pattern = r"(?m)(^\|.*\|[ \t]*\r?\n(?:[ \t]*\r?\n)?^\|\s*:?--+.*\r?\n(?:^\|.*\|\r?\n?)*)"
@@ -710,7 +435,8 @@ class BaseParser(ABC):
# 按起始位置排序 # 按起始位置排序
protected_ranges.sort(key=lambda x: x[0]) protected_ranges.sort(key=lambda x: x[0])
logger.info( logger.info(
f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links)." f"Found {len(protected_ranges)} protected structures "
"(tables, code, formulas, images, links)."
) )
# 合并可能重叠的保护范围 --- # 合并可能重叠的保护范围 ---
@@ -731,7 +457,7 @@ class BaseParser(ABC):
merged_ranges.append((current_start, current_end)) merged_ranges.append((current_start, current_end))
protected_ranges = merged_ranges protected_ranges = merged_ranges
logger.info( logger.info(
f"After merging overlaps, {len(protected_ranges)} protected ranges remain." f"After overlaps, {len(protected_ranges)} protected ranges remain."
) )
# 根据保护范围和分隔符来分割文本 --- # 根据保护范围和分隔符来分割文本 ---
@@ -749,7 +475,7 @@ class BaseParser(ABC):
segments = re.split(separator_pattern, pre_text) segments = re.split(separator_pattern, pre_text)
units.extend([s for s in segments if s]) # 添加所有非空部分 units.extend([s for s in segments if s]) # 添加所有非空部分
# b. 将整个受保护的块(例如,一个完整的表格)作为一个单独的、不可分割的单元添加 # b. 将整个受保护的块(例如,一个完整的表格)作为一个不可分割的单元添加
protected_text = text[start:end] protected_text = text[start:end]
units.append(protected_text) units.append(protected_text)
@@ -764,38 +490,6 @@ class BaseParser(ABC):
logger.info(f"Text splitting complete, created {len(units)} final basic units.") logger.info(f"Text splitting complete, created {len(units)} final basic units.")
return units return units
def _find_complete_units(self, units: List[str], target_size: int) -> List[str]:
"""Find a list of complete units that do not exceed the target size
Args:
units: List of units
target_size: Target size
Returns:
List of complete units
"""
logger.info(f"Finding complete units with target size: {target_size}")
result = []
current_size = 0
for unit in units:
unit_size = len(unit)
if current_size + unit_size > target_size and result:
logger.info(
f"Reached target size limit at {current_size} characters, stopping"
)
break
result.append(unit)
current_size += unit_size
logger.info(
f"Added unit of size {unit_size}, current total: {current_size}/{target_size}"
)
logger.info(
f"Found {len(result)} complete units totaling {current_size} characters"
)
return result
def chunk_text(self, text: str) -> List[Chunk]: def chunk_text(self, text: str) -> List[Chunk]:
"""Chunk text, preserving Markdown structure """Chunk text, preserving Markdown structure
@@ -825,7 +519,7 @@ class BaseParser(ABC):
for i, unit in enumerate(units): for i, unit in enumerate(units):
unit_size = len(unit) unit_size = len(unit)
logger.info(f"Processing unit {i+1}/{len(units)}, size: {unit_size}") logger.info(f"Processing unit {i + 1}/{len(units)}, size: {unit_size}")
# If current chunk plus new unit exceeds size limit, create new chunk # If current chunk plus new unit exceeds size limit, create new chunk
if current_size + unit_size > self.chunk_size and current_chunk: if current_size + unit_size > self.chunk_size and current_chunk:
@@ -855,14 +549,12 @@ class BaseParser(ABC):
for u in reversed(current_chunk): for u in reversed(current_chunk):
if overlap_size + len(u) > overlap_target: if overlap_size + len(u) > overlap_target:
logger.info( logger.info(
f"Reached overlap target ({overlap_size}/{overlap_target})" f"Overlap target ({overlap_size}/{overlap_target})"
) )
break break
overlap_units.insert(0, u) overlap_units.insert(0, u)
overlap_size += len(u) overlap_size += len(u)
logger.info( logger.info(f"Added unit to overlap, size: {overlap_size}")
f"Added unit to overlap, current overlap size: {overlap_size}"
)
# Remove elements from overlap that are included in separators # Remove elements from overlap that are included in separators
start_index = 0 start_index = 0
@@ -883,7 +575,7 @@ class BaseParser(ABC):
overlap_units = overlap_units[start_index:] overlap_units = overlap_units[start_index:]
logger.info( logger.info(
f"Final overlap: {len(overlap_units)} units, {overlap_size} characters" f"Overlap: {len(overlap_units)} units, {overlap_size} size"
) )
current_chunk = overlap_units current_chunk = overlap_units
@@ -899,7 +591,7 @@ class BaseParser(ABC):
current_chunk.append(unit) current_chunk.append(unit)
current_size += unit_size current_size += unit_size
logger.info( logger.info(
f"Added unit to current chunk, now at {current_size}/{self.chunk_size} characters" f"Added unit to current chunk, at {current_size}/{self.chunk_size}"
) )
# Add the last chunk # Add the last chunk
@@ -925,12 +617,13 @@ class BaseParser(ABC):
chunk: Document chunk chunk: Document chunk
Returns: Returns:
List of image information, each element contains image URL and match position List of image information
""" """
logger.info(f"Extracting image information from Chunk #{chunk.seq}") logger.info(f"Extracting image information from Chunk #{chunk.seq}")
text = chunk.content text = chunk.content
# Regex to extract image information from text, supporting Markdown images and HTML images # Regex to extract image information from text,
# support: Markdown images, HTML images
img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)|<img [^>]*src="([^"]+)" [^>]*>' img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)|<img [^>]*src="([^"]+)" [^>]*>'
# Extract image information # Extract image information
@@ -954,28 +647,28 @@ class BaseParser(ABC):
images_info.append(image_info) images_info.append(image_info)
logger.info( logger.info(
f"Image in Chunk #{chunk.seq} {match_idx+1}: " f"URL={img_url[:50]}..." f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url[:50]}..."
if len(img_url) > 50 if len(img_url) > 50
else f"Image in Chunk #{chunk.seq} {match_idx+1}: URL={img_url}" else f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url}"
) )
return images_info return images_info
async def download_and_upload_image(self, img_url: str): async def download_and_upload_image(
"""Download image and upload to object storage, if it's already an object storage path or local path, use directly self, img_url: str
) -> Tuple[str, str, Image.Image | None]:
"""Download image and upload to object storage,
if it's already an object storage path or local path, use directly
Args: Args:
img_url: Image URL or local path img_url: Image URL or local path
Returns: Returns:
tuple: (original URL, storage URL, image object), if failed returns (original URL, None, None) tuple: (original URL, storage URL, image object),
if failed returns (original URL, None, None)
""" """
try: try:
import requests
from PIL import Image
import io
# Check if it's already a storage URL (COS or MinIO) # Check if it's already a storage URL (COS or MinIO)
is_storage_url = any( is_storage_url = any(
pattern in img_url pattern in img_url
@@ -997,12 +690,7 @@ class BaseParser(ABC):
response = requests.get(img_url, timeout=5, proxies=proxies) response = requests.get(img_url, timeout=5, proxies=proxies)
if response.status_code == 200: if response.status_code == 200:
image = Image.open(io.BytesIO(response.content)) image = Image.open(io.BytesIO(response.content))
try: return img_url, img_url, image
return img_url, img_url, image
finally:
# Ensure image resources are also released after the function returns
# Image will be closed by the caller
pass
else: else:
logger.warning( logger.warning(
f"Failed to get storage image: {response.status_code}" f"Failed to get storage image: {response.status_code}"
@@ -1022,7 +710,7 @@ class BaseParser(ABC):
# Upload to storage # Upload to storage
with open(img_url, "rb") as f: with open(img_url, "rb") as f:
content = f.read() content = f.read()
storage_url = self.upload_bytes(content) storage_url = self.storage.upload_bytes(content)
logger.info( logger.info(
f"Successfully uploaded local image to storage: {storage_url}" f"Successfully uploaded local image to storage: {storage_url}"
) )
@@ -1031,7 +719,7 @@ class BaseParser(ABC):
logger.error(f"Error processing local image: {str(e)}") logger.error(f"Error processing local image: {str(e)}")
if image and hasattr(image, "close"): if image and hasattr(image, "close"):
image.close() image.close()
return img_url, None, None return img_url, img_url, None
# Normal remote URL download handling # Normal remote URL download handling
else: else:
@@ -1044,9 +732,7 @@ class BaseParser(ABC):
if https_proxy: if https_proxy:
proxies["https"] = https_proxy proxies["https"] = https_proxy
logger.info( logger.info(f"Downloading image {img_url}, using proxy: {proxies}")
f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}"
)
response = requests.get(img_url, timeout=5, proxies=proxies) response = requests.get(img_url, timeout=5, proxies=proxies)
if response.status_code == 200: if response.status_code == 200:
@@ -1054,7 +740,7 @@ class BaseParser(ABC):
image = Image.open(io.BytesIO(response.content)) image = Image.open(io.BytesIO(response.content))
try: try:
# Upload to storage using the method in BaseParser # Upload to storage using the method in BaseParser
storage_url = self.upload_bytes(response.content) storage_url = self.storage.upload_bytes(response.content)
logger.info( logger.info(
f"Successfully uploaded image to storage: {storage_url}" f"Successfully uploaded image to storage: {storage_url}"
) )
@@ -1064,11 +750,11 @@ class BaseParser(ABC):
pass pass
else: else:
logger.warning(f"Failed to download image: {response.status_code}") logger.warning(f"Failed to download image: {response.status_code}")
return img_url, None, None return img_url, img_url, None
except Exception as e: except Exception as e:
logger.error(f"Error downloading or processing image: {str(e)}") logger.error(f"Error downloading or processing image: {str(e)}")
return img_url, None, None return img_url, img_url, None
async def process_chunk_images_async( async def process_chunk_images_async(
self, chunk, chunk_idx, total_chunks, image_map=None self, chunk, chunk_idx, total_chunks, image_map=None
@@ -1086,18 +772,19 @@ class BaseParser(ABC):
""" """
logger.info( logger.info(
f"Starting to process images in Chunk #{chunk_idx+1}/{total_chunks}" f"Starting to process images in Chunk #{chunk_idx + 1}/{total_chunks}"
) )
# Extract image information from the Chunk # Extract image information from the Chunk
images_info = self.extract_images_from_chunk(chunk) images_info = self.extract_images_from_chunk(chunk)
if not images_info: if not images_info:
logger.info(f"Chunk #{chunk_idx+1} found no images") logger.info(f"Chunk #{chunk_idx + 1} found no images")
return chunk return chunk
# Prepare images that need to be downloaded and processed # Prepare images that need to be downloaded and processed
images_to_process = [] images_to_process = []
url_to_info_map = {} # Map URL to image information # Map URL to image information
url_to_info_map = {}
# Record all image URLs that need to be processed # Record all image URLs that need to be processed
for img_info in images_info: for img_info in images_info:
@@ -1106,14 +793,21 @@ class BaseParser(ABC):
results = [] results = []
download_tasks = [] download_tasks = []
for img_url in url_to_info_map.keys(): # Check if image is already in the image_map # Check if image is already in the image_map
for img_url in url_to_info_map.keys():
if image_map and img_url in image_map: if image_map and img_url in image_map:
logger.info(f"Image already in image_map: {img_url}, using cached object") logger.info(
results.append((img_url, img_url, image_map[img_url])) f"Image already in image_map: {img_url}, using cached object"
)
image = Image.open(
io.BytesIO(endecode.encode_image(image_map[img_url]))
)
results.append((img_url, img_url, image))
else: else:
download_task = self.download_and_upload_image(img_url) download_task = self.download_and_upload_image(img_url)
download_tasks.append(download_task) download_tasks.append(download_task)
# Concurrent download and upload of images, ignore images that are already in the image_map # Concurrent download and upload of images,
# ignore images that are already in the image_map
results.extend(await asyncio.gather(*download_tasks)) results.extend(await asyncio.gather(*download_tasks))
# Process download results, prepare for OCR processing # Process download results, prepare for OCR processing
@@ -1123,16 +817,17 @@ class BaseParser(ABC):
img_info["cos_url"] = cos_url img_info["cos_url"] = cos_url
images_to_process.append((image, cos_url)) images_to_process.append((image, cos_url))
# If no images were successfully downloaded and uploaded, return the original Chunk # If no images were successfully downloaded and uploaded,
# return the original Chunk
if not images_to_process: if not images_to_process:
logger.info( logger.info(
f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images" f"Chunk #{chunk_idx + 1} not found downloaded and uploaded images"
) )
return chunk return chunk
# Concurrent processing of all images (OCR + caption) # Concurrent processing of all images (OCR + caption)
logger.info( logger.info(
f"Processing {len(images_to_process)} images in Chunk #{chunk_idx+1}" f"Processing {len(images_to_process)} images in Chunk #{chunk_idx + 1}"
) )
# Concurrent processing of all images # Concurrent processing of all images
@@ -1163,10 +858,12 @@ class BaseParser(ABC):
# Update image information in the Chunk # Update image information in the Chunk
chunk.images = processed_images chunk.images = processed_images
logger.info(f"Completed image processing in Chunk #{chunk_idx+1}") logger.info(f"Completed image processing in Chunk #{chunk_idx + 1}")
return chunk return chunk
def process_chunks_images(self, chunks: List[Chunk], image_map=None) -> List[Chunk]: def process_chunks_images(
self, chunks: List[Chunk], image_map: Dict[str, str] = {}
) -> List[Chunk]:
"""Concurrent processing of images in all Chunks """Concurrent processing of images in all Chunks
Args: Args:
@@ -1210,7 +907,7 @@ class BaseParser(ABC):
processed_chunks = [] processed_chunks = []
for i, result in enumerate(results): for i, result in enumerate(results):
if isinstance(result, Exception): if isinstance(result, Exception):
logger.error(f"Error processing Chunk {i+1}: {str(result)}") logger.error(f"Error processing Chunk {i + 1}: {str(result)}")
# Keep original Chunk # Keep original Chunk
if i < len(chunks): if i < len(chunks):
processed_chunks.append(chunks[i]) processed_chunks.append(chunks[i])
@@ -1235,7 +932,7 @@ class BaseParser(ABC):
# Execute processing for all Chunks # Execute processing for all Chunks
processed_chunks = loop.run_until_complete(process_all_chunks()) processed_chunks = loop.run_until_complete(process_all_chunks())
logger.info( logger.info(
f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks" f"Completed processing of {len(processed_chunks)}/{len(chunks)} chunks"
) )
return processed_chunks return processed_chunks

View File

@@ -3,11 +3,10 @@ import logging
import os import os
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import List, Optional, Union from typing import Dict, List, Optional, Union
import requests
import ollama import ollama
import requests
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -158,11 +157,16 @@ class CaptionChatResp:
Returns: Returns:
The content string from the first choice, or empty string if no choices The content string from the first choice, or empty string if no choices
""" """
if self.choices: if (
logger.info("Retrieving content from first choice") not self.choices
return self.choices[0].message.content or not self.choices[0]
logger.warning("No choices available in response") or not self.choices[0].message
return "" or not self.choices[0].message.content
):
logger.warning("No choices available in response")
return ""
logger.info("Retrieving content from first choice")
return self.choices[0].message.content
class Caption: class Caption:
@@ -171,33 +175,43 @@ class Caption:
Uses an external API to process images and return textual descriptions. Uses an external API to process images and return textual descriptions.
""" """
def __init__(self, vlm_config=None): def __init__(self, vlm_config: Optional[Dict[str, str]] = None):
"""Initialize the Caption service with configuration from parameters or environment variables.""" """
Initialize the Caption service with configuration
from parameters or environment variables.
"""
logger.info("Initializing Caption service") logger.info("Initializing Caption service")
self.prompt = """简单凝炼的描述图片的主要内容""" self.prompt = """简单凝炼的描述图片的主要内容"""
self.timeout = 30
# Use provided VLM config if available, otherwise fall back to environment variables
# Use provided VLM config if available,
# otherwise fall back to environment variables
if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"): if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"):
self.completion_url = vlm_config.get("base_url", "") + "/chat/completions" self.completion_url = vlm_config.get("base_url", "") + "/chat/completions"
self.model = vlm_config.get("model_name", "") self.model = vlm_config.get("model_name", "")
self.api_key = vlm_config.get("api_key", "") self.api_key = vlm_config.get("api_key", "")
self.interface_type = vlm_config.get("interface_type", "openai").lower() self.interface_type = vlm_config.get("interface_type", "openai").lower()
else: else:
if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "": base_url = os.getenv("VLM_MODEL_BASE_URL")
model_name = os.getenv("VLM_MODEL_NAME")
if not base_url or not model_name:
logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set") logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set")
return return
self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions" self.completion_url = base_url + "/chat/completions"
self.model = os.getenv("VLM_MODEL_NAME") self.model = model_name
self.api_key = os.getenv("VLM_MODEL_API_KEY") self.api_key = os.getenv("VLM_MODEL_API_KEY", "")
self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower() self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower()
# 验证接口类型 # 验证接口类型
if self.interface_type not in ["ollama", "openai"]: if self.interface_type not in ["ollama", "openai"]:
logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai") logger.warning(
f"Unknown interface type: {self.interface_type}, defaulting to openai"
)
self.interface_type = "openai" self.interface_type = "openai"
logger.info( logger.info(
f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}" f"Configured with model: {self.model}, "
f"endpoint: {self.completion_url}, interface: {self.interface_type}"
) )
def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]: def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]:
@@ -210,8 +224,8 @@ class Caption:
Returns: Returns:
CaptionChatResp object if successful, None otherwise CaptionChatResp object if successful, None otherwise
""" """
logger.info(f"Calling Caption API for image captioning") logger.info("Calling Caption API for image captioning")
logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}") logger.info(f"Processing image data: {image_data[:50]}...")
# 根据接口类型选择调用方式 # 根据接口类型选择调用方式
if self.interface_type == "ollama": if self.interface_type == "ollama":
@@ -226,39 +240,35 @@ class Caption:
client = ollama.Client( client = ollama.Client(
host=host, host=host,
timeout=self.timeout,
) )
try: try:
logger.info(f"Calling Ollama API with model: {self.model}") logger.info(f"Calling Ollama API with model: {self.model}")
# 调用Ollama API使用images参数传递base64编码的图片 # 调用Ollama API使用images参数传递base64编码的图片
response = client.generate( response = client.generate(
model=self.model, model=self.model,
prompt="简单凝炼的描述图片的主要内容", prompt="简单凝炼的描述图片的主要内容",
images=[image_base64], # image_base64是base64编码的图片数据 images=[image_base64], # image_base64是base64编码的图片数据
options={"temperature": 0.1}, options={"temperature": 0.1},
stream=False, stream=False,
) )
# 构造响应对象 # 构造响应对象
caption_resp = CaptionChatResp( caption_resp = CaptionChatResp(
id="ollama_response", id="ollama_response",
created=int(time.time()), created=int(time.time()),
model=self.model, model=Model(id=self.model),
object="chat.completion", object="chat.completion",
choices=[ choices=[
Choice( Choice(message=Message(role="assistant", content=response.response))
message=Message( ],
role="assistant",
content=response.response
)
)
]
) )
logger.info("Successfully received response from Ollama API") logger.info("Successfully received response from Ollama API")
return caption_resp return caption_resp
except Exception as e: except Exception as e:
logger.error(f"Error calling Ollama API: {e}") logger.error(f"Error calling Ollama API: {e}")
return None return None
@@ -266,13 +276,16 @@ class Caption:
def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]: def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]:
"""Call OpenAI-compatible API for image captioning.""" """Call OpenAI-compatible API for image captioning."""
logger.info(f"Calling OpenAI-compatible API with model: {self.model}") logger.info(f"Calling OpenAI-compatible API with model: {self.model}")
user_msg = UserMessage( user_msg = UserMessage(
role="user", role="user",
content=[ content=[
Content(type="text", text=self.prompt), Content(type="text", text=self.prompt),
Content( Content(
type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto") type="image_url",
image_url=ImageUrl(
url="data:image/png;base64," + image_base64, detail="auto"
),
), ),
], ],
) )
@@ -295,23 +308,23 @@ class Caption:
headers["Authorization"] = f"Bearer {self.api_key}" headers["Authorization"] = f"Bearer {self.api_key}"
try: try:
logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}") logger.info(
f"Sending request to OpenAI-compatible API with model: {self.model}"
)
response = requests.post( response = requests.post(
self.completion_url, self.completion_url,
data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4), data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4),
headers=headers, headers=headers,
timeout=30, timeout=self.timeout,
) )
if response.status_code != 200: if response.status_code != 200:
logger.error( logger.error(
f"OpenAI-compatible API returned non-200 status code: {response.status_code}" f"OpenAI API returned non-200 status code: {response.status_code}"
) )
response.raise_for_status() response.raise_for_status()
logger.info( logger.info(f"Received from OpenAI with status: {response.status_code}")
f"Successfully received response from OpenAI-compatible API with status: {response.status_code}" logger.info("Converting response to CaptionChatResp object")
)
logger.info(f"Converting response to CaptionChatResp object")
caption_resp = CaptionChatResp.from_json(response.json()) caption_resp = CaptionChatResp.from_json(response.json())
if caption_resp.usage: if caption_resp.usage:
@@ -322,7 +335,7 @@ class Caption:
return caption_resp return caption_resp
except requests.exceptions.Timeout: except requests.exceptions.Timeout:
logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds") logger.error("Timeout while calling OpenAI-compatible API after 30 seconds")
return None return None
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
logger.error(f"Request error calling OpenAI-compatible API: {e}") logger.error(f"Request error calling OpenAI-compatible API: {e}")

View File

@@ -0,0 +1,70 @@
import logging
from typing import List, Tuple, Type
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.utils import endecode
logger = logging.getLogger(__name__)
class FirstParser(BaseParser):
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._parsers: List[BaseParser] = []
for parser_cls in self._parser_cls:
try:
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
except Exception as e:
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
def parse_into_text(self, content: bytes) -> Document:
for p in self._parsers:
document = p.parse_into_text(content)
if document.is_valid():
return document
return Document()
@classmethod
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["FirstParser"]:
names = "_".join([p.__name__ for p in parser_classes])
return type(f"FirstParser_{names}", (cls,), {"_parser_cls": parser_classes})
class PipelineParser(BaseParser):
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._parsers: List[BaseParser] = []
for parser_cls in self._parser_cls:
try:
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
except Exception as e:
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
def parse_into_text(self, content: bytes) -> Document:
document = Document()
for p in self._parsers:
document = p.parse_into_text(content)
content = endecode.encode_bytes(document.content)
return document
@classmethod
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["PipelineParser"]:
names = "_".join([p.__name__ for p in parser_classes])
return type(f"PipelineParser_{names}", (cls,), {"_parser_cls": parser_classes})
if __name__ == "__main__":
from docreader.parser.markdown_parser import MarkdownParser
cls = FirstParser.create(MarkdownParser)
parser = cls()
print(parser.parse_into_text(b"aaa"))

View File

@@ -1,21 +0,0 @@
from dataclasses import dataclass, field
@dataclass
class ChunkingConfig:
"""
Configuration for text chunking process.
Controls how documents are split into smaller pieces for processing.
"""
chunk_size: int = 512 # Maximum size of each chunk in tokens/chars
chunk_overlap: int = 50 # Number of tokens/chars to overlap between chunks
separators: list = field(
default_factory=lambda: ["\n\n", "\n", ""]
) # Text separators in order of priority
enable_multimodal: bool = (
False # Whether to enable multimodal processing (text + images)
)
storage_config: dict = None # Preferred field name going forward
vlm_config: dict = None # VLM configuration for image captioning

View File

@@ -1,134 +1,88 @@
import asyncio
import logging import logging
import re
import tempfile
import os import os
import subprocess import subprocess
import shutil from typing import List, Optional
from io import BytesIO
from typing import Optional, List, Tuple
import textract
from PIL import Image
import zipfile
import xml.etree.ElementTree as ET
from .base_parser import BaseParser import textract
from .docx_parser import DocxParser, Docx
from docreader.models.document import Document
from docreader.parser.docx2_parser import Docx2Parser
from docreader.utils.tempfile import TempDirContext, TempFileContext
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class DocParser(BaseParser): class DocParser(Docx2Parser):
"""DOC document parser""" """DOC document parser"""
def parse_into_text(self, content: bytes) -> str: def parse_into_text(self, content: bytes) -> Document:
"""Parse DOC document
Args:
content: DOC document content
Returns:
Parse result
"""
logger.info(f"Parsing DOC document, content size: {len(content)} bytes") logger.info(f"Parsing DOC document, content size: {len(content)} bytes")
handle_chain = [
# 1. Try to convert to docx format to extract images
self._parse_with_docx,
# 2. If image extraction is not needed or conversion failed,
# try using antiword to extract text
self._parse_with_antiword,
# 3. If antiword extraction fails, use textract
self._parse_with_textract,
]
# Save byte content as a temporary file # Save byte content as a temporary file
with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file: with TempFileContext(content, ".doc") as temp_file_path:
temp_file_path = temp_file.name for handle in handle_chain:
temp_file.write(content) try:
temp_file.flush() document = handle(temp_file_path)
logger.info(f"Saved DOC content to temporary file: {temp_file_path}") if document:
return document
except Exception as e:
logger.warning(f"Failed to parse DOC with {handle.__name__} {e}")
try: return Document(content="")
# First try to convert to docx format to extract images
if self.enable_multimodal:
logger.info("Multimodal enabled, attempting to extract images from DOC")
docx_content = self._convert_doc_to_docx(temp_file_path)
if docx_content: def _parse_with_docx(self, temp_file_path: str) -> Document:
logger.info("Successfully converted DOC to DOCX, using DocxParser") logger.info("Multimodal enabled, attempting to extract images from DOC")
# Use existing DocxParser to parse the converted docx
docx_parser = DocxParser(
file_name=self.file_name,
file_type="docx",
enable_multimodal=self.enable_multimodal,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
chunking_config=self.chunking_config,
separators=self.separators,
)
text = docx_parser.parse_into_text(docx_content)
logger.info(f"Extracted {len(text)} characters using DocxParser")
# Clean up temporary file docx_content = self._try_convert_doc_to_docx(temp_file_path)
os.unlink(temp_file_path) if not docx_content:
logger.info(f"Deleted temporary file: {temp_file_path}") raise RuntimeError("Failed to convert DOC to DOCX")
return text logger.info("Successfully converted DOC to DOCX, using DocxParser")
else: # Use existing DocxParser to parse the converted docx
logger.warning( document = super(Docx2Parser, self).parse_into_text(docx_content)
"Failed to convert DOC to DOCX, falling back to text-only extraction" logger.info(f"Extracted {len(document.content)} characters using DocxParser")
) return document
# If image extraction is not needed or conversion failed, try using antiword to extract text def _parse_with_antiword(self, temp_file_path: str) -> Document:
try: logger.info("Attempting to parse DOC file with antiword")
logger.info("Attempting to parse DOC file with antiword")
# Check if antiword is installed
antiword_path = self._find_antiword_path()
if antiword_path: # Check if antiword is installed
# Use antiword to extract text directly antiword_path = self._try_find_antiword()
logger.info(f"Using antiword at {antiword_path} to extract text") if not antiword_path:
process = subprocess.Popen( raise RuntimeError("antiword not found in PATH")
[antiword_path, temp_file_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = process.communicate()
if process.returncode == 0: # Use antiword to extract text directly
text = stdout.decode("utf-8", errors="ignore") process = subprocess.Popen(
logger.info( [antiword_path, temp_file_path],
f"Successfully extracted {len(text)} characters using antiword" stdout=subprocess.PIPE,
) stderr=subprocess.PIPE,
)
# Clean up temporary file stdout, stderr = process.communicate()
os.unlink(temp_file_path) if process.returncode != 0:
logger.info(f"Deleted temporary file: {temp_file_path}") raise RuntimeError(
f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
return text
else:
logger.warning(
f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
)
else:
logger.warning("antiword not found, falling back to textract")
except Exception as e:
logger.warning(
f"Error using antiword: {str(e)}, falling back to textract"
)
# If antiword fails, try using textract
logger.info("Parsing DOC file with textract")
text = textract.process(temp_file_path, method="antiword").decode("utf-8")
logger.info(
f"Successfully extracted {len(text)} characters of text from DOC document using textract"
) )
text = stdout.decode("utf-8", errors="ignore")
logger.info(f"Successfully extracted {len(text)} characters using antiword")
return Document(content=text)
# Clean up temporary file def _parse_with_textract(self, temp_file_path: str) -> Document:
os.unlink(temp_file_path) logger.info(f"Parsing DOC file with textract: {temp_file_path}")
logger.info(f"Deleted temporary file: {temp_file_path}") text = textract.process(temp_file_path, method="antiword").decode("utf-8")
logger.info(f"Successfully extracted {len(text)} bytes of DOC using textract")
return Document(content=str(text))
return text def _try_convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
except Exception as e:
logger.error(f"Error parsing DOC document: {str(e)}")
# Ensure temporary file is cleaned up
if os.path.exists(temp_file_path):
os.unlink(temp_file_path)
logger.info(f"Deleted temporary file after error: {temp_file_path}")
return ""
def _convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
"""Convert DOC file to DOCX format """Convert DOC file to DOCX format
Uses LibreOffice/OpenOffice for conversion Uses LibreOffice/OpenOffice for conversion
@@ -141,21 +95,16 @@ class DocParser(BaseParser):
""" """
logger.info(f"Converting DOC to DOCX: {doc_path}") logger.info(f"Converting DOC to DOCX: {doc_path}")
# Check if LibreOffice or OpenOffice is installed
soffice_path = self._try_find_soffice()
if not soffice_path:
return None
# Execute conversion command
logger.info(f"Using {soffice_path} to convert DOC to DOCX")
# Create a temporary directory to store the converted file # Create a temporary directory to store the converted file
temp_dir = tempfile.mkdtemp() with TempDirContext() as temp_dir:
docx_path = os.path.join(temp_dir, "converted.docx")
try:
# Check if LibreOffice or OpenOffice is installed
soffice_path = self._find_soffice_path()
if not soffice_path:
logger.error(
"LibreOffice/OpenOffice not found, cannot convert DOC to DOCX"
)
return None
# Execute conversion command
logger.info(f"Using {soffice_path} to convert DOC to DOCX")
cmd = [ cmd = [
soffice_path, soffice_path,
"--headless", "--headless",
@@ -165,7 +114,6 @@ class DocParser(BaseParser):
temp_dir, temp_dir,
doc_path, doc_path,
] ]
logger.info(f"Running command: {' '.join(cmd)}") logger.info(f"Running command: {' '.join(cmd)}")
process = subprocess.Popen( process = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
@@ -173,41 +121,68 @@ class DocParser(BaseParser):
stdout, stderr = process.communicate() stdout, stderr = process.communicate()
if process.returncode != 0: if process.returncode != 0:
logger.error( logger.warning(
f"Error converting DOC to DOCX: {stderr.decode('utf-8', errors='ignore')}" f"Error converting DOC to DOCX: {stderr.decode('utf-8')}"
) )
return None return None
# Find the converted file # Find the converted file
for file in os.listdir(temp_dir): docx_file = [
if file.endswith(".docx"): file for file in os.listdir(temp_dir) if file.endswith(".docx")
converted_file = os.path.join(temp_dir, file) ]
logger.info(f"Found converted file: {converted_file}") logger.info(f"Found {len(docx_file)} DOCX file(s) in temporary directory")
for file in docx_file:
# Read the converted file content converted_file = os.path.join(temp_dir, file)
with open(converted_file, "rb") as f: logger.info(f"Found converted file: {converted_file}")
docx_content = f.read()
# Read the converted file content
with open(converted_file, "rb") as f:
docx_content = f.read()
logger.info( logger.info(
f"Successfully read converted DOCX file, size: {len(docx_content)} bytes" f"Successfully read DOCX file, size: {len(docx_content)}"
) )
return docx_content return docx_content
return None
logger.error("No DOCX file found after conversion") def _try_find_executable_path(
return None self,
executable_name: str,
possible_path: List[str] = [],
environment_variable: List[str] = [],
) -> Optional[str]:
"""Find executable path
Args:
executable_name: Executable name
possible_path: List of possible paths
environment_variable: List of environment variables to check
Returns:
Executable path, or None if not found
"""
# Common executable paths
paths: List[str] = []
paths.extend(possible_path)
paths.extend(os.environ.get(env_var, "") for env_var in environment_variable)
paths = list(set(paths))
except Exception as e: # Check if path is set in environment variable
logger.error(f"Error during DOC to DOCX conversion: {str(e)}") for path in paths:
return None if os.path.exists(path):
finally: logger.info(f"Found {executable_name} at {path}")
# Clean up temporary directory return path
try:
shutil.rmtree(temp_dir)
logger.info(f"Cleaned up temporary directory: {temp_dir}")
except Exception as e:
logger.warning(f"Failed to clean up temporary directory: {str(e)}")
def _find_soffice_path(self) -> Optional[str]: # Try to find in PATH
result = subprocess.run(
["which", executable_name], capture_output=True, text=True
)
if result.returncode == 0 and result.stdout.strip():
path = result.stdout.strip()
logger.info(f"Found {executable_name} at {path}")
return path
logger.warning(f"Failed to find {executable_name}")
return None
def _try_find_soffice(self) -> Optional[str]:
"""Find LibreOffice/OpenOffice executable path """Find LibreOffice/OpenOffice executable path
Returns: Returns:
@@ -225,32 +200,13 @@ class DocParser(BaseParser):
"C:\\Program Files\\LibreOffice\\program\\soffice.exe", "C:\\Program Files\\LibreOffice\\program\\soffice.exe",
"C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe", "C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
] ]
return self._try_find_executable_path(
executable_name="soffice",
possible_path=possible_paths,
environment_variable=["LIBREOFFICE_PATH"],
)
# Check if path is set in environment variable def _try_find_antiword(self) -> Optional[str]:
if os.environ.get("LIBREOFFICE_PATH"):
possible_paths.insert(0, os.environ.get("LIBREOFFICE_PATH"))
for path in possible_paths:
if os.path.exists(path):
logger.info(f"Found LibreOffice/OpenOffice at: {path}")
return path
# Try to find in PATH
try:
result = subprocess.run(
["which", "soffice"], capture_output=True, text=True
)
if result.returncode == 0 and result.stdout.strip():
path = result.stdout.strip()
logger.info(f"Found LibreOffice/OpenOffice in PATH: {path}")
return path
except Exception:
pass
logger.warning("LibreOffice/OpenOffice not found")
return None
def _find_antiword_path(self) -> Optional[str]:
"""Find antiword executable path """Find antiword executable path
Returns: Returns:
@@ -265,51 +221,27 @@ class DocParser(BaseParser):
"C:\\Program Files\\Antiword\\antiword.exe", "C:\\Program Files\\Antiword\\antiword.exe",
"C:\\Program Files (x86)\\Antiword\\antiword.exe", "C:\\Program Files (x86)\\Antiword\\antiword.exe",
] ]
return self._try_find_executable_path(
# Check if path is set in environment variable executable_name="antiword",
if os.environ.get("ANTIWORD_PATH"): possible_path=possible_paths,
possible_paths.insert(0, os.environ.get("ANTIWORD_PATH")) environment_variable=["ANTIWORD_PATH"],
)
for path in possible_paths:
if os.path.exists(path):
logger.info(f"Found antiword at: {path}")
return path
# Try to find in PATH
try:
result = subprocess.run(
["which", "antiword"], capture_output=True, text=True
)
if result.returncode == 0 and result.stdout.strip():
path = result.stdout.strip()
logger.info(f"Found antiword in PATH: {path}")
return path
except Exception:
pass
logger.warning("antiword not found")
return None
if __name__ == "__main__": if __name__ == "__main__":
logging.basicConfig( logging.basicConfig(level=logging.DEBUG)
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger.info("Running DocParser in standalone mode")
file_name = "/path/to/your/test.doc" file_name = "/path/to/your/test.doc"
logger.info(f"Processing file: {file_name}") logger.info(f"Processing file: {file_name}")
doc_parser = DocParser( doc_parser = DocParser(
file_name, enable_multimodal=True, chunk_size=512, chunk_overlap=60 file_name=file_name,
enable_multimodal=True,
chunk_size=512,
chunk_overlap=60,
) )
logger.info("Parser initialized, starting processing")
with open(file_name, "rb") as f: with open(file_name, "rb") as f:
content = f.read() content = f.read()
text = doc_parser.parse_into_text(content) document = doc_parser.parse_into_text(content)
logger.info(f"Processing complete, extracted text length: {len(text)}") logger.info(f"Processing complete, extracted text length: {len(document.content)}")
logger.info(f"Sample text: {text[:200]}...") logger.info(f"Sample text: {document.content[:200]}...")

View File

@@ -0,0 +1,28 @@
import logging
from docreader.parser.chain_parser import FirstParser
from docreader.parser.docx_parser import DocxParser
from docreader.parser.markitdown_parser import MarkitdownParser
logger = logging.getLogger(__name__)
class Docx2Parser(FirstParser):
_parser_cls = (MarkitdownParser, DocxParser)
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
your_file = "/path/to/your/file.docx"
parser = Docx2Parser(separators=[".", "?", "!", "", "", ""])
with open(your_file, "rb") as f:
content = f.read()
document = parser.parse(content)
for cc in document.chunks:
logger.info(f"chunk: {cc}")
# document = parser.parse_into_text(content)
# logger.info(f"docx content: {document.content}")
# logger.info(f"find images {document.images.keys()}")

View File

@@ -1,37 +1,36 @@
import logging import logging
import tempfile
import os import os
import sys import re
import time
from io import BytesIO
from typing import Optional, Dict, Any, Tuple, List, Union
from dataclasses import dataclass, field
from PIL import Image
from docx import Document
from docx.image.exceptions import (
UnrecognizedImageError,
UnexpectedEndOfFileError,
InvalidImageStreamError,
)
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import tempfile import tempfile
import threading import threading
import time
import traceback import traceback
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass, field
from io import BytesIO
from multiprocessing import Manager from multiprocessing import Manager
import re from typing import Any, Dict, List, Optional, Tuple
from .base_parser import BaseParser from docx import Document
from docx.image.exceptions import (
InvalidImageStreamError,
UnexpectedEndOfFileError,
UnrecognizedImageError,
)
from PIL import Image
from docreader.models.document import Document as DocumentModel
from docreader.parser.base_parser import BaseParser
from docreader.utils import endecode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# Add thread local storage to track the processing status of each thread
thread_local = threading.local()
class ImageData: class ImageData:
"""Represents a processed image of document content""" """Represents a processed image of document content"""
local_path: str = "" local_path: str = ""
object: Image.Image = None object: Optional[Image.Image] = None
url: str = "" url: str = ""
@@ -40,7 +39,9 @@ class LineData:
"""Represents a processed line of document content with associated images""" """Represents a processed line of document content with associated images"""
text: str = "" # Extracted text content text: str = "" # Extracted text content
images: List[ImageData] = field(default_factory=list) # List of images or image paths images: List[ImageData] = field(
default_factory=list
) # List of images or image paths
extra_info: str = "" # Placeholder for additional info (currently unused) extra_info: str = "" # Placeholder for additional info (currently unused)
page_num: int = 0 # Page number page_num: int = 0 # Page number
content_sequence: List[Tuple[str, Any]] = field( content_sequence: List[Tuple[str, Any]] = field(
@@ -53,18 +54,8 @@ class DocxParser(BaseParser):
def __init__( def __init__(
self, self,
file_name: str = "", max_pages: int = 100, # Maximum number of pages to process
file_type: str = None, **kwargs,
enable_multimodal: bool = True,
chunk_size: int = 1000,
chunk_overlap: int = 200,
separators: list = ["\n\n", "\n", ""],
ocr_backend: str = "paddle",
ocr_config: dict = None,
max_image_size: int = 1920,
max_concurrent_tasks: int = 5,
max_pages: int = 100, # Maximum number of pages to process, default to 50 pages
chunking_config=None,
): ):
"""Initialize DOCX document parser """Initialize DOCX document parser
@@ -79,37 +70,16 @@ class DocxParser(BaseParser):
ocr_config: OCR engine configuration ocr_config: OCR engine configuration
max_image_size: Maximum image size limit max_image_size: Maximum image size limit
max_concurrent_tasks: Maximum number of concurrent tasks max_concurrent_tasks: Maximum number of concurrent tasks
max_pages: Maximum number of pages to process, if more than this, only process the first max_pages pages max_pages: Maximum number of pages to process
""" """
super().__init__( super().__init__(**kwargs)
file_name=file_name,
file_type=file_type,
enable_multimodal=enable_multimodal,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=separators,
ocr_backend=ocr_backend,
ocr_config=ocr_config,
max_image_size=max_image_size,
max_concurrent_tasks=max_concurrent_tasks,
chunking_config=chunking_config,
)
self.max_pages = max_pages self.max_pages = max_pages
logger.info(f"DocxParser initialized with max_pages={max_pages}") logger.info(f"DocxParser initialized with max_pages={max_pages}")
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: def parse_into_text(self, content: bytes) -> DocumentModel:
"""Parse DOCX document, extract text content and image Markdown links """Parse DOCX document, extract text content and image Markdown links"""
Args:
content: DOCX document content
Returns:
Tuple of (parsed_text, image_map) where image_map maps image URLs to Image objects
All LineData objects are used internally but not returned directly through this interface
"""
logger.info(f"Parsing DOCX document, content size: {len(content)} bytes") logger.info(f"Parsing DOCX document, content size: {len(content)} bytes")
logger.info(f"Max pages limit set to: {self.max_pages}") logger.info(f"Max pages limit set to: {self.max_pages}")
logger.info("Converting DOCX content to sections and tables")
start_time = time.time() start_time = time.time()
# Use concurrent processing to handle the document # Use concurrent processing to handle the document
@@ -123,7 +93,7 @@ class DocxParser(BaseParser):
docx_processor = Docx( docx_processor = Docx(
max_image_size=self.max_image_size, max_image_size=self.max_image_size,
enable_multimodal=self.enable_multimodal, enable_multimodal=self.enable_multimodal,
upload_file=self.upload_file, upload_file=self.storage.upload_file,
) )
all_lines, tables = docx_processor( all_lines, tables = docx_processor(
binary=content, binary=content,
@@ -140,7 +110,7 @@ class DocxParser(BaseParser):
section_start_time = time.time() section_start_time = time.time()
text_parts = [] text_parts = []
image_parts = {} image_parts: Dict[str, str] = {}
for sec_idx, line in enumerate(all_lines): for sec_idx, line in enumerate(all_lines):
try: try:
@@ -148,16 +118,19 @@ class DocxParser(BaseParser):
text_parts.append(line.text) text_parts.append(line.text)
if sec_idx < 3 or sec_idx % 50 == 0: if sec_idx < 3 or sec_idx % 50 == 0:
logger.info( logger.info(
f"Added section {sec_idx+1} text: {line.text[:50]}..." f"Added section {sec_idx + 1} text: {line.text[:50]}..."
if len(line.text) > 50 if len(line.text) > 50
else f"Added section {sec_idx+1} text: {line.text}" else f"Added section {sec_idx + 1} text: {line.text}"
) )
if line.images: if line.images:
for image_data in line.images: for image_data in line.images:
if image_data.url: if image_data.url and image_data.object:
image_parts[image_data.url] = image_data.object image_parts[image_data.url] = endecode.decode_image(
image_data.object
)
image_data.object.close()
except Exception as e: except Exception as e:
logger.error(f"Error processing section {sec_idx+1}: {str(e)}") logger.error(f"Error processing section {sec_idx + 1}: {str(e)}")
logger.error(f"Detailed stack trace: {traceback.format_exc()}") logger.error(f"Detailed stack trace: {traceback.format_exc()}")
continue continue
@@ -176,17 +149,17 @@ class DocxParser(BaseParser):
total_processing_time = time.time() - start_time total_processing_time = time.time() - start_time
logger.info( logger.info(
f"Parsing complete in {total_processing_time:.2f}s, generated {len(text)} characters of text " f"Parsing complete in {total_processing_time:.2f}s, "
f"generated {len(text)} characters of text"
) )
return text, image_parts return DocumentModel(content=text, images=image_parts)
except Exception as e: except Exception as e:
logger.error(f"Error parsing DOCX document: {str(e)}") logger.error(f"Error parsing DOCX document: {str(e)}")
logger.error(f"Detailed stack trace: {traceback.format_exc()}") logger.error(f"Detailed stack trace: {traceback.format_exc()}")
fallback_text = self._parse_using_simple_method(content) return self._parse_using_simple_method(content)
return fallback_text, {}
def _parse_using_simple_method(self, content: bytes) -> str: def _parse_using_simple_method(self, content: bytes) -> DocumentModel:
"""Parse document using a simplified method, as a fallback """Parse document using a simplified method, as a fallback
Args: Args:
@@ -201,7 +174,8 @@ class DocxParser(BaseParser):
doc = Document(BytesIO(content)) doc = Document(BytesIO(content))
logger.info( logger.info(
f"Successfully loaded document in simplified method, " f"Successfully loaded document in simplified method, "
f"contains {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables" f"contains {len(doc.paragraphs)} paragraphs "
f"and {len(doc.tables)} tables"
) )
text_parts = [] text_parts = []
@@ -211,7 +185,7 @@ class DocxParser(BaseParser):
para_with_text = 0 para_with_text = 0
for i, para in enumerate(doc.paragraphs): for i, para in enumerate(doc.paragraphs):
if i % 100 == 0: if i % 100 == 0:
logger.info(f"Processing paragraph {i+1}/{para_count}") logger.info(f"Processing paragraph {i + 1}/{para_count}")
if para.text.strip(): if para.text.strip():
text_parts.append(para.text.strip()) text_parts.append(para.text.strip())
para_with_text += 1 para_with_text += 1
@@ -225,7 +199,7 @@ class DocxParser(BaseParser):
rows_processed = 0 rows_processed = 0
for i, table in enumerate(doc.tables): for i, table in enumerate(doc.tables):
if i % 10 == 0: if i % 10 == 0:
logger.info(f"Processing table {i+1}/{table_count}") logger.info(f"Processing table {i + 1}/{table_count}")
table_has_content = False table_has_content = False
for row in table.rows: for row in table.rows:
@@ -256,25 +230,24 @@ class DocxParser(BaseParser):
# If the result is still empty, return an error message # If the result is still empty, return an error message
if not result_text: if not result_text:
logger.warning("No text extracted using simplified method") logger.warning("No text extracted using simplified method")
return "", {} return DocumentModel()
return result_text, {} return DocumentModel(content=result_text)
except Exception as backup_error: except Exception as backup_error:
processing_time = time.time() - start_time processing_time = time.time() - start_time
logger.error( logger.error(
f"Simplified parsing failed after {processing_time:.2f}s: {str(backup_error)}" f"Simplified parsing failed {processing_time:.2f}s: {backup_error}"
) )
logger.error(f"Detailed traceback: {traceback.format_exc()}") logger.error(f"Detailed traceback: {traceback.format_exc()}")
return "", {} return DocumentModel()
class Docx: class Docx:
def __init__(self, max_image_size=1920, enable_multimodal=False, upload_file=None): def __init__(self, max_image_size=1920, enable_multimodal=False, upload_file=None):
logger.info("Initializing DOCX processor") logger.info("Initializing DOCX processor")
self.max_image_size = max_image_size # Maximum image size limit self.max_image_size = max_image_size # Maximum image size limit
self.picture_cache = ( # Image cache to avoid processing the same image repeatedly
{} self.picture_cache = {}
) # Image cache to avoid processing the same image repeatedly
self.enable_multimodal = enable_multimodal self.enable_multimodal = enable_multimodal
self.upload_file = upload_file self.upload_file = upload_file
@@ -454,7 +427,6 @@ class Docx:
return page_to_paragraphs return page_to_paragraphs
def __call__( def __call__(
self, self,
binary: Optional[bytes] = None, binary: Optional[bytes] = None,
@@ -611,7 +583,6 @@ class Docx:
return pages_to_process return pages_to_process
def _process_document( def _process_document(
self, self,
binary, binary,
@@ -806,7 +777,9 @@ class Docx:
# Collect temporary image paths for later cleanup # Collect temporary image paths for later cleanup
for line in page_lines: for line in page_lines:
for image_data in line.images: for image_data in line.images:
if image_data.local_path and image_data.local_path.startswith("/tmp/docx_img_"): if image_data.local_path and image_data.local_path.startswith(
"/tmp/docx_img_"
):
temp_img_paths.add(image_data.local_path) temp_img_paths.add(image_data.local_path)
results.extend(page_lines) results.extend(page_lines)
@@ -876,7 +849,11 @@ class Docx:
# Process all image data objects # Process all image data objects
for image_data in image_paths: for image_data in image_paths:
if image_data.local_path and os.path.exists(image_data.local_path) and image_data.local_path not in image_url_map: if (
image_data.local_path
and os.path.exists(image_data.local_path)
and image_data.local_path not in image_url_map
):
try: try:
# Upload the image if it doesn't have a URL yet # Upload the image if it doesn't have a URL yet
if not image_data.url: if not image_data.url:
@@ -886,12 +863,16 @@ class Docx:
image_data.url = image_url image_data.url = image_url
# Add image URL as Markdown format # Add image URL as Markdown format
markdown_image = f"![]({image_url})" markdown_image = f"![]({image_url})"
image_url_map[image_data.local_path] = markdown_image image_url_map[image_data.local_path] = (
markdown_image
)
logger.info( logger.info(
f"Added image URL for {image_data.local_path}: {image_url}" f"Added image URL for {image_data.local_path}: {image_url}"
) )
else: else:
logger.warning(f"Failed to upload image: {image_data.local_path}") logger.warning(
f"Failed to upload image: {image_data.local_path}"
)
else: else:
# Already has a URL, use it # Already has a URL, use it
markdown_image = f"![]({image_data.url})" markdown_image = f"![]({image_data.url})"
@@ -925,12 +906,19 @@ class Docx:
# For ImageData objects, use the URL # For ImageData objects, use the URL
if isinstance(content, str) and content in image_url_map: if isinstance(content, str) and content in image_url_map:
combined_parts.append(image_url_map[content]) combined_parts.append(image_url_map[content])
elif hasattr(content, 'local_path') and content.local_path in image_url_map: elif (
hasattr(content, "local_path")
and content.local_path in image_url_map
):
combined_parts.append(image_url_map[content.local_path]) combined_parts.append(image_url_map[content.local_path])
# Create the final text with proper ordering # Create the final text with proper ordering
final_text = "\n\n".join(part for part in combined_parts if part) final_text = "\n\n".join(part for part in combined_parts if part)
processed_lines.append(LineData(text=final_text, page_num=page_num, images=line_data.images)) processed_lines.append(
LineData(
text=final_text, page_num=page_num, images=line_data.images
)
)
else: else:
processed_lines = lines processed_lines = lines
@@ -1003,11 +991,11 @@ class Docx:
logger.info(f"Processing {table_count} tables") logger.info(f"Processing {table_count} tables")
for tb_idx, tb in enumerate(self.doc.tables): for tb_idx, tb in enumerate(self.doc.tables):
if tb_idx % 10 == 0: # Log only every 10 tables to reduce log volume if tb_idx % 10 == 0: # Log only every 10 tables to reduce log volume
logger.info(f"Processing table {tb_idx+1}/{table_count}") logger.info(f"Processing table {tb_idx + 1}/{table_count}")
# Optimize: Check if table is empty # Optimize: Check if table is empty
if len(tb.rows) == 0 or all(len(r.cells) == 0 for r in tb.rows): if len(tb.rows) == 0 or all(len(r.cells) == 0 for r in tb.rows):
logger.info(f"Skipping empty table {tb_idx+1}") logger.info(f"Skipping empty table {tb_idx + 1}")
continue continue
table_html = self._convert_table_to_html(tb) table_html = self._convert_table_to_html(tb)
@@ -1111,8 +1099,8 @@ def _save_image_to_temp(logger, image, page_num, img_idx):
if not image: if not image:
return None return None
import tempfile
import os import os
import tempfile
try: try:
# Create a temporary file # Create a temporary file
@@ -1187,8 +1175,15 @@ def process_page_multiprocess(
return [] return []
# Extract page content # Extract page content
combined_text, image_objects, content_sequence = _extract_page_content_in_process( combined_text, image_objects, content_sequence = (
process_logger, doc, page_num, paragraphs, enable_multimodal, max_image_size _extract_page_content_in_process(
process_logger,
doc,
page_num,
paragraphs,
enable_multimodal,
max_image_size,
)
) )
# Process content sequence to maintain order between processes # Process content sequence to maintain order between processes
@@ -1199,7 +1194,9 @@ def process_page_multiprocess(
if enable_multimodal: if enable_multimodal:
# First pass: save all images to temporary files # First pass: save all images to temporary files
for i, image_object in enumerate(image_objects): for i, image_object in enumerate(image_objects):
img_path = _save_image_to_temp(process_logger, image_object, page_num, i) img_path = _save_image_to_temp(
process_logger, image_object, page_num, i
)
if img_path: if img_path:
# Create ImageData object # Create ImageData object
image_data = ImageData() image_data = ImageData()

View File

@@ -1,15 +1,13 @@
import base64
import logging import logging
import os import os
import asyncio
from PIL import Image from docreader.models.document import Document
import io from docreader.parser.base_parser import BaseParser
from typing import Dict, Any, Tuple, Union
from .base_parser import BaseParser, ParseResult
import numpy as np
# Set up logger for this module # Set up logger for this module
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class ImageParser(BaseParser): class ImageParser(BaseParser):
""" """
@@ -23,46 +21,24 @@ class ImageParser(BaseParser):
4. Returning a combined result with both text and image reference 4. Returning a combined result with both text and image reference
""" """
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: def parse_into_text(self, content: bytes) -> Document:
""" """
Parse image content, upload the image and return Markdown reference along with image map. Parse image content into markdown text
:param content: bytes content of the image
Args: :return: Document object
content: Raw image data (bytes)
Returns:
Tuple of (markdown_text, image_map) where image_map maps image URLs to PIL Image objects
""" """
logger.info(f"Parsing image content, size: {len(content)} bytes") logger.info(f"Parsing image content, size: {len(content)} bytes")
image_map = {}
try:
# Upload image to storage service
logger.info("Uploading image to storage")
_, ext = os.path.splitext(self.file_name)
image_url = self.upload_bytes(content, file_ext=ext)
if not image_url:
logger.error("Failed to upload image to storage")
return "", {}
logger.info(
f"Successfully uploaded image, URL: {image_url[:50]}..."
if len(image_url) > 50
else f"Successfully uploaded image, URL: {image_url}"
)
# Create image object and add to map # Get file extension
try: ext = os.path.splitext(self.file_name)[1].lower()
from PIL import Image
import io
image = Image.open(io.BytesIO(content))
image_map[image_url] = image
logger.info(f"Added image to image_map for URL: {image_url}")
except Exception as img_err:
logger.error(f"Error creating image object: {str(img_err)}")
markdown_text = f"![{self.file_name}]({image_url})" # Upload image to storage
return markdown_text, image_map image_url = self.storage.upload_bytes(content, file_ext=ext)
logger.info(f"Successfully uploaded image, URL: {image_url[:50]}...")
except Exception as e: # Generate markdown text
logger.error(f"Error parsing image: {str(e)}") text = f"![{self.file_name}]({image_url})"
return "", {} images = {image_url: base64.b64encode(content).decode()}
# Create image object and add to map
return Document(content=text, images=images)

View File

@@ -1,43 +0,0 @@
import base64
import io
import logging
from typing import Union
from PIL import Image
import numpy as np
logger = logging.getLogger(__name__)
def image_to_base64(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
"""Convert image to base64 encoded string
Args:
image: Image file path, bytes, PIL Image object, or numpy array
Returns:
Base64 encoded image string, or empty string if conversion fails
"""
try:
if isinstance(image, str):
# It's a file path
with open(image, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
elif isinstance(image, bytes):
# It's bytes data
return base64.b64encode(image).decode("utf-8")
elif isinstance(image, Image.Image):
# It's a PIL Image
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
elif isinstance(image, np.ndarray):
# It's a numpy array
pil_image = Image.fromarray(image)
buffer = io.BytesIO()
pil_image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
else:
logger.error(f"Unsupported image type: {type(image)}")
return ""
except Exception as e:
logger.error(f"Error converting image to base64: {str(e)}")
return ""

View File

@@ -0,0 +1,111 @@
import logging
import re
import uuid
from typing import Dict, List, Match, Optional, Tuple
from docreader.utils import endecode
# Get logger object
logger = logging.getLogger(__name__)
class MarkdownImageUtil:
def __init__(self):
self.b64_pattern = re.compile(
r"!\[([^\]]*)\]\(data:image/(\w+);base64,([^\)]+)\)"
)
self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
def extract_image(
self,
content: str,
path_prefix: Optional[str] = None,
replace: bool = True,
) -> Tuple[str, List[str]]:
"""Extract base64 encoded images from Markdown content"""
# image_path => base64 bytes
images: List[str] = []
def repl(match: Match[str]) -> str:
title = match.group(1)
image_path = match.group(2)
if path_prefix:
image_path = f"{path_prefix}/{image_path}"
images.append(image_path)
if not replace:
return match.group(0)
# Replace image path with URL
return f"![{title}]({image_path})"
text = self.image_pattern.sub(repl, content)
logger.debug(f"Extracted {len(images)} images from markdown")
return text, images
def extract_base64(
self,
content: str,
path_prefix: Optional[str] = None,
replace: bool = True,
) -> Tuple[str, Dict[str, bytes]]:
"""Extract base64 encoded images from Markdown content"""
# image_path => base64 bytes
images: Dict[str, bytes] = {}
def repl(match: Match[str]) -> str:
title = match.group(1)
img_ext = match.group(2)
img_b64 = match.group(3)
image_byte = endecode.encode_image(img_b64, errors="ignore")
if not image_byte:
logger.error(f"Failed to decode base64 image skip it: {img_b64}")
return title
image_path = f"{uuid.uuid4()}.{img_ext}"
if path_prefix:
image_path = f"{path_prefix}/{image_path}"
images[image_path] = image_byte
if not replace:
return match.group(0)
# Replace image path with URL
return f"![{title}]({image_path})"
text = self.b64_pattern.sub(repl, content)
logger.debug(f"Extracted {len(images)} base64 images from markdown")
return text, images
def replace_path(self, content: str, images: Dict[str, str]) -> str:
content_replace: set = set()
def repl(match: Match[str]) -> str:
title = match.group(1)
image_path = match.group(2)
if image_path not in images:
return match.group(0)
content_replace.add(image_path)
image_path = images[image_path]
return f"![{title}]({image_path})"
text = self.replace_pattern.sub(repl, content)
logger.debug(f"Replaced {len(content_replace)} images in markdown")
return text
if __name__ == "__main__":
your_content = "test![]()test"
image_handle = MarkdownImageUtil()
text, images = image_handle.extract_base64(your_content)
print(text)
for image_url, image_byte in images.items():
with open(image_url, "wb") as f:
f.write(image_byte)

View File

@@ -1,33 +1,53 @@
import asyncio import base64
import re
import logging import logging
import numpy as np import os
import os # Import os module to get environment variables from typing import Dict
from typing import Dict, List, Optional, Tuple, Union, Any
from .base_parser import BaseParser from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.chain_parser import PipelineParser
from docreader.parser.markdown_image_util import MarkdownImageUtil
from docreader.utils import endecode
# Get logger object # Get logger object
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class MarkdownParser(BaseParser): class MarkdownImageBase64(BaseParser):
"""Markdown document parser""" def __init__(self, **kwargs):
super().__init__(**kwargs)
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: self.image_helper = MarkdownImageUtil()
"""Parse Markdown document, only extract text content, do not process images
Args:
content: Markdown document content
Returns:
Parsed text result
"""
logger.info(f"Parsing Markdown document, content size: {len(content)} bytes")
def parse_into_text(self, content: bytes) -> Document:
# Convert byte content to string using universal decoding method # Convert byte content to string using universal decoding method
text = self.decode_bytes(content) text = endecode.decode_bytes(content)
logger.info(f"Decoded Markdown content, text length: {len(text)} characters") text, img_b64 = self.image_helper.extract_base64(text, path_prefix="images")
logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text") images: Dict[str, str] = {}
return text image_replace: Dict[str, str] = {}
logger.debug(f"Uploading {len(img_b64)} images from markdown")
for ipath, b64_bytes in img_b64.items():
ext = os.path.splitext(ipath)[1].lower()
image_url = self.storage.upload_bytes(b64_bytes, ext)
image_replace[ipath] = image_url
images[image_url] = base64.b64encode(b64_bytes).decode()
text = self.image_helper.replace_path(text, image_replace)
return Document(content=text, images=images)
class MarkdownParser(PipelineParser):
_parser_cls = (MarkdownImageBase64,)
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
your_content = "test![]()test"
parser = MarkdownParser()
document = parser.parse_into_text(your_content.encode())
logger.info(document.content)
logger.info(f"Images: {len(document.images)}, name: {document.images.keys()}")

View File

@@ -0,0 +1,31 @@
import io
import logging
from markitdown import MarkItDown
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.chain_parser import PipelineParser
from docreader.parser.markdown_parser import MarkdownParser
logger = logging.getLogger(__name__)
class StdMarkitdownParser(BaseParser):
"""
PDF Document Parser
This parser handles PDF documents by extracting text content.
It uses the markitdown library for simple text extraction.
"""
def __init__(self, *args, **kwargs):
self.markitdown = MarkItDown()
def parse_into_text(self, content: bytes) -> Document:
result = self.markitdown.convert(io.BytesIO(content), keep_data_uris=True)
return Document(content=result.text_content)
class MarkitdownParser(PipelineParser):
_parser_cls = (StdMarkitdownParser, MarkdownParser)

View File

@@ -0,0 +1,124 @@
import logging
import os
import re
from typing import Dict
import markdownify
import requests
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.markdown_parser import MarkdownImageUtil
from docreader.utils import endecode
logger = logging.getLogger(__name__)
class MinerUParser(BaseParser):
def __init__(
self,
enable_markdownify: bool = True,
mineru_endpoint: str = "",
**kwargs,
):
super().__init__(**kwargs)
self.minerU = os.getenv("MINERU_ENDPOINT", mineru_endpoint)
self.enable_markdownify = enable_markdownify
self.image_helper = MarkdownImageUtil()
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
self.enable = self.ping()
assert self.ping(), "MinerU API is not reachable"
def ping(self, timeout: int = 5) -> bool:
try:
response = requests.get(
self.minerU + "/docs", timeout=timeout, allow_redirects=True
)
response.raise_for_status()
return True
except Exception:
return False
def parse_into_text(self, content: bytes) -> Document:
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
md_content: str = ""
images_b64: Dict[str, str] = {}
try:
response = requests.post(
url=self.minerU + "/file_parse",
data={
"return_md": True,
"return_images": True,
"lang_list": ["ch", "en"],
"table_enable": True,
"formula_enable": True,
"parse_method": "auto",
"start_page_id": 0,
"end_page_id": 99999,
"backend": "pipeline",
"response_format_zip": False,
"return_middle_json": False,
"return_model_output": False,
"return_content_list": False,
},
files={"files": content},
timeout=1000,
)
response.raise_for_status()
result = response.json()["results"]["files"]
md_content = result["md_content"]
images_b64 = result.get("images", {})
except Exception as e:
logger.error(f"MinerU parsing failed: {e}", exc_info=True)
return Document()
# convert table(HTML) in markdown to markdown table
if self.enable_markdownify:
logger.debug("Converting HTML to Markdown")
md_content = markdownify.markdownify(md_content)
images = {}
image_replace = {}
# image in images_bs64 may not be used in md_content
# such as: table ...
# so we need to filter them
for ipath, b64_str in images_b64.items():
if f"images/{ipath}" not in md_content:
logger.debug(f"Image {ipath} not used in markdown")
continue
match = self.base64_pattern.match(b64_str)
if match:
file_ext = match.group(1)
b64_str = match.group(2)
image_bytes = endecode.encode_image(b64_str, errors="ignore")
if not image_bytes:
logger.error("Failed to decode base64 image skip it")
continue
image_url = self.storage.upload_bytes(
image_bytes, file_ext=f".{file_ext}"
)
images[image_url] = b64_str
image_replace[f"images/{ipath}"] = image_url
logger.info(f"Replaced {len(image_replace)} images in markdown")
text = self.image_helper.replace_path(md_content, image_replace)
logger.info(
f"Successfully parsed PDF, text: {len(text)}, images: {len(images)}"
)
return Document(content=text, images=images)
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
your_file = "/path/to/your/file.pdf"
your_mineru = "http://host.docker.internal:9987"
parser = MinerUParser(mineru_endpoint=your_mineru)
with open(your_file, "rb") as f:
content = f.read()
document = parser.parse_into_text(content)
logger.error(document.content)

View File

@@ -1,71 +1,96 @@
import os
import logging
import base64
from typing import Optional, Union, Dict, Any
from abc import ABC, abstractmethod
from PIL import Image
import io import io
import logging
import os
import platform
import subprocess
from abc import ABC, abstractmethod
from typing import Dict, Union
import numpy as np import numpy as np
from .image_utils import image_to_base64 from openai import OpenAI
from PIL import Image
from docreader.utils import endecode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class OCRBackend(ABC): class OCRBackend(ABC):
"""Base class for OCR backends""" """Base class for OCR backends"""
@abstractmethod @abstractmethod
def predict(self, image: Union[str, bytes, Image.Image]) -> str: def predict(self, image: Union[str, bytes, Image.Image]) -> str:
"""Extract text from an image """Extract text from an image
Args: Args:
image: Image file path, bytes, or PIL Image object image: Image file path, bytes, or PIL Image object
Returns: Returns:
Extracted text Extracted text
""" """
pass pass
class DummyOCRBackend(OCRBackend):
"""Dummy OCR backend implementation"""
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
logger.warning("Dummy OCR backend is used")
return ""
class PaddleOCRBackend(OCRBackend): class PaddleOCRBackend(OCRBackend):
"""PaddleOCR backend implementation""" """PaddleOCR backend implementation"""
def __init__(self, **kwargs): def __init__(self):
"""Initialize PaddleOCR backend""" """Initialize PaddleOCR backend"""
self.ocr = None self.ocr = None
try: try:
import os
import paddle import paddle
# Set PaddlePaddle to use CPU and disable GPU # Set PaddlePaddle to use CPU and disable GPU
os.environ['CUDA_VISIBLE_DEVICES'] = '' os.environ["CUDA_VISIBLE_DEVICES"] = ""
paddle.set_device('cpu') paddle.device.set_device("cpu")
# 尝试检测CPU是否支持AVX指令集 # 尝试检测CPU是否支持AVX指令集
try: try:
import subprocess
import platform
# 检测CPU是否支持AVX # 检测CPU是否支持AVX
if platform.system() == "Linux": if platform.system() == "Linux":
try: try:
result = subprocess.run(['grep', '-o', 'avx', '/proc/cpuinfo'], result = subprocess.run(
capture_output=True, text=True, timeout=5) ["grep", "-o", "avx", "/proc/cpuinfo"],
has_avx = 'avx' in result.stdout.lower() capture_output=True,
text=True,
timeout=5,
)
has_avx = "avx" in result.stdout.lower()
if not has_avx: if not has_avx:
logger.warning("CPU does not support AVX instructions, using compatibility mode") logger.warning(
"CPU does not support AVX instructions, "
"using compatibility mode"
)
# 进一步限制指令集使用 # 进一步限制指令集使用
os.environ['FLAGS_use_avx2'] = '0' os.environ["FLAGS_use_avx2"] = "0"
os.environ['FLAGS_use_avx'] = '1' os.environ["FLAGS_use_avx"] = "1"
except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError): except (
logger.warning("Could not detect AVX support, using compatibility mode") subprocess.TimeoutExpired,
os.environ['FLAGS_use_avx2'] = '0' FileNotFoundError,
os.environ['FLAGS_use_avx'] = '1' subprocess.SubprocessError,
):
logger.warning(
"Could not detect AVX support, using compatibility mode"
)
os.environ["FLAGS_use_avx2"] = "0"
os.environ["FLAGS_use_avx"] = "1"
except Exception as e: except Exception as e:
logger.warning(f"Error detecting CPU capabilities: {e}, using compatibility mode") logger.warning(
os.environ['FLAGS_use_avx2'] = '0' f"Error detecting CPU capabilities: {e}, using compatibility mode"
os.environ['FLAGS_use_avx'] = '1' )
os.environ["FLAGS_use_avx2"] = "0"
os.environ["FLAGS_use_avx"] = "1"
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
# OCR configuration with text orientation classification enabled # OCR configuration with text orientation classification enabled
ocr_config = { ocr_config = {
"use_gpu": False, "use_gpu": False,
@@ -86,23 +111,53 @@ class PaddleOCRBackend(OCRBackend):
"use_dilation": True, # improves accuracy "use_dilation": True, # improves accuracy
"det_db_score_mode": "slow", # improves accuracy "det_db_score_mode": "slow", # improves accuracy
} }
self.ocr = PaddleOCR(**ocr_config) self.ocr = PaddleOCR(**ocr_config)
logger.info("PaddleOCR engine initialized successfully") logger.info("PaddleOCR engine initialized successfully")
except ImportError as e: except ImportError as e:
logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'") logger.error(
f"Failed to import paddleocr: {str(e)}. "
"Please install it with 'pip install paddleocr'"
)
except OSError as e: except OSError as e:
if "Illegal instruction" in str(e) or "core dumped" in str(e): if "Illegal instruction" in str(e) or "core dumped" in str(e):
logger.error(f"PaddlePaddle crashed due to CPU instruction set incompatibility: {str(e)}") logger.error(
logger.error("This usually happens when the CPU doesn't support AVX instructions.") f"PaddlePaddle crashed due to CPU instruction set incompatibility:"
logger.error("Please try installing a CPU-only version of PaddlePaddle or use a different OCR backend.") f"{e}"
)
logger.error(
"This happens when the CPU doesn't support AVX instructions. "
"Try install CPU-only version of PaddlePaddle, "
"or use a different OCR backend."
)
else: else:
logger.error(f"Failed to initialize PaddleOCR due to OS error: {str(e)}") logger.error(
f"Failed to initialize PaddleOCR due to OS error: {str(e)}"
)
except Exception as e: except Exception as e:
logger.error(f"Failed to initialize PaddleOCR: {str(e)}") logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
def predict(self, image): def predict(self, image: Union[str, bytes, Image.Image]) -> str:
"""Extract text from an image
Args:
image: Image file path, bytes, or PIL Image object
Returns:
Extracted text
"""
if isinstance(image, str):
image = Image.open(image)
elif isinstance(image, bytes):
image = Image.open(io.BytesIO(image))
if not isinstance(image, Image.Image):
raise TypeError("image must be a string, bytes, or PIL Image object")
return self._predict(image)
def _predict(self, image: Image.Image) -> str:
"""Perform OCR recognition on the image """Perform OCR recognition on the image
Args: Args:
@@ -111,63 +166,59 @@ class PaddleOCRBackend(OCRBackend):
Returns: Returns:
Extracted text string Extracted text string
""" """
if self.ocr is None:
logger.error("PaddleOCR engine not initialized")
return ""
try: try:
# Ensure image is in RGB format # Ensure image is in RGB format
if hasattr(image, "convert") and image.mode != "RGB": if image.mode != "RGB":
image = image.convert("RGB") image = image.convert("RGB")
# Convert to numpy array if needed # Convert to numpy array if needed
if hasattr(image, "convert"): image_array = np.array(image)
image_array = np.array(image)
else:
image_array = image
# Perform OCR # Perform OCR
ocr_result = self.ocr.ocr(image_array, cls=False) ocr_result = self.ocr.ocr(image_array, cls=False)
# Extract text # Extract text
ocr_text = "" ocr_text = ""
if ocr_result and ocr_result[0]: if ocr_result and ocr_result[0]:
for line in ocr_result[0]: text = [
if line and len(line) >= 2: line[1][0] if line and len(line) >= 2 and line[1] else ""
text = line[1][0] if line[1] else "" for line in ocr_result[0]
if text: ]
ocr_text += text + " " text = [t.strip() for t in text if t]
ocr_text = " ".join(text)
text_length = len(ocr_text.strip())
if text_length > 0: logger.info(f"OCR extracted {len(ocr_text)} characters")
logger.info(f"OCR extracted {text_length} characters") return ocr_text
return ocr_text.strip()
else:
logger.warning("OCR returned empty result")
return ""
except Exception as e: except Exception as e:
logger.error(f"OCR recognition error: {str(e)}") logger.error(f"OCR recognition error: {str(e)}")
return "" return ""
class NanonetsOCRBackend(OCRBackend): class NanonetsOCRBackend(OCRBackend):
"""Nanonets OCR backend implementation using OpenAI API format""" """Nanonets OCR backend implementation using OpenAI API format"""
def __init__(self, **kwargs): def __init__(self):
"""Initialize Nanonets OCR backend """Initialize Nanonets OCR backend
Args: Args:
api_key: API key for OpenAI API api_key: API key for OpenAI API
base_url: Base URL for OpenAI API base_url: Base URL for OpenAI API
model: Model name model: Model name
""" """
try: base_url = os.getenv("OCR_API_BASE_URL", "http://localhost:8000/v1")
from openai import OpenAI api_key = os.getenv("OCR_API_KEY", "123")
self.api_key = kwargs.get("api_key", "123") timeout = 30
self.base_url = kwargs.get("base_url", "http://localhost:8000/v1") self.client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout)
self.model = kwargs.get("model", "nanonets/Nanonets-OCR-s")
self.temperature = kwargs.get("temperature", 0.0) self.model = os.getenv("OCR_MODEL", "nanonets/Nanonets-OCR-s")
self.max_tokens = kwargs.get("max_tokens", 15000) logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
self.temperature = 0.0
self.client = OpenAI(api_key=self.api_key, base_url=self.base_url) self.max_tokens = 15000
self.prompt = """ self.prompt = """## 任务说明
## 任务说明
请从上传的文档中提取文字内容,严格按自然阅读顺序(从上到下,从左到右)输出,并遵循以下格式规范。 请从上传的文档中提取文字内容,严格按自然阅读顺序(从上到下,从左到右)输出,并遵循以下格式规范。
@@ -192,33 +243,26 @@ class NanonetsOCRBackend(OCRBackend):
* 不要猜测或补全不确定的链接地址。 * 不要猜测或补全不确定的链接地址。
""" """
logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
except ImportError:
logger.error("Failed to import openai. Please install it with 'pip install openai'")
self.client = None
except Exception as e:
logger.error(f"Failed to initialize Nanonets OCR: {str(e)}")
self.client = None
def predict(self, image: Union[str, bytes, Image.Image]) -> str: def predict(self, image: Union[str, bytes, Image.Image]) -> str:
"""Extract text from an image using Nanonets OCR """Extract text from an image using Nanonets OCR
Args: Args:
image: Image file path, bytes, or PIL Image object image: Image file path, bytes, or PIL Image object
Returns: Returns:
Extracted text Extracted text
""" """
if self.client is None: if self.client is None:
logger.error("Nanonets OCR client not initialized") logger.error("Nanonets OCR client not initialized")
return "" return ""
try: try:
# Encode image to base64 # Encode image to base64
img_base64 = image_to_base64(image) img_base64 = endecode.decode_image(image)
if not img_base64: if not img_base64:
return "" return ""
# Call Nanonets OCR API # Call Nanonets OCR API
logger.info(f"Calling Nanonets OCR API with model: {self.model}") logger.info(f"Calling Nanonets OCR API with model: {self.model}")
response = self.client.chat.completions.create( response = self.client.chat.completions.create(
@@ -229,7 +273,9 @@ class NanonetsOCRBackend(OCRBackend):
"content": [ "content": [
{ {
"type": "image_url", "type": "image_url",
"image_url": {"url": f"data:image/png;base64,{img_base64}"}, "image_url": {
"url": f"data:image/png;base64,{img_base64}"
},
}, },
{ {
"type": "text", "type": "text",
@@ -239,40 +285,43 @@ class NanonetsOCRBackend(OCRBackend):
} }
], ],
temperature=self.temperature, temperature=self.temperature,
max_tokens=self.max_tokens max_tokens=self.max_tokens,
) )
return response.choices[0].message.content or ""
return response.choices[0].message.content
except Exception as e: except Exception as e:
logger.error(f"Nanonets OCR prediction error: {str(e)}") logger.error(f"Nanonets OCR prediction error: {str(e)}")
return "" return ""
class OCREngine: class OCREngine:
"""OCR Engine factory class""" """OCR Engine factory class"""
_instance = None _instance: Dict[str, OCRBackend] = {}
@classmethod @classmethod
def get_instance(cls, backend_type="paddle", **kwargs) -> Optional[OCRBackend]: def get_instance(cls, backend_type: str) -> OCRBackend:
"""Get OCR engine instance """Get OCR engine instance
Args: Args:
backend_type: OCR backend type, one of: "paddle", "nanonets" backend_type: OCR backend type, one of: "paddle", "nanonets"
**kwargs: Additional arguments for the backend **kwargs: Additional arguments for the backend
Returns: Returns:
OCR engine instance or None if initialization fails OCR engine instance or None if initialization fails
""" """
if cls._instance is None: backend_type = backend_type.lower()
logger.info(f"Initializing OCR engine with backend: {backend_type}") if cls._instance.get(backend_type):
return cls._instance[backend_type]
if backend_type.lower() == "paddle":
cls._instance = PaddleOCRBackend(**kwargs) logger.info(f"Initializing OCR engine with backend: {backend_type}")
elif backend_type.lower() == "nanonets":
cls._instance = NanonetsOCRBackend(**kwargs) if backend_type == "paddle":
else: cls._instance[backend_type] = PaddleOCRBackend()
logger.error(f"Unknown OCR backend type: {backend_type}")
return None elif backend_type == "nanonets":
cls._instance[backend_type] = NanonetsOCRBackend()
return cls._instance
else:
cls._instance[backend_type] = DummyOCRBackend()
return cls._instance[backend_type]

View File

@@ -1,30 +1,19 @@
import logging import logging
from dataclasses import dataclass, field from typing import Dict, Type
from typing import Dict, Any, Optional, Type
from .base_parser import BaseParser, ParseResult from docreader.models.document import Document
from .docx_parser import DocxParser from docreader.models.read_config import ChunkingConfig
from .doc_parser import DocParser from docreader.parser.base_parser import BaseParser
from .pdf_parser import PDFParser from docreader.parser.doc_parser import DocParser
from .markdown_parser import MarkdownParser from docreader.parser.docx2_parser import Docx2Parser
from .text_parser import TextParser from docreader.parser.image_parser import ImageParser
from .image_parser import ImageParser from docreader.parser.markdown_parser import MarkdownParser
from .web_parser import WebParser from docreader.parser.pdf_parser import PDFParser
from .config import ChunkingConfig from docreader.parser.text_parser import TextParser
import traceback from docreader.parser.web_parser import WebParser
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@dataclass
class Chunk:
"""
Represents a single text chunk with associated metadata.
Basic unit for document processing and embedding.
"""
content: str # Text content of the chunk
metadata: Dict[str, Any] = None # Associated metadata (source, page number, etc.)
class Parser: class Parser:
""" """
@@ -33,10 +22,9 @@ class Parser:
""" """
def __init__(self): def __init__(self):
logger.info("Initializing document parser")
# Initialize all parser types # Initialize all parser types
self.parsers: Dict[str, Type[BaseParser]] = { self.parsers: Dict[str, Type[BaseParser]] = {
"docx": DocxParser, "docx": Docx2Parser,
"doc": DocParser, "doc": DocParser,
"pdf": PDFParser, "pdf": PDFParser,
"md": MarkdownParser, "md": MarkdownParser,
@@ -56,8 +44,7 @@ class Parser:
", ".join(self.parsers.keys()), ", ".join(self.parsers.keys()),
) )
def get_parser(self, file_type: str) -> Type[BaseParser]:
def get_parser(self, file_type: str) -> Optional[Type[BaseParser]]:
""" """
Get parser class for the specified file type. Get parser class for the specified file type.
@@ -67,12 +54,9 @@ class Parser:
Returns: Returns:
Parser class for the file type, or None if unsupported Parser class for the file type, or None if unsupported
""" """
file_type = file_type.lower() parser = self.parsers.get(file_type.lower())
parser = self.parsers.get(file_type) if not parser:
if parser: raise ValueError(f"Unsupported file type: {file_type}")
logger.info(f"Found parser for file type: {file_type}")
else:
logger.warning(f"No parser found for file type: {file_type}")
return parser return parser
def parse_file( def parse_file(
@@ -81,7 +65,7 @@ class Parser:
file_type: str, file_type: str,
content: bytes, content: bytes,
config: ChunkingConfig, config: ChunkingConfig,
) -> Optional[ParseResult]: ) -> Document:
""" """
Parse file content using appropriate parser based on file type. Parse file content using appropriate parser based on file type.
@@ -96,60 +80,41 @@ class Parser:
""" """
logger.info(f"Parsing file: {file_name} with type: {file_type}") logger.info(f"Parsing file: {file_name} with type: {file_type}")
logger.info( logger.info(
f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, " f"Chunking config: size={config.chunk_size}, "
f"overlap={config.chunk_overlap}, "
f"multimodal={config.enable_multimodal}" f"multimodal={config.enable_multimodal}"
) )
parser_instance = None
try:
# Get appropriate parser for file type
cls = self.get_parser(file_type)
if cls is None:
logger.error(f"Unsupported file type: {file_type}")
return None
# Parse file content # Get appropriate parser for file type
logger.info(f"Creating parser instance for {file_type} file") cls = self.get_parser(file_type)
parser_instance = cls(
file_name=file_name,
file_type=file_type,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
separators=config.separators,
enable_multimodal=config.enable_multimodal,
max_image_size=1920, # Limit image size to 1920px
max_concurrent_tasks=5, # Limit concurrent tasks to 5
chunking_config=config, # Pass the entire chunking config
)
logger.info(f"Starting to parse file content, size: {len(content)} bytes") # Parse file content
result = parser_instance.parse(content) logger.info(f"Creating parser instance for {file_type} file")
parser = cls(
file_name=file_name,
file_type=file_type,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
separators=config.separators,
enable_multimodal=config.enable_multimodal,
max_image_size=1920, # Limit image size to 1920px
max_concurrent_tasks=5, # Limit concurrent tasks to 5
chunking_config=config, # Pass the entire chunking config
)
if result: logger.info(f"Starting to parse file content, size: {len(content)} bytes")
logger.info( result = parser.parse(content)
f"Successfully parsed file {file_name}, generated {len(result.chunks)} chunks"
)
if result.chunks and len(result.chunks) > 0:
logger.info(
f"First chunk content length: {len(result.chunks[0].content)}"
)
else:
logger.warning(f"Parser returned empty chunks for file: {file_name}")
else:
logger.warning(f"Parser returned None result for file: {file_name}")
# Return parse results if not result.content:
return result logger.warning(f"Parser returned empty content for file: {file_name}")
elif not result.chunks:
logger.warning(f"Parser returned empty chunks for file: {file_name}")
elif result.chunks[0]:
logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
logger.info(f"Parsed file {file_name}, with {len(result.chunks)} chunks")
return result
except Exception as e: def parse_url(self, url: str, title: str, config: ChunkingConfig) -> Document:
logger.error(f"Error parsing file {file_name}: {str(e)}")
logger.info(f"Detailed traceback: {traceback.format_exc()}")
return None
def parse_url(
self, url: str, title: str, config: ChunkingConfig
) -> Optional[ParseResult]:
""" """
Parse content from a URL using the WebParser. Parse content from a URL using the WebParser.
@@ -163,44 +128,31 @@ class Parser:
""" """
logger.info(f"Parsing URL: {url}, title: {title}") logger.info(f"Parsing URL: {url}, title: {title}")
logger.info( logger.info(
f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, " f"Chunking config: size={config.chunk_size}, "
f"multimodal={config.enable_multimodal}" f"overlap={config.chunk_overlap}, multimodal={config.enable_multimodal}"
) )
parser_instance = None
try: # Create web parser instance
# Create web parser instance logger.info("Creating WebParser instance")
logger.info("Creating WebParser instance") parser = WebParser(
parser_instance = WebParser( title=title,
title=title, chunk_size=config.chunk_size,
chunk_size=config.chunk_size, chunk_overlap=config.chunk_overlap,
chunk_overlap=config.chunk_overlap, separators=config.separators,
separators=config.separators, enable_multimodal=config.enable_multimodal,
enable_multimodal=config.enable_multimodal, max_image_size=1920, # Limit image size
max_image_size=1920, # Limit image size max_concurrent_tasks=5, # Limit concurrent tasks
max_concurrent_tasks=5, # Limit concurrent tasks chunking_config=config,
chunking_config=config, )
)
logger.info(f"Starting to parse URL content") logger.info("Starting to parse URL content")
result = parser_instance.parse(url) result = parser.parse(url.encode())
if result:
logger.info(
f"Successfully parsed URL, generated {len(result.chunks)} chunks"
)
logger.info(
f"First chunk content length: {len(result.chunks[0].content) if result.chunks else 0}"
)
else:
logger.warning(f"Parser returned empty result for URL: {url}")
# Return parse results
return result
except Exception as e:
logger.error(f"Error parsing URL {url}: {str(e)}")
logger.info(f"Detailed traceback: {traceback.format_exc()}")
return None
if not result.content:
logger.warning(f"Parser returned empty content for url: {url}")
elif not result.chunks:
logger.warning(f"Parser returned empty chunks for url: {url}")
elif result.chunks[0]:
logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
logger.info(f"Parsed url {url}, with {len(result.chunks)} chunks")
return result

View File

@@ -1,113 +1,7 @@
import logging from docreader.parser.chain_parser import FirstParser
import os from docreader.parser.markitdown_parser import MarkitdownParser
import io from docreader.parser.mineru_parser import MinerUParser
from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
import pdfplumber
import tempfile
from .base_parser import BaseParser
logger = logging.getLogger(__name__) class PDFParser(FirstParser):
_parser_cls = (MinerUParser, MarkitdownParser)
class PDFParser(BaseParser):
"""
PDF Document Parser
This parser handles PDF documents by extracting text content.
It uses the pypdf library for simple text extraction.
"""
def _convert_table_to_markdown(self, table_data: list) -> str:
if not table_data or not table_data[0]: return ""
def clean_cell(cell):
if cell is None: return ""
return str(cell).replace("\n", " <br> ")
try:
markdown = ""
header = [clean_cell(cell) for cell in table_data[0]]
markdown += "| " + " | ".join(header) + " |\n"
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
for row in table_data[1:]:
if not row: continue
body_row = [clean_cell(cell) for cell in row]
if len(body_row) != len(header):
logger.warning(f"Skipping malformed table row: {body_row}")
continue
markdown += "| " + " | ".join(body_row) + " |\n"
return markdown
except Exception as e:
logger.error(f"Error converting table to markdown: {e}")
return ""
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
all_page_content = []
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
temp_pdf_path = temp_pdf.name
try:
temp_pdf.write(content)
temp_pdf.close()
logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
with pdfplumber.open(temp_pdf_path) as pdf:
logger.info(f"PDF has {len(pdf.pages)} pages")
for page_num, page in enumerate(pdf.pages):
page_content_parts = []
# Try-fallback strategy for table detection
default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
found_tables = page.find_tables(default_settings)
if not found_tables:
logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
found_tables = page.find_tables(fallback_settings)
table_bboxes = [table.bbox for table in found_tables]
# Define a filter function that keeps objects NOT inside any table bbox.
def not_within_bboxes(obj):
"""Check if an object is outside all table bounding boxes."""
for bbox in table_bboxes:
# Check if the object's vertical center is within a bbox
if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
return False # It's inside a table, so we DON'T keep it
return True # It's outside all tables, so we DO keep it
# that contains only the non-table text.
non_table_page = page.filter(not_within_bboxes)
# Now, extract text from this filtered page view.
text = non_table_page.extract_text(x_tolerance=2)
if text:
page_content_parts.append(text)
# Process and append the structured Markdown tables
if found_tables:
logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
for table in found_tables:
markdown_table = self._convert_table_to_markdown(table.extract())
page_content_parts.append(f"\n\n{markdown_table}\n\n")
all_page_content.append("".join(page_content_parts))
final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
return final_text
except Exception as e:
logger.error(f"Failed to parse PDF document: {str(e)}")
return ""
finally:
# This block is GUARANTEED to execute, preventing resource leaks.
if os.path.exists(temp_pdf_path):
try:
os.remove(temp_pdf_path)
logger.info(f"Temporary file cleaned up: {temp_pdf_path}")
except OSError as e:
logger.error(f"Error removing temporary file {temp_pdf_path}: {e}")

View File

@@ -1,64 +1,68 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os
import uuid
import logging
import io import io
import logging
import os
import traceback import traceback
import uuid
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Tuple, Optional from typing import Dict
from qcloud_cos import CosConfig, CosS3Client
from minio import Minio from minio import Minio
from qcloud_cos import CosConfig, CosS3Client
from docreader.utils import endecode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class Storage(ABC): class Storage(ABC):
"""Abstract base class for object storage operations""" """Abstract base class for object storage operations"""
@abstractmethod @abstractmethod
def upload_file(self, file_path: str) -> str: def upload_file(self, file_path: str) -> str:
"""Upload file to object storage """Upload file to object storage
Args: Args:
file_path: File path file_path: File path
Returns:
File URL
"""
pass
@abstractmethod
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to object storage
Args:
content: Byte content to upload
file_ext: File extension
Returns: Returns:
File URL File URL
""" """
pass pass
@abstractmethod
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to object storage
Args:
content: Byte content to upload
file_ext: File extension
Returns:
File URL
"""
pass
class CosStorage(Storage): class CosStorage(Storage):
"""Tencent Cloud COS storage implementation""" """Tencent Cloud COS storage implementation"""
def __init__(self, storage_config=None): def __init__(self, storage_config=None):
"""Initialize COS storage """Initialize COS storage
Args: Args:
storage_config: Storage configuration storage_config: Storage configuration
""" """
self.storage_config = storage_config self.storage_config = storage_config
self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client() self.client, self.bucket_name, self.region, self.prefix = (
self._init_cos_client()
)
def _init_cos_client(self): def _init_cos_client(self):
"""Initialize Tencent Cloud COS client""" """Initialize Tencent Cloud COS client"""
try: try:
# Use provided COS config if available, otherwise fall back to environment variables # Use provided COS config if available,
# otherwise fall back to environment variables
if self.storage_config and self.storage_config.get("access_key_id") != "": if self.storage_config and self.storage_config.get("access_key_id") != "":
cos_config = self.storage_config cos_config = self.storage_config
secret_id = cos_config.get("access_key_id") secret_id = cos_config.get("access_key_id")
@@ -75,15 +79,16 @@ class CosStorage(Storage):
bucket_name = os.getenv("COS_BUCKET_NAME") bucket_name = os.getenv("COS_BUCKET_NAME")
appid = os.getenv("COS_APP_ID") appid = os.getenv("COS_APP_ID")
prefix = os.getenv("COS_PATH_PREFIX") prefix = os.getenv("COS_PATH_PREFIX")
enable_old_domain = ( enable_old_domain = (
os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true" os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true"
) )
if not all([secret_id, secret_key, region, bucket_name, appid]): if not all([secret_id, secret_key, region, bucket_name, appid]):
logger.error( logger.error(
"Incomplete COS configuration, missing required environment variables" "Incomplete COS configuration, missing environment variables"
f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}" f"secret_id: {secret_id}, secret_key: {secret_key}, "
f"region: {region}, bucket_name: {bucket_name}, appid: {appid}"
) )
return None, None, None, None return None, None, None, None
@@ -105,27 +110,26 @@ class CosStorage(Storage):
except Exception as e: except Exception as e:
logger.error(f"Failed to initialize COS client: {str(e)}") logger.error(f"Failed to initialize COS client: {str(e)}")
return None, None, None, None return None, None, None, None
def _get_download_url(self, bucket_name, region, object_key): def _get_download_url(self, bucket_name, region, object_key):
"""Generate COS object URL """Generate COS object URL
Args: Args:
bucket_name: Bucket name bucket_name: Bucket name
region: Region region: Region
object_key: Object key object_key: Object key
Returns: Returns:
File URL File URL
""" """
return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}" return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
def upload_file(self, file_path: str) -> str: def upload_file(self, file_path: str) -> str:
"""Upload file to Tencent Cloud COS """Upload file to Tencent Cloud COS
Args: Args:
file_path: File path file_path: File path
Returns: Returns:
File URL File URL
""" """
@@ -135,16 +139,16 @@ class CosStorage(Storage):
return "" return ""
# Generate object key, use UUID to avoid conflicts # Generate object key, use UUID to avoid conflicts
file_name = os.path.basename(file_path) file_ext = os.path.splitext(file_path)[1]
object_key = ( object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
)
logger.info(f"Generated object key: {object_key}") logger.info(f"Generated object key: {object_key}")
# Upload file # Upload file
logger.info("Attempting to upload file to COS") logger.info("Attempting to upload file to COS")
response = self.client.upload_file( self.client.upload_file(
Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key Bucket=self.bucket_name,
LocalFilePath=file_path,
Key=object_key,
) )
# Get file URL # Get file URL
@@ -156,14 +160,14 @@ class CosStorage(Storage):
except Exception as e: except Exception as e:
logger.error(f"Failed to upload file to COS: {str(e)}") logger.error(f"Failed to upload file to COS: {str(e)}")
return "" return ""
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str: def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to Tencent Cloud COS """Upload bytes to Tencent Cloud COS
Args: Args:
content: Byte content to upload content: Byte content to upload
file_ext: File extension file_ext: File extension
Returns: Returns:
File URL File URL
""" """
@@ -171,10 +175,16 @@ class CosStorage(Storage):
logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes") logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes")
if not self.client: if not self.client:
return "" return ""
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}" object_key = (
f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
if self.prefix
else f"images/{uuid.uuid4().hex}{file_ext}"
)
logger.info(f"Generated object key: {object_key}") logger.info(f"Generated object key: {object_key}")
self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key) self.client.put_object(
Bucket=self.bucket_name, Body=content, Key=object_key
)
file_url = self._get_download_url(self.bucket_name, self.region, object_key) file_url = self._get_download_url(self.bucket_name, self.region, object_key)
logger.info(f"Successfully uploaded bytes to COS: {file_url}") logger.info(f"Successfully uploaded bytes to COS: {file_url}")
return file_url return file_url
@@ -186,16 +196,18 @@ class CosStorage(Storage):
class MinioStorage(Storage): class MinioStorage(Storage):
"""MinIO storage implementation""" """MinIO storage implementation"""
def __init__(self, storage_config=None): def __init__(self, storage_config=None):
"""Initialize MinIO storage """Initialize MinIO storage
Args: Args:
storage_config: Storage configuration storage_config: Storage configuration
""" """
self.storage_config = storage_config self.storage_config = storage_config
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client() self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = (
self._init_minio_client()
)
def _init_minio_client(self): def _init_minio_client(self):
"""Initialize MinIO client from environment variables or injected config. """Initialize MinIO client from environment variables or injected config.
@@ -203,58 +215,69 @@ class MinioStorage(Storage):
prefer those values to override envs. prefer those values to override envs.
""" """
try: try:
endpoint = os.getenv("MINIO_ENDPOINT") endpoint = os.getenv("MINIO_ENDPOINT", "")
use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true" use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true"
if self.storage_config and self.storage_config.get("bucket_name"): if self.storage_config and self.storage_config.get("bucket_name"):
storage_config = self.storage_config storage_config = self.storage_config
bucket_name = storage_config.get("bucket_name") bucket_name = storage_config.get("bucket_name", "")
path_prefix = storage_config.get("path_prefix").strip().strip("/") path_prefix = storage_config.get("path_prefix").strip().strip("/")
access_key = storage_config.get("access_key_id") access_key = storage_config.get("access_key_id")
secret_key = storage_config.get("secret_access_key") secret_key = storage_config.get("secret_access_key")
else: else:
access_key = os.getenv("MINIO_ACCESS_KEY_ID") access_key = os.getenv("MINIO_ACCESS_KEY_ID")
secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY") secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY")
bucket_name = os.getenv("MINIO_BUCKET_NAME") bucket_name = os.getenv("MINIO_BUCKET_NAME", "")
path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/") path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/")
if not all([endpoint, access_key, secret_key, bucket_name]): if not all([endpoint, access_key, secret_key, bucket_name]):
logger.error("Incomplete MinIO configuration, missing required environment variables") logger.error(
"Incomplete MinIO configuration, missing environment variables"
)
return None, None, None, None, None return None, None, None, None, None
# Initialize client # Initialize client
client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl) client = Minio(
endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl
)
# Ensure bucket exists # Ensure bucket exists
found = client.bucket_exists(bucket_name) found = client.bucket_exists(bucket_name)
if not found: if not found:
client.make_bucket(bucket_name) client.make_bucket(bucket_name)
policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name) policy = (
'{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}'
% (bucket_name, bucket_name)
)
client.set_bucket_policy(bucket_name, policy) client.set_bucket_policy(bucket_name, policy)
return client, bucket_name, use_ssl, endpoint, path_prefix return client, bucket_name, use_ssl, endpoint, path_prefix
except Exception as e: except Exception as e:
logger.error(f"Failed to initialize MinIO client: {str(e)}") logger.error(f"Failed to initialize MinIO client: {str(e)}")
return None, None, None, None, None return None, None, None, None, None
def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None): def _get_download_url(self, object_key: str):
"""Construct a public URL for MinIO object. """Construct a public URL for MinIO object.
If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint. If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint.
""" """
if public_endpoint: # 1. Use public endpoint if provided
base = public_endpoint endpoint = os.getenv("MINIO_PUBLIC_ENDPOINT")
else: if endpoint:
scheme = "https" if use_ssl else "http" return f"{endpoint}/{self.bucket_name}/{object_key}"
base = f"{scheme}://{endpoint}"
# Path-style URL for MinIO # 2. Use SSL if enabled
return f"{base}/{bucket_name}/{object_key}" if self.use_ssl:
return f"https://{self.endpoint}/{self.bucket_name}/{object_key}"
# 3. Use HTTP default
return f"http://{self.endpoint}/{self.bucket_name}/{object_key}"
def upload_file(self, file_path: str) -> str: def upload_file(self, file_path: str) -> str:
"""Upload file to MinIO """Upload file to MinIO
Args: Args:
file_path: File path file_path: File path
Returns: Returns:
File URL File URL
""" """
@@ -265,29 +288,27 @@ class MinioStorage(Storage):
# Generate object key, use UUID to avoid conflicts # Generate object key, use UUID to avoid conflicts
file_name = os.path.basename(file_path) file_name = os.path.basename(file_path)
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" object_key = (
f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
if self.path_prefix
else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
)
logger.info(f"Generated MinIO object key: {object_key}") logger.info(f"Generated MinIO object key: {object_key}")
# Upload file # Upload file
logger.info("Attempting to upload file to MinIO") logger.info("Attempting to upload file to MinIO")
with open(file_path, 'rb') as file_data: with open(file_path, "rb") as file_data:
file_size = os.path.getsize(file_path) file_size = os.path.getsize(file_path)
self.client.put_object( self.client.put_object(
bucket_name=self.bucket_name, bucket_name=self.bucket_name or "",
object_name=object_key, object_name=object_key,
data=file_data, data=file_data,
length=file_size, length=file_size,
content_type='application/octet-stream' content_type="application/octet-stream",
) )
# Get file URL # Get file URL
file_url = self._get_download_url( file_url = self._get_download_url(object_key)
self.bucket_name,
object_key,
self.use_ssl,
self.endpoint,
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
)
logger.info(f"Successfully uploaded file to MinIO: {file_url}") logger.info(f"Successfully uploaded file to MinIO: {file_url}")
return file_url return file_url
@@ -295,14 +316,14 @@ class MinioStorage(Storage):
except Exception as e: except Exception as e:
logger.error(f"Failed to upload file to MinIO: {str(e)}") logger.error(f"Failed to upload file to MinIO: {str(e)}")
return "" return ""
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str: def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to MinIO """Upload bytes to MinIO
Args: Args:
content: Byte content to upload content: Byte content to upload
file_ext: File extension file_ext: File extension
Returns: Returns:
File URL File URL
""" """
@@ -310,23 +331,21 @@ class MinioStorage(Storage):
logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes") logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes")
if not self.client: if not self.client:
return "" return ""
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}" object_key = (
f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}"
if self.path_prefix
else f"images/{uuid.uuid4().hex}{file_ext}"
)
logger.info(f"Generated MinIO object key: {object_key}") logger.info(f"Generated MinIO object key: {object_key}")
self.client.put_object( self.client.put_object(
self.bucket_name, self.bucket_name or "",
object_key, object_key,
data=io.BytesIO(content), data=io.BytesIO(content),
length=len(content), length=len(content),
content_type="application/octet-stream" content_type="application/octet-stream",
)
file_url = self._get_download_url(
self.bucket_name,
object_key,
self.use_ssl,
self.endpoint,
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
) )
file_url = self._get_download_url(object_key)
logger.info(f"Successfully uploaded bytes to MinIO: {file_url}") logger.info(f"Successfully uploaded bytes to MinIO: {file_url}")
return file_url return file_url
except Exception as e: except Exception as e:
@@ -335,26 +354,61 @@ class MinioStorage(Storage):
return "" return ""
def create_storage(storage_config=None) -> Storage: class LocalStorage(Storage):
"""Local file system storage implementation"""
def __init__(self, storage_config: Dict[str, str] = {}):
self.storage_config = storage_config
base_dir = storage_config.get(
"base_dir", os.getenv("LOCAL_STORAGE_BASE_DIR", "")
)
self.image_dir = os.path.join(base_dir, "images")
os.makedirs(self.image_dir, exist_ok=True)
def upload_file(self, file_path: str) -> str:
logger.info(f"Uploading file to local storage: {file_path}")
return file_path
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
logger.info(f"Uploading file to local storage: {len(content)} bytes")
fname = os.path.join(self.image_dir, f"{uuid.uuid4()}{file_ext}")
with open(fname, "wb") as f:
f.write(content)
return fname
class Base64Storage(Storage):
def upload_file(self, file_path: str) -> str:
logger.info(f"Uploading file to base64 storage: {file_path}")
return file_path
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
logger.info(f"Uploading file to base64 storage: {len(content)} bytes")
file_ext = file_ext.lstrip(".")
return f"data:image/{file_ext};base64,{endecode.decode_image(content)}"
def create_storage(storage_config: Dict[str, str] | None = None) -> Storage:
"""Create a storage instance based on configuration or environment variables """Create a storage instance based on configuration or environment variables
Args: Args:
storage_config: Storage configuration dictionary storage_config: Storage configuration dictionary
Returns: Returns:
Storage instance Storage instance
""" """
storage_type = os.getenv("STORAGE_TYPE", "cos").lower() storage_type = os.getenv("STORAGE_TYPE", "cos").lower()
if storage_config: if storage_config:
storage_type = str(storage_config.get("provider", storage_type)).lower() storage_type = str(storage_config.get("provider", storage_type)).lower()
logger.info(f"Creating {storage_type} storage instance") logger.info(f"Creating {storage_type} storage instance")
if storage_type == "minio": if storage_type == "minio":
return MinioStorage(storage_config) return MinioStorage(storage_config)
elif storage_type == "cos": elif storage_type == "cos":
# Default to COS
return CosStorage(storage_config) return CosStorage(storage_config)
else: elif storage_type == "local":
return None return LocalStorage(storage_config or {})
elif storage_type == "base64":
return Base64Storage()
raise ValueError(f"Invalid storage type: {storage_type}")

View File

@@ -1,6 +1,8 @@
import logging import logging
from .base_parser import BaseParser
from typing import Dict, Any, Tuple, Union from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.utils import endecode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -11,7 +13,7 @@ class TextParser(BaseParser):
This parser handles text extraction and chunking from plain text documents. This parser handles text extraction and chunking from plain text documents.
""" """
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: def parse_into_text(self, content: bytes) -> Document:
""" """
Parse text document content by decoding bytes to string. Parse text document content by decoding bytes to string.
@@ -25,20 +27,15 @@ class TextParser(BaseParser):
Parsed text content as string Parsed text content as string
""" """
logger.info(f"Parsing text document, content size: {len(content)} bytes") logger.info(f"Parsing text document, content size: {len(content)} bytes")
text = self.decode_bytes(content) text = endecode.decode_bytes(content)
logger.info( logger.info(
f"Successfully parsed text document, extracted {len(text)} characters" f"Successfully parsed text document, extracted {len(text)} characters"
) )
return text return Document(content=text)
if __name__ == "__main__": if __name__ == "__main__":
logging.basicConfig( logger = logging.getLogger(__name__)
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger.info("Running TextParser in standalone mode")
# Sample text for testing # Sample text for testing
text = """## 标题1 text = """## 标题1

View File

@@ -1,11 +1,14 @@
from typing import Any, Optional, Tuple, Dict, Union
import os
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from .base_parser import BaseParser, ParseResult
import logging
import asyncio import asyncio
import logging
import os
from typing import Any
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.utils import endecode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -59,7 +62,7 @@ class WebParser(BaseParser):
# Return empty BeautifulSoup object on error # Return empty BeautifulSoup object on error
return BeautifulSoup("", "html.parser") return BeautifulSoup("", "html.parser")
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: def parse_into_text(self, content: bytes) -> Document:
"""Parse web page """Parse web page
Args: Args:
@@ -78,10 +81,10 @@ class WebParser(BaseParser):
# Run async method # Run async method
# Handle content possibly being a string # Handle content possibly being a string
if isinstance(content, bytes): if isinstance(content, bytes):
url = self.decode_bytes(content) url = endecode.decode_bytes(content)
logger.info(f"Decoded URL from bytes: {url}") logger.info(f"Decoded URL from bytes: {url}")
else: else:
url = content url = str(content)
logger.info(f"Using content as URL directly: {url}") logger.info(f"Using content as URL directly: {url}")
logger.info(f"Scraping web page: {url}") logger.info(f"Scraping web page: {url}")
@@ -118,11 +121,11 @@ class WebParser(BaseParser):
logger.info( logger.info(
f"Web page parsing complete, total content: {len(result)} characters" f"Web page parsing complete, total content: {len(result)} characters"
) )
return result return Document(content=result)
except Exception as e: except Exception as e:
logger.error(f"Error parsing web page: {str(e)}") logger.error(f"Error parsing web page: {str(e)}")
return f"Error parsing web page: {str(e)}" return Document(content=f"Error parsing web page: {str(e)}")
finally: finally:
# Close event loop # Close event loop

View File

@@ -0,0 +1,127 @@
from google.protobuf.internal import containers as _containers
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from collections.abc import Iterable as _Iterable, Mapping as _Mapping
from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
DESCRIPTOR: _descriptor.FileDescriptor
class StorageProvider(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
__slots__ = ()
STORAGE_PROVIDER_UNSPECIFIED: _ClassVar[StorageProvider]
COS: _ClassVar[StorageProvider]
MINIO: _ClassVar[StorageProvider]
STORAGE_PROVIDER_UNSPECIFIED: StorageProvider
COS: StorageProvider
MINIO: StorageProvider
class StorageConfig(_message.Message):
__slots__ = ("provider", "region", "bucket_name", "access_key_id", "secret_access_key", "app_id", "path_prefix")
PROVIDER_FIELD_NUMBER: _ClassVar[int]
REGION_FIELD_NUMBER: _ClassVar[int]
BUCKET_NAME_FIELD_NUMBER: _ClassVar[int]
ACCESS_KEY_ID_FIELD_NUMBER: _ClassVar[int]
SECRET_ACCESS_KEY_FIELD_NUMBER: _ClassVar[int]
APP_ID_FIELD_NUMBER: _ClassVar[int]
PATH_PREFIX_FIELD_NUMBER: _ClassVar[int]
provider: StorageProvider
region: str
bucket_name: str
access_key_id: str
secret_access_key: str
app_id: str
path_prefix: str
def __init__(self, provider: _Optional[_Union[StorageProvider, str]] = ..., region: _Optional[str] = ..., bucket_name: _Optional[str] = ..., access_key_id: _Optional[str] = ..., secret_access_key: _Optional[str] = ..., app_id: _Optional[str] = ..., path_prefix: _Optional[str] = ...) -> None: ...
class VLMConfig(_message.Message):
__slots__ = ("model_name", "base_url", "api_key", "interface_type")
MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
BASE_URL_FIELD_NUMBER: _ClassVar[int]
API_KEY_FIELD_NUMBER: _ClassVar[int]
INTERFACE_TYPE_FIELD_NUMBER: _ClassVar[int]
model_name: str
base_url: str
api_key: str
interface_type: str
def __init__(self, model_name: _Optional[str] = ..., base_url: _Optional[str] = ..., api_key: _Optional[str] = ..., interface_type: _Optional[str] = ...) -> None: ...
class ReadConfig(_message.Message):
__slots__ = ("chunk_size", "chunk_overlap", "separators", "enable_multimodal", "storage_config", "vlm_config")
CHUNK_SIZE_FIELD_NUMBER: _ClassVar[int]
CHUNK_OVERLAP_FIELD_NUMBER: _ClassVar[int]
SEPARATORS_FIELD_NUMBER: _ClassVar[int]
ENABLE_MULTIMODAL_FIELD_NUMBER: _ClassVar[int]
STORAGE_CONFIG_FIELD_NUMBER: _ClassVar[int]
VLM_CONFIG_FIELD_NUMBER: _ClassVar[int]
chunk_size: int
chunk_overlap: int
separators: _containers.RepeatedScalarFieldContainer[str]
enable_multimodal: bool
storage_config: StorageConfig
vlm_config: VLMConfig
def __init__(self, chunk_size: _Optional[int] = ..., chunk_overlap: _Optional[int] = ..., separators: _Optional[_Iterable[str]] = ..., enable_multimodal: bool = ..., storage_config: _Optional[_Union[StorageConfig, _Mapping]] = ..., vlm_config: _Optional[_Union[VLMConfig, _Mapping]] = ...) -> None: ...
class ReadFromFileRequest(_message.Message):
__slots__ = ("file_content", "file_name", "file_type", "read_config", "request_id")
FILE_CONTENT_FIELD_NUMBER: _ClassVar[int]
FILE_NAME_FIELD_NUMBER: _ClassVar[int]
FILE_TYPE_FIELD_NUMBER: _ClassVar[int]
READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
file_content: bytes
file_name: str
file_type: str
read_config: ReadConfig
request_id: str
def __init__(self, file_content: _Optional[bytes] = ..., file_name: _Optional[str] = ..., file_type: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
class ReadFromURLRequest(_message.Message):
__slots__ = ("url", "title", "read_config", "request_id")
URL_FIELD_NUMBER: _ClassVar[int]
TITLE_FIELD_NUMBER: _ClassVar[int]
READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
url: str
title: str
read_config: ReadConfig
request_id: str
def __init__(self, url: _Optional[str] = ..., title: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
class Image(_message.Message):
__slots__ = ("url", "caption", "ocr_text", "original_url", "start", "end")
URL_FIELD_NUMBER: _ClassVar[int]
CAPTION_FIELD_NUMBER: _ClassVar[int]
OCR_TEXT_FIELD_NUMBER: _ClassVar[int]
ORIGINAL_URL_FIELD_NUMBER: _ClassVar[int]
START_FIELD_NUMBER: _ClassVar[int]
END_FIELD_NUMBER: _ClassVar[int]
url: str
caption: str
ocr_text: str
original_url: str
start: int
end: int
def __init__(self, url: _Optional[str] = ..., caption: _Optional[str] = ..., ocr_text: _Optional[str] = ..., original_url: _Optional[str] = ..., start: _Optional[int] = ..., end: _Optional[int] = ...) -> None: ...
class Chunk(_message.Message):
__slots__ = ("content", "seq", "start", "end", "images")
CONTENT_FIELD_NUMBER: _ClassVar[int]
SEQ_FIELD_NUMBER: _ClassVar[int]
START_FIELD_NUMBER: _ClassVar[int]
END_FIELD_NUMBER: _ClassVar[int]
IMAGES_FIELD_NUMBER: _ClassVar[int]
content: str
seq: int
start: int
end: int
images: _containers.RepeatedCompositeFieldContainer[Image]
def __init__(self, content: _Optional[str] = ..., seq: _Optional[int] = ..., start: _Optional[int] = ..., end: _Optional[int] = ..., images: _Optional[_Iterable[_Union[Image, _Mapping]]] = ...) -> None: ...
class ReadResponse(_message.Message):
__slots__ = ("chunks", "error")
CHUNKS_FIELD_NUMBER: _ClassVar[int]
ERROR_FIELD_NUMBER: _ClassVar[int]
chunks: _containers.RepeatedCompositeFieldContainer[Chunk]
error: str
def __init__(self, chunks: _Optional[_Iterable[_Union[Chunk, _Mapping]]] = ..., error: _Optional[str] = ...) -> None: ...

View File

@@ -3,7 +3,7 @@
import grpc import grpc
import warnings import warnings
from . import docreader_pb2 as docreader__pb2 import docreader_pb2 as docreader__pb2
GRPC_GENERATED_VERSION = '1.76.0' GRPC_GENERATED_VERSION = '1.76.0'
GRPC_VERSION = grpc.__version__ GRPC_VERSION = grpc.__version__

View File

@@ -16,6 +16,7 @@ dependencies = [
"lxml>=6.0.2", "lxml>=6.0.2",
"markdown>=3.10", "markdown>=3.10",
"markdownify>=1.2.0", "markdownify>=1.2.0",
"markitdown[docx,pdf,xls,xlsx]>=0.1.3",
"minio>=7.2.18", "minio>=7.2.18",
"mistletoe>=1.5.0", "mistletoe>=1.5.0",
"ollama>=0.6.0", "ollama>=0.6.0",
@@ -26,6 +27,7 @@ dependencies = [
"pillow>=12.0.0", "pillow>=12.0.0",
"playwright>=1.55.0", "playwright>=1.55.0",
"protobuf>=6.33.0", "protobuf>=6.33.0",
"pydantic>=2.12.3",
"pypdf>=6.1.3", "pypdf>=6.1.3",
"pypdf2>=3.0.1", "pypdf2>=3.0.1",
"python-docx>=1.2.0", "python-docx>=1.2.0",

View File

@@ -2,13 +2,14 @@
set -x set -x
# 设置目录 # 设置目录
PROTO_DIR="proto" PROTO_DIR="docreader/proto"
PYTHON_OUT="proto" PYTHON_OUT="docreader/proto"
GO_OUT="proto" GO_OUT="docreader/proto"
# 生成Python代码 # 生成Python代码
python3 -m grpc_tools.protoc -I${PROTO_DIR} \ python3 -m grpc_tools.protoc -I${PROTO_DIR} \
--python_out=${PYTHON_OUT} \ --python_out=${PYTHON_OUT} \
--pyi_out=${PYTHON_OUT} \
--grpc_python_out=${PYTHON_OUT} \ --grpc_python_out=${PYTHON_OUT} \
${PROTO_DIR}/docreader.proto ${PROTO_DIR}/docreader.proto
@@ -22,10 +23,10 @@ protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
# 修复Python导入问题MacOS兼容版本 # 修复Python导入问题MacOS兼容版本
if [ "$(uname)" == "Darwin" ]; then if [ "$(uname)" == "Darwin" ]; then
# MacOS版本 # MacOS版本
sed -i '' 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py sed -i '' 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
else else
# Linux版本 # Linux版本
sed -i 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py sed -i 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
fi fi
echo "Proto files generated successfully!" echo "Proto files generated successfully!"

View File

@@ -0,0 +1,112 @@
import re
from typing import Callable, Dict, List, Match, Pattern, Union
from pydantic import BaseModel, Field
class HeaderTrackerHook(BaseModel):
"""表头追踪Hook的配置类支持多种场景的表头识别"""
start_pattern: Pattern[str] = Field(
description="表头开始匹配(正则表达式或字符串)"
)
end_pattern: Pattern[str] = Field(description="表头结束匹配(正则表达式或字符串)")
extract_header_fn: Callable[[Match[str]], str] = Field(
default=lambda m: m.group(0),
description="从开始匹配结果中提取表头内容的函数(默认取匹配到的整个内容)",
)
priority: int = Field(default=0, description="优先级(多个配置时,高优先级先匹配)")
case_sensitive: bool = Field(
default=True, description="是否大小写敏感仅当传入字符串pattern时生效"
)
def __init__(
self,
start_pattern: Union[str, Pattern[str]],
end_pattern: Union[str, Pattern[str]],
**kwargs,
):
flags = 0 if kwargs.get("case_sensitive", True) else re.IGNORECASE
if isinstance(start_pattern, str):
start_pattern = re.compile(start_pattern, flags | re.DOTALL)
if isinstance(end_pattern, str):
end_pattern = re.compile(end_pattern, flags | re.DOTALL)
super().__init__(
start_pattern=start_pattern,
end_pattern=end_pattern,
**kwargs,
)
# 初始化表头Hook配置提供默认配置支持Markdown表格、代码块
DEFAULT_CONFIGS = [
# 代码块配置(```开头,```结尾)
# HeaderTrackerHook(
# # 代码块开始(支持语言指定)
# start_pattern=r"^\s*```(\w+).*(?!```$)",
# # 代码块结束
# end_pattern=r"^\s*```.*$",
# extract_header_fn=lambda m: f"```{m.group(1)}" if m.group(1) else "```",
# priority=20, # 代码块优先级高于表格
# case_sensitive=True,
# ),
# Markdown表格配置表头带下划线
HeaderTrackerHook(
# 表头行 + 分隔行
start_pattern=r"^\s*(?:\|[^|\n]*)+[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|?[\r\n]+$",
# 空行或非表格内容
end_pattern=r"^\s*$|^\s*[^|\s].*$",
priority=15,
case_sensitive=False,
),
]
DEFAULT_CONFIGS.sort(key=lambda x: -x.priority)
# 定义Hook状态数据结构
class HeaderTracker(BaseModel):
"""表头追踪 Hook 的状态类"""
header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS)
active_headers: Dict[int, str] = Field(default_factory=dict)
ended_headers: set[int] = Field(default_factory=set)
def update(self, split: str) -> Dict[int, str]:
"""检测当前split中的表头开始/结束更新Hook状态"""
new_headers: Dict[int, str] = {}
# 1. 检查是否有表头结束标记
for config in self.header_hook_configs:
if config.priority in self.active_headers and config.end_pattern.search(
split
):
self.ended_headers.add(config.priority)
del self.active_headers[config.priority]
# 2. 检查是否有新的表头开始标记(只处理未活跃且未结束的)
for config in self.header_hook_configs:
if (
config.priority not in self.active_headers
and config.priority not in self.ended_headers
):
match = config.start_pattern.search(split)
if match:
header = config.extract_header_fn(match)
self.active_headers[config.priority] = header
new_headers[config.priority] = header
# 3. 检查是否所有活跃表头都已结束(清空结束标记)
if not self.active_headers:
self.ended_headers.clear()
return new_headers
def get_headers(self) -> str:
"""获取当前所有活跃表头的拼接文本(按优先级排序)"""
# 按优先级降序排列表头
sorted_headers = sorted(self.active_headers.items(), key=lambda x: -x[0])
return (
"\n".join([header for _, header in sorted_headers])
if sorted_headers
else ""
)

View File

@@ -0,0 +1,313 @@
"""Token splitter."""
import itertools
import logging
import re
from typing import Callable, Generic, List, Pattern, Tuple, TypeVar
from pydantic import BaseModel, Field, PrivateAttr
from docreader.splitter.header_hook import (
HeaderTracker,
)
from docreader.utils.split import split_by_char, split_by_sep
DEFAULT_CHUNK_OVERLAP = 100
DEFAULT_CHUNK_SIZE = 512
T = TypeVar("T")
logger = logging.getLogger(__name__)
class TextSplitter(BaseModel, Generic[T]):
chunk_size: int = Field(description="The token chunk size for each chunk.")
chunk_overlap: int = Field(
description="The token overlap of each chunk when splitting."
)
separators: List[str] = Field(
description="Default separators for splitting into words"
)
# Try to keep the matched characters as a whole.
# If it's too long, the content will be further segmented.
protected_regex: List[str] = Field(
description="Protected regex for splitting into words"
)
len_function: Callable[[str], int] = Field(description="The length function.")
# Header tracking Hook related attributes
header_hook: HeaderTracker = Field(default_factory=HeaderTracker, exclude=True)
_protected_fns: List[Pattern] = PrivateAttr()
_split_fns: List[Callable] = PrivateAttr()
def __init__(
self,
chunk_size: int = DEFAULT_CHUNK_SIZE,
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
separators: List[str] = ["\n", "", " "],
protected_regex: List[str] = [
# math formula
r"\$\$[\s\S]*?\$\$",
# image
r"!\[.*?\]\(.*?\)",
# link
r"\[.*?\]\(.*?\)",
# table header
r"(?:\|[^|\n]*)+\|[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|[\r\n]+",
# table body
r"(?:\|[^|\n]*)+\|[\r\n]+",
# code header
r"```(?:\w+)[\r\n]+[^\r\n]*",
],
length_function: Callable[[str], int] = lambda x: len(x),
):
"""Initialize with parameters."""
if chunk_overlap > chunk_size:
raise ValueError(
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
f"({chunk_size}), should be smaller."
)
super().__init__(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=separators,
protected_regex=protected_regex,
len_function=length_function,
)
self._protected_fns = [re.compile(reg) for reg in protected_regex]
self._split_fns = [split_by_sep(sep) for sep in separators] + [split_by_char()]
def split_text(self, text: str) -> List[Tuple[int, int, str]]:
"""Split text into chunks."""
if text == "":
return []
splits = self._split(text)
protect = self._split_protected(text)
splits = self._join(splits, protect)
assert "".join(splits) == text
chunks = self._merge(splits)
return chunks
def _split(self, text: str) -> List[str]:
"""Break text into splits that are smaller than chunk size.
NOTE: the splits contain the separators.
"""
if self.len_function(text) <= self.chunk_size:
return [text]
splits = []
for split_fn in self._split_fns:
splits = split_fn(text)
if len(splits) > 1:
break
new_splits = []
for split in splits:
split_len = self.len_function(split)
if split_len <= self.chunk_size:
new_splits.append(split)
else:
# recursively split
new_splits.extend(self._split(split))
return new_splits
def _merge(self, splits: List[str]) -> List[Tuple[int, int, str]]:
"""Merge splits into chunks.
The high-level idea is to keep adding splits to a chunk until we
exceed the chunk size, then we start a new chunk with overlap.
When we start a new chunk, we pop off the first element of the previous
chunk until the total length is less than the chunk size.
"""
chunks: List[Tuple[int, int, str]] = []
cur_chunk: List[Tuple[int, int, str]] = []
cur_headers, cur_len = "", 0
cur_start, cur_end = 0, 0
for split in splits:
cur_end = cur_start + len(split)
split_len = self.len_function(split)
if split_len > self.chunk_size:
logger.error(
f"Got a split of size {split_len}, ",
f"larger than chunk size {self.chunk_size}.",
)
self.header_hook.update(split)
cur_headers = self.header_hook.get_headers()
cur_headers_len = self.len_function(cur_headers)
if cur_headers_len > self.chunk_size:
logger.error(
f"Got headers of size {cur_headers_len}, ",
f"larger than chunk size {self.chunk_size}.",
)
cur_headers, cur_headers_len = "", 0
# if we exceed the chunk size after adding the new split, then
# we need to end the current chunk and start a new one
if cur_len + split_len + cur_headers_len > self.chunk_size:
# end the previous chunk
if len(cur_chunk) > 0:
chunks.append(
(
cur_chunk[0][0],
cur_chunk[-1][1],
"".join([c[2] for c in cur_chunk]),
)
)
# start a new chunk with overlap
# keep popping off the first element of the previous chunk until:
# 1. the current chunk length is less than chunk overlap
# 2. the total length is less than chunk size
while cur_chunk and (
cur_len > self.chunk_overlap
or cur_len + split_len + cur_headers_len > self.chunk_size
):
# pop off the first element
first_chunk = cur_chunk.pop(0)
cur_len -= self.len_function(first_chunk[2])
if (
cur_headers
and split_len + cur_headers_len < self.chunk_size
and cur_headers not in split
):
cur_chunk.insert(
0,
(
cur_chunk[0][0] if cur_chunk else cur_start,
cur_chunk[0][1] if cur_chunk else cur_end,
cur_headers,
),
)
cur_len += cur_headers_len
cur_chunk.append((cur_start, cur_end, split))
cur_len += split_len
cur_start = cur_end
# handle the last chunk
assert cur_chunk
if cur_headers and cur_len < self.chunk_size:
cur_chunk.insert(0, (cur_chunk[0][0], cur_chunk[0][1], cur_headers))
chunks.append(
(
cur_chunk[0][0],
cur_chunk[-1][1],
"".join([c[2] for c in cur_chunk]),
)
)
return chunks
def _split_protected(self, text: str) -> List[Tuple[int, str]]:
matches = [
(match.start(), match.end())
for pattern in self._protected_fns
for match in pattern.finditer(text)
]
matches.sort(key=lambda x: (x[0], -x[1]))
res = []
def fold(initial: int, current: Tuple[int, int]) -> int:
if current[0] >= initial:
if current[1] - current[0] < self.chunk_size:
res.append((current[0], text[current[0] : current[1]]))
else:
logger.warning(f"Protected text ignore: {current}")
return max(initial, current[1])
# filter overlapping matches
list(itertools.accumulate(matches, fold, initial=-1))
return res
def _join(self, splits: List[str], protect: List[Tuple[int, str]]) -> List[str]:
"""
Merges and splits elements in splits array based on protected substrings.
The function processes the input splits to ensure all protected substrings
remain as single items. If a protected substring is concatenated with preceding
or following content in any split element, it will be separated from
the adjacent content. The final result maintains the original order of content
while enforcing the integrity of protected substrings.
Key behaviors:
1. Preserves the complete structure of each protected substring
2. Separates protected substrings from any adjacent non-protected content
3. Maintains the original sequence of all content except for necessary
4. Handles cases where protected substrings are partially concatenated
"""
j = 0
point, start = 0, 0
res = []
for split in splits:
end = start + len(split)
cur = split[point - start :]
while j < len(protect):
p_start, p_content = protect[j]
p_end = p_start + len(p_content)
if end <= p_start:
break
if point < p_start:
local_end = p_start - point
res.append(cur[:local_end])
cur = cur[local_end:]
point = p_start
res.append(p_content)
j += 1
if point < p_end:
local_start = p_end - point
cur = cur[local_start:]
point = p_end
if not cur:
break
if cur:
res.append(cur)
point = end
start = end
return res
if __name__ == "__main__":
s = """
这是一些普通文本。
| 姓名 | 年龄 | 城市 |
|------|------|------|
| 张三 | 25 | 北京 |
| 李四 | 30 | 上海 |
| 王五 | 28 | 广州 |
| 张三 | 25 | 北京 |
| 李四 | 30 | 上海 |
| 王五 | 28 | 广州 |
这是文本结束。
"""
sp = TextSplitter(chunk_size=200, chunk_overlap=2)
ck = sp.split_text(s)
for c in ck:
print("------", len(c))
print(c)
pass

103
docreader/utils/endecode.py Normal file
View File

@@ -0,0 +1,103 @@
import base64
import binascii
import io
import logging
from typing import List, Union
import numpy as np
from PIL import Image
logger = logging.getLogger(__name__)
def decode_image(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
"""Convert image to base64 encoded string
Args:
image: Image file path, bytes, PIL Image object, or numpy array
Returns:
Base64 encoded image string, or empty string if conversion fails
"""
if isinstance(image, str):
# It's a file path
with open(image, "rb") as image_file:
return base64.b64encode(image_file.read()).decode()
elif isinstance(image, bytes):
# It's bytes data
return base64.b64encode(image).decode()
elif isinstance(image, Image.Image):
# It's a PIL Image
buffer = io.BytesIO()
image.save(buffer, format=image.format)
return base64.b64encode(buffer.getvalue()).decode()
elif isinstance(image, np.ndarray):
# It's a numpy array
pil_image = Image.fromarray(image)
buffer = io.BytesIO()
pil_image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode()
raise ValueError(f"Unsupported image type: {type(image)}")
def encode_image(image: str, errors="strict") -> bytes:
"""
Decode image bytes using base64.
errors
The error handling scheme to use for the handling of decoding errors.
The default is 'strict' meaning that decoding errors raise a
UnicodeDecodeError. Other possible values are 'ignore' and '????'
as well as any other name registered with codecs.register_error that
can handle UnicodeDecodeErrors.
"""
try:
image_bytes = base64.b64decode(image)
except binascii.Error as e:
if errors == "ignore":
return b""
else:
raise e
return image_bytes
def encode_bytes(content: str) -> bytes:
return content.encode()
def decode_bytes(
content: bytes,
encodings: List[str] = [
"utf-8",
"gb18030",
"gb2312",
"gbk",
"big5",
"ascii",
"latin-1",
],
) -> str:
# Try decoding with each encoding format
for encoding in encodings:
try:
text = content.decode(encoding)
logger.debug(f"Decode content with {encoding}: {len(text)} characters")
return text
except UnicodeDecodeError:
continue
text = content.decode(encoding="latin-1", errors="replace")
logger.warning(
"Unable to determine correct encoding, using latin-1 as fallback. "
"This may cause character issues."
)
return text
if __name__ == "__main__":
img = "test![]()test"
encode_image(img, errors="ignore")

View File

@@ -1,10 +1,10 @@
from contextvars import ContextVar
import logging
import uuid
import contextlib import contextlib
import logging
import time import time
from typing import Optional import uuid
from contextvars import ContextVar
from logging import LogRecord from logging import LogRecord
from typing import Optional
# 配置日志 # 配置日志
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -26,21 +26,21 @@ def get_request_id() -> Optional[str]:
class MillisecondFormatter(logging.Formatter): class MillisecondFormatter(logging.Formatter):
"""自定义日志格式化器,只显示毫秒级时间戳(3位数字)而不是微秒(6位)""" """自定义日志格式化器,只显示毫秒级时间戳(3位数字)而不是微秒(6位)"""
def formatTime(self, record, datefmt=None): def formatTime(self, record, datefmt=None):
"""重写formatTime方法将微秒格式化为毫秒""" """重写formatTime方法将微秒格式化为毫秒"""
# 先获取标准的格式化时间 # 先获取标准的格式化时间
result = super().formatTime(record, datefmt) result = super().formatTime(record, datefmt)
# 如果使用了包含.%f的格式则将微秒(6位)截断为毫秒(3位) # 如果使用了包含.%f的格式则将微秒(6位)截断为毫秒(3位)
if datefmt and ".%f" in datefmt: if datefmt and ".%f" in datefmt:
# 格式化的时间字符串应该在最后有6位微秒数 # 格式化的时间字符串应该在最后有6位微秒数
parts = result.split('.') parts = result.split(".")
if len(parts) > 1 and len(parts[1]) >= 6: if len(parts) > 1 and len(parts[1]) >= 6:
# 只保留前3位作为毫秒 # 只保留前3位作为毫秒
millis = parts[1][:3] millis = parts[1][:3]
result = f"{parts[0]}.{millis}" result = f"{parts[0]}.{millis}"
return result return result

34
docreader/utils/split.py Normal file
View File

@@ -0,0 +1,34 @@
import re
from typing import Callable, List
def split_text_keep_separator(text: str, separator: str) -> List[str]:
"""Split text with separator and keep the separator at the end of each split."""
parts = text.split(separator)
result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
return [s for s in result if s]
def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
"""Split text by separator."""
if keep_sep:
return lambda text: split_text_keep_separator(text, sep)
else:
return lambda text: text.split(sep)
def split_by_char() -> Callable[[str], List[str]]:
"""Split text by character."""
return lambda text: list(text)
def split_by_regex(regex: str) -> Callable[[str], List[str]]:
"""Split text by regex."""
pattern = re.compile(f"({regex})")
return lambda text: list(filter(None, pattern.split(text)))
def match_by_regex(regex: str) -> Callable[[str], bool]:
"""Split text by regex."""
pattern = re.compile(regex)
return lambda text: bool(pattern.match(text))

View File

@@ -0,0 +1,77 @@
import logging
import os
import tempfile
logger = logging.getLogger(__name__)
class TempFileContext:
def __init__(self, file_content: bytes, suffix: str):
"""
Initialize the context
:param file_content: Byte data to write to file
:param suffix: File suffix
"""
self.file_content = file_content
self.suffix = suffix
self.file = None
def __enter__(self):
"""
Create file when entering context
"""
self.temp_file = tempfile.NamedTemporaryFile(suffix=self.suffix, delete=False)
self.temp_file.write(self.file_content)
self.temp_file.flush()
logger.info(
f"Saved {self.suffix} content to temporary file: {self.temp_file.name}"
)
return self.temp_file.name
def __exit__(self, exc_type, exc_val, exc_tb):
"""
Delete file when exiting context
"""
if self.temp_file:
self.temp_file.close()
if os.path.exists(self.temp_file.name):
os.remove(self.temp_file.name)
logger.info(f"File {self.temp_file.name} has been deleted.")
# Return False to propagate exception (if any exception occurred)
return False
class TempDirContext:
def __init__(self):
"""
Initialize the context
"""
self.temp_dir = None
def __enter__(self):
"""
Create directory when entering context
"""
self.temp_dir = tempfile.TemporaryDirectory()
logger.info(f"Created temporary directory: {self.temp_dir.name}")
return self.temp_dir.name
def __exit__(self, exc_type, exc_val, exc_tb):
"""
Delete directory when exiting context
"""
if self.temp_dir and os.path.exists(self.temp_dir.name):
self.temp_dir.cleanup()
logger.info(f"Directory {self.temp_dir.name} has been deleted.")
# Return False to propagate exception (if any exception occurred)
return False
if __name__ == "__main__":
example_bytes = b"Hello, this is a test file."
file_name = "test_file.txt"
# Using with statement
with TempFileContext(example_bytes, file_name) as temp_file:
# File operations can be performed within the context
print(f"Does file {file_name} exist: {os.path.exists(file_name)}")

438
docreader/uv.lock generated
View File

@@ -6,17 +6,22 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'", "python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'", "python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
] ]
[[package]] [[package]]
@@ -423,6 +428,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" }, { url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" },
] ]
[[package]]
name = "cobble"
version = "0.1.4"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa", size = 3805, upload-time = "2024-06-01T18:11:09.528Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44", size = 3984, upload-time = "2024-06-01T18:11:07.911Z" },
]
[[package]] [[package]]
name = "colorama" name = "colorama"
version = "0.4.6" version = "0.4.6"
@@ -432,6 +446,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
] ]
[[package]]
name = "coloredlogs"
version = "15.0.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "humanfriendly" },
]
sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" },
]
[[package]] [[package]]
name = "cos-python-sdk-v5" name = "cos-python-sdk-v5"
version = "1.9.38" version = "1.9.38"
@@ -587,6 +613,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/18/d5/7a04640bf559bb890455ffb28978daf7d44f667c3f04a4d422c655c1ba92/cython-3.1.6-py3-none-any.whl", hash = "sha256:91dcf7eb9b6a089ce4e9e1140e571d84c3bca834afb77ec269be7aa9d31a8157", size = 1223550, upload-time = "2025-10-23T12:38:16.732Z" }, { url = "https://files.pythonhosted.org/packages/18/d5/7a04640bf559bb890455ffb28978daf7d44f667c3f04a4d422c655c1ba92/cython-3.1.6-py3-none-any.whl", hash = "sha256:91dcf7eb9b6a089ce4e9e1140e571d84c3bca834afb77ec269be7aa9d31a8157", size = 1223550, upload-time = "2025-10-23T12:38:16.732Z" },
] ]
[[package]]
name = "defusedxml"
version = "0.7.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" },
]
[[package]] [[package]]
name = "distro" name = "distro"
version = "1.9.0" version = "1.9.0"
@@ -612,6 +647,7 @@ dependencies = [
{ name = "lxml" }, { name = "lxml" },
{ name = "markdown" }, { name = "markdown" },
{ name = "markdownify" }, { name = "markdownify" },
{ name = "markitdown", extra = ["docx", "pdf", "xls", "xlsx"] },
{ name = "minio" }, { name = "minio" },
{ name = "mistletoe" }, { name = "mistletoe" },
{ name = "ollama" }, { name = "ollama" },
@@ -622,6 +658,7 @@ dependencies = [
{ name = "pillow" }, { name = "pillow" },
{ name = "playwright" }, { name = "playwright" },
{ name = "protobuf" }, { name = "protobuf" },
{ name = "pydantic" },
{ name = "pypdf" }, { name = "pypdf" },
{ name = "pypdf2" }, { name = "pypdf2" },
{ name = "python-docx" }, { name = "python-docx" },
@@ -643,6 +680,7 @@ requires-dist = [
{ name = "lxml", specifier = ">=6.0.2" }, { name = "lxml", specifier = ">=6.0.2" },
{ name = "markdown", specifier = ">=3.10" }, { name = "markdown", specifier = ">=3.10" },
{ name = "markdownify", specifier = ">=1.2.0" }, { name = "markdownify", specifier = ">=1.2.0" },
{ name = "markitdown", extras = ["docx", "pdf", "xls", "xlsx"], specifier = ">=0.1.3" },
{ name = "minio", specifier = ">=7.2.18" }, { name = "minio", specifier = ">=7.2.18" },
{ name = "mistletoe", specifier = ">=1.5.0" }, { name = "mistletoe", specifier = ">=1.5.0" },
{ name = "ollama", specifier = ">=0.6.0" }, { name = "ollama", specifier = ">=0.6.0" },
@@ -653,6 +691,7 @@ requires-dist = [
{ name = "pillow", specifier = ">=12.0.0" }, { name = "pillow", specifier = ">=12.0.0" },
{ name = "playwright", specifier = ">=1.55.0" }, { name = "playwright", specifier = ">=1.55.0" },
{ name = "protobuf", specifier = ">=6.33.0" }, { name = "protobuf", specifier = ">=6.33.0" },
{ name = "pydantic", specifier = ">=2.12.3" },
{ name = "pypdf", specifier = ">=6.1.3" }, { name = "pypdf", specifier = ">=6.1.3" },
{ name = "pypdf2", specifier = ">=3.0.1" }, { name = "pypdf2", specifier = ">=3.0.1" },
{ name = "python-docx", specifier = ">=1.2.0" }, { name = "python-docx", specifier = ">=1.2.0" },
@@ -683,6 +722,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/bf/ee/aa015c5de8b0dc42a8e507eae8c2de5d1c0e068c896858fec6d502402ed6/ebooklib-0.20-py3-none-any.whl", hash = "sha256:fff5322517a37e31c972d27be7d982cc3928c16b3dcc5fd7e8f7c0f5d7bcf42b", size = 40995, upload-time = "2025-10-26T20:56:19.104Z" }, { url = "https://files.pythonhosted.org/packages/bf/ee/aa015c5de8b0dc42a8e507eae8c2de5d1c0e068c896858fec6d502402ed6/ebooklib-0.20-py3-none-any.whl", hash = "sha256:fff5322517a37e31c972d27be7d982cc3928c16b3dcc5fd7e8f7c0f5d7bcf42b", size = 40995, upload-time = "2025-10-26T20:56:19.104Z" },
] ]
[[package]]
name = "et-xmlfile"
version = "2.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
]
[[package]] [[package]]
name = "exceptiongroup" name = "exceptiongroup"
version = "1.3.0" version = "1.3.0"
@@ -707,6 +755,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" }, { url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" },
] ]
[[package]]
name = "flatbuffers"
version = "25.9.23"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" },
]
[[package]] [[package]]
name = "fonttools" name = "fonttools"
version = "4.60.1" version = "4.60.1"
@@ -850,6 +907,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" }, { url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" },
{ url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" }, { url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" },
{ url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126, upload-time = "2025-08-07T13:18:20.239Z" }, { url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126, upload-time = "2025-08-07T13:18:20.239Z" },
{ url = "https://files.pythonhosted.org/packages/f1/29/74242b7d72385e29bcc5563fba67dad94943d7cd03552bac320d597f29b2/greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7", size = 1544904, upload-time = "2025-11-04T12:42:04.763Z" },
{ url = "https://files.pythonhosted.org/packages/c8/e2/1572b8eeab0f77df5f6729d6ab6b141e4a84ee8eb9bc8c1e7918f94eda6d/greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8", size = 1611228, upload-time = "2025-11-04T12:42:08.423Z" },
{ url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654, upload-time = "2025-08-07T13:50:00.469Z" }, { url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654, upload-time = "2025-08-07T13:50:00.469Z" },
{ url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" }, { url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" },
{ url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" }, { url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" },
@@ -859,6 +918,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" }, { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" },
{ url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" }, { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" },
{ url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" }, { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" },
{ url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" },
{ url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" },
{ url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" }, { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" },
{ url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
{ url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
@@ -868,6 +929,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
{ url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
{ url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" }, { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" },
{ url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" },
{ url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" },
{ url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" }, { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" },
{ url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
{ url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
@@ -877,6 +940,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
{ url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
{ url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" },
{ url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" },
{ url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" },
{ url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" },
{ url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
{ url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
@@ -884,6 +949,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
{ url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
{ url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
{ url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
{ url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" },
{ url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" }, { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" },
] ]
@@ -1061,6 +1128,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
] ]
[[package]]
name = "humanfriendly"
version = "10.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyreadline3", marker = "sys_platform == 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" },
]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.11" version = "3.11"
@@ -1386,6 +1465,38 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" }, { url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" },
] ]
[[package]]
name = "magika"
version = "0.6.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "onnxruntime", version = "1.23.2", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
{ name = "python-dotenv" },
]
sdist = { url = "https://files.pythonhosted.org/packages/a3/f3/3d1dcdd7b9c41d589f5cff252d32ed91cdf86ba84391cfc81d9d8773571d/magika-0.6.3.tar.gz", hash = "sha256:7cc52aa7359af861957043e2bf7265ed4741067251c104532765cd668c0c0cb1", size = 3042784, upload-time = "2025-10-30T15:22:34.499Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a2/e4/35c323beb3280482c94299d61626116856ac2d4ec16ecef50afc4fdd4291/magika-0.6.3-py3-none-any.whl", hash = "sha256:eda443d08006ee495e02083b32e51b98cb3696ab595a7d13900d8e2ef506ec9d", size = 2969474, upload-time = "2025-10-30T15:22:25.298Z" },
{ url = "https://files.pythonhosted.org/packages/25/8f/132b0d7cd51c02c39fd52658a5896276c30c8cc2fd453270b19db8c40f7e/magika-0.6.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:86901e64b05dde5faff408c9b8245495b2e1fd4c226e3393d3d2a3fee65c504b", size = 13358841, upload-time = "2025-10-30T15:22:27.413Z" },
{ url = "https://files.pythonhosted.org/packages/c4/03/5ed859be502903a68b7b393b17ae0283bf34195cfcca79ce2dc25b9290e7/magika-0.6.3-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:3d9661eedbdf445ac9567e97e7ceefb93545d77a6a32858139ea966b5806fb64", size = 15367335, upload-time = "2025-10-30T15:22:29.907Z" },
{ url = "https://files.pythonhosted.org/packages/7b/9e/f8ee7d644affa3b80efdd623a3d75865c8f058f3950cb87fb0c48e3559bc/magika-0.6.3-py3-none-win_amd64.whl", hash = "sha256:e57f75674447b20cab4db928ae58ab264d7d8582b55183a0b876711c2b2787f3", size = 12692831, upload-time = "2025-10-30T15:22:32.063Z" },
]
[[package]]
name = "mammoth"
version = "1.11.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "cobble" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ed/3c/a58418d2af00f2da60d4a51e18cd0311307b72d48d2fffec36a97b4a5e44/mammoth-1.11.0.tar.gz", hash = "sha256:a0f59e442f34d5b6447f4b0999306cbf3e67aaabfa8cb516f878fb1456744637", size = 53142, upload-time = "2025-09-19T10:35:20.373Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ca/54/2e39566a131b13f6d8d193f974cb6a34e81bb7cc2fa6f7e03de067b36588/mammoth-1.11.0-py2.py3-none-any.whl", hash = "sha256:c077ab0d450bd7c0c6ecd529a23bf7e0fa8190c929e28998308ff4eada3f063b", size = 54752, upload-time = "2025-09-19T10:35:18.699Z" },
]
[[package]] [[package]]
name = "markdown" name = "markdown"
version = "3.10" version = "3.10"
@@ -1408,6 +1519,41 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6a/e2/7af643acb4cae0741dffffaa7f3f7c9e7ab4046724543ba1777c401d821c/markdownify-1.2.0-py3-none-any.whl", hash = "sha256:48e150a1c4993d4d50f282f725c0111bd9eb25645d41fa2f543708fd44161351", size = 15561, upload-time = "2025-08-09T17:44:14.074Z" }, { url = "https://files.pythonhosted.org/packages/6a/e2/7af643acb4cae0741dffffaa7f3f7c9e7ab4046724543ba1777c401d821c/markdownify-1.2.0-py3-none-any.whl", hash = "sha256:48e150a1c4993d4d50f282f725c0111bd9eb25645d41fa2f543708fd44161351", size = 15561, upload-time = "2025-08-09T17:44:14.074Z" },
] ]
[[package]]
name = "markitdown"
version = "0.1.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "charset-normalizer" },
{ name = "defusedxml" },
{ name = "magika" },
{ name = "markdownify" },
{ name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'win32'" },
{ name = "requests" },
]
sdist = { url = "https://files.pythonhosted.org/packages/87/31/90cef2bc8ecd85c200ed3b3d1e20fc7a724213502685c4b05b5431e02668/markitdown-0.1.3.tar.gz", hash = "sha256:b0d9127c3373a68274dede6af6c9bb0684b78ce364c727c4c304da97a20d6fd9", size = 40039, upload-time = "2025-08-26T22:37:04.4Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/97/83/7b47d2ecbf58650a03aeeb21ba2d59175f202bf4fb81d44f40f1deb82bc0/markitdown-0.1.3-py3-none-any.whl", hash = "sha256:08d9a25770979d78f60dcc0afcb868de6799608e4db65342b2e03304fb091251", size = 58391, upload-time = "2025-08-26T22:37:02.924Z" },
]
[package.optional-dependencies]
docx = [
{ name = "lxml" },
{ name = "mammoth" },
]
pdf = [
{ name = "pdfminer-six" },
]
xls = [
{ name = "pandas" },
{ name = "xlrd" },
]
xlsx = [
{ name = "openpyxl" },
{ name = "pandas" },
]
[[package]] [[package]]
name = "minio" name = "minio"
version = "7.2.18" version = "7.2.18"
@@ -1433,6 +1579,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3f/c2/bd0fc48dc323035cd65d21c45a3bb5c7b054001bf4ea75e461beb7656e07/mistletoe-1.5.0-py3-none-any.whl", hash = "sha256:d4e77b991b998c5efe3c4eab4e1b472263b5349688acd50d79bd9a6c317a9df1", size = 55262, upload-time = "2025-10-18T16:37:08.376Z" }, { url = "https://files.pythonhosted.org/packages/3f/c2/bd0fc48dc323035cd65d21c45a3bb5c7b054001bf4ea75e461beb7656e07/mistletoe-1.5.0-py3-none-any.whl", hash = "sha256:d4e77b991b998c5efe3c4eab4e1b472263b5349688acd50d79bd9a6c317a9df1", size = 55262, upload-time = "2025-10-18T16:37:08.376Z" },
] ]
[[package]]
name = "mpmath"
version = "1.3.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
]
[[package]] [[package]]
name = "networkx" name = "networkx"
version = "3.4.2" version = "3.4.2"
@@ -1440,7 +1595,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [ resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
] ]
sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" } sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" }
wheels = [ wheels = [
@@ -1456,14 +1612,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'", "python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'", "python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
] ]
sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" } sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" }
wheels = [ wheels = [
@@ -1492,7 +1652,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [ resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
] ]
sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" } sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
wheels = [ wheels = [
@@ -1561,14 +1722,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'", "python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'", "python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
] ]
sdist = { url = "https://files.pythonhosted.org/packages/b5/f4/098d2270d52b41f1bd7db9fc288aaa0400cb48c2a3e2af6fa365d9720947/numpy-2.3.4.tar.gz", hash = "sha256:a7d018bfedb375a8d979ac758b120ba846a7fe764911a64465fd87b8729f4a6a", size = 20582187, upload-time = "2025-10-15T16:18:11.77Z" } sdist = { url = "https://files.pythonhosted.org/packages/b5/f4/098d2270d52b41f1bd7db9fc288aaa0400cb48c2a3e2af6fa365d9720947/numpy-2.3.4.tar.gz", hash = "sha256:a7d018bfedb375a8d979ac758b120ba846a7fe764911a64465fd87b8729f4a6a", size = 20582187, upload-time = "2025-10-15T16:18:11.77Z" }
wheels = [ wheels = [
@@ -1660,6 +1825,97 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b5/c1/edc9f41b425ca40b26b7c104c5f6841a4537bb2552bfa6ca66e81405bb95/ollama-0.6.0-py3-none-any.whl", hash = "sha256:534511b3ccea2dff419ae06c3b58d7f217c55be7897c8ce5868dfb6b219cf7a0", size = 14130, upload-time = "2025-09-24T22:46:01.19Z" }, { url = "https://files.pythonhosted.org/packages/b5/c1/edc9f41b425ca40b26b7c104c5f6841a4537bb2552bfa6ca66e81405bb95/ollama-0.6.0-py3-none-any.whl", hash = "sha256:534511b3ccea2dff419ae06c3b58d7f217c55be7897c8ce5868dfb6b219cf7a0", size = 14130, upload-time = "2025-09-24T22:46:01.19Z" },
] ]
[[package]]
name = "onnxruntime"
version = "1.20.1"
source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
]
dependencies = [
{ name = "coloredlogs", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "flatbuffers", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "packaging", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "protobuf", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
{ name = "sympy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/6d/c6/c4c0860bee2fde6037bdd9dcd12d323f6e38cf00fcc9a5065b394337fc55/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b2908b50101a19e99c4d4e97ebb9905561daf61829403061c1adc1b588bc0de", size = 11954010, upload-time = "2024-11-21T00:48:35.254Z" },
{ url = "https://files.pythonhosted.org/packages/63/47/3dc0b075ab539f16b3d8b09df6b504f51836086ee709690a6278d791737d/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d82daaec24045a2e87598b8ac2b417b1cce623244e80e663882e9fe1aae86410", size = 13330452, upload-time = "2024-11-21T00:48:40.02Z" },
{ url = "https://files.pythonhosted.org/packages/27/ef/80fab86289ecc01a734b7ddf115dfb93d8b2e004bd1e1977e12881c72b12/onnxruntime-1.20.1-cp310-cp310-win32.whl", hash = "sha256:4c4b251a725a3b8cf2aab284f7d940c26094ecd9d442f07dd81ab5470e99b83f", size = 9813849, upload-time = "2024-11-21T00:48:43.569Z" },
{ url = "https://files.pythonhosted.org/packages/a9/e6/33ab10066c9875a29d55e66ae97c3bf91b9b9b987179455d67c32261a49c/onnxruntime-1.20.1-cp310-cp310-win_amd64.whl", hash = "sha256:d3b616bb53a77a9463707bb313637223380fc327f5064c9a782e8ec69c22e6a2", size = 11329702, upload-time = "2024-11-21T00:48:46.599Z" },
{ url = "https://files.pythonhosted.org/packages/a5/da/c44bf9bd66cd6d9018a921f053f28d819445c4d84b4dd4777271b0fe52a2/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6243e34d74423bdd1edf0ae9596dd61023b260f546ee17d701723915f06a9f7", size = 11955227, upload-time = "2024-11-21T00:48:54.556Z" },
{ url = "https://files.pythonhosted.org/packages/11/ac/4120dfb74c8e45cce1c664fc7f7ce010edd587ba67ac41489f7432eb9381/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5eec64c0269dcdb8d9a9a53dc4d64f87b9e0c19801d9321246a53b7eb5a7d1bc", size = 13331703, upload-time = "2024-11-21T00:48:57.97Z" },
{ url = "https://files.pythonhosted.org/packages/12/f1/cefacac137f7bb7bfba57c50c478150fcd3c54aca72762ac2c05ce0532c1/onnxruntime-1.20.1-cp311-cp311-win32.whl", hash = "sha256:a19bc6e8c70e2485a1725b3d517a2319603acc14c1f1a017dda0afe6d4665b41", size = 9813977, upload-time = "2024-11-21T00:49:00.519Z" },
{ url = "https://files.pythonhosted.org/packages/2c/2d/2d4d202c0bcfb3a4cc2b171abb9328672d7f91d7af9ea52572722c6d8d96/onnxruntime-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:8508887eb1c5f9537a4071768723ec7c30c28eb2518a00d0adcd32c89dea3221", size = 11329895, upload-time = "2024-11-21T00:49:03.845Z" },
{ url = "https://files.pythonhosted.org/packages/c5/9d/a42a84e10f1744dd27c6f2f9280cc3fb98f869dd19b7cd042e391ee2ab61/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f56e898815963d6dc4ee1c35fc6c36506466eff6d16f3cb9848cea4e8c8172", size = 11952833, upload-time = "2024-11-21T00:49:10.563Z" },
{ url = "https://files.pythonhosted.org/packages/47/42/2f71f5680834688a9c81becbe5c5bb996fd33eaed5c66ae0606c3b1d6a02/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb71a814f66517a65628c9e4a2bb530a6edd2cd5d87ffa0af0f6f773a027d99e", size = 13333903, upload-time = "2024-11-21T00:49:12.984Z" },
{ url = "https://files.pythonhosted.org/packages/c8/f1/aabfdf91d013320aa2fc46cf43c88ca0182860ff15df872b4552254a9680/onnxruntime-1.20.1-cp312-cp312-win32.whl", hash = "sha256:bd386cc9ee5f686ee8a75ba74037750aca55183085bf1941da8efcfe12d5b120", size = 9814562, upload-time = "2024-11-21T00:49:15.453Z" },
{ url = "https://files.pythonhosted.org/packages/dd/80/76979e0b744307d488c79e41051117634b956612cc731f1028eb17ee7294/onnxruntime-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:19c2d843eb074f385e8bbb753a40df780511061a63f9def1b216bf53860223fb", size = 11331482, upload-time = "2024-11-21T00:49:19.412Z" },
{ url = "https://files.pythonhosted.org/packages/81/0d/13bbd9489be2a6944f4a940084bfe388f1100472f38c07080a46fbd4ab96/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb44b08e017a648924dbe91b82d89b0c105b1adcfe31e90d1dc06b8677ad37be", size = 11951459, upload-time = "2024-11-21T00:49:26.269Z" },
{ url = "https://files.pythonhosted.org/packages/c0/ea/4454ae122874fd52bbb8a961262de81c5f932edeb1b72217f594c700d6ef/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bda6aebdf7917c1d811f21d41633df00c58aff2bef2f598f69289c1f1dabc4b3", size = 13331620, upload-time = "2024-11-21T00:49:28.875Z" },
{ url = "https://files.pythonhosted.org/packages/d8/e0/50db43188ca1c945decaa8fc2a024c33446d31afed40149897d4f9de505f/onnxruntime-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:d30367df7e70f1d9fc5a6a68106f5961686d39b54d3221f760085524e8d38e16", size = 11331758, upload-time = "2024-11-21T00:49:31.417Z" },
{ url = "https://files.pythonhosted.org/packages/d8/55/3821c5fd60b52a6c82a00bba18531793c93c4addfe64fbf061e235c5617a/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9158465745423b2b5d97ed25aa7740c7d38d2993ee2e5c3bfacb0c4145c49d8", size = 11950342, upload-time = "2024-11-21T00:49:34.164Z" },
{ url = "https://files.pythonhosted.org/packages/14/56/fd990ca222cef4f9f4a9400567b9a15b220dee2eafffb16b2adbc55c8281/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0df6f2df83d61f46e842dbcde610ede27218947c33e994545a22333491e72a3b", size = 13337040, upload-time = "2024-11-21T00:49:37.271Z" },
]
[[package]]
name = "onnxruntime"
version = "1.23.2"
source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version >= '3.14' and sys_platform == 'darwin'",
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
]
dependencies = [
{ name = "coloredlogs", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
{ name = "flatbuffers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'darwin')" },
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform == 'darwin')" },
{ name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
{ name = "protobuf", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
{ name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/35/d6/311b1afea060015b56c742f3531168c1644650767f27ef40062569960587/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a7730122afe186a784660f6ec5807138bf9d792fa1df76556b27307ea9ebcbe3", size = 17195934, upload-time = "2025-10-27T23:06:14.143Z" },
{ url = "https://files.pythonhosted.org/packages/db/db/81bf3d7cecfbfed9092b6b4052e857a769d62ed90561b410014e0aae18db/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:b28740f4ecef1738ea8f807461dd541b8287d5650b5be33bca7b474e3cbd1f36", size = 19153079, upload-time = "2025-10-27T23:05:57.686Z" },
{ url = "https://files.pythonhosted.org/packages/2e/4d/a382452b17cf70a2313153c520ea4c96ab670c996cb3a95cc5d5ac7bfdac/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f7d1fe034090a1e371b7f3ca9d3ccae2fabae8c1d8844fb7371d1ea38e8e8d2", size = 15219883, upload-time = "2025-10-22T03:46:21.66Z" },
{ url = "https://files.pythonhosted.org/packages/fb/56/179bf90679984c85b417664c26aae4f427cba7514bd2d65c43b181b7b08b/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ca88747e708e5c67337b0f65eed4b7d0dd70d22ac332038c9fc4635760018f7", size = 17370357, upload-time = "2025-10-22T03:46:57.968Z" },
{ url = "https://files.pythonhosted.org/packages/44/be/467b00f09061572f022ffd17e49e49e5a7a789056bad95b54dfd3bee73ff/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f91d2c9b0965e86827a5ba01531d5b669770b01775b23199565d6c1f136616c", size = 17196113, upload-time = "2025-10-22T03:47:33.526Z" },
{ url = "https://files.pythonhosted.org/packages/9f/a8/3c23a8f75f93122d2b3410bfb74d06d0f8da4ac663185f91866b03f7da1b/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:87d8b6eaf0fbeb6835a60a4265fde7a3b60157cf1b2764773ac47237b4d48612", size = 19153857, upload-time = "2025-10-22T03:46:37.578Z" },
{ url = "https://files.pythonhosted.org/packages/3f/d8/506eed9af03d86f8db4880a4c47cd0dffee973ef7e4f4cff9f1d4bcf7d22/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbfd2fca76c855317568c1b36a885ddea2272c13cb0e395002c402f2360429a6", size = 15220095, upload-time = "2025-10-22T03:46:24.769Z" },
{ url = "https://files.pythonhosted.org/packages/e9/80/113381ba832d5e777accedc6cb41d10f9eca82321ae31ebb6bcede530cea/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da44b99206e77734c5819aa2142c69e64f3b46edc3bd314f6a45a932defc0b3e", size = 17372080, upload-time = "2025-10-22T03:47:00.265Z" },
{ url = "https://files.pythonhosted.org/packages/1b/9e/f748cd64161213adeef83d0cb16cb8ace1e62fa501033acdd9f9341fff57/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:b8f029a6b98d3cf5be564d52802bb50a8489ab73409fa9db0bf583eabb7c2321", size = 17195929, upload-time = "2025-10-22T03:47:36.24Z" },
{ url = "https://files.pythonhosted.org/packages/91/9d/a81aafd899b900101988ead7fb14974c8a58695338ab6a0f3d6b0100f30b/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:218295a8acae83905f6f1aed8cacb8e3eb3bd7513a13fe4ba3b2664a19fc4a6b", size = 19157705, upload-time = "2025-10-22T03:46:40.415Z" },
{ url = "https://files.pythonhosted.org/packages/3c/35/4e40f2fba272a6698d62be2cd21ddc3675edfc1a4b9ddefcc4648f115315/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76ff670550dc23e58ea9bc53b5149b99a44e63b34b524f7b8547469aaa0dcb8c", size = 15226915, upload-time = "2025-10-22T03:46:27.773Z" },
{ url = "https://files.pythonhosted.org/packages/ef/88/9cc25d2bafe6bc0d4d3c1db3ade98196d5b355c0b273e6a5dc09c5d5d0d5/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f9b4ae77f8e3c9bee50c27bc1beede83f786fe1d52e99ac85aa8d65a01e9b77", size = 17382649, upload-time = "2025-10-22T03:47:02.782Z" },
{ url = "https://files.pythonhosted.org/packages/3d/41/fba0cabccecefe4a1b5fc8020c44febb334637f133acefc7ec492029dd2c/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2ff531ad8496281b4297f32b83b01cdd719617e2351ffe0dba5684fb283afa1f", size = 17196337, upload-time = "2025-10-22T03:46:35.168Z" },
{ url = "https://files.pythonhosted.org/packages/fe/f9/2d49ca491c6a986acce9f1d1d5fc2099108958cc1710c28e89a032c9cfe9/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:162f4ca894ec3de1a6fd53589e511e06ecdc3ff646849b62a9da7489dee9ce95", size = 19157691, upload-time = "2025-10-22T03:46:43.518Z" },
{ url = "https://files.pythonhosted.org/packages/1c/a1/428ee29c6eaf09a6f6be56f836213f104618fb35ac6cc586ff0f477263eb/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45d127d6e1e9b99d1ebeae9bcd8f98617a812f53f46699eafeb976275744826b", size = 15226898, upload-time = "2025-10-22T03:46:30.039Z" },
{ url = "https://files.pythonhosted.org/packages/f2/2b/b57c8a2466a3126dbe0a792f56ad7290949b02f47b86216cd47d857e4b77/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8bace4e0d46480fbeeb7bbe1ffe1f080e6663a42d1086ff95c1551f2d39e7872", size = 17382518, upload-time = "2025-10-22T03:47:05.407Z" },
{ url = "https://files.pythonhosted.org/packages/7c/3d/6830fa61c69ca8e905f237001dbfc01689a4e4ab06147020a4518318881f/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466", size = 15229610, upload-time = "2025-10-22T03:46:32.239Z" },
{ url = "https://files.pythonhosted.org/packages/b6/ca/862b1e7a639460f0ca25fd5b6135fb42cf9deea86d398a92e44dfda2279d/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2b9233c4947907fd1818d0e581c049c41ccc39b2856cc942ff6d26317cee145", size = 17394184, upload-time = "2025-10-22T03:47:08.127Z" },
]
[[package]] [[package]]
name = "openai" name = "openai"
version = "2.7.1" version = "2.7.1"
@@ -1733,6 +1989,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" }, { url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" },
] ]
[[package]]
name = "openpyxl"
version = "3.1.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "et-xmlfile" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
]
[[package]] [[package]]
name = "opt-einsum" name = "opt-einsum"
version = "3.3.0" version = "3.3.0"
@@ -1821,6 +2089,68 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a3/6f/042d73453ad01a2e3dd4adeae4870e25679d9eaa340d70bd54cd409cb982/paddlepaddle-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:d16fd322246c4a580ca512971840ac8c16126a77d59a7bceee165d8f2196a6b2", size = 101708504, upload-time = "2025-10-30T13:21:18.125Z" }, { url = "https://files.pythonhosted.org/packages/a3/6f/042d73453ad01a2e3dd4adeae4870e25679d9eaa340d70bd54cd409cb982/paddlepaddle-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:d16fd322246c4a580ca512971840ac8c16126a77d59a7bceee165d8f2196a6b2", size = 101708504, upload-time = "2025-10-30T13:21:18.125Z" },
] ]
[[package]]
name = "pandas"
version = "2.3.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "python-dateutil" },
{ name = "pytz" },
{ name = "tzdata" },
]
sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" },
{ url = "https://files.pythonhosted.org/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" },
{ url = "https://files.pythonhosted.org/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" },
{ url = "https://files.pythonhosted.org/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" },
{ url = "https://files.pythonhosted.org/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" },
{ url = "https://files.pythonhosted.org/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" },
{ url = "https://files.pythonhosted.org/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" },
{ url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" },
{ url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" },
{ url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" },
{ url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" },
{ url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" },
{ url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" },
{ url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" },
{ url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
{ url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
{ url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
{ url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" },
{ url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" },
{ url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" },
{ url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" },
{ url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" },
{ url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" },
{ url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" },
{ url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" },
{ url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" },
{ url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" },
{ url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" },
{ url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" },
{ url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" },
{ url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" },
{ url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" },
{ url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" },
{ url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
{ url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" },
{ url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" },
{ url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" },
{ url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" },
{ url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" },
{ url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" },
{ url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" },
{ url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" },
{ url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" },
{ url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" },
{ url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
{ url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
{ url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
]
[[package]] [[package]]
name = "pdfminer-six" name = "pdfminer-six"
version = "20250506" version = "20250506"
@@ -2266,6 +2596,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ae/43/2b0607ef7f16d63fbe00de728151a090397ef5b3b9147b4aefe975d17106/pypdfium2-5.0.0-py3-none-win_arm64.whl", hash = "sha256:0a2a473fe95802e7a5f4140f25e5cd036cf17f060f27ee2d28c3977206add763", size = 2939015, upload-time = "2025-10-26T13:31:40.531Z" }, { url = "https://files.pythonhosted.org/packages/ae/43/2b0607ef7f16d63fbe00de728151a090397ef5b3b9147b4aefe975d17106/pypdfium2-5.0.0-py3-none-win_arm64.whl", hash = "sha256:0a2a473fe95802e7a5f4140f25e5cd036cf17f060f27ee2d28c3977206add763", size = 2939015, upload-time = "2025-10-26T13:31:40.531Z" },
] ]
[[package]]
name = "pyreadline3"
version = "3.5.4"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" },
]
[[package]] [[package]]
name = "python-dateutil" name = "python-dateutil"
version = "2.9.0.post0" version = "2.9.0.post0"
@@ -2291,6 +2630,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" }, { url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" },
] ]
[[package]]
name = "python-dotenv"
version = "1.2.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
]
[[package]] [[package]]
name = "python-pptx" name = "python-pptx"
version = "1.0.2" version = "1.0.2"
@@ -2306,6 +2654,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" }, { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" },
] ]
[[package]]
name = "pytz"
version = "2025.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
]
[[package]] [[package]]
name = "pyyaml" name = "pyyaml"
version = "6.0.3" version = "6.0.3"
@@ -2654,7 +3011,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [ resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
] ]
dependencies = [ dependencies = [
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -2717,14 +3075,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'", "python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'", "python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
] ]
dependencies = [ dependencies = [
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -3083,6 +3445,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/97/45/1097a0eb3ad2a1cb5a29d493aefffe4170e454b2726cfe2a76f6652e91af/stringzilla-4.2.3-cp313-cp313-win_arm64.whl", hash = "sha256:1000a8df547fb3b194def48cc3ed6494715fe1c8ac25c6444ed3cfb0b52942c7", size = 99815, upload-time = "2025-10-27T18:35:56.555Z" }, { url = "https://files.pythonhosted.org/packages/97/45/1097a0eb3ad2a1cb5a29d493aefffe4170e454b2726cfe2a76f6652e91af/stringzilla-4.2.3-cp313-cp313-win_arm64.whl", hash = "sha256:1000a8df547fb3b194def48cc3ed6494715fe1c8ac25c6444ed3cfb0b52942c7", size = 99815, upload-time = "2025-10-27T18:35:56.555Z" },
] ]
[[package]]
name = "sympy"
version = "1.14.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "mpmath" },
]
sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
]
[[package]] [[package]]
name = "termcolor" name = "termcolor"
version = "3.2.0" version = "3.2.0"
@@ -3116,7 +3490,8 @@ source = { registry = "https://pypi.org/simple" }
resolution-markers = [ resolution-markers = [
"python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version < '3.11' and sys_platform == 'darwin'",
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version < '3.11' and sys_platform == 'win32'",
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
] ]
dependencies = [ dependencies = [
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -3135,14 +3510,18 @@ resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'darwin'", "python_full_version == '3.13.*' and sys_platform == 'darwin'",
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version >= '3.14' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.13.*' and sys_platform == 'win32'",
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version == '3.12.*' and sys_platform == 'darwin'",
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version == '3.12.*' and sys_platform == 'win32'",
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
"python_full_version == '3.11.*' and sys_platform == 'darwin'", "python_full_version == '3.11.*' and sys_platform == 'darwin'",
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", "python_full_version == '3.11.*' and sys_platform == 'win32'",
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
] ]
dependencies = [ dependencies = [
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -3185,6 +3564,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
] ]
[[package]]
name = "tzdata"
version = "2025.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
]
[[package]] [[package]]
name = "unidic-lite" name = "unidic-lite"
version = "1.0.8" version = "1.0.8"