mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
feat: 新增文档模型类,调整配置与解析逻辑,优化日志及导入
移除日志设置与冗余代码,优化导入、类型提示及OCR后端管理 统一调整各文件模块导入路径为绝对导入 调整导入路径,移除部分导入,优化日志及注释 升级文档解析器为 Docx2Parser,优化超时与图片处理逻辑
This commit is contained in:
7
.gitignore
vendored
7
.gitignore
vendored
@@ -24,17 +24,14 @@ node_modules/
|
||||
tmp/
|
||||
temp/
|
||||
|
||||
# Docker compose файл (локальные настройки)
|
||||
# docker-compose.yml
|
||||
|
||||
WeKnora
|
||||
/models/
|
||||
**/__pycache__
|
||||
test/data/mswag.txt
|
||||
data/files/
|
||||
|
||||
.python-version
|
||||
.venv/
|
||||
**/__pycache__
|
||||
.python-version
|
||||
|
||||
### macOS
|
||||
# General
|
||||
|
||||
@@ -127,6 +127,7 @@ services:
|
||||
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
|
||||
- MINIO_USE_SSL=${MINIO_USE_SSL:-}
|
||||
- WEB_PROXY=${WEB_PROXY:-}
|
||||
- MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
|
||||
healthcheck:
|
||||
test: ["CMD", "grpc_health_probe", "-addr=:50051"]
|
||||
interval: 30s
|
||||
|
||||
@@ -80,10 +80,11 @@ RUN pip install uv --break-system-packages && \
|
||||
python -m uv sync --locked --no-dev
|
||||
|
||||
# 复制源代码和生成脚本
|
||||
COPY docreader .
|
||||
COPY docreader docreader
|
||||
|
||||
# 生成 protobuf 代码
|
||||
RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh
|
||||
RUN chmod +x docreader/scripts/generate_proto.sh && \
|
||||
bash docreader/scripts/generate_proto.sh
|
||||
|
||||
# 确保模型目录存在
|
||||
RUN ls -la /root/.paddleocr/whl/
|
||||
@@ -150,10 +151,11 @@ RUN python -m playwright install-deps webkit
|
||||
# COPY docreader/scripts/download_deps.py download_deps.py
|
||||
# RUN python -m download_deps
|
||||
|
||||
COPY --from=builder /app/ ./
|
||||
COPY docreader/pyproject.toml docreader/uv.lock ./
|
||||
COPY --from=builder /app/docreader docreader
|
||||
|
||||
# 暴露 gRPC 端口
|
||||
EXPOSE 50051
|
||||
|
||||
# 直接运行 Python 服务(日志输出到 stdout/stderr)
|
||||
CMD ["uv", "run", "main.py"]
|
||||
CMD ["uv", "run", "-m", "docreader.main"]
|
||||
5
docreader/.pylintrc
Normal file
5
docreader/.pylintrc
Normal file
@@ -0,0 +1,5 @@
|
||||
[LOGGING]
|
||||
logging-format-style=fstr
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
; disable=W1203
|
||||
@@ -1,37 +1,25 @@
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from concurrent import futures
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import traceback
|
||||
import grpc
|
||||
import uuid
|
||||
import atexit
|
||||
from concurrent import futures
|
||||
from typing import Optional
|
||||
|
||||
import grpc
|
||||
from grpc_health.v1 import health_pb2_grpc
|
||||
from grpc_health.v1.health import HealthServicer
|
||||
|
||||
# Add parent directory to Python path
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parent_dir = os.path.dirname(current_dir)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
from docreader.models.read_config import ChunkingConfig
|
||||
from docreader.parser import Parser
|
||||
from docreader.parser.ocr_engine import OCREngine
|
||||
from docreader.proto import docreader_pb2_grpc
|
||||
from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse
|
||||
from docreader.utils.request import init_logging_request_id, request_id_context
|
||||
|
||||
from proto.docreader_pb2 import ReadResponse, Chunk, Image
|
||||
from proto import docreader_pb2_grpc
|
||||
from parser import Parser, OCREngine
|
||||
from parser.config import ChunkingConfig
|
||||
from utils.request import request_id_context, init_logging_request_id
|
||||
|
||||
# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
# Optional dependency for charset detection; install via `pip install charset-normalizer`
|
||||
from charset_normalizer import from_bytes as _cn_from_bytes # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
_cn_from_bytes = None # type: ignore
|
||||
|
||||
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
|
||||
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
|
||||
# cannot be encoded to UTF-8
|
||||
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
|
||||
|
||||
|
||||
@@ -47,29 +35,6 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
|
||||
return s.encode("utf-8", errors="replace").decode("utf-8")
|
||||
|
||||
|
||||
def read_text_with_fallback(file_path: str) -> str:
|
||||
"""Read text from file supporting multiple encodings with graceful fallback.
|
||||
|
||||
This server currently receives bytes over gRPC and delegates decoding to the parser.
|
||||
This helper is provided for future local-file reads if needed.
|
||||
"""
|
||||
with open(file_path, "rb") as f:
|
||||
raw = f.read()
|
||||
if _cn_from_bytes is not None:
|
||||
try:
|
||||
result = _cn_from_bytes(raw).best()
|
||||
if result:
|
||||
return str(result)
|
||||
except Exception:
|
||||
pass
|
||||
for enc in ("utf-8", "gb18030", "latin-1"):
|
||||
try:
|
||||
return raw.decode(enc, errors="replace")
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
return raw.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
# Ensure no existing handlers
|
||||
for handler in logging.root.handlers[:]:
|
||||
logging.root.removeHandler(handler)
|
||||
@@ -113,7 +78,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
request.file_type or os.path.splitext(request.file_name)[1][1:]
|
||||
)
|
||||
logger.info(
|
||||
f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
|
||||
f"ReadFromFile for file: {request.file_name}, type: {file_type}"
|
||||
)
|
||||
logger.info(f"File content size: {len(request.file_content)} bytes")
|
||||
|
||||
@@ -124,8 +89,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
enable_multimodal = request.read_config.enable_multimodal or False
|
||||
|
||||
logger.info(
|
||||
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
|
||||
f"multimodal={enable_multimodal}"
|
||||
f"Using chunking config: size={chunk_size}, "
|
||||
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
|
||||
)
|
||||
|
||||
# Get Storage and VLM config from request
|
||||
@@ -144,7 +109,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
"path_prefix": sc.path_prefix,
|
||||
}
|
||||
logger.info(
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, "
|
||||
f"bucket={storage_config['bucket_name']}"
|
||||
)
|
||||
|
||||
vlm_config = {
|
||||
@@ -170,7 +136,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
)
|
||||
|
||||
# Parse file
|
||||
logger.info(f"Starting file parsing process")
|
||||
logger.info("Starting file parsing process")
|
||||
result = self.parser.parse_file(
|
||||
request.file_name, file_type, request.file_content, chunking_config
|
||||
)
|
||||
@@ -184,7 +150,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
|
||||
# Convert to protobuf message
|
||||
logger.info(
|
||||
f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
|
||||
f"Parsed file {request.file_name}, with {len(result.chunks)} chunks"
|
||||
)
|
||||
|
||||
# Build response, including image info
|
||||
@@ -224,8 +190,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
enable_multimodal = request.read_config.enable_multimodal or False
|
||||
|
||||
logger.info(
|
||||
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
|
||||
f"multimodal={enable_multimodal}"
|
||||
f"Using chunking config: size={chunk_size}, "
|
||||
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
|
||||
)
|
||||
|
||||
# Get Storage and VLM config from request
|
||||
@@ -243,7 +209,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
"path_prefix": sc.path_prefix,
|
||||
}
|
||||
logger.info(
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, "
|
||||
f"bucket={storage_config['bucket_name']}"
|
||||
)
|
||||
|
||||
vlm_config = {
|
||||
@@ -269,7 +236,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
)
|
||||
|
||||
# Parse URL
|
||||
logger.info(f"Starting URL parsing process")
|
||||
logger.info("Starting URL parsing process")
|
||||
result = self.parser.parse_url(
|
||||
request.url, request.title, chunking_config
|
||||
)
|
||||
@@ -282,7 +249,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
|
||||
# Convert to protobuf message, including image info
|
||||
logger.info(
|
||||
f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
|
||||
f"Parsed URL {request.url}, returning {len(result.chunks)} chunks"
|
||||
)
|
||||
|
||||
response = ReadResponse(
|
||||
@@ -335,29 +302,15 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
return proto_chunk
|
||||
|
||||
|
||||
def init_ocr_engine(ocr_backend, ocr_config):
|
||||
def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs):
|
||||
"""Initialize OCR engine"""
|
||||
try:
|
||||
logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
|
||||
ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
|
||||
if ocr_engine:
|
||||
logger.info("OCR engine initialized successfully")
|
||||
return True
|
||||
else:
|
||||
logger.error("OCR engine initialization failed")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Error initializing OCR engine: {str(e)}")
|
||||
return False
|
||||
backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle")
|
||||
logger.info(f"Initializing OCR engine with backend: {backend_type}")
|
||||
OCREngine.get_instance(backend_type=backend_type, **kwargs)
|
||||
|
||||
|
||||
def main():
|
||||
init_ocr_engine(
|
||||
os.getenv("OCR_BACKEND", "paddle"),
|
||||
{
|
||||
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
|
||||
},
|
||||
)
|
||||
init_ocr_engine()
|
||||
|
||||
# Set max number of worker threads
|
||||
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
|
||||
|
||||
0
docreader/models/__init__.py
Normal file
0
docreader/models/__init__.py
Normal file
87
docreader/models/document.py
Normal file
87
docreader/models/document.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""Chunk document schema."""
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class Chunk(BaseModel):
|
||||
"""Document Chunk including chunk content, chunk metadata."""
|
||||
|
||||
content: str = Field(default="", description="chunk text content")
|
||||
seq: int = Field(default=0, description="Chunk sequence number")
|
||||
start: int = Field(default=0, description="Chunk start position")
|
||||
end: int = Field(description="Chunk end position")
|
||||
images: List[Dict[str, Any]] = Field(
|
||||
default_factory=list, description="Images in the chunk"
|
||||
)
|
||||
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="metadata fields",
|
||||
)
|
||||
|
||||
def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
|
||||
"""Convert Chunk to dict."""
|
||||
|
||||
data = self.model_dump()
|
||||
data.update(kwargs)
|
||||
data["class_name"] = self.__class__.__name__
|
||||
return data
|
||||
|
||||
def to_json(self, **kwargs: Any) -> str:
|
||||
"""Convert Chunk to json."""
|
||||
data = self.to_dict(**kwargs)
|
||||
return json.dumps(data)
|
||||
|
||||
def __hash__(self):
|
||||
"""Hash function."""
|
||||
return hash((self.content,))
|
||||
|
||||
def __eq__(self, other):
|
||||
"""Equal function."""
|
||||
return self.content == other.content
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any], **kwargs: Any): # type: ignore
|
||||
"""Create Chunk from dict."""
|
||||
if isinstance(kwargs, dict):
|
||||
data.update(kwargs)
|
||||
|
||||
data.pop("class_name", None)
|
||||
return cls(**data)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, data_str: str, **kwargs: Any): # type: ignore
|
||||
"""Create Chunk from json."""
|
||||
data = json.loads(data_str)
|
||||
return cls.from_dict(data, **kwargs)
|
||||
|
||||
|
||||
class Document(BaseModel):
|
||||
"""Document including document content, document metadata."""
|
||||
|
||||
model_config = {"arbitrary_types_allowed": True}
|
||||
|
||||
content: str = Field(default="", description="document text content")
|
||||
images: Dict[str, str] = Field(
|
||||
default_factory=dict, description="Images in the document"
|
||||
)
|
||||
|
||||
chunks: List[Chunk] = Field(default_factory=list, description="document chunks")
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="metadata fields",
|
||||
)
|
||||
|
||||
def set_content(self, content: str) -> None:
|
||||
"""Set document content."""
|
||||
self.content = content
|
||||
|
||||
def get_content(self) -> str:
|
||||
"""Get document content."""
|
||||
return self.content
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.content != ""
|
||||
27
docreader/models/read_config.py
Normal file
27
docreader/models/read_config.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkingConfig:
|
||||
"""
|
||||
Configuration for text chunking process.
|
||||
Controls how documents are split into smaller pieces for processing.
|
||||
"""
|
||||
|
||||
# Maximum size of each chunk in tokens/chars
|
||||
chunk_size: int = 512
|
||||
|
||||
# Number of tokens/chars to overlap between chunks
|
||||
chunk_overlap: int = 50
|
||||
|
||||
# Text separators in order of priority
|
||||
separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"])
|
||||
|
||||
# Whether to enable multimodal processing (text + images)
|
||||
enable_multimodal: bool = False
|
||||
|
||||
# Preferred field name going forward
|
||||
storage_config: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
# VLM configuration for image captioning
|
||||
vlm_config: dict[str, str] = field(default_factory=dict)
|
||||
@@ -13,22 +13,18 @@ The parsers extract content from documents and can split them into
|
||||
meaningful chunks for further processing and indexing.
|
||||
"""
|
||||
|
||||
from .base_parser import BaseParser, ParseResult
|
||||
from .docx_parser import DocxParser
|
||||
from .doc_parser import DocParser
|
||||
from .pdf_parser import PDFParser
|
||||
from .markdown_parser import MarkdownParser
|
||||
from .text_parser import TextParser
|
||||
from .docx2_parser import Docx2Parser
|
||||
from .image_parser import ImageParser
|
||||
from .web_parser import WebParser
|
||||
from .markdown_parser import MarkdownParser
|
||||
from .parser import Parser
|
||||
from .config import ChunkingConfig
|
||||
from .ocr_engine import OCREngine
|
||||
from .pdf_parser import PDFParser
|
||||
from .text_parser import TextParser
|
||||
from .web_parser import WebParser
|
||||
|
||||
# Export public classes and modules
|
||||
__all__ = [
|
||||
"BaseParser", # Base parser class that all format parsers inherit from
|
||||
"DocxParser", # Parser for .docx files (modern Word documents)
|
||||
"Docx2Parser", # Parser for .docx files (modern Word documents)
|
||||
"DocParser", # Parser for .doc files (legacy Word documents)
|
||||
"PDFParser", # Parser for PDF documents
|
||||
"MarkdownParser", # Parser for Markdown text files
|
||||
@@ -36,7 +32,4 @@ __all__ = [
|
||||
"ImageParser", # Parser for images with text content
|
||||
"WebParser", # Parser for web pages
|
||||
"Parser", # Main parser factory that selects the appropriate parser
|
||||
"ChunkingConfig", # Configuration for text chunking behavior
|
||||
"ParseResult", # Standard result format returned by all parsers
|
||||
"OCREngine", # OCR engine for extracting text from images
|
||||
]
|
||||
|
||||
@@ -1,65 +1,28 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import os
|
||||
import asyncio
|
||||
from typing import List, Dict, Any, Optional, Tuple, Union
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
import logging
|
||||
import sys
|
||||
import traceback
|
||||
import numpy as np
|
||||
import time
|
||||
import io
|
||||
import json
|
||||
from .ocr_engine import OCREngine
|
||||
from .image_utils import image_to_base64
|
||||
from .config import ChunkingConfig
|
||||
from .storage import create_storage
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
# Add parent directory to Python path for src imports
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parent_dir = os.path.dirname(current_dir)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
|
||||
try:
|
||||
from services.docreader.src.parser.caption import Caption
|
||||
except ImportError:
|
||||
# Fallback: try relative import
|
||||
try:
|
||||
from .caption import Caption
|
||||
except ImportError:
|
||||
# If both imports fail, set to None
|
||||
Caption = None
|
||||
logging.warning(
|
||||
"Failed to import Caption, image captioning will be unavailable"
|
||||
)
|
||||
from docreader.models.document import Chunk, Document
|
||||
from docreader.models.read_config import ChunkingConfig
|
||||
from docreader.parser.caption import Caption
|
||||
from docreader.parser.ocr_engine import OCREngine
|
||||
from docreader.parser.storage import create_storage
|
||||
from docreader.splitter.splitter import TextSplitter
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
"""Chunk result"""
|
||||
|
||||
content: str # Chunk content
|
||||
seq: int # Chunk sequence number
|
||||
start: int # Chunk start position
|
||||
end: int # Chunk end position
|
||||
images: List[Dict[str, Any]] = field(default_factory=list) # Images in the chunk
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParseResult:
|
||||
"""Parse result"""
|
||||
|
||||
text: str # Extracted text content
|
||||
chunks: Optional[List[Chunk]] = None # Chunk results
|
||||
|
||||
|
||||
class BaseParser(ABC):
|
||||
"""Base parser interface"""
|
||||
|
||||
@@ -97,17 +60,17 @@ class BaseParser(ABC):
|
||||
def __init__(
|
||||
self,
|
||||
file_name: str = "",
|
||||
file_type: str = None,
|
||||
file_type: Optional[str] = None,
|
||||
enable_multimodal: bool = True,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
separators: list = ["\n\n", "\n", "。"],
|
||||
separators: list[str] = ["\n\n", "\n", "。"],
|
||||
ocr_backend: str = "paddle",
|
||||
ocr_config: dict = None,
|
||||
ocr_config: dict = {},
|
||||
max_image_size: int = 1920, # Maximum image size
|
||||
max_concurrent_tasks: int = 5, # Max concurrent tasks
|
||||
max_chunks: int = 1000, # Max number of returned chunks
|
||||
chunking_config: ChunkingConfig = None, # Chunking configuration object
|
||||
chunking_config: Optional[ChunkingConfig] = None,
|
||||
):
|
||||
"""Initialize parser
|
||||
|
||||
@@ -125,7 +88,6 @@ class BaseParser(ABC):
|
||||
max_chunks: Max number of returned chunks
|
||||
"""
|
||||
# Storage client instance
|
||||
self._storage = None
|
||||
self.file_name = file_name
|
||||
self.file_type = file_type or os.path.splitext(file_name)[1]
|
||||
self.enable_multimodal = enable_multimodal
|
||||
@@ -133,15 +95,16 @@ class BaseParser(ABC):
|
||||
self.chunk_overlap = chunk_overlap
|
||||
self.separators = separators
|
||||
self.ocr_backend = os.getenv("OCR_BACKEND", ocr_backend)
|
||||
self.ocr_config = ocr_config or {}
|
||||
self.ocr_config = ocr_config
|
||||
self.max_image_size = max_image_size
|
||||
self.max_concurrent_tasks = max_concurrent_tasks
|
||||
self.max_chunks = max_chunks
|
||||
self.chunking_config = chunking_config
|
||||
|
||||
logger.info(
|
||||
f"Initializing {self.__class__.__name__} for file: {file_name}, type: {self.file_type}"
|
||||
self.storage = create_storage(
|
||||
self.chunking_config.storage_config if self.chunking_config else None
|
||||
)
|
||||
|
||||
logger.info(f"Initializing parser for file: {file_name}, type: {file_type}")
|
||||
logger.info(
|
||||
f"Parser config: chunk_size={chunk_size}, "
|
||||
f"overlap={chunk_overlap}, "
|
||||
@@ -150,16 +113,24 @@ class BaseParser(ABC):
|
||||
f"max_chunks={max_chunks}"
|
||||
)
|
||||
# Only initialize Caption service if multimodal is enabled
|
||||
if self.enable_multimodal:
|
||||
try:
|
||||
self.caption_parser = Caption(self.chunking_config.vlm_config)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to initialize Caption service: {str(e)}")
|
||||
self.caption_parser = None
|
||||
else:
|
||||
self.caption_parser = None
|
||||
vlm_config = self.chunking_config.vlm_config if self.chunking_config else None
|
||||
self.caption_parser = (
|
||||
Caption(vlm_config=vlm_config) if self.enable_multimodal else None
|
||||
)
|
||||
|
||||
def perform_ocr(self, image):
|
||||
@abstractmethod
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""Parse document content
|
||||
|
||||
Args:
|
||||
content: Document content
|
||||
|
||||
Returns:
|
||||
Either a string containing the parsed text, or a tuple of (text, image_map)
|
||||
where image_map is a dict mapping image URLs to Image objects
|
||||
"""
|
||||
|
||||
def perform_ocr(self, image: Image.Image):
|
||||
"""Execute OCR recognition on the image
|
||||
|
||||
Args:
|
||||
@@ -170,53 +141,23 @@ class BaseParser(ABC):
|
||||
"""
|
||||
start_time = time.time()
|
||||
logger.info("Starting OCR recognition")
|
||||
resized_image = None
|
||||
|
||||
try:
|
||||
# Resize image to avoid processing large images
|
||||
resized_image = self._resize_image_if_needed(image)
|
||||
|
||||
# Get OCR engine
|
||||
ocr_engine = self.get_ocr_engine(
|
||||
backend_type=self.ocr_backend, **self.ocr_config
|
||||
)
|
||||
if ocr_engine is None:
|
||||
logger.error(
|
||||
f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, "
|
||||
"skipping OCR recognition"
|
||||
)
|
||||
return ""
|
||||
ocr_engine = OCREngine.get_instance(self.ocr_backend)
|
||||
|
||||
# Execute OCR prediction
|
||||
logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
|
||||
# Add extra exception handling
|
||||
try:
|
||||
ocr_result = ocr_engine.predict(resized_image)
|
||||
except RuntimeError as e:
|
||||
# Handle common CUDA memory issues or other runtime errors
|
||||
logger.error(f"OCR prediction runtime error: {str(e)}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
# Handle other prediction errors
|
||||
logger.error(f"Unexpected OCR prediction error: {str(e)}")
|
||||
return ""
|
||||
|
||||
process_time = time.time() - start_time
|
||||
logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")
|
||||
return ocr_result
|
||||
except Exception as e:
|
||||
process_time = time.time() - start_time
|
||||
logger.error(
|
||||
f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds"
|
||||
)
|
||||
return ""
|
||||
finally:
|
||||
# Release image resources
|
||||
if resized_image is not image and hasattr(resized_image, "close"):
|
||||
# Only close the new image we created, not the original image
|
||||
resized_image.close()
|
||||
|
||||
def _resize_image_if_needed(self, image):
|
||||
return ocr_result
|
||||
|
||||
def _resize_image_if_needed(self, image: Image.Image) -> Image.Image:
|
||||
"""Resize image if it exceeds maximum size limit
|
||||
|
||||
Args:
|
||||
@@ -225,102 +166,21 @@ class BaseParser(ABC):
|
||||
Returns:
|
||||
Resized image object
|
||||
"""
|
||||
try:
|
||||
# If it's a PIL Image
|
||||
if hasattr(image, "size"):
|
||||
width, height = image.size
|
||||
if width > self.max_image_size or height > self.max_image_size:
|
||||
logger.info(f"Resizing PIL image, original size: {width}x{height}")
|
||||
scale = min(
|
||||
self.max_image_size / width, self.max_image_size / height
|
||||
)
|
||||
scale = min(self.max_image_size / width, self.max_image_size / height)
|
||||
new_width = int(width * scale)
|
||||
new_height = int(height * scale)
|
||||
resized_image = image.resize((new_width, new_height))
|
||||
logger.info(f"Resized to: {new_width}x{new_height}")
|
||||
return resized_image
|
||||
else:
|
||||
logger.info(
|
||||
f"PIL image size {width}x{height} is within limits, no resizing needed"
|
||||
)
|
||||
return image
|
||||
# If it's a numpy array
|
||||
elif hasattr(image, "shape"):
|
||||
height, width = image.shape[:2]
|
||||
if width > self.max_image_size or height > self.max_image_size:
|
||||
logger.info(
|
||||
f"Resizing numpy image, original size: {width}x{height}"
|
||||
)
|
||||
scale = min(
|
||||
self.max_image_size / width, self.max_image_size / height
|
||||
)
|
||||
new_width = int(width * scale)
|
||||
new_height = int(height * scale)
|
||||
# Use PIL for resizing numpy arrays
|
||||
pil_image = Image.fromarray(image)
|
||||
resized_pil = pil_image.resize((new_width, new_height))
|
||||
resized_image = np.array(resized_pil)
|
||||
logger.info(f"Resized to: {new_width}x{new_height}")
|
||||
return resized_image
|
||||
else:
|
||||
logger.info(
|
||||
f"Numpy image size {width}x{height} is within limits, no resizing needed"
|
||||
)
|
||||
return image
|
||||
else:
|
||||
logger.warning(f"Unknown image type: {type(image)}, cannot resize")
|
||||
return image
|
||||
except Exception as e:
|
||||
logger.error(f"Error resizing image: {str(e)}")
|
||||
|
||||
logger.info(f"PIL image size is {width}x{height}, no resizing needed")
|
||||
return image
|
||||
|
||||
def process_image(self, image, image_url=None):
|
||||
"""Process image: first perform OCR, then get caption if text is available
|
||||
|
||||
Args:
|
||||
image: Image object (PIL.Image or numpy array)
|
||||
image_url: Image URL (if uploaded)
|
||||
|
||||
Returns:
|
||||
tuple: (ocr_text, caption, image_url)
|
||||
- ocr_text: OCR extracted text
|
||||
- caption: Image description (if OCR has text) or empty string
|
||||
- image_url: Image URL (if provided)
|
||||
"""
|
||||
logger.info("Starting image processing (OCR + optional caption)")
|
||||
|
||||
# Resize image
|
||||
image = self._resize_image_if_needed(image)
|
||||
|
||||
# Perform OCR recognition
|
||||
ocr_text = self.perform_ocr(image)
|
||||
caption = ""
|
||||
|
||||
if self.caption_parser:
|
||||
logger.info(
|
||||
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
|
||||
)
|
||||
# Convert image to base64 for caption generation
|
||||
img_base64 = image_to_base64(image)
|
||||
if img_base64:
|
||||
caption = self.get_image_caption(img_base64)
|
||||
if caption:
|
||||
logger.info(f"Successfully obtained image caption: {caption}")
|
||||
else:
|
||||
logger.warning("Failed to get caption")
|
||||
else:
|
||||
logger.warning("Failed to convert image to base64")
|
||||
caption = ""
|
||||
else:
|
||||
logger.info("Caption service not initialized, skipping caption retrieval")
|
||||
|
||||
# Release image resources
|
||||
del image
|
||||
|
||||
return ocr_text, caption, image_url
|
||||
|
||||
async def process_image_async(self, image, image_url=None):
|
||||
"""Asynchronously process image: first perform OCR, then get caption if text is available
|
||||
async def process_image_async(self, image: Image.Image, image_url: str):
|
||||
"""Asynchronously process image: first perform OCR, then get caption
|
||||
|
||||
Args:
|
||||
image: Image object (PIL.Image or numpy array)
|
||||
@@ -333,67 +193,31 @@ class BaseParser(ABC):
|
||||
- image_url: Image URL (if provided)
|
||||
"""
|
||||
logger.info("Starting asynchronous image processing (OCR + optional caption)")
|
||||
resized_image = None
|
||||
|
||||
try:
|
||||
# Resize image
|
||||
resized_image = self._resize_image_if_needed(image)
|
||||
|
||||
# Perform OCR recognition (using run_in_executor to execute synchronous operations in the event loop)
|
||||
try:
|
||||
# Perform OCR recognition
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
# Add timeout mechanism to avoid infinite blocking (30 seconds timeout)
|
||||
ocr_task = loop.run_in_executor(None, self.perform_ocr, resized_image)
|
||||
ocr_text = await asyncio.wait_for(ocr_task, timeout=30.0)
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(
|
||||
"OCR processing timed out (30 seconds), skipping this image"
|
||||
)
|
||||
ocr_text = ""
|
||||
except Exception as e:
|
||||
logger.error(f"OCR processing error: {str(e)}")
|
||||
logger.error(f"OCR processing error, skipping this image: {str(e)}")
|
||||
ocr_text = ""
|
||||
|
||||
logger.info(
|
||||
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
|
||||
)
|
||||
caption = ""
|
||||
if self.caption_parser:
|
||||
try:
|
||||
# Convert image to base64 for caption generation
|
||||
img_base64 = image_to_base64(resized_image)
|
||||
if img_base64:
|
||||
# Add timeout to avoid blocking caption retrieval (30 seconds timeout)
|
||||
caption_task = self.get_image_caption_async(img_base64)
|
||||
image_data, caption = await asyncio.wait_for(
|
||||
caption_task, timeout=30.0
|
||||
)
|
||||
if caption:
|
||||
logger.info(
|
||||
f"Successfully obtained image caption: {caption}"
|
||||
)
|
||||
else:
|
||||
logger.warning("Failed to get caption")
|
||||
else:
|
||||
logger.warning("Failed to convert image to base64")
|
||||
caption = ""
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Caption retrieval timed out, skipping")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get caption: {str(e)}")
|
||||
else:
|
||||
logger.info(
|
||||
"Caption service not initialized, skipping caption retrieval"
|
||||
)
|
||||
|
||||
logger.info(f"Successfully obtained image ocr: {ocr_text}")
|
||||
img_base64 = endecode.decode_image(resized_image)
|
||||
caption = self.get_image_caption(img_base64)
|
||||
logger.info(f"Successfully obtained image caption: {caption}")
|
||||
return ocr_text, caption, image_url
|
||||
finally:
|
||||
# Release image resources
|
||||
if resized_image is not image and hasattr(resized_image, "close"):
|
||||
# Only close the new image we created, not the original image
|
||||
resized_image.close()
|
||||
|
||||
async def process_with_limit(self, idx, image, url, semaphore):
|
||||
async def process_with_limit(
|
||||
self, idx: int, image: Image.Image, url: str, semaphore: asyncio.Semaphore
|
||||
):
|
||||
"""Function to process a single image using a semaphore"""
|
||||
try:
|
||||
logger.info(f"Waiting to process image {idx + 1}")
|
||||
@@ -407,10 +231,9 @@ class BaseParser(ABC):
|
||||
return ("", "", url) # Return empty result to avoid overall failure
|
||||
finally:
|
||||
# Manually release image resources
|
||||
if hasattr(image, "close"):
|
||||
image.close()
|
||||
|
||||
async def process_multiple_images(self, images_data):
|
||||
async def process_multiple_images(self, images_data: List[Tuple[Image.Image, str]]):
|
||||
"""Process multiple images concurrently
|
||||
|
||||
Args:
|
||||
@@ -467,47 +290,10 @@ class BaseParser(ABC):
|
||||
logger.info("Image processing resource cleanup complete")
|
||||
|
||||
logger.info(
|
||||
f"Completed concurrent processing of {len(results)}/{len(images_data)} images"
|
||||
f"Concurrent processing of {len(results)}/{len(images_data)} images"
|
||||
)
|
||||
return results
|
||||
|
||||
def decode_bytes(self, content: bytes) -> str:
|
||||
"""Intelligently decode byte stream, supports multiple encodings
|
||||
|
||||
Tries to decode in common encodings, if all fail, uses latin-1 as fallback
|
||||
|
||||
Args:
|
||||
content: Byte stream to decode
|
||||
|
||||
Returns:
|
||||
Decoded string
|
||||
"""
|
||||
logger.info(f"Attempting to decode bytes of length: {len(content)}")
|
||||
# Common encodings, sorted by priority
|
||||
encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii", "latin-1"]
|
||||
text = None
|
||||
|
||||
# Try decoding with each encoding format
|
||||
for encoding in encodings:
|
||||
try:
|
||||
text = content.decode(encoding)
|
||||
logger.info(f"Successfully decoded content using {encoding} encoding")
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
logger.info(f"Failed to decode using {encoding} encoding")
|
||||
continue
|
||||
|
||||
# If all encodings fail, use latin-1 as fallback
|
||||
if text is None:
|
||||
text = content.decode("latin-1")
|
||||
logger.warning(
|
||||
f"Unable to determine correct encoding, using latin-1 as fallback. "
|
||||
f"This may cause character issues."
|
||||
)
|
||||
|
||||
logger.info(f"Decoded text length: {len(text)} characters")
|
||||
return text
|
||||
|
||||
def get_image_caption(self, image_data: str) -> str:
|
||||
"""Get image description
|
||||
|
||||
@@ -517,6 +303,9 @@ class BaseParser(ABC):
|
||||
Returns:
|
||||
Image description
|
||||
"""
|
||||
if not self.caption_parser:
|
||||
logger.warning("Caption parser not initialized")
|
||||
return ""
|
||||
start_time = time.time()
|
||||
logger.info(
|
||||
f"Getting caption for image: {image_data[:250]}..."
|
||||
@@ -533,80 +322,7 @@ class BaseParser(ABC):
|
||||
logger.warning("Failed to get caption for image")
|
||||
return caption
|
||||
|
||||
async def get_image_caption_async(self, image_data: str) -> Tuple[str, str]:
|
||||
"""Asynchronously get image description
|
||||
|
||||
Args:
|
||||
image_data: Image data (base64 encoded string or URL)
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: Image data and corresponding description
|
||||
"""
|
||||
caption = self.get_image_caption(image_data)
|
||||
return image_data, caption
|
||||
|
||||
def __init_storage(self):
|
||||
"""Initialize storage client based on configuration"""
|
||||
if self._storage is None:
|
||||
storage_config = (
|
||||
self.chunking_config.storage_config if self.chunking_config else None
|
||||
)
|
||||
self._storage = create_storage(storage_config)
|
||||
logger.info(
|
||||
f"Initialized storage client: {self._storage.__class__.__name__}"
|
||||
)
|
||||
return self._storage
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to object storage
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
logger.info(f"Uploading file: {file_path}")
|
||||
try:
|
||||
storage = self.__init_storage()
|
||||
return storage.upload_file(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file: {str(e)}")
|
||||
return ""
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to object storage
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
logger.info(f"Uploading bytes content, size: {len(content)} bytes")
|
||||
try:
|
||||
storage = self.__init_storage()
|
||||
return storage.upload_bytes(content, file_ext)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload bytes to storage: {str(e)}")
|
||||
traceback.print_exc()
|
||||
return ""
|
||||
|
||||
@abstractmethod
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
"""Parse document content
|
||||
|
||||
Args:
|
||||
content: Document content
|
||||
|
||||
Returns:
|
||||
Either a string containing the parsed text, or a tuple of (text, image_map)
|
||||
where image_map is a dict mapping image URLs to Image objects
|
||||
"""
|
||||
pass
|
||||
|
||||
def parse(self, content: bytes) -> ParseResult:
|
||||
def parse(self, content: bytes) -> Document:
|
||||
"""Parse document content
|
||||
|
||||
Args:
|
||||
@@ -616,17 +332,19 @@ class BaseParser(ABC):
|
||||
Parse result
|
||||
"""
|
||||
logger.info(
|
||||
f"Parsing document with {self.__class__.__name__}, content size: {len(content)} bytes"
|
||||
f"Parsing document with {self.__class__.__name__}, bytes: {len(content)}"
|
||||
)
|
||||
parse_result = self.parse_into_text(content)
|
||||
if isinstance(parse_result, tuple):
|
||||
text, image_map = parse_result
|
||||
else:
|
||||
text = parse_result
|
||||
image_map = {}
|
||||
logger.info(f"Extracted {len(text)} characters of text from {self.file_name}")
|
||||
logger.info(f"Beginning chunking process for text")
|
||||
chunks = self.chunk_text(text)
|
||||
document = self.parse_into_text(content)
|
||||
logger.info(
|
||||
f"Extracted {len(document.content)} characters from {self.file_name}"
|
||||
)
|
||||
splitter = TextSplitter(
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
separators=self.separators,
|
||||
)
|
||||
chunk_str = splitter.split_text(document.content)
|
||||
chunks = self._str_to_chunk(chunk_str)
|
||||
logger.info(f"Created {len(chunks)} chunks from document")
|
||||
|
||||
# Limit the number of returned chunks
|
||||
@@ -636,7 +354,7 @@ class BaseParser(ABC):
|
||||
)
|
||||
chunks = chunks[: self.max_chunks]
|
||||
|
||||
# If multimodal is enabled and file type is supported, process images in each chunk
|
||||
# If multimodal is enabled and file type is supported, process images
|
||||
if self.enable_multimodal:
|
||||
# Get file extension and convert to lowercase
|
||||
file_ext = (
|
||||
@@ -647,11 +365,12 @@ class BaseParser(ABC):
|
||||
|
||||
# Define allowed file types for image processing
|
||||
allowed_types = [
|
||||
".pdf", # PDF files
|
||||
# Text files
|
||||
".pdf",
|
||||
".md",
|
||||
".markdown", # Markdown files
|
||||
".markdown",
|
||||
".doc",
|
||||
".docx", # Word documents
|
||||
".docx",
|
||||
# Image files
|
||||
".jpg",
|
||||
".jpeg",
|
||||
@@ -666,13 +385,21 @@ class BaseParser(ABC):
|
||||
logger.info(
|
||||
f"Processing images in each chunk for file type: {file_ext}"
|
||||
)
|
||||
chunks = self.process_chunks_images(chunks, image_map)
|
||||
chunks = self.process_chunks_images(chunks, document.images)
|
||||
else:
|
||||
logger.info(
|
||||
f"Skipping image processing for unsupported file type: {file_ext}"
|
||||
)
|
||||
|
||||
return ParseResult(text=text, chunks=chunks)
|
||||
document.chunks = chunks
|
||||
return document
|
||||
|
||||
def _str_to_chunk(self, text: List[Tuple[int, int, str]]) -> List[Chunk]:
|
||||
"""Convert string to Chunk object"""
|
||||
return [
|
||||
Chunk(seq=i, content=t, start=start, end=end)
|
||||
for i, (start, end, t) in enumerate(text)
|
||||
]
|
||||
|
||||
def _split_into_units(self, text: str) -> List[str]:
|
||||
"""
|
||||
@@ -682,9 +409,7 @@ class BaseParser(ABC):
|
||||
Returns:
|
||||
基本单元的列表
|
||||
"""
|
||||
logger.info(
|
||||
f"Splitting text into basic units with robust structure protection, text length: {len(text)}"
|
||||
)
|
||||
logger.info(f"Splitting text into basic units, text length: {len(text)}")
|
||||
|
||||
# 定义所有需要作为整体保护的结构模式 ---
|
||||
table_pattern = r"(?m)(^\|.*\|[ \t]*\r?\n(?:[ \t]*\r?\n)?^\|\s*:?--+.*\r?\n(?:^\|.*\|\r?\n?)*)"
|
||||
@@ -710,7 +435,8 @@ class BaseParser(ABC):
|
||||
# 按起始位置排序
|
||||
protected_ranges.sort(key=lambda x: x[0])
|
||||
logger.info(
|
||||
f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links)."
|
||||
f"Found {len(protected_ranges)} protected structures "
|
||||
"(tables, code, formulas, images, links)."
|
||||
)
|
||||
|
||||
# 合并可能重叠的保护范围 ---
|
||||
@@ -731,7 +457,7 @@ class BaseParser(ABC):
|
||||
merged_ranges.append((current_start, current_end))
|
||||
protected_ranges = merged_ranges
|
||||
logger.info(
|
||||
f"After merging overlaps, {len(protected_ranges)} protected ranges remain."
|
||||
f"After overlaps, {len(protected_ranges)} protected ranges remain."
|
||||
)
|
||||
|
||||
# 根据保护范围和分隔符来分割文本 ---
|
||||
@@ -749,7 +475,7 @@ class BaseParser(ABC):
|
||||
segments = re.split(separator_pattern, pre_text)
|
||||
units.extend([s for s in segments if s]) # 添加所有非空部分
|
||||
|
||||
# b. 将整个受保护的块(例如,一个完整的表格)作为一个单独的、不可分割的单元添加
|
||||
# b. 将整个受保护的块(例如,一个完整的表格)作为一个不可分割的单元添加
|
||||
protected_text = text[start:end]
|
||||
units.append(protected_text)
|
||||
|
||||
@@ -764,38 +490,6 @@ class BaseParser(ABC):
|
||||
logger.info(f"Text splitting complete, created {len(units)} final basic units.")
|
||||
return units
|
||||
|
||||
def _find_complete_units(self, units: List[str], target_size: int) -> List[str]:
|
||||
"""Find a list of complete units that do not exceed the target size
|
||||
|
||||
Args:
|
||||
units: List of units
|
||||
target_size: Target size
|
||||
|
||||
Returns:
|
||||
List of complete units
|
||||
"""
|
||||
logger.info(f"Finding complete units with target size: {target_size}")
|
||||
result = []
|
||||
current_size = 0
|
||||
|
||||
for unit in units:
|
||||
unit_size = len(unit)
|
||||
if current_size + unit_size > target_size and result:
|
||||
logger.info(
|
||||
f"Reached target size limit at {current_size} characters, stopping"
|
||||
)
|
||||
break
|
||||
result.append(unit)
|
||||
current_size += unit_size
|
||||
logger.info(
|
||||
f"Added unit of size {unit_size}, current total: {current_size}/{target_size}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Found {len(result)} complete units totaling {current_size} characters"
|
||||
)
|
||||
return result
|
||||
|
||||
def chunk_text(self, text: str) -> List[Chunk]:
|
||||
"""Chunk text, preserving Markdown structure
|
||||
|
||||
@@ -855,14 +549,12 @@ class BaseParser(ABC):
|
||||
for u in reversed(current_chunk):
|
||||
if overlap_size + len(u) > overlap_target:
|
||||
logger.info(
|
||||
f"Reached overlap target ({overlap_size}/{overlap_target})"
|
||||
f"Overlap target ({overlap_size}/{overlap_target})"
|
||||
)
|
||||
break
|
||||
overlap_units.insert(0, u)
|
||||
overlap_size += len(u)
|
||||
logger.info(
|
||||
f"Added unit to overlap, current overlap size: {overlap_size}"
|
||||
)
|
||||
logger.info(f"Added unit to overlap, size: {overlap_size}")
|
||||
|
||||
# Remove elements from overlap that are included in separators
|
||||
start_index = 0
|
||||
@@ -883,7 +575,7 @@ class BaseParser(ABC):
|
||||
|
||||
overlap_units = overlap_units[start_index:]
|
||||
logger.info(
|
||||
f"Final overlap: {len(overlap_units)} units, {overlap_size} characters"
|
||||
f"Overlap: {len(overlap_units)} units, {overlap_size} size"
|
||||
)
|
||||
|
||||
current_chunk = overlap_units
|
||||
@@ -899,7 +591,7 @@ class BaseParser(ABC):
|
||||
current_chunk.append(unit)
|
||||
current_size += unit_size
|
||||
logger.info(
|
||||
f"Added unit to current chunk, now at {current_size}/{self.chunk_size} characters"
|
||||
f"Added unit to current chunk, at {current_size}/{self.chunk_size}"
|
||||
)
|
||||
|
||||
# Add the last chunk
|
||||
@@ -925,12 +617,13 @@ class BaseParser(ABC):
|
||||
chunk: Document chunk
|
||||
|
||||
Returns:
|
||||
List of image information, each element contains image URL and match position
|
||||
List of image information
|
||||
"""
|
||||
logger.info(f"Extracting image information from Chunk #{chunk.seq}")
|
||||
text = chunk.content
|
||||
|
||||
# Regex to extract image information from text, supporting Markdown images and HTML images
|
||||
# Regex to extract image information from text,
|
||||
# support: Markdown images, HTML images
|
||||
img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)|<img [^>]*src="([^"]+)" [^>]*>'
|
||||
|
||||
# Extract image information
|
||||
@@ -954,28 +647,28 @@ class BaseParser(ABC):
|
||||
images_info.append(image_info)
|
||||
|
||||
logger.info(
|
||||
f"Image in Chunk #{chunk.seq} {match_idx+1}: " f"URL={img_url[:50]}..."
|
||||
f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url[:50]}..."
|
||||
if len(img_url) > 50
|
||||
else f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url}"
|
||||
)
|
||||
|
||||
return images_info
|
||||
|
||||
async def download_and_upload_image(self, img_url: str):
|
||||
"""Download image and upload to object storage, if it's already an object storage path or local path, use directly
|
||||
async def download_and_upload_image(
|
||||
self, img_url: str
|
||||
) -> Tuple[str, str, Image.Image | None]:
|
||||
"""Download image and upload to object storage,
|
||||
if it's already an object storage path or local path, use directly
|
||||
|
||||
Args:
|
||||
img_url: Image URL or local path
|
||||
|
||||
Returns:
|
||||
tuple: (original URL, storage URL, image object), if failed returns (original URL, None, None)
|
||||
tuple: (original URL, storage URL, image object),
|
||||
if failed returns (original URL, None, None)
|
||||
"""
|
||||
|
||||
try:
|
||||
import requests
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
# Check if it's already a storage URL (COS or MinIO)
|
||||
is_storage_url = any(
|
||||
pattern in img_url
|
||||
@@ -997,12 +690,7 @@ class BaseParser(ABC):
|
||||
response = requests.get(img_url, timeout=5, proxies=proxies)
|
||||
if response.status_code == 200:
|
||||
image = Image.open(io.BytesIO(response.content))
|
||||
try:
|
||||
return img_url, img_url, image
|
||||
finally:
|
||||
# Ensure image resources are also released after the function returns
|
||||
# Image will be closed by the caller
|
||||
pass
|
||||
else:
|
||||
logger.warning(
|
||||
f"Failed to get storage image: {response.status_code}"
|
||||
@@ -1022,7 +710,7 @@ class BaseParser(ABC):
|
||||
# Upload to storage
|
||||
with open(img_url, "rb") as f:
|
||||
content = f.read()
|
||||
storage_url = self.upload_bytes(content)
|
||||
storage_url = self.storage.upload_bytes(content)
|
||||
logger.info(
|
||||
f"Successfully uploaded local image to storage: {storage_url}"
|
||||
)
|
||||
@@ -1031,7 +719,7 @@ class BaseParser(ABC):
|
||||
logger.error(f"Error processing local image: {str(e)}")
|
||||
if image and hasattr(image, "close"):
|
||||
image.close()
|
||||
return img_url, None, None
|
||||
return img_url, img_url, None
|
||||
|
||||
# Normal remote URL download handling
|
||||
else:
|
||||
@@ -1044,9 +732,7 @@ class BaseParser(ABC):
|
||||
if https_proxy:
|
||||
proxies["https"] = https_proxy
|
||||
|
||||
logger.info(
|
||||
f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}"
|
||||
)
|
||||
logger.info(f"Downloading image {img_url}, using proxy: {proxies}")
|
||||
response = requests.get(img_url, timeout=5, proxies=proxies)
|
||||
|
||||
if response.status_code == 200:
|
||||
@@ -1054,7 +740,7 @@ class BaseParser(ABC):
|
||||
image = Image.open(io.BytesIO(response.content))
|
||||
try:
|
||||
# Upload to storage using the method in BaseParser
|
||||
storage_url = self.upload_bytes(response.content)
|
||||
storage_url = self.storage.upload_bytes(response.content)
|
||||
logger.info(
|
||||
f"Successfully uploaded image to storage: {storage_url}"
|
||||
)
|
||||
@@ -1064,11 +750,11 @@ class BaseParser(ABC):
|
||||
pass
|
||||
else:
|
||||
logger.warning(f"Failed to download image: {response.status_code}")
|
||||
return img_url, None, None
|
||||
return img_url, img_url, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading or processing image: {str(e)}")
|
||||
return img_url, None, None
|
||||
return img_url, img_url, None
|
||||
|
||||
async def process_chunk_images_async(
|
||||
self, chunk, chunk_idx, total_chunks, image_map=None
|
||||
@@ -1097,7 +783,8 @@ class BaseParser(ABC):
|
||||
|
||||
# Prepare images that need to be downloaded and processed
|
||||
images_to_process = []
|
||||
url_to_info_map = {} # Map URL to image information
|
||||
# Map URL to image information
|
||||
url_to_info_map = {}
|
||||
|
||||
# Record all image URLs that need to be processed
|
||||
for img_info in images_info:
|
||||
@@ -1106,14 +793,21 @@ class BaseParser(ABC):
|
||||
|
||||
results = []
|
||||
download_tasks = []
|
||||
for img_url in url_to_info_map.keys(): # Check if image is already in the image_map
|
||||
# Check if image is already in the image_map
|
||||
for img_url in url_to_info_map.keys():
|
||||
if image_map and img_url in image_map:
|
||||
logger.info(f"Image already in image_map: {img_url}, using cached object")
|
||||
results.append((img_url, img_url, image_map[img_url]))
|
||||
logger.info(
|
||||
f"Image already in image_map: {img_url}, using cached object"
|
||||
)
|
||||
image = Image.open(
|
||||
io.BytesIO(endecode.encode_image(image_map[img_url]))
|
||||
)
|
||||
results.append((img_url, img_url, image))
|
||||
else:
|
||||
download_task = self.download_and_upload_image(img_url)
|
||||
download_tasks.append(download_task)
|
||||
# Concurrent download and upload of images, ignore images that are already in the image_map
|
||||
# Concurrent download and upload of images,
|
||||
# ignore images that are already in the image_map
|
||||
results.extend(await asyncio.gather(*download_tasks))
|
||||
|
||||
# Process download results, prepare for OCR processing
|
||||
@@ -1123,10 +817,11 @@ class BaseParser(ABC):
|
||||
img_info["cos_url"] = cos_url
|
||||
images_to_process.append((image, cos_url))
|
||||
|
||||
# If no images were successfully downloaded and uploaded, return the original Chunk
|
||||
# If no images were successfully downloaded and uploaded,
|
||||
# return the original Chunk
|
||||
if not images_to_process:
|
||||
logger.info(
|
||||
f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images"
|
||||
f"Chunk #{chunk_idx + 1} not found downloaded and uploaded images"
|
||||
)
|
||||
return chunk
|
||||
|
||||
@@ -1166,7 +861,9 @@ class BaseParser(ABC):
|
||||
logger.info(f"Completed image processing in Chunk #{chunk_idx + 1}")
|
||||
return chunk
|
||||
|
||||
def process_chunks_images(self, chunks: List[Chunk], image_map=None) -> List[Chunk]:
|
||||
def process_chunks_images(
|
||||
self, chunks: List[Chunk], image_map: Dict[str, str] = {}
|
||||
) -> List[Chunk]:
|
||||
"""Concurrent processing of images in all Chunks
|
||||
|
||||
Args:
|
||||
@@ -1235,7 +932,7 @@ class BaseParser(ABC):
|
||||
# Execute processing for all Chunks
|
||||
processed_chunks = loop.run_until_complete(process_all_chunks())
|
||||
logger.info(
|
||||
f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks"
|
||||
f"Completed processing of {len(processed_chunks)}/{len(chunks)} chunks"
|
||||
)
|
||||
|
||||
return processed_chunks
|
||||
|
||||
@@ -3,11 +3,10 @@ import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Union
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import requests
|
||||
import ollama
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -158,11 +157,16 @@ class CaptionChatResp:
|
||||
Returns:
|
||||
The content string from the first choice, or empty string if no choices
|
||||
"""
|
||||
if self.choices:
|
||||
logger.info("Retrieving content from first choice")
|
||||
return self.choices[0].message.content
|
||||
if (
|
||||
not self.choices
|
||||
or not self.choices[0]
|
||||
or not self.choices[0].message
|
||||
or not self.choices[0].message.content
|
||||
):
|
||||
logger.warning("No choices available in response")
|
||||
return ""
|
||||
logger.info("Retrieving content from first choice")
|
||||
return self.choices[0].message.content
|
||||
|
||||
|
||||
class Caption:
|
||||
@@ -171,33 +175,43 @@ class Caption:
|
||||
Uses an external API to process images and return textual descriptions.
|
||||
"""
|
||||
|
||||
def __init__(self, vlm_config=None):
|
||||
"""Initialize the Caption service with configuration from parameters or environment variables."""
|
||||
def __init__(self, vlm_config: Optional[Dict[str, str]] = None):
|
||||
"""
|
||||
Initialize the Caption service with configuration
|
||||
from parameters or environment variables.
|
||||
"""
|
||||
logger.info("Initializing Caption service")
|
||||
self.prompt = """简单凝炼的描述图片的主要内容"""
|
||||
self.timeout = 30
|
||||
|
||||
# Use provided VLM config if available, otherwise fall back to environment variables
|
||||
# Use provided VLM config if available,
|
||||
# otherwise fall back to environment variables
|
||||
if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"):
|
||||
self.completion_url = vlm_config.get("base_url", "") + "/chat/completions"
|
||||
self.model = vlm_config.get("model_name", "")
|
||||
self.api_key = vlm_config.get("api_key", "")
|
||||
self.interface_type = vlm_config.get("interface_type", "openai").lower()
|
||||
else:
|
||||
if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "":
|
||||
base_url = os.getenv("VLM_MODEL_BASE_URL")
|
||||
model_name = os.getenv("VLM_MODEL_NAME")
|
||||
if not base_url or not model_name:
|
||||
logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set")
|
||||
return
|
||||
self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions"
|
||||
self.model = os.getenv("VLM_MODEL_NAME")
|
||||
self.api_key = os.getenv("VLM_MODEL_API_KEY")
|
||||
self.completion_url = base_url + "/chat/completions"
|
||||
self.model = model_name
|
||||
self.api_key = os.getenv("VLM_MODEL_API_KEY", "")
|
||||
self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower()
|
||||
|
||||
# 验证接口类型
|
||||
if self.interface_type not in ["ollama", "openai"]:
|
||||
logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai")
|
||||
logger.warning(
|
||||
f"Unknown interface type: {self.interface_type}, defaulting to openai"
|
||||
)
|
||||
self.interface_type = "openai"
|
||||
|
||||
logger.info(
|
||||
f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}"
|
||||
f"Configured with model: {self.model}, "
|
||||
f"endpoint: {self.completion_url}, interface: {self.interface_type}"
|
||||
)
|
||||
|
||||
def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]:
|
||||
@@ -210,8 +224,8 @@ class Caption:
|
||||
Returns:
|
||||
CaptionChatResp object if successful, None otherwise
|
||||
"""
|
||||
logger.info(f"Calling Caption API for image captioning")
|
||||
logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}")
|
||||
logger.info("Calling Caption API for image captioning")
|
||||
logger.info(f"Processing image data: {image_data[:50]}...")
|
||||
|
||||
# 根据接口类型选择调用方式
|
||||
if self.interface_type == "ollama":
|
||||
@@ -226,6 +240,7 @@ class Caption:
|
||||
|
||||
client = ollama.Client(
|
||||
host=host,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -244,16 +259,11 @@ class Caption:
|
||||
caption_resp = CaptionChatResp(
|
||||
id="ollama_response",
|
||||
created=int(time.time()),
|
||||
model=self.model,
|
||||
model=Model(id=self.model),
|
||||
object="chat.completion",
|
||||
choices=[
|
||||
Choice(
|
||||
message=Message(
|
||||
role="assistant",
|
||||
content=response.response
|
||||
)
|
||||
)
|
||||
]
|
||||
Choice(message=Message(role="assistant", content=response.response))
|
||||
],
|
||||
)
|
||||
|
||||
logger.info("Successfully received response from Ollama API")
|
||||
@@ -272,7 +282,10 @@ class Caption:
|
||||
content=[
|
||||
Content(type="text", text=self.prompt),
|
||||
Content(
|
||||
type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto")
|
||||
type="image_url",
|
||||
image_url=ImageUrl(
|
||||
url="data:image/png;base64," + image_base64, detail="auto"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
@@ -295,23 +308,23 @@ class Caption:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
|
||||
try:
|
||||
logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}")
|
||||
logger.info(
|
||||
f"Sending request to OpenAI-compatible API with model: {self.model}"
|
||||
)
|
||||
response = requests.post(
|
||||
self.completion_url,
|
||||
data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4),
|
||||
headers=headers,
|
||||
timeout=30,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
if response.status_code != 200:
|
||||
logger.error(
|
||||
f"OpenAI-compatible API returned non-200 status code: {response.status_code}"
|
||||
f"OpenAI API returned non-200 status code: {response.status_code}"
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
logger.info(
|
||||
f"Successfully received response from OpenAI-compatible API with status: {response.status_code}"
|
||||
)
|
||||
logger.info(f"Converting response to CaptionChatResp object")
|
||||
logger.info(f"Received from OpenAI with status: {response.status_code}")
|
||||
logger.info("Converting response to CaptionChatResp object")
|
||||
caption_resp = CaptionChatResp.from_json(response.json())
|
||||
|
||||
if caption_resp.usage:
|
||||
@@ -322,7 +335,7 @@ class Caption:
|
||||
|
||||
return caption_resp
|
||||
except requests.exceptions.Timeout:
|
||||
logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds")
|
||||
logger.error("Timeout while calling OpenAI-compatible API after 30 seconds")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Request error calling OpenAI-compatible API: {e}")
|
||||
|
||||
70
docreader/parser/chain_parser.py
Normal file
70
docreader/parser/chain_parser.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import logging
|
||||
from typing import List, Tuple, Type
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FirstParser(BaseParser):
|
||||
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self._parsers: List[BaseParser] = []
|
||||
for parser_cls in self._parser_cls:
|
||||
try:
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
for p in self._parsers:
|
||||
document = p.parse_into_text(content)
|
||||
if document.is_valid():
|
||||
return document
|
||||
return Document()
|
||||
|
||||
@classmethod
|
||||
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["FirstParser"]:
|
||||
names = "_".join([p.__name__ for p in parser_classes])
|
||||
return type(f"FirstParser_{names}", (cls,), {"_parser_cls": parser_classes})
|
||||
|
||||
|
||||
class PipelineParser(BaseParser):
|
||||
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self._parsers: List[BaseParser] = []
|
||||
for parser_cls in self._parser_cls:
|
||||
try:
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
document = Document()
|
||||
for p in self._parsers:
|
||||
document = p.parse_into_text(content)
|
||||
content = endecode.encode_bytes(document.content)
|
||||
return document
|
||||
|
||||
@classmethod
|
||||
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["PipelineParser"]:
|
||||
names = "_".join([p.__name__ for p in parser_classes])
|
||||
return type(f"PipelineParser_{names}", (cls,), {"_parser_cls": parser_classes})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
|
||||
cls = FirstParser.create(MarkdownParser)
|
||||
parser = cls()
|
||||
print(parser.parse_into_text(b"aaa"))
|
||||
@@ -1,21 +0,0 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkingConfig:
|
||||
"""
|
||||
Configuration for text chunking process.
|
||||
Controls how documents are split into smaller pieces for processing.
|
||||
"""
|
||||
|
||||
chunk_size: int = 512 # Maximum size of each chunk in tokens/chars
|
||||
chunk_overlap: int = 50 # Number of tokens/chars to overlap between chunks
|
||||
separators: list = field(
|
||||
default_factory=lambda: ["\n\n", "\n", "。"]
|
||||
) # Text separators in order of priority
|
||||
enable_multimodal: bool = (
|
||||
False # Whether to enable multimodal processing (text + images)
|
||||
)
|
||||
storage_config: dict = None # Preferred field name going forward
|
||||
vlm_config: dict = None # VLM configuration for image captioning
|
||||
|
||||
@@ -1,134 +1,88 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
import os
|
||||
import subprocess
|
||||
import shutil
|
||||
from io import BytesIO
|
||||
from typing import Optional, List, Tuple
|
||||
import textract
|
||||
from PIL import Image
|
||||
import zipfile
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import List, Optional
|
||||
|
||||
from .base_parser import BaseParser
|
||||
from .docx_parser import DocxParser, Docx
|
||||
import textract
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.docx2_parser import Docx2Parser
|
||||
from docreader.utils.tempfile import TempDirContext, TempFileContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocParser(BaseParser):
|
||||
class DocParser(Docx2Parser):
|
||||
"""DOC document parser"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> str:
|
||||
"""Parse DOC document
|
||||
|
||||
Args:
|
||||
content: DOC document content
|
||||
|
||||
Returns:
|
||||
Parse result
|
||||
"""
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
logger.info(f"Parsing DOC document, content size: {len(content)} bytes")
|
||||
|
||||
handle_chain = [
|
||||
# 1. Try to convert to docx format to extract images
|
||||
self._parse_with_docx,
|
||||
# 2. If image extraction is not needed or conversion failed,
|
||||
# try using antiword to extract text
|
||||
self._parse_with_antiword,
|
||||
# 3. If antiword extraction fails, use textract
|
||||
self._parse_with_textract,
|
||||
]
|
||||
|
||||
# Save byte content as a temporary file
|
||||
with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
|
||||
temp_file_path = temp_file.name
|
||||
temp_file.write(content)
|
||||
temp_file.flush()
|
||||
logger.info(f"Saved DOC content to temporary file: {temp_file_path}")
|
||||
|
||||
with TempFileContext(content, ".doc") as temp_file_path:
|
||||
for handle in handle_chain:
|
||||
try:
|
||||
# First try to convert to docx format to extract images
|
||||
if self.enable_multimodal:
|
||||
logger.info("Multimodal enabled, attempting to extract images from DOC")
|
||||
docx_content = self._convert_doc_to_docx(temp_file_path)
|
||||
document = handle(temp_file_path)
|
||||
if document:
|
||||
return document
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse DOC with {handle.__name__} {e}")
|
||||
|
||||
return Document(content="")
|
||||
|
||||
def _parse_with_docx(self, temp_file_path: str) -> Document:
|
||||
logger.info("Multimodal enabled, attempting to extract images from DOC")
|
||||
|
||||
docx_content = self._try_convert_doc_to_docx(temp_file_path)
|
||||
if not docx_content:
|
||||
raise RuntimeError("Failed to convert DOC to DOCX")
|
||||
|
||||
if docx_content:
|
||||
logger.info("Successfully converted DOC to DOCX, using DocxParser")
|
||||
# Use existing DocxParser to parse the converted docx
|
||||
docx_parser = DocxParser(
|
||||
file_name=self.file_name,
|
||||
file_type="docx",
|
||||
enable_multimodal=self.enable_multimodal,
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
chunking_config=self.chunking_config,
|
||||
separators=self.separators,
|
||||
)
|
||||
text = docx_parser.parse_into_text(docx_content)
|
||||
logger.info(f"Extracted {len(text)} characters using DocxParser")
|
||||
document = super(Docx2Parser, self).parse_into_text(docx_content)
|
||||
logger.info(f"Extracted {len(document.content)} characters using DocxParser")
|
||||
return document
|
||||
|
||||
# Clean up temporary file
|
||||
os.unlink(temp_file_path)
|
||||
logger.info(f"Deleted temporary file: {temp_file_path}")
|
||||
|
||||
return text
|
||||
else:
|
||||
logger.warning(
|
||||
"Failed to convert DOC to DOCX, falling back to text-only extraction"
|
||||
)
|
||||
|
||||
# If image extraction is not needed or conversion failed, try using antiword to extract text
|
||||
try:
|
||||
def _parse_with_antiword(self, temp_file_path: str) -> Document:
|
||||
logger.info("Attempting to parse DOC file with antiword")
|
||||
# Check if antiword is installed
|
||||
antiword_path = self._find_antiword_path()
|
||||
|
||||
if antiword_path:
|
||||
# Check if antiword is installed
|
||||
antiword_path = self._try_find_antiword()
|
||||
if not antiword_path:
|
||||
raise RuntimeError("antiword not found in PATH")
|
||||
|
||||
# Use antiword to extract text directly
|
||||
logger.info(f"Using antiword at {antiword_path} to extract text")
|
||||
process = subprocess.Popen(
|
||||
[antiword_path, temp_file_path],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = process.communicate()
|
||||
|
||||
if process.returncode == 0:
|
||||
text = stdout.decode("utf-8", errors="ignore")
|
||||
logger.info(
|
||||
f"Successfully extracted {len(text)} characters using antiword"
|
||||
)
|
||||
|
||||
# Clean up temporary file
|
||||
os.unlink(temp_file_path)
|
||||
logger.info(f"Deleted temporary file: {temp_file_path}")
|
||||
|
||||
return text
|
||||
else:
|
||||
logger.warning(
|
||||
if process.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
|
||||
)
|
||||
else:
|
||||
logger.warning("antiword not found, falling back to textract")
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error using antiword: {str(e)}, falling back to textract"
|
||||
)
|
||||
text = stdout.decode("utf-8", errors="ignore")
|
||||
logger.info(f"Successfully extracted {len(text)} characters using antiword")
|
||||
return Document(content=text)
|
||||
|
||||
# If antiword fails, try using textract
|
||||
logger.info("Parsing DOC file with textract")
|
||||
def _parse_with_textract(self, temp_file_path: str) -> Document:
|
||||
logger.info(f"Parsing DOC file with textract: {temp_file_path}")
|
||||
text = textract.process(temp_file_path, method="antiword").decode("utf-8")
|
||||
logger.info(
|
||||
f"Successfully extracted {len(text)} characters of text from DOC document using textract"
|
||||
)
|
||||
logger.info(f"Successfully extracted {len(text)} bytes of DOC using textract")
|
||||
return Document(content=str(text))
|
||||
|
||||
# Clean up temporary file
|
||||
os.unlink(temp_file_path)
|
||||
logger.info(f"Deleted temporary file: {temp_file_path}")
|
||||
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing DOC document: {str(e)}")
|
||||
# Ensure temporary file is cleaned up
|
||||
if os.path.exists(temp_file_path):
|
||||
os.unlink(temp_file_path)
|
||||
logger.info(f"Deleted temporary file after error: {temp_file_path}")
|
||||
return ""
|
||||
|
||||
def _convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
|
||||
def _try_convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
|
||||
"""Convert DOC file to DOCX format
|
||||
|
||||
Uses LibreOffice/OpenOffice for conversion
|
||||
@@ -141,21 +95,16 @@ class DocParser(BaseParser):
|
||||
"""
|
||||
logger.info(f"Converting DOC to DOCX: {doc_path}")
|
||||
|
||||
# Create a temporary directory to store the converted file
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
docx_path = os.path.join(temp_dir, "converted.docx")
|
||||
|
||||
try:
|
||||
# Check if LibreOffice or OpenOffice is installed
|
||||
soffice_path = self._find_soffice_path()
|
||||
soffice_path = self._try_find_soffice()
|
||||
if not soffice_path:
|
||||
logger.error(
|
||||
"LibreOffice/OpenOffice not found, cannot convert DOC to DOCX"
|
||||
)
|
||||
return None
|
||||
|
||||
# Execute conversion command
|
||||
logger.info(f"Using {soffice_path} to convert DOC to DOCX")
|
||||
|
||||
# Create a temporary directory to store the converted file
|
||||
with TempDirContext() as temp_dir:
|
||||
cmd = [
|
||||
soffice_path,
|
||||
"--headless",
|
||||
@@ -165,7 +114,6 @@ class DocParser(BaseParser):
|
||||
temp_dir,
|
||||
doc_path,
|
||||
]
|
||||
|
||||
logger.info(f"Running command: {' '.join(cmd)}")
|
||||
process = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
@@ -173,41 +121,68 @@ class DocParser(BaseParser):
|
||||
stdout, stderr = process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
logger.error(
|
||||
f"Error converting DOC to DOCX: {stderr.decode('utf-8', errors='ignore')}"
|
||||
logger.warning(
|
||||
f"Error converting DOC to DOCX: {stderr.decode('utf-8')}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Find the converted file
|
||||
for file in os.listdir(temp_dir):
|
||||
if file.endswith(".docx"):
|
||||
docx_file = [
|
||||
file for file in os.listdir(temp_dir) if file.endswith(".docx")
|
||||
]
|
||||
logger.info(f"Found {len(docx_file)} DOCX file(s) in temporary directory")
|
||||
for file in docx_file:
|
||||
converted_file = os.path.join(temp_dir, file)
|
||||
logger.info(f"Found converted file: {converted_file}")
|
||||
|
||||
# Read the converted file content
|
||||
with open(converted_file, "rb") as f:
|
||||
docx_content = f.read()
|
||||
|
||||
logger.info(
|
||||
f"Successfully read converted DOCX file, size: {len(docx_content)} bytes"
|
||||
f"Successfully read DOCX file, size: {len(docx_content)}"
|
||||
)
|
||||
return docx_content
|
||||
|
||||
logger.error("No DOCX file found after conversion")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during DOC to DOCX conversion: {str(e)}")
|
||||
return None
|
||||
finally:
|
||||
# Clean up temporary directory
|
||||
try:
|
||||
shutil.rmtree(temp_dir)
|
||||
logger.info(f"Cleaned up temporary directory: {temp_dir}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to clean up temporary directory: {str(e)}")
|
||||
def _try_find_executable_path(
|
||||
self,
|
||||
executable_name: str,
|
||||
possible_path: List[str] = [],
|
||||
environment_variable: List[str] = [],
|
||||
) -> Optional[str]:
|
||||
"""Find executable path
|
||||
Args:
|
||||
executable_name: Executable name
|
||||
possible_path: List of possible paths
|
||||
environment_variable: List of environment variables to check
|
||||
Returns:
|
||||
Executable path, or None if not found
|
||||
"""
|
||||
# Common executable paths
|
||||
paths: List[str] = []
|
||||
paths.extend(possible_path)
|
||||
paths.extend(os.environ.get(env_var, "") for env_var in environment_variable)
|
||||
paths = list(set(paths))
|
||||
|
||||
def _find_soffice_path(self) -> Optional[str]:
|
||||
# Check if path is set in environment variable
|
||||
for path in paths:
|
||||
if os.path.exists(path):
|
||||
logger.info(f"Found {executable_name} at {path}")
|
||||
return path
|
||||
|
||||
# Try to find in PATH
|
||||
result = subprocess.run(
|
||||
["which", executable_name], capture_output=True, text=True
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
path = result.stdout.strip()
|
||||
logger.info(f"Found {executable_name} at {path}")
|
||||
return path
|
||||
|
||||
logger.warning(f"Failed to find {executable_name}")
|
||||
return None
|
||||
|
||||
def _try_find_soffice(self) -> Optional[str]:
|
||||
"""Find LibreOffice/OpenOffice executable path
|
||||
|
||||
Returns:
|
||||
@@ -225,32 +200,13 @@ class DocParser(BaseParser):
|
||||
"C:\\Program Files\\LibreOffice\\program\\soffice.exe",
|
||||
"C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
|
||||
]
|
||||
|
||||
# Check if path is set in environment variable
|
||||
if os.environ.get("LIBREOFFICE_PATH"):
|
||||
possible_paths.insert(0, os.environ.get("LIBREOFFICE_PATH"))
|
||||
|
||||
for path in possible_paths:
|
||||
if os.path.exists(path):
|
||||
logger.info(f"Found LibreOffice/OpenOffice at: {path}")
|
||||
return path
|
||||
|
||||
# Try to find in PATH
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["which", "soffice"], capture_output=True, text=True
|
||||
return self._try_find_executable_path(
|
||||
executable_name="soffice",
|
||||
possible_path=possible_paths,
|
||||
environment_variable=["LIBREOFFICE_PATH"],
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
path = result.stdout.strip()
|
||||
logger.info(f"Found LibreOffice/OpenOffice in PATH: {path}")
|
||||
return path
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.warning("LibreOffice/OpenOffice not found")
|
||||
return None
|
||||
|
||||
def _find_antiword_path(self) -> Optional[str]:
|
||||
def _try_find_antiword(self) -> Optional[str]:
|
||||
"""Find antiword executable path
|
||||
|
||||
Returns:
|
||||
@@ -265,51 +221,27 @@ class DocParser(BaseParser):
|
||||
"C:\\Program Files\\Antiword\\antiword.exe",
|
||||
"C:\\Program Files (x86)\\Antiword\\antiword.exe",
|
||||
]
|
||||
|
||||
# Check if path is set in environment variable
|
||||
if os.environ.get("ANTIWORD_PATH"):
|
||||
possible_paths.insert(0, os.environ.get("ANTIWORD_PATH"))
|
||||
|
||||
for path in possible_paths:
|
||||
if os.path.exists(path):
|
||||
logger.info(f"Found antiword at: {path}")
|
||||
return path
|
||||
|
||||
# Try to find in PATH
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["which", "antiword"], capture_output=True, text=True
|
||||
return self._try_find_executable_path(
|
||||
executable_name="antiword",
|
||||
possible_path=possible_paths,
|
||||
environment_variable=["ANTIWORD_PATH"],
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
path = result.stdout.strip()
|
||||
logger.info(f"Found antiword in PATH: {path}")
|
||||
return path
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.warning("antiword not found")
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger.info("Running DocParser in standalone mode")
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
file_name = "/path/to/your/test.doc"
|
||||
logger.info(f"Processing file: {file_name}")
|
||||
|
||||
doc_parser = DocParser(
|
||||
file_name, enable_multimodal=True, chunk_size=512, chunk_overlap=60
|
||||
file_name=file_name,
|
||||
enable_multimodal=True,
|
||||
chunk_size=512,
|
||||
chunk_overlap=60,
|
||||
)
|
||||
logger.info("Parser initialized, starting processing")
|
||||
|
||||
with open(file_name, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
text = doc_parser.parse_into_text(content)
|
||||
logger.info(f"Processing complete, extracted text length: {len(text)}")
|
||||
logger.info(f"Sample text: {text[:200]}...")
|
||||
document = doc_parser.parse_into_text(content)
|
||||
logger.info(f"Processing complete, extracted text length: {len(document.content)}")
|
||||
logger.info(f"Sample text: {document.content[:200]}...")
|
||||
|
||||
28
docreader/parser/docx2_parser.py
Normal file
28
docreader/parser/docx2_parser.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import logging
|
||||
|
||||
from docreader.parser.chain_parser import FirstParser
|
||||
from docreader.parser.docx_parser import DocxParser
|
||||
from docreader.parser.markitdown_parser import MarkitdownParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Docx2Parser(FirstParser):
|
||||
_parser_cls = (MarkitdownParser, DocxParser)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_file = "/path/to/your/file.docx"
|
||||
parser = Docx2Parser(separators=[".", "?", "!", "。", "?", "!"])
|
||||
with open(your_file, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
document = parser.parse(content)
|
||||
for cc in document.chunks:
|
||||
logger.info(f"chunk: {cc}")
|
||||
|
||||
# document = parser.parse_into_text(content)
|
||||
# logger.info(f"docx content: {document.content}")
|
||||
# logger.info(f"find images {document.images.keys()}")
|
||||
@@ -1,37 +1,36 @@
|
||||
import logging
|
||||
import tempfile
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from io import BytesIO
|
||||
from typing import Optional, Dict, Any, Tuple, List, Union
|
||||
from dataclasses import dataclass, field
|
||||
from PIL import Image
|
||||
from docx import Document
|
||||
from docx.image.exceptions import (
|
||||
UnrecognizedImageError,
|
||||
UnexpectedEndOfFileError,
|
||||
InvalidImageStreamError,
|
||||
)
|
||||
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
|
||||
import re
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from dataclasses import dataclass, field
|
||||
from io import BytesIO
|
||||
from multiprocessing import Manager
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from .base_parser import BaseParser
|
||||
from docx import Document
|
||||
from docx.image.exceptions import (
|
||||
InvalidImageStreamError,
|
||||
UnexpectedEndOfFileError,
|
||||
UnrecognizedImageError,
|
||||
)
|
||||
from PIL import Image
|
||||
|
||||
from docreader.models.document import Document as DocumentModel
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
# Add thread local storage to track the processing status of each thread
|
||||
thread_local = threading.local()
|
||||
|
||||
|
||||
class ImageData:
|
||||
"""Represents a processed image of document content"""
|
||||
|
||||
local_path: str = ""
|
||||
object: Image.Image = None
|
||||
object: Optional[Image.Image] = None
|
||||
url: str = ""
|
||||
|
||||
|
||||
@@ -40,7 +39,9 @@ class LineData:
|
||||
"""Represents a processed line of document content with associated images"""
|
||||
|
||||
text: str = "" # Extracted text content
|
||||
images: List[ImageData] = field(default_factory=list) # List of images or image paths
|
||||
images: List[ImageData] = field(
|
||||
default_factory=list
|
||||
) # List of images or image paths
|
||||
extra_info: str = "" # Placeholder for additional info (currently unused)
|
||||
page_num: int = 0 # Page number
|
||||
content_sequence: List[Tuple[str, Any]] = field(
|
||||
@@ -53,18 +54,8 @@ class DocxParser(BaseParser):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_name: str = "",
|
||||
file_type: str = None,
|
||||
enable_multimodal: bool = True,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
separators: list = ["\n\n", "\n", "。"],
|
||||
ocr_backend: str = "paddle",
|
||||
ocr_config: dict = None,
|
||||
max_image_size: int = 1920,
|
||||
max_concurrent_tasks: int = 5,
|
||||
max_pages: int = 100, # Maximum number of pages to process, default to 50 pages
|
||||
chunking_config=None,
|
||||
max_pages: int = 100, # Maximum number of pages to process
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize DOCX document parser
|
||||
|
||||
@@ -79,37 +70,16 @@ class DocxParser(BaseParser):
|
||||
ocr_config: OCR engine configuration
|
||||
max_image_size: Maximum image size limit
|
||||
max_concurrent_tasks: Maximum number of concurrent tasks
|
||||
max_pages: Maximum number of pages to process, if more than this, only process the first max_pages pages
|
||||
max_pages: Maximum number of pages to process
|
||||
"""
|
||||
super().__init__(
|
||||
file_name=file_name,
|
||||
file_type=file_type,
|
||||
enable_multimodal=enable_multimodal,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
separators=separators,
|
||||
ocr_backend=ocr_backend,
|
||||
ocr_config=ocr_config,
|
||||
max_image_size=max_image_size,
|
||||
max_concurrent_tasks=max_concurrent_tasks,
|
||||
chunking_config=chunking_config,
|
||||
)
|
||||
super().__init__(**kwargs)
|
||||
self.max_pages = max_pages
|
||||
logger.info(f"DocxParser initialized with max_pages={max_pages}")
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
"""Parse DOCX document, extract text content and image Markdown links
|
||||
|
||||
Args:
|
||||
content: DOCX document content
|
||||
|
||||
Returns:
|
||||
Tuple of (parsed_text, image_map) where image_map maps image URLs to Image objects
|
||||
All LineData objects are used internally but not returned directly through this interface
|
||||
"""
|
||||
def parse_into_text(self, content: bytes) -> DocumentModel:
|
||||
"""Parse DOCX document, extract text content and image Markdown links"""
|
||||
logger.info(f"Parsing DOCX document, content size: {len(content)} bytes")
|
||||
logger.info(f"Max pages limit set to: {self.max_pages}")
|
||||
logger.info("Converting DOCX content to sections and tables")
|
||||
|
||||
start_time = time.time()
|
||||
# Use concurrent processing to handle the document
|
||||
@@ -123,7 +93,7 @@ class DocxParser(BaseParser):
|
||||
docx_processor = Docx(
|
||||
max_image_size=self.max_image_size,
|
||||
enable_multimodal=self.enable_multimodal,
|
||||
upload_file=self.upload_file,
|
||||
upload_file=self.storage.upload_file,
|
||||
)
|
||||
all_lines, tables = docx_processor(
|
||||
binary=content,
|
||||
@@ -140,7 +110,7 @@ class DocxParser(BaseParser):
|
||||
section_start_time = time.time()
|
||||
|
||||
text_parts = []
|
||||
image_parts = {}
|
||||
image_parts: Dict[str, str] = {}
|
||||
|
||||
for sec_idx, line in enumerate(all_lines):
|
||||
try:
|
||||
@@ -154,8 +124,11 @@ class DocxParser(BaseParser):
|
||||
)
|
||||
if line.images:
|
||||
for image_data in line.images:
|
||||
if image_data.url:
|
||||
image_parts[image_data.url] = image_data.object
|
||||
if image_data.url and image_data.object:
|
||||
image_parts[image_data.url] = endecode.decode_image(
|
||||
image_data.object
|
||||
)
|
||||
image_data.object.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing section {sec_idx + 1}: {str(e)}")
|
||||
logger.error(f"Detailed stack trace: {traceback.format_exc()}")
|
||||
@@ -176,17 +149,17 @@ class DocxParser(BaseParser):
|
||||
|
||||
total_processing_time = time.time() - start_time
|
||||
logger.info(
|
||||
f"Parsing complete in {total_processing_time:.2f}s, generated {len(text)} characters of text "
|
||||
f"Parsing complete in {total_processing_time:.2f}s, "
|
||||
f"generated {len(text)} characters of text"
|
||||
)
|
||||
|
||||
return text, image_parts
|
||||
return DocumentModel(content=text, images=image_parts)
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing DOCX document: {str(e)}")
|
||||
logger.error(f"Detailed stack trace: {traceback.format_exc()}")
|
||||
fallback_text = self._parse_using_simple_method(content)
|
||||
return fallback_text, {}
|
||||
return self._parse_using_simple_method(content)
|
||||
|
||||
def _parse_using_simple_method(self, content: bytes) -> str:
|
||||
def _parse_using_simple_method(self, content: bytes) -> DocumentModel:
|
||||
"""Parse document using a simplified method, as a fallback
|
||||
|
||||
Args:
|
||||
@@ -201,7 +174,8 @@ class DocxParser(BaseParser):
|
||||
doc = Document(BytesIO(content))
|
||||
logger.info(
|
||||
f"Successfully loaded document in simplified method, "
|
||||
f"contains {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables"
|
||||
f"contains {len(doc.paragraphs)} paragraphs "
|
||||
f"and {len(doc.tables)} tables"
|
||||
)
|
||||
text_parts = []
|
||||
|
||||
@@ -256,25 +230,24 @@ class DocxParser(BaseParser):
|
||||
# If the result is still empty, return an error message
|
||||
if not result_text:
|
||||
logger.warning("No text extracted using simplified method")
|
||||
return "", {}
|
||||
return DocumentModel()
|
||||
|
||||
return result_text, {}
|
||||
return DocumentModel(content=result_text)
|
||||
except Exception as backup_error:
|
||||
processing_time = time.time() - start_time
|
||||
logger.error(
|
||||
f"Simplified parsing failed after {processing_time:.2f}s: {str(backup_error)}"
|
||||
f"Simplified parsing failed {processing_time:.2f}s: {backup_error}"
|
||||
)
|
||||
logger.error(f"Detailed traceback: {traceback.format_exc()}")
|
||||
return "", {}
|
||||
return DocumentModel()
|
||||
|
||||
|
||||
class Docx:
|
||||
def __init__(self, max_image_size=1920, enable_multimodal=False, upload_file=None):
|
||||
logger.info("Initializing DOCX processor")
|
||||
self.max_image_size = max_image_size # Maximum image size limit
|
||||
self.picture_cache = (
|
||||
{}
|
||||
) # Image cache to avoid processing the same image repeatedly
|
||||
# Image cache to avoid processing the same image repeatedly
|
||||
self.picture_cache = {}
|
||||
self.enable_multimodal = enable_multimodal
|
||||
self.upload_file = upload_file
|
||||
|
||||
@@ -454,7 +427,6 @@ class Docx:
|
||||
|
||||
return page_to_paragraphs
|
||||
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
binary: Optional[bytes] = None,
|
||||
@@ -611,7 +583,6 @@ class Docx:
|
||||
|
||||
return pages_to_process
|
||||
|
||||
|
||||
def _process_document(
|
||||
self,
|
||||
binary,
|
||||
@@ -806,7 +777,9 @@ class Docx:
|
||||
# Collect temporary image paths for later cleanup
|
||||
for line in page_lines:
|
||||
for image_data in line.images:
|
||||
if image_data.local_path and image_data.local_path.startswith("/tmp/docx_img_"):
|
||||
if image_data.local_path and image_data.local_path.startswith(
|
||||
"/tmp/docx_img_"
|
||||
):
|
||||
temp_img_paths.add(image_data.local_path)
|
||||
|
||||
results.extend(page_lines)
|
||||
@@ -876,7 +849,11 @@ class Docx:
|
||||
|
||||
# Process all image data objects
|
||||
for image_data in image_paths:
|
||||
if image_data.local_path and os.path.exists(image_data.local_path) and image_data.local_path not in image_url_map:
|
||||
if (
|
||||
image_data.local_path
|
||||
and os.path.exists(image_data.local_path)
|
||||
and image_data.local_path not in image_url_map
|
||||
):
|
||||
try:
|
||||
# Upload the image if it doesn't have a URL yet
|
||||
if not image_data.url:
|
||||
@@ -886,12 +863,16 @@ class Docx:
|
||||
image_data.url = image_url
|
||||
# Add image URL as Markdown format
|
||||
markdown_image = f""
|
||||
image_url_map[image_data.local_path] = markdown_image
|
||||
image_url_map[image_data.local_path] = (
|
||||
markdown_image
|
||||
)
|
||||
logger.info(
|
||||
f"Added image URL for {image_data.local_path}: {image_url}"
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Failed to upload image: {image_data.local_path}")
|
||||
logger.warning(
|
||||
f"Failed to upload image: {image_data.local_path}"
|
||||
)
|
||||
else:
|
||||
# Already has a URL, use it
|
||||
markdown_image = f""
|
||||
@@ -925,12 +906,19 @@ class Docx:
|
||||
# For ImageData objects, use the URL
|
||||
if isinstance(content, str) and content in image_url_map:
|
||||
combined_parts.append(image_url_map[content])
|
||||
elif hasattr(content, 'local_path') and content.local_path in image_url_map:
|
||||
elif (
|
||||
hasattr(content, "local_path")
|
||||
and content.local_path in image_url_map
|
||||
):
|
||||
combined_parts.append(image_url_map[content.local_path])
|
||||
|
||||
# Create the final text with proper ordering
|
||||
final_text = "\n\n".join(part for part in combined_parts if part)
|
||||
processed_lines.append(LineData(text=final_text, page_num=page_num, images=line_data.images))
|
||||
processed_lines.append(
|
||||
LineData(
|
||||
text=final_text, page_num=page_num, images=line_data.images
|
||||
)
|
||||
)
|
||||
else:
|
||||
processed_lines = lines
|
||||
|
||||
@@ -1111,8 +1099,8 @@ def _save_image_to_temp(logger, image, page_num, img_idx):
|
||||
if not image:
|
||||
return None
|
||||
|
||||
import tempfile
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
try:
|
||||
# Create a temporary file
|
||||
@@ -1187,8 +1175,15 @@ def process_page_multiprocess(
|
||||
return []
|
||||
|
||||
# Extract page content
|
||||
combined_text, image_objects, content_sequence = _extract_page_content_in_process(
|
||||
process_logger, doc, page_num, paragraphs, enable_multimodal, max_image_size
|
||||
combined_text, image_objects, content_sequence = (
|
||||
_extract_page_content_in_process(
|
||||
process_logger,
|
||||
doc,
|
||||
page_num,
|
||||
paragraphs,
|
||||
enable_multimodal,
|
||||
max_image_size,
|
||||
)
|
||||
)
|
||||
|
||||
# Process content sequence to maintain order between processes
|
||||
@@ -1199,7 +1194,9 @@ def process_page_multiprocess(
|
||||
if enable_multimodal:
|
||||
# First pass: save all images to temporary files
|
||||
for i, image_object in enumerate(image_objects):
|
||||
img_path = _save_image_to_temp(process_logger, image_object, page_num, i)
|
||||
img_path = _save_image_to_temp(
|
||||
process_logger, image_object, page_num, i
|
||||
)
|
||||
if img_path:
|
||||
# Create ImageData object
|
||||
image_data = ImageData()
|
||||
|
||||
@@ -1,15 +1,13 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import asyncio
|
||||
from PIL import Image
|
||||
import io
|
||||
from typing import Dict, Any, Tuple, Union
|
||||
from .base_parser import BaseParser, ParseResult
|
||||
import numpy as np
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
|
||||
# Set up logger for this module
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class ImageParser(BaseParser):
|
||||
"""
|
||||
@@ -23,46 +21,24 @@ class ImageParser(BaseParser):
|
||||
4. Returning a combined result with both text and image reference
|
||||
"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""
|
||||
Parse image content, upload the image and return Markdown reference along with image map.
|
||||
|
||||
Args:
|
||||
content: Raw image data (bytes)
|
||||
|
||||
Returns:
|
||||
Tuple of (markdown_text, image_map) where image_map maps image URLs to PIL Image objects
|
||||
Parse image content into markdown text
|
||||
:param content: bytes content of the image
|
||||
:return: Document object
|
||||
"""
|
||||
logger.info(f"Parsing image content, size: {len(content)} bytes")
|
||||
image_map = {}
|
||||
|
||||
try:
|
||||
# Upload image to storage service
|
||||
logger.info("Uploading image to storage")
|
||||
_, ext = os.path.splitext(self.file_name)
|
||||
image_url = self.upload_bytes(content, file_ext=ext)
|
||||
if not image_url:
|
||||
logger.error("Failed to upload image to storage")
|
||||
return "", {}
|
||||
logger.info(
|
||||
f"Successfully uploaded image, URL: {image_url[:50]}..."
|
||||
if len(image_url) > 50
|
||||
else f"Successfully uploaded image, URL: {image_url}"
|
||||
)
|
||||
# Get file extension
|
||||
ext = os.path.splitext(self.file_name)[1].lower()
|
||||
|
||||
# Upload image to storage
|
||||
image_url = self.storage.upload_bytes(content, file_ext=ext)
|
||||
logger.info(f"Successfully uploaded image, URL: {image_url[:50]}...")
|
||||
|
||||
# Generate markdown text
|
||||
text = f""
|
||||
images = {image_url: base64.b64encode(content).decode()}
|
||||
|
||||
# Create image object and add to map
|
||||
try:
|
||||
from PIL import Image
|
||||
import io
|
||||
image = Image.open(io.BytesIO(content))
|
||||
image_map[image_url] = image
|
||||
logger.info(f"Added image to image_map for URL: {image_url}")
|
||||
except Exception as img_err:
|
||||
logger.error(f"Error creating image object: {str(img_err)}")
|
||||
|
||||
markdown_text = f""
|
||||
return markdown_text, image_map
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing image: {str(e)}")
|
||||
return "", {}
|
||||
return Document(content=text, images=images)
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
from typing import Union
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def image_to_base64(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
|
||||
"""Convert image to base64 encoded string
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, PIL Image object, or numpy array
|
||||
|
||||
Returns:
|
||||
Base64 encoded image string, or empty string if conversion fails
|
||||
"""
|
||||
try:
|
||||
if isinstance(image, str):
|
||||
# It's a file path
|
||||
with open(image, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
elif isinstance(image, bytes):
|
||||
# It's bytes data
|
||||
return base64.b64encode(image).decode("utf-8")
|
||||
elif isinstance(image, Image.Image):
|
||||
# It's a PIL Image
|
||||
buffer = io.BytesIO()
|
||||
image.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
elif isinstance(image, np.ndarray):
|
||||
# It's a numpy array
|
||||
pil_image = Image.fromarray(image)
|
||||
buffer = io.BytesIO()
|
||||
pil_image.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
else:
|
||||
logger.error(f"Unsupported image type: {type(image)}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting image to base64: {str(e)}")
|
||||
return ""
|
||||
111
docreader/parser/markdown_image_util.py
Normal file
111
docreader/parser/markdown_image_util.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from typing import Dict, List, Match, Optional, Tuple
|
||||
|
||||
from docreader.utils import endecode
|
||||
|
||||
# Get logger object
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownImageUtil:
|
||||
def __init__(self):
|
||||
self.b64_pattern = re.compile(
|
||||
r"!\[([^\]]*)\]\(data:image/(\w+);base64,([^\)]+)\)"
|
||||
)
|
||||
self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
|
||||
self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
|
||||
|
||||
def extract_image(
|
||||
self,
|
||||
content: str,
|
||||
path_prefix: Optional[str] = None,
|
||||
replace: bool = True,
|
||||
) -> Tuple[str, List[str]]:
|
||||
"""Extract base64 encoded images from Markdown content"""
|
||||
|
||||
# image_path => base64 bytes
|
||||
images: List[str] = []
|
||||
|
||||
def repl(match: Match[str]) -> str:
|
||||
title = match.group(1)
|
||||
image_path = match.group(2)
|
||||
if path_prefix:
|
||||
image_path = f"{path_prefix}/{image_path}"
|
||||
|
||||
images.append(image_path)
|
||||
|
||||
if not replace:
|
||||
return match.group(0)
|
||||
|
||||
# Replace image path with URL
|
||||
return f""
|
||||
|
||||
text = self.image_pattern.sub(repl, content)
|
||||
logger.debug(f"Extracted {len(images)} images from markdown")
|
||||
return text, images
|
||||
|
||||
def extract_base64(
|
||||
self,
|
||||
content: str,
|
||||
path_prefix: Optional[str] = None,
|
||||
replace: bool = True,
|
||||
) -> Tuple[str, Dict[str, bytes]]:
|
||||
"""Extract base64 encoded images from Markdown content"""
|
||||
|
||||
# image_path => base64 bytes
|
||||
images: Dict[str, bytes] = {}
|
||||
|
||||
def repl(match: Match[str]) -> str:
|
||||
title = match.group(1)
|
||||
img_ext = match.group(2)
|
||||
img_b64 = match.group(3)
|
||||
|
||||
image_byte = endecode.encode_image(img_b64, errors="ignore")
|
||||
if not image_byte:
|
||||
logger.error(f"Failed to decode base64 image skip it: {img_b64}")
|
||||
return title
|
||||
|
||||
image_path = f"{uuid.uuid4()}.{img_ext}"
|
||||
if path_prefix:
|
||||
image_path = f"{path_prefix}/{image_path}"
|
||||
images[image_path] = image_byte
|
||||
|
||||
if not replace:
|
||||
return match.group(0)
|
||||
|
||||
# Replace image path with URL
|
||||
return f""
|
||||
|
||||
text = self.b64_pattern.sub(repl, content)
|
||||
logger.debug(f"Extracted {len(images)} base64 images from markdown")
|
||||
return text, images
|
||||
|
||||
def replace_path(self, content: str, images: Dict[str, str]) -> str:
|
||||
content_replace: set = set()
|
||||
|
||||
def repl(match: Match[str]) -> str:
|
||||
title = match.group(1)
|
||||
image_path = match.group(2)
|
||||
if image_path not in images:
|
||||
return match.group(0)
|
||||
|
||||
content_replace.add(image_path)
|
||||
image_path = images[image_path]
|
||||
return f""
|
||||
|
||||
text = self.replace_pattern.sub(repl, content)
|
||||
logger.debug(f"Replaced {len(content_replace)} images in markdown")
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
your_content = "testtest"
|
||||
image_handle = MarkdownImageUtil()
|
||||
text, images = image_handle.extract_base64(your_content)
|
||||
print(text)
|
||||
|
||||
for image_url, image_byte in images.items():
|
||||
with open(image_url, "wb") as f:
|
||||
f.write(image_byte)
|
||||
@@ -1,33 +1,53 @@
|
||||
import asyncio
|
||||
import re
|
||||
import base64
|
||||
import logging
|
||||
import numpy as np
|
||||
import os # Import os module to get environment variables
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
from .base_parser import BaseParser
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.parser.markdown_image_util import MarkdownImageUtil
|
||||
from docreader.utils import endecode
|
||||
|
||||
# Get logger object
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownParser(BaseParser):
|
||||
"""Markdown document parser"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
"""Parse Markdown document, only extract text content, do not process images
|
||||
|
||||
Args:
|
||||
content: Markdown document content
|
||||
|
||||
Returns:
|
||||
Parsed text result
|
||||
"""
|
||||
logger.info(f"Parsing Markdown document, content size: {len(content)} bytes")
|
||||
class MarkdownImageBase64(BaseParser):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.image_helper = MarkdownImageUtil()
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
# Convert byte content to string using universal decoding method
|
||||
text = self.decode_bytes(content)
|
||||
logger.info(f"Decoded Markdown content, text length: {len(text)} characters")
|
||||
text = endecode.decode_bytes(content)
|
||||
text, img_b64 = self.image_helper.extract_base64(text, path_prefix="images")
|
||||
|
||||
logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text")
|
||||
return text
|
||||
images: Dict[str, str] = {}
|
||||
image_replace: Dict[str, str] = {}
|
||||
|
||||
logger.debug(f"Uploading {len(img_b64)} images from markdown")
|
||||
for ipath, b64_bytes in img_b64.items():
|
||||
ext = os.path.splitext(ipath)[1].lower()
|
||||
image_url = self.storage.upload_bytes(b64_bytes, ext)
|
||||
|
||||
image_replace[ipath] = image_url
|
||||
images[image_url] = base64.b64encode(b64_bytes).decode()
|
||||
|
||||
text = self.image_helper.replace_path(text, image_replace)
|
||||
return Document(content=text, images=images)
|
||||
|
||||
|
||||
class MarkdownParser(PipelineParser):
|
||||
_parser_cls = (MarkdownImageBase64,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_content = "testtest"
|
||||
parser = MarkdownParser()
|
||||
|
||||
document = parser.parse_into_text(your_content.encode())
|
||||
logger.info(document.content)
|
||||
logger.info(f"Images: {len(document.images)}, name: {document.images.keys()}")
|
||||
|
||||
31
docreader/parser/markitdown_parser.py
Normal file
31
docreader/parser/markitdown_parser.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import io
|
||||
import logging
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StdMarkitdownParser(BaseParser):
|
||||
"""
|
||||
PDF Document Parser
|
||||
|
||||
This parser handles PDF documents by extracting text content.
|
||||
It uses the markitdown library for simple text extraction.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.markitdown = MarkItDown()
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
result = self.markitdown.convert(io.BytesIO(content), keep_data_uris=True)
|
||||
return Document(content=result.text_content)
|
||||
|
||||
|
||||
class MarkitdownParser(PipelineParser):
|
||||
_parser_cls = (StdMarkitdownParser, MarkdownParser)
|
||||
124
docreader/parser/mineru_parser.py
Normal file
124
docreader/parser/mineru_parser.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Dict
|
||||
|
||||
import markdownify
|
||||
import requests
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.markdown_parser import MarkdownImageUtil
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MinerUParser(BaseParser):
|
||||
def __init__(
|
||||
self,
|
||||
enable_markdownify: bool = True,
|
||||
mineru_endpoint: str = "",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.minerU = os.getenv("MINERU_ENDPOINT", mineru_endpoint)
|
||||
self.enable_markdownify = enable_markdownify
|
||||
self.image_helper = MarkdownImageUtil()
|
||||
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
|
||||
self.enable = self.ping()
|
||||
assert self.ping(), "MinerU API is not reachable"
|
||||
|
||||
def ping(self, timeout: int = 5) -> bool:
|
||||
try:
|
||||
response = requests.get(
|
||||
self.minerU + "/docs", timeout=timeout, allow_redirects=True
|
||||
)
|
||||
response.raise_for_status()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
|
||||
md_content: str = ""
|
||||
images_b64: Dict[str, str] = {}
|
||||
try:
|
||||
response = requests.post(
|
||||
url=self.minerU + "/file_parse",
|
||||
data={
|
||||
"return_md": True,
|
||||
"return_images": True,
|
||||
"lang_list": ["ch", "en"],
|
||||
"table_enable": True,
|
||||
"formula_enable": True,
|
||||
"parse_method": "auto",
|
||||
"start_page_id": 0,
|
||||
"end_page_id": 99999,
|
||||
"backend": "pipeline",
|
||||
"response_format_zip": False,
|
||||
"return_middle_json": False,
|
||||
"return_model_output": False,
|
||||
"return_content_list": False,
|
||||
},
|
||||
files={"files": content},
|
||||
timeout=1000,
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()["results"]["files"]
|
||||
md_content = result["md_content"]
|
||||
images_b64 = result.get("images", {})
|
||||
except Exception as e:
|
||||
logger.error(f"MinerU parsing failed: {e}", exc_info=True)
|
||||
return Document()
|
||||
|
||||
# convert table(HTML) in markdown to markdown table
|
||||
if self.enable_markdownify:
|
||||
logger.debug("Converting HTML to Markdown")
|
||||
md_content = markdownify.markdownify(md_content)
|
||||
|
||||
images = {}
|
||||
image_replace = {}
|
||||
# image in images_bs64 may not be used in md_content
|
||||
# such as: table ...
|
||||
# so we need to filter them
|
||||
for ipath, b64_str in images_b64.items():
|
||||
if f"images/{ipath}" not in md_content:
|
||||
logger.debug(f"Image {ipath} not used in markdown")
|
||||
continue
|
||||
match = self.base64_pattern.match(b64_str)
|
||||
if match:
|
||||
file_ext = match.group(1)
|
||||
b64_str = match.group(2)
|
||||
|
||||
image_bytes = endecode.encode_image(b64_str, errors="ignore")
|
||||
if not image_bytes:
|
||||
logger.error("Failed to decode base64 image skip it")
|
||||
continue
|
||||
|
||||
image_url = self.storage.upload_bytes(
|
||||
image_bytes, file_ext=f".{file_ext}"
|
||||
)
|
||||
|
||||
images[image_url] = b64_str
|
||||
image_replace[f"images/{ipath}"] = image_url
|
||||
|
||||
logger.info(f"Replaced {len(image_replace)} images in markdown")
|
||||
text = self.image_helper.replace_path(md_content, image_replace)
|
||||
|
||||
logger.info(
|
||||
f"Successfully parsed PDF, text: {len(text)}, images: {len(images)}"
|
||||
)
|
||||
return Document(content=text, images=images)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_file = "/path/to/your/file.pdf"
|
||||
your_mineru = "http://host.docker.internal:9987"
|
||||
parser = MinerUParser(mineru_endpoint=your_mineru)
|
||||
with open(your_file, "rb") as f:
|
||||
content = f.read()
|
||||
document = parser.parse_into_text(content)
|
||||
logger.error(document.content)
|
||||
@@ -1,15 +1,19 @@
|
||||
import os
|
||||
import logging
|
||||
import base64
|
||||
from typing import Optional, Union, Dict, Any
|
||||
from abc import ABC, abstractmethod
|
||||
from PIL import Image
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Union
|
||||
|
||||
import numpy as np
|
||||
from .image_utils import image_to_base64
|
||||
from openai import OpenAI
|
||||
from PIL import Image
|
||||
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class OCRBackend(ABC):
|
||||
"""Base class for OCR backends"""
|
||||
@@ -26,46 +30,67 @@ class OCRBackend(ABC):
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class DummyOCRBackend(OCRBackend):
|
||||
"""Dummy OCR backend implementation"""
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
logger.warning("Dummy OCR backend is used")
|
||||
return ""
|
||||
|
||||
|
||||
class PaddleOCRBackend(OCRBackend):
|
||||
"""PaddleOCR backend implementation"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
def __init__(self):
|
||||
"""Initialize PaddleOCR backend"""
|
||||
self.ocr = None
|
||||
try:
|
||||
import os
|
||||
import paddle
|
||||
|
||||
# Set PaddlePaddle to use CPU and disable GPU
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
||||
paddle.set_device('cpu')
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||||
paddle.device.set_device("cpu")
|
||||
|
||||
# 尝试检测CPU是否支持AVX指令集
|
||||
try:
|
||||
import subprocess
|
||||
import platform
|
||||
|
||||
# 检测CPU是否支持AVX
|
||||
if platform.system() == "Linux":
|
||||
try:
|
||||
result = subprocess.run(['grep', '-o', 'avx', '/proc/cpuinfo'],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
has_avx = 'avx' in result.stdout.lower()
|
||||
result = subprocess.run(
|
||||
["grep", "-o", "avx", "/proc/cpuinfo"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
has_avx = "avx" in result.stdout.lower()
|
||||
if not has_avx:
|
||||
logger.warning("CPU does not support AVX instructions, using compatibility mode")
|
||||
logger.warning(
|
||||
"CPU does not support AVX instructions, "
|
||||
"using compatibility mode"
|
||||
)
|
||||
# 进一步限制指令集使用
|
||||
os.environ['FLAGS_use_avx2'] = '0'
|
||||
os.environ['FLAGS_use_avx'] = '1'
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
|
||||
logger.warning("Could not detect AVX support, using compatibility mode")
|
||||
os.environ['FLAGS_use_avx2'] = '0'
|
||||
os.environ['FLAGS_use_avx'] = '1'
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
except (
|
||||
subprocess.TimeoutExpired,
|
||||
FileNotFoundError,
|
||||
subprocess.SubprocessError,
|
||||
):
|
||||
logger.warning(
|
||||
"Could not detect AVX support, using compatibility mode"
|
||||
)
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
except Exception as e:
|
||||
logger.warning(f"Error detecting CPU capabilities: {e}, using compatibility mode")
|
||||
os.environ['FLAGS_use_avx2'] = '0'
|
||||
os.environ['FLAGS_use_avx'] = '1'
|
||||
logger.warning(
|
||||
f"Error detecting CPU capabilities: {e}, using compatibility mode"
|
||||
)
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
# OCR configuration with text orientation classification enabled
|
||||
ocr_config = {
|
||||
"use_gpu": False,
|
||||
@@ -91,18 +116,48 @@ class PaddleOCRBackend(OCRBackend):
|
||||
logger.info("PaddleOCR engine initialized successfully")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
|
||||
logger.error(
|
||||
f"Failed to import paddleocr: {str(e)}. "
|
||||
"Please install it with 'pip install paddleocr'"
|
||||
)
|
||||
except OSError as e:
|
||||
if "Illegal instruction" in str(e) or "core dumped" in str(e):
|
||||
logger.error(f"PaddlePaddle crashed due to CPU instruction set incompatibility: {str(e)}")
|
||||
logger.error("This usually happens when the CPU doesn't support AVX instructions.")
|
||||
logger.error("Please try installing a CPU-only version of PaddlePaddle or use a different OCR backend.")
|
||||
logger.error(
|
||||
f"PaddlePaddle crashed due to CPU instruction set incompatibility:"
|
||||
f"{e}"
|
||||
)
|
||||
logger.error(
|
||||
"This happens when the CPU doesn't support AVX instructions. "
|
||||
"Try install CPU-only version of PaddlePaddle, "
|
||||
"or use a different OCR backend."
|
||||
)
|
||||
else:
|
||||
logger.error(f"Failed to initialize PaddleOCR due to OS error: {str(e)}")
|
||||
logger.error(
|
||||
f"Failed to initialize PaddleOCR due to OS error: {str(e)}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
|
||||
|
||||
def predict(self, image):
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
if isinstance(image, str):
|
||||
image = Image.open(image)
|
||||
elif isinstance(image, bytes):
|
||||
image = Image.open(io.BytesIO(image))
|
||||
|
||||
if not isinstance(image, Image.Image):
|
||||
raise TypeError("image must be a string, bytes, or PIL Image object")
|
||||
|
||||
return self._predict(image)
|
||||
|
||||
def _predict(self, image: Image.Image) -> str:
|
||||
"""Perform OCR recognition on the image
|
||||
|
||||
Args:
|
||||
@@ -111,16 +166,16 @@ class PaddleOCRBackend(OCRBackend):
|
||||
Returns:
|
||||
Extracted text string
|
||||
"""
|
||||
if self.ocr is None:
|
||||
logger.error("PaddleOCR engine not initialized")
|
||||
return ""
|
||||
try:
|
||||
# Ensure image is in RGB format
|
||||
if hasattr(image, "convert") and image.mode != "RGB":
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
|
||||
# Convert to numpy array if needed
|
||||
if hasattr(image, "convert"):
|
||||
image_array = np.array(image)
|
||||
else:
|
||||
image_array = image
|
||||
|
||||
# Perform OCR
|
||||
ocr_result = self.ocr.ocr(image_array, cls=False)
|
||||
@@ -128,28 +183,25 @@ class PaddleOCRBackend(OCRBackend):
|
||||
# Extract text
|
||||
ocr_text = ""
|
||||
if ocr_result and ocr_result[0]:
|
||||
for line in ocr_result[0]:
|
||||
if line and len(line) >= 2:
|
||||
text = line[1][0] if line[1] else ""
|
||||
if text:
|
||||
ocr_text += text + " "
|
||||
text = [
|
||||
line[1][0] if line and len(line) >= 2 and line[1] else ""
|
||||
for line in ocr_result[0]
|
||||
]
|
||||
text = [t.strip() for t in text if t]
|
||||
ocr_text = " ".join(text)
|
||||
|
||||
text_length = len(ocr_text.strip())
|
||||
if text_length > 0:
|
||||
logger.info(f"OCR extracted {text_length} characters")
|
||||
return ocr_text.strip()
|
||||
else:
|
||||
logger.warning("OCR returned empty result")
|
||||
return ""
|
||||
logger.info(f"OCR extracted {len(ocr_text)} characters")
|
||||
return ocr_text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR recognition error: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
class NanonetsOCRBackend(OCRBackend):
|
||||
"""Nanonets OCR backend implementation using OpenAI API format"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
def __init__(self):
|
||||
"""Initialize Nanonets OCR backend
|
||||
|
||||
Args:
|
||||
@@ -157,17 +209,16 @@ class NanonetsOCRBackend(OCRBackend):
|
||||
base_url: Base URL for OpenAI API
|
||||
model: Model name
|
||||
"""
|
||||
try:
|
||||
from openai import OpenAI
|
||||
self.api_key = kwargs.get("api_key", "123")
|
||||
self.base_url = kwargs.get("base_url", "http://localhost:8000/v1")
|
||||
self.model = kwargs.get("model", "nanonets/Nanonets-OCR-s")
|
||||
self.temperature = kwargs.get("temperature", 0.0)
|
||||
self.max_tokens = kwargs.get("max_tokens", 15000)
|
||||
base_url = os.getenv("OCR_API_BASE_URL", "http://localhost:8000/v1")
|
||||
api_key = os.getenv("OCR_API_KEY", "123")
|
||||
timeout = 30
|
||||
self.client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout)
|
||||
|
||||
self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
|
||||
self.prompt = """
|
||||
## 任务说明
|
||||
self.model = os.getenv("OCR_MODEL", "nanonets/Nanonets-OCR-s")
|
||||
logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
|
||||
self.temperature = 0.0
|
||||
self.max_tokens = 15000
|
||||
self.prompt = """## 任务说明
|
||||
|
||||
请从上传的文档中提取文字内容,严格按自然阅读顺序(从上到下,从左到右)输出,并遵循以下格式规范。
|
||||
|
||||
@@ -192,13 +243,6 @@ class NanonetsOCRBackend(OCRBackend):
|
||||
|
||||
* 不要猜测或补全不确定的链接地址。
|
||||
"""
|
||||
logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
|
||||
except ImportError:
|
||||
logger.error("Failed to import openai. Please install it with 'pip install openai'")
|
||||
self.client = None
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize Nanonets OCR: {str(e)}")
|
||||
self.client = None
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image using Nanonets OCR
|
||||
@@ -215,7 +259,7 @@ class NanonetsOCRBackend(OCRBackend):
|
||||
|
||||
try:
|
||||
# Encode image to base64
|
||||
img_base64 = image_to_base64(image)
|
||||
img_base64 = endecode.decode_image(image)
|
||||
if not img_base64:
|
||||
return ""
|
||||
|
||||
@@ -229,7 +273,9 @@ class NanonetsOCRBackend(OCRBackend):
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_base64}"},
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{img_base64}"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
@@ -239,21 +285,21 @@ class NanonetsOCRBackend(OCRBackend):
|
||||
}
|
||||
],
|
||||
temperature=self.temperature,
|
||||
max_tokens=self.max_tokens
|
||||
max_tokens=self.max_tokens,
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
return response.choices[0].message.content or ""
|
||||
except Exception as e:
|
||||
logger.error(f"Nanonets OCR prediction error: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
class OCREngine:
|
||||
"""OCR Engine factory class"""
|
||||
|
||||
_instance = None
|
||||
_instance: Dict[str, OCRBackend] = {}
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls, backend_type="paddle", **kwargs) -> Optional[OCRBackend]:
|
||||
def get_instance(cls, backend_type: str) -> OCRBackend:
|
||||
"""Get OCR engine instance
|
||||
|
||||
Args:
|
||||
@@ -263,16 +309,19 @@ class OCREngine:
|
||||
Returns:
|
||||
OCR engine instance or None if initialization fails
|
||||
"""
|
||||
if cls._instance is None:
|
||||
backend_type = backend_type.lower()
|
||||
if cls._instance.get(backend_type):
|
||||
return cls._instance[backend_type]
|
||||
|
||||
logger.info(f"Initializing OCR engine with backend: {backend_type}")
|
||||
|
||||
if backend_type.lower() == "paddle":
|
||||
cls._instance = PaddleOCRBackend(**kwargs)
|
||||
elif backend_type.lower() == "nanonets":
|
||||
cls._instance = NanonetsOCRBackend(**kwargs)
|
||||
if backend_type == "paddle":
|
||||
cls._instance[backend_type] = PaddleOCRBackend()
|
||||
|
||||
elif backend_type == "nanonets":
|
||||
cls._instance[backend_type] = NanonetsOCRBackend()
|
||||
|
||||
else:
|
||||
logger.error(f"Unknown OCR backend type: {backend_type}")
|
||||
return None
|
||||
|
||||
return cls._instance
|
||||
cls._instance[backend_type] = DummyOCRBackend()
|
||||
|
||||
return cls._instance[backend_type]
|
||||
|
||||
@@ -1,30 +1,19 @@
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Any, Optional, Type
|
||||
from typing import Dict, Type
|
||||
|
||||
from .base_parser import BaseParser, ParseResult
|
||||
from .docx_parser import DocxParser
|
||||
from .doc_parser import DocParser
|
||||
from .pdf_parser import PDFParser
|
||||
from .markdown_parser import MarkdownParser
|
||||
from .text_parser import TextParser
|
||||
from .image_parser import ImageParser
|
||||
from .web_parser import WebParser
|
||||
from .config import ChunkingConfig
|
||||
import traceback
|
||||
from docreader.models.document import Document
|
||||
from docreader.models.read_config import ChunkingConfig
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.doc_parser import DocParser
|
||||
from docreader.parser.docx2_parser import Docx2Parser
|
||||
from docreader.parser.image_parser import ImageParser
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
from docreader.parser.pdf_parser import PDFParser
|
||||
from docreader.parser.text_parser import TextParser
|
||||
from docreader.parser.web_parser import WebParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
"""
|
||||
Represents a single text chunk with associated metadata.
|
||||
Basic unit for document processing and embedding.
|
||||
"""
|
||||
|
||||
content: str # Text content of the chunk
|
||||
metadata: Dict[str, Any] = None # Associated metadata (source, page number, etc.)
|
||||
|
||||
|
||||
class Parser:
|
||||
"""
|
||||
@@ -33,10 +22,9 @@ class Parser:
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
logger.info("Initializing document parser")
|
||||
# Initialize all parser types
|
||||
self.parsers: Dict[str, Type[BaseParser]] = {
|
||||
"docx": DocxParser,
|
||||
"docx": Docx2Parser,
|
||||
"doc": DocParser,
|
||||
"pdf": PDFParser,
|
||||
"md": MarkdownParser,
|
||||
@@ -56,8 +44,7 @@ class Parser:
|
||||
", ".join(self.parsers.keys()),
|
||||
)
|
||||
|
||||
|
||||
def get_parser(self, file_type: str) -> Optional[Type[BaseParser]]:
|
||||
def get_parser(self, file_type: str) -> Type[BaseParser]:
|
||||
"""
|
||||
Get parser class for the specified file type.
|
||||
|
||||
@@ -67,12 +54,9 @@ class Parser:
|
||||
Returns:
|
||||
Parser class for the file type, or None if unsupported
|
||||
"""
|
||||
file_type = file_type.lower()
|
||||
parser = self.parsers.get(file_type)
|
||||
if parser:
|
||||
logger.info(f"Found parser for file type: {file_type}")
|
||||
else:
|
||||
logger.warning(f"No parser found for file type: {file_type}")
|
||||
parser = self.parsers.get(file_type.lower())
|
||||
if not parser:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
return parser
|
||||
|
||||
def parse_file(
|
||||
@@ -81,7 +65,7 @@ class Parser:
|
||||
file_type: str,
|
||||
content: bytes,
|
||||
config: ChunkingConfig,
|
||||
) -> Optional[ParseResult]:
|
||||
) -> Document:
|
||||
"""
|
||||
Parse file content using appropriate parser based on file type.
|
||||
|
||||
@@ -96,22 +80,17 @@ class Parser:
|
||||
"""
|
||||
logger.info(f"Parsing file: {file_name} with type: {file_type}")
|
||||
logger.info(
|
||||
f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
|
||||
f"Chunking config: size={config.chunk_size}, "
|
||||
f"overlap={config.chunk_overlap}, "
|
||||
f"multimodal={config.enable_multimodal}"
|
||||
)
|
||||
|
||||
parser_instance = None
|
||||
|
||||
try:
|
||||
# Get appropriate parser for file type
|
||||
cls = self.get_parser(file_type)
|
||||
if cls is None:
|
||||
logger.error(f"Unsupported file type: {file_type}")
|
||||
return None
|
||||
|
||||
# Parse file content
|
||||
logger.info(f"Creating parser instance for {file_type} file")
|
||||
parser_instance = cls(
|
||||
parser = cls(
|
||||
file_name=file_name,
|
||||
file_type=file_type,
|
||||
chunk_size=config.chunk_size,
|
||||
@@ -124,32 +103,18 @@ class Parser:
|
||||
)
|
||||
|
||||
logger.info(f"Starting to parse file content, size: {len(content)} bytes")
|
||||
result = parser_instance.parse(content)
|
||||
result = parser.parse(content)
|
||||
|
||||
if result:
|
||||
logger.info(
|
||||
f"Successfully parsed file {file_name}, generated {len(result.chunks)} chunks"
|
||||
)
|
||||
if result.chunks and len(result.chunks) > 0:
|
||||
logger.info(
|
||||
f"First chunk content length: {len(result.chunks[0].content)}"
|
||||
)
|
||||
else:
|
||||
if not result.content:
|
||||
logger.warning(f"Parser returned empty content for file: {file_name}")
|
||||
elif not result.chunks:
|
||||
logger.warning(f"Parser returned empty chunks for file: {file_name}")
|
||||
else:
|
||||
logger.warning(f"Parser returned None result for file: {file_name}")
|
||||
|
||||
# Return parse results
|
||||
elif result.chunks[0]:
|
||||
logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
|
||||
logger.info(f"Parsed file {file_name}, with {len(result.chunks)} chunks")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing file {file_name}: {str(e)}")
|
||||
logger.info(f"Detailed traceback: {traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
def parse_url(
|
||||
self, url: str, title: str, config: ChunkingConfig
|
||||
) -> Optional[ParseResult]:
|
||||
def parse_url(self, url: str, title: str, config: ChunkingConfig) -> Document:
|
||||
"""
|
||||
Parse content from a URL using the WebParser.
|
||||
|
||||
@@ -163,16 +128,13 @@ class Parser:
|
||||
"""
|
||||
logger.info(f"Parsing URL: {url}, title: {title}")
|
||||
logger.info(
|
||||
f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
|
||||
f"multimodal={config.enable_multimodal}"
|
||||
f"Chunking config: size={config.chunk_size}, "
|
||||
f"overlap={config.chunk_overlap}, multimodal={config.enable_multimodal}"
|
||||
)
|
||||
|
||||
parser_instance = None
|
||||
|
||||
try:
|
||||
# Create web parser instance
|
||||
logger.info("Creating WebParser instance")
|
||||
parser_instance = WebParser(
|
||||
parser = WebParser(
|
||||
title=title,
|
||||
chunk_size=config.chunk_size,
|
||||
chunk_overlap=config.chunk_overlap,
|
||||
@@ -183,24 +145,14 @@ class Parser:
|
||||
chunking_config=config,
|
||||
)
|
||||
|
||||
logger.info(f"Starting to parse URL content")
|
||||
result = parser_instance.parse(url)
|
||||
logger.info("Starting to parse URL content")
|
||||
result = parser.parse(url.encode())
|
||||
|
||||
if result:
|
||||
logger.info(
|
||||
f"Successfully parsed URL, generated {len(result.chunks)} chunks"
|
||||
)
|
||||
logger.info(
|
||||
f"First chunk content length: {len(result.chunks[0].content) if result.chunks else 0}"
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Parser returned empty result for URL: {url}")
|
||||
|
||||
# Return parse results
|
||||
if not result.content:
|
||||
logger.warning(f"Parser returned empty content for url: {url}")
|
||||
elif not result.chunks:
|
||||
logger.warning(f"Parser returned empty chunks for url: {url}")
|
||||
elif result.chunks[0]:
|
||||
logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
|
||||
logger.info(f"Parsed url {url}, with {len(result.chunks)} chunks")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing URL {url}: {str(e)}")
|
||||
logger.info(f"Detailed traceback: {traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
|
||||
@@ -1,113 +1,7 @@
|
||||
import logging
|
||||
import os
|
||||
import io
|
||||
from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
|
||||
|
||||
import pdfplumber
|
||||
import tempfile
|
||||
from .base_parser import BaseParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PDFParser(BaseParser):
|
||||
"""
|
||||
PDF Document Parser
|
||||
|
||||
This parser handles PDF documents by extracting text content.
|
||||
It uses the pypdf library for simple text extraction.
|
||||
"""
|
||||
def _convert_table_to_markdown(self, table_data: list) -> str:
|
||||
|
||||
if not table_data or not table_data[0]: return ""
|
||||
def clean_cell(cell):
|
||||
if cell is None: return ""
|
||||
return str(cell).replace("\n", " <br> ")
|
||||
try:
|
||||
markdown = ""
|
||||
header = [clean_cell(cell) for cell in table_data[0]]
|
||||
markdown += "| " + " | ".join(header) + " |\n"
|
||||
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
|
||||
for row in table_data[1:]:
|
||||
if not row: continue
|
||||
body_row = [clean_cell(cell) for cell in row]
|
||||
if len(body_row) != len(header):
|
||||
logger.warning(f"Skipping malformed table row: {body_row}")
|
||||
continue
|
||||
markdown += "| " + " | ".join(body_row) + " |\n"
|
||||
return markdown
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting table to markdown: {e}")
|
||||
return ""
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
|
||||
logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
|
||||
|
||||
all_page_content = []
|
||||
from docreader.parser.chain_parser import FirstParser
|
||||
from docreader.parser.markitdown_parser import MarkitdownParser
|
||||
from docreader.parser.mineru_parser import MinerUParser
|
||||
|
||||
|
||||
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
||||
temp_pdf_path = temp_pdf.name
|
||||
|
||||
try:
|
||||
temp_pdf.write(content)
|
||||
temp_pdf.close()
|
||||
logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
|
||||
|
||||
with pdfplumber.open(temp_pdf_path) as pdf:
|
||||
logger.info(f"PDF has {len(pdf.pages)} pages")
|
||||
|
||||
for page_num, page in enumerate(pdf.pages):
|
||||
page_content_parts = []
|
||||
|
||||
# Try-fallback strategy for table detection
|
||||
default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
|
||||
found_tables = page.find_tables(default_settings)
|
||||
if not found_tables:
|
||||
logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
|
||||
fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
|
||||
found_tables = page.find_tables(fallback_settings)
|
||||
|
||||
table_bboxes = [table.bbox for table in found_tables]
|
||||
# Define a filter function that keeps objects NOT inside any table bbox.
|
||||
def not_within_bboxes(obj):
|
||||
"""Check if an object is outside all table bounding boxes."""
|
||||
for bbox in table_bboxes:
|
||||
# Check if the object's vertical center is within a bbox
|
||||
if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
|
||||
return False # It's inside a table, so we DON'T keep it
|
||||
return True # It's outside all tables, so we DO keep it
|
||||
|
||||
# that contains only the non-table text.
|
||||
non_table_page = page.filter(not_within_bboxes)
|
||||
|
||||
# Now, extract text from this filtered page view.
|
||||
text = non_table_page.extract_text(x_tolerance=2)
|
||||
if text:
|
||||
page_content_parts.append(text)
|
||||
|
||||
# Process and append the structured Markdown tables
|
||||
if found_tables:
|
||||
logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
|
||||
for table in found_tables:
|
||||
markdown_table = self._convert_table_to_markdown(table.extract())
|
||||
page_content_parts.append(f"\n\n{markdown_table}\n\n")
|
||||
|
||||
|
||||
all_page_content.append("".join(page_content_parts))
|
||||
|
||||
final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
|
||||
logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
|
||||
|
||||
return final_text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse PDF document: {str(e)}")
|
||||
return ""
|
||||
finally:
|
||||
# This block is GUARANTEED to execute, preventing resource leaks.
|
||||
if os.path.exists(temp_pdf_path):
|
||||
try:
|
||||
os.remove(temp_pdf_path)
|
||||
logger.info(f"Temporary file cleaned up: {temp_pdf_path}")
|
||||
except OSError as e:
|
||||
logger.error(f"Error removing temporary file {temp_pdf_path}: {e}")
|
||||
class PDFParser(FirstParser):
|
||||
_parser_cls = (MinerUParser, MarkitdownParser)
|
||||
|
||||
@@ -1,17 +1,18 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import uuid
|
||||
import logging
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import traceback
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Tuple, Optional
|
||||
from typing import Dict
|
||||
|
||||
from qcloud_cos import CosConfig, CosS3Client
|
||||
from minio import Minio
|
||||
from qcloud_cos import CosConfig, CosS3Client
|
||||
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class Storage(ABC):
|
||||
@@ -53,12 +54,15 @@ class CosStorage(Storage):
|
||||
storage_config: Storage configuration
|
||||
"""
|
||||
self.storage_config = storage_config
|
||||
self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client()
|
||||
self.client, self.bucket_name, self.region, self.prefix = (
|
||||
self._init_cos_client()
|
||||
)
|
||||
|
||||
def _init_cos_client(self):
|
||||
"""Initialize Tencent Cloud COS client"""
|
||||
try:
|
||||
# Use provided COS config if available, otherwise fall back to environment variables
|
||||
# Use provided COS config if available,
|
||||
# otherwise fall back to environment variables
|
||||
if self.storage_config and self.storage_config.get("access_key_id") != "":
|
||||
cos_config = self.storage_config
|
||||
secret_id = cos_config.get("access_key_id")
|
||||
@@ -82,8 +86,9 @@ class CosStorage(Storage):
|
||||
|
||||
if not all([secret_id, secret_key, region, bucket_name, appid]):
|
||||
logger.error(
|
||||
"Incomplete COS configuration, missing required environment variables"
|
||||
f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}"
|
||||
"Incomplete COS configuration, missing environment variables"
|
||||
f"secret_id: {secret_id}, secret_key: {secret_key}, "
|
||||
f"region: {region}, bucket_name: {bucket_name}, appid: {appid}"
|
||||
)
|
||||
return None, None, None, None
|
||||
|
||||
@@ -119,7 +124,6 @@ class CosStorage(Storage):
|
||||
"""
|
||||
return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
|
||||
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to Tencent Cloud COS
|
||||
|
||||
@@ -135,16 +139,16 @@ class CosStorage(Storage):
|
||||
return ""
|
||||
|
||||
# Generate object key, use UUID to avoid conflicts
|
||||
file_name = os.path.basename(file_path)
|
||||
object_key = (
|
||||
f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
)
|
||||
file_ext = os.path.splitext(file_path)[1]
|
||||
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
logger.info(f"Generated object key: {object_key}")
|
||||
|
||||
# Upload file
|
||||
logger.info("Attempting to upload file to COS")
|
||||
response = self.client.upload_file(
|
||||
Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key
|
||||
self.client.upload_file(
|
||||
Bucket=self.bucket_name,
|
||||
LocalFilePath=file_path,
|
||||
Key=object_key,
|
||||
)
|
||||
|
||||
# Get file URL
|
||||
@@ -172,9 +176,15 @@ class CosStorage(Storage):
|
||||
if not self.client:
|
||||
return ""
|
||||
|
||||
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
object_key = (
|
||||
f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
if self.prefix
|
||||
else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
)
|
||||
logger.info(f"Generated object key: {object_key}")
|
||||
self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key)
|
||||
self.client.put_object(
|
||||
Bucket=self.bucket_name, Body=content, Key=object_key
|
||||
)
|
||||
file_url = self._get_download_url(self.bucket_name, self.region, object_key)
|
||||
logger.info(f"Successfully uploaded bytes to COS: {file_url}")
|
||||
return file_url
|
||||
@@ -194,7 +204,9 @@ class MinioStorage(Storage):
|
||||
storage_config: Storage configuration
|
||||
"""
|
||||
self.storage_config = storage_config
|
||||
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client()
|
||||
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = (
|
||||
self._init_minio_client()
|
||||
)
|
||||
|
||||
def _init_minio_client(self):
|
||||
"""Initialize MinIO client from environment variables or injected config.
|
||||
@@ -203,32 +215,39 @@ class MinioStorage(Storage):
|
||||
prefer those values to override envs.
|
||||
"""
|
||||
try:
|
||||
endpoint = os.getenv("MINIO_ENDPOINT")
|
||||
endpoint = os.getenv("MINIO_ENDPOINT", "")
|
||||
use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true"
|
||||
if self.storage_config and self.storage_config.get("bucket_name"):
|
||||
storage_config = self.storage_config
|
||||
bucket_name = storage_config.get("bucket_name")
|
||||
bucket_name = storage_config.get("bucket_name", "")
|
||||
path_prefix = storage_config.get("path_prefix").strip().strip("/")
|
||||
access_key = storage_config.get("access_key_id")
|
||||
secret_key = storage_config.get("secret_access_key")
|
||||
else:
|
||||
access_key = os.getenv("MINIO_ACCESS_KEY_ID")
|
||||
secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY")
|
||||
bucket_name = os.getenv("MINIO_BUCKET_NAME")
|
||||
bucket_name = os.getenv("MINIO_BUCKET_NAME", "")
|
||||
path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/")
|
||||
|
||||
if not all([endpoint, access_key, secret_key, bucket_name]):
|
||||
logger.error("Incomplete MinIO configuration, missing required environment variables")
|
||||
logger.error(
|
||||
"Incomplete MinIO configuration, missing environment variables"
|
||||
)
|
||||
return None, None, None, None, None
|
||||
|
||||
# Initialize client
|
||||
client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl)
|
||||
client = Minio(
|
||||
endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl
|
||||
)
|
||||
|
||||
# Ensure bucket exists
|
||||
found = client.bucket_exists(bucket_name)
|
||||
if not found:
|
||||
client.make_bucket(bucket_name)
|
||||
policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name)
|
||||
policy = (
|
||||
'{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}'
|
||||
% (bucket_name, bucket_name)
|
||||
)
|
||||
client.set_bucket_policy(bucket_name, policy)
|
||||
|
||||
return client, bucket_name, use_ssl, endpoint, path_prefix
|
||||
@@ -236,18 +255,22 @@ class MinioStorage(Storage):
|
||||
logger.error(f"Failed to initialize MinIO client: {str(e)}")
|
||||
return None, None, None, None, None
|
||||
|
||||
def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None):
|
||||
def _get_download_url(self, object_key: str):
|
||||
"""Construct a public URL for MinIO object.
|
||||
|
||||
If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint.
|
||||
"""
|
||||
if public_endpoint:
|
||||
base = public_endpoint
|
||||
else:
|
||||
scheme = "https" if use_ssl else "http"
|
||||
base = f"{scheme}://{endpoint}"
|
||||
# Path-style URL for MinIO
|
||||
return f"{base}/{bucket_name}/{object_key}"
|
||||
# 1. Use public endpoint if provided
|
||||
endpoint = os.getenv("MINIO_PUBLIC_ENDPOINT")
|
||||
if endpoint:
|
||||
return f"{endpoint}/{self.bucket_name}/{object_key}"
|
||||
|
||||
# 2. Use SSL if enabled
|
||||
if self.use_ssl:
|
||||
return f"https://{self.endpoint}/{self.bucket_name}/{object_key}"
|
||||
|
||||
# 3. Use HTTP default
|
||||
return f"http://{self.endpoint}/{self.bucket_name}/{object_key}"
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to MinIO
|
||||
@@ -265,29 +288,27 @@ class MinioStorage(Storage):
|
||||
|
||||
# Generate object key, use UUID to avoid conflicts
|
||||
file_name = os.path.basename(file_path)
|
||||
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
object_key = (
|
||||
f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
if self.path_prefix
|
||||
else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
)
|
||||
logger.info(f"Generated MinIO object key: {object_key}")
|
||||
|
||||
# Upload file
|
||||
logger.info("Attempting to upload file to MinIO")
|
||||
with open(file_path, 'rb') as file_data:
|
||||
with open(file_path, "rb") as file_data:
|
||||
file_size = os.path.getsize(file_path)
|
||||
self.client.put_object(
|
||||
bucket_name=self.bucket_name,
|
||||
bucket_name=self.bucket_name or "",
|
||||
object_name=object_key,
|
||||
data=file_data,
|
||||
length=file_size,
|
||||
content_type='application/octet-stream'
|
||||
content_type="application/octet-stream",
|
||||
)
|
||||
|
||||
# Get file URL
|
||||
file_url = self._get_download_url(
|
||||
self.bucket_name,
|
||||
object_key,
|
||||
self.use_ssl,
|
||||
self.endpoint,
|
||||
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
|
||||
)
|
||||
file_url = self._get_download_url(object_key)
|
||||
|
||||
logger.info(f"Successfully uploaded file to MinIO: {file_url}")
|
||||
return file_url
|
||||
@@ -311,22 +332,20 @@ class MinioStorage(Storage):
|
||||
if not self.client:
|
||||
return ""
|
||||
|
||||
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
object_key = (
|
||||
f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
if self.path_prefix
|
||||
else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
)
|
||||
logger.info(f"Generated MinIO object key: {object_key}")
|
||||
self.client.put_object(
|
||||
self.bucket_name,
|
||||
self.bucket_name or "",
|
||||
object_key,
|
||||
data=io.BytesIO(content),
|
||||
length=len(content),
|
||||
content_type="application/octet-stream"
|
||||
)
|
||||
file_url = self._get_download_url(
|
||||
self.bucket_name,
|
||||
object_key,
|
||||
self.use_ssl,
|
||||
self.endpoint,
|
||||
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
|
||||
content_type="application/octet-stream",
|
||||
)
|
||||
file_url = self._get_download_url(object_key)
|
||||
logger.info(f"Successfully uploaded bytes to MinIO: {file_url}")
|
||||
return file_url
|
||||
except Exception as e:
|
||||
@@ -335,7 +354,41 @@ class MinioStorage(Storage):
|
||||
return ""
|
||||
|
||||
|
||||
def create_storage(storage_config=None) -> Storage:
|
||||
class LocalStorage(Storage):
|
||||
"""Local file system storage implementation"""
|
||||
|
||||
def __init__(self, storage_config: Dict[str, str] = {}):
|
||||
self.storage_config = storage_config
|
||||
base_dir = storage_config.get(
|
||||
"base_dir", os.getenv("LOCAL_STORAGE_BASE_DIR", "")
|
||||
)
|
||||
self.image_dir = os.path.join(base_dir, "images")
|
||||
os.makedirs(self.image_dir, exist_ok=True)
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
logger.info(f"Uploading file to local storage: {file_path}")
|
||||
return file_path
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
logger.info(f"Uploading file to local storage: {len(content)} bytes")
|
||||
fname = os.path.join(self.image_dir, f"{uuid.uuid4()}{file_ext}")
|
||||
with open(fname, "wb") as f:
|
||||
f.write(content)
|
||||
return fname
|
||||
|
||||
|
||||
class Base64Storage(Storage):
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
logger.info(f"Uploading file to base64 storage: {file_path}")
|
||||
return file_path
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
logger.info(f"Uploading file to base64 storage: {len(content)} bytes")
|
||||
file_ext = file_ext.lstrip(".")
|
||||
return f"data:image/{file_ext};base64,{endecode.decode_image(content)}"
|
||||
|
||||
|
||||
def create_storage(storage_config: Dict[str, str] | None = None) -> Storage:
|
||||
"""Create a storage instance based on configuration or environment variables
|
||||
|
||||
Args:
|
||||
@@ -345,16 +398,17 @@ def create_storage(storage_config=None) -> Storage:
|
||||
Storage instance
|
||||
"""
|
||||
storage_type = os.getenv("STORAGE_TYPE", "cos").lower()
|
||||
|
||||
if storage_config:
|
||||
storage_type = str(storage_config.get("provider", storage_type)).lower()
|
||||
|
||||
logger.info(f"Creating {storage_type} storage instance")
|
||||
|
||||
if storage_type == "minio":
|
||||
return MinioStorage(storage_config)
|
||||
elif storage_type == "cos":
|
||||
# Default to COS
|
||||
return CosStorage(storage_config)
|
||||
else:
|
||||
return None
|
||||
elif storage_type == "local":
|
||||
return LocalStorage(storage_config or {})
|
||||
elif storage_type == "base64":
|
||||
return Base64Storage()
|
||||
|
||||
raise ValueError(f"Invalid storage type: {storage_type}")
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import logging
|
||||
from .base_parser import BaseParser
|
||||
from typing import Dict, Any, Tuple, Union
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -11,7 +13,7 @@ class TextParser(BaseParser):
|
||||
This parser handles text extraction and chunking from plain text documents.
|
||||
"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""
|
||||
Parse text document content by decoding bytes to string.
|
||||
|
||||
@@ -25,20 +27,15 @@ class TextParser(BaseParser):
|
||||
Parsed text content as string
|
||||
"""
|
||||
logger.info(f"Parsing text document, content size: {len(content)} bytes")
|
||||
text = self.decode_bytes(content)
|
||||
text = endecode.decode_bytes(content)
|
||||
logger.info(
|
||||
f"Successfully parsed text document, extracted {len(text)} characters"
|
||||
)
|
||||
return text
|
||||
return Document(content=text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger.info("Running TextParser in standalone mode")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Sample text for testing
|
||||
text = """## 标题1
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
from typing import Any, Optional, Tuple, Dict, Union
|
||||
import os
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
from bs4 import BeautifulSoup
|
||||
from .base_parser import BaseParser, ParseResult
|
||||
import logging
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -59,7 +62,7 @@ class WebParser(BaseParser):
|
||||
# Return empty BeautifulSoup object on error
|
||||
return BeautifulSoup("", "html.parser")
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""Parse web page
|
||||
|
||||
Args:
|
||||
@@ -78,10 +81,10 @@ class WebParser(BaseParser):
|
||||
# Run async method
|
||||
# Handle content possibly being a string
|
||||
if isinstance(content, bytes):
|
||||
url = self.decode_bytes(content)
|
||||
url = endecode.decode_bytes(content)
|
||||
logger.info(f"Decoded URL from bytes: {url}")
|
||||
else:
|
||||
url = content
|
||||
url = str(content)
|
||||
logger.info(f"Using content as URL directly: {url}")
|
||||
|
||||
logger.info(f"Scraping web page: {url}")
|
||||
@@ -118,11 +121,11 @@ class WebParser(BaseParser):
|
||||
logger.info(
|
||||
f"Web page parsing complete, total content: {len(result)} characters"
|
||||
)
|
||||
return result
|
||||
return Document(content=result)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing web page: {str(e)}")
|
||||
return f"Error parsing web page: {str(e)}"
|
||||
return Document(content=f"Error parsing web page: {str(e)}")
|
||||
|
||||
finally:
|
||||
# Close event loop
|
||||
|
||||
127
docreader/proto/docreader_pb2.pyi
Normal file
127
docreader/proto/docreader_pb2.pyi
Normal file
@@ -0,0 +1,127 @@
|
||||
from google.protobuf.internal import containers as _containers
|
||||
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import message as _message
|
||||
from collections.abc import Iterable as _Iterable, Mapping as _Mapping
|
||||
from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
|
||||
|
||||
DESCRIPTOR: _descriptor.FileDescriptor
|
||||
|
||||
class StorageProvider(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
||||
__slots__ = ()
|
||||
STORAGE_PROVIDER_UNSPECIFIED: _ClassVar[StorageProvider]
|
||||
COS: _ClassVar[StorageProvider]
|
||||
MINIO: _ClassVar[StorageProvider]
|
||||
STORAGE_PROVIDER_UNSPECIFIED: StorageProvider
|
||||
COS: StorageProvider
|
||||
MINIO: StorageProvider
|
||||
|
||||
class StorageConfig(_message.Message):
|
||||
__slots__ = ("provider", "region", "bucket_name", "access_key_id", "secret_access_key", "app_id", "path_prefix")
|
||||
PROVIDER_FIELD_NUMBER: _ClassVar[int]
|
||||
REGION_FIELD_NUMBER: _ClassVar[int]
|
||||
BUCKET_NAME_FIELD_NUMBER: _ClassVar[int]
|
||||
ACCESS_KEY_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
SECRET_ACCESS_KEY_FIELD_NUMBER: _ClassVar[int]
|
||||
APP_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
PATH_PREFIX_FIELD_NUMBER: _ClassVar[int]
|
||||
provider: StorageProvider
|
||||
region: str
|
||||
bucket_name: str
|
||||
access_key_id: str
|
||||
secret_access_key: str
|
||||
app_id: str
|
||||
path_prefix: str
|
||||
def __init__(self, provider: _Optional[_Union[StorageProvider, str]] = ..., region: _Optional[str] = ..., bucket_name: _Optional[str] = ..., access_key_id: _Optional[str] = ..., secret_access_key: _Optional[str] = ..., app_id: _Optional[str] = ..., path_prefix: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class VLMConfig(_message.Message):
|
||||
__slots__ = ("model_name", "base_url", "api_key", "interface_type")
|
||||
MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
|
||||
BASE_URL_FIELD_NUMBER: _ClassVar[int]
|
||||
API_KEY_FIELD_NUMBER: _ClassVar[int]
|
||||
INTERFACE_TYPE_FIELD_NUMBER: _ClassVar[int]
|
||||
model_name: str
|
||||
base_url: str
|
||||
api_key: str
|
||||
interface_type: str
|
||||
def __init__(self, model_name: _Optional[str] = ..., base_url: _Optional[str] = ..., api_key: _Optional[str] = ..., interface_type: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class ReadConfig(_message.Message):
|
||||
__slots__ = ("chunk_size", "chunk_overlap", "separators", "enable_multimodal", "storage_config", "vlm_config")
|
||||
CHUNK_SIZE_FIELD_NUMBER: _ClassVar[int]
|
||||
CHUNK_OVERLAP_FIELD_NUMBER: _ClassVar[int]
|
||||
SEPARATORS_FIELD_NUMBER: _ClassVar[int]
|
||||
ENABLE_MULTIMODAL_FIELD_NUMBER: _ClassVar[int]
|
||||
STORAGE_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
VLM_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
chunk_size: int
|
||||
chunk_overlap: int
|
||||
separators: _containers.RepeatedScalarFieldContainer[str]
|
||||
enable_multimodal: bool
|
||||
storage_config: StorageConfig
|
||||
vlm_config: VLMConfig
|
||||
def __init__(self, chunk_size: _Optional[int] = ..., chunk_overlap: _Optional[int] = ..., separators: _Optional[_Iterable[str]] = ..., enable_multimodal: bool = ..., storage_config: _Optional[_Union[StorageConfig, _Mapping]] = ..., vlm_config: _Optional[_Union[VLMConfig, _Mapping]] = ...) -> None: ...
|
||||
|
||||
class ReadFromFileRequest(_message.Message):
|
||||
__slots__ = ("file_content", "file_name", "file_type", "read_config", "request_id")
|
||||
FILE_CONTENT_FIELD_NUMBER: _ClassVar[int]
|
||||
FILE_NAME_FIELD_NUMBER: _ClassVar[int]
|
||||
FILE_TYPE_FIELD_NUMBER: _ClassVar[int]
|
||||
READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
file_content: bytes
|
||||
file_name: str
|
||||
file_type: str
|
||||
read_config: ReadConfig
|
||||
request_id: str
|
||||
def __init__(self, file_content: _Optional[bytes] = ..., file_name: _Optional[str] = ..., file_type: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class ReadFromURLRequest(_message.Message):
|
||||
__slots__ = ("url", "title", "read_config", "request_id")
|
||||
URL_FIELD_NUMBER: _ClassVar[int]
|
||||
TITLE_FIELD_NUMBER: _ClassVar[int]
|
||||
READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
url: str
|
||||
title: str
|
||||
read_config: ReadConfig
|
||||
request_id: str
|
||||
def __init__(self, url: _Optional[str] = ..., title: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class Image(_message.Message):
|
||||
__slots__ = ("url", "caption", "ocr_text", "original_url", "start", "end")
|
||||
URL_FIELD_NUMBER: _ClassVar[int]
|
||||
CAPTION_FIELD_NUMBER: _ClassVar[int]
|
||||
OCR_TEXT_FIELD_NUMBER: _ClassVar[int]
|
||||
ORIGINAL_URL_FIELD_NUMBER: _ClassVar[int]
|
||||
START_FIELD_NUMBER: _ClassVar[int]
|
||||
END_FIELD_NUMBER: _ClassVar[int]
|
||||
url: str
|
||||
caption: str
|
||||
ocr_text: str
|
||||
original_url: str
|
||||
start: int
|
||||
end: int
|
||||
def __init__(self, url: _Optional[str] = ..., caption: _Optional[str] = ..., ocr_text: _Optional[str] = ..., original_url: _Optional[str] = ..., start: _Optional[int] = ..., end: _Optional[int] = ...) -> None: ...
|
||||
|
||||
class Chunk(_message.Message):
|
||||
__slots__ = ("content", "seq", "start", "end", "images")
|
||||
CONTENT_FIELD_NUMBER: _ClassVar[int]
|
||||
SEQ_FIELD_NUMBER: _ClassVar[int]
|
||||
START_FIELD_NUMBER: _ClassVar[int]
|
||||
END_FIELD_NUMBER: _ClassVar[int]
|
||||
IMAGES_FIELD_NUMBER: _ClassVar[int]
|
||||
content: str
|
||||
seq: int
|
||||
start: int
|
||||
end: int
|
||||
images: _containers.RepeatedCompositeFieldContainer[Image]
|
||||
def __init__(self, content: _Optional[str] = ..., seq: _Optional[int] = ..., start: _Optional[int] = ..., end: _Optional[int] = ..., images: _Optional[_Iterable[_Union[Image, _Mapping]]] = ...) -> None: ...
|
||||
|
||||
class ReadResponse(_message.Message):
|
||||
__slots__ = ("chunks", "error")
|
||||
CHUNKS_FIELD_NUMBER: _ClassVar[int]
|
||||
ERROR_FIELD_NUMBER: _ClassVar[int]
|
||||
chunks: _containers.RepeatedCompositeFieldContainer[Chunk]
|
||||
error: str
|
||||
def __init__(self, chunks: _Optional[_Iterable[_Union[Chunk, _Mapping]]] = ..., error: _Optional[str] = ...) -> None: ...
|
||||
@@ -3,7 +3,7 @@
|
||||
import grpc
|
||||
import warnings
|
||||
|
||||
from . import docreader_pb2 as docreader__pb2
|
||||
import docreader_pb2 as docreader__pb2
|
||||
|
||||
GRPC_GENERATED_VERSION = '1.76.0'
|
||||
GRPC_VERSION = grpc.__version__
|
||||
|
||||
@@ -16,6 +16,7 @@ dependencies = [
|
||||
"lxml>=6.0.2",
|
||||
"markdown>=3.10",
|
||||
"markdownify>=1.2.0",
|
||||
"markitdown[docx,pdf,xls,xlsx]>=0.1.3",
|
||||
"minio>=7.2.18",
|
||||
"mistletoe>=1.5.0",
|
||||
"ollama>=0.6.0",
|
||||
@@ -26,6 +27,7 @@ dependencies = [
|
||||
"pillow>=12.0.0",
|
||||
"playwright>=1.55.0",
|
||||
"protobuf>=6.33.0",
|
||||
"pydantic>=2.12.3",
|
||||
"pypdf>=6.1.3",
|
||||
"pypdf2>=3.0.1",
|
||||
"python-docx>=1.2.0",
|
||||
|
||||
@@ -2,13 +2,14 @@
|
||||
set -x
|
||||
|
||||
# 设置目录
|
||||
PROTO_DIR="proto"
|
||||
PYTHON_OUT="proto"
|
||||
GO_OUT="proto"
|
||||
PROTO_DIR="docreader/proto"
|
||||
PYTHON_OUT="docreader/proto"
|
||||
GO_OUT="docreader/proto"
|
||||
|
||||
# 生成Python代码
|
||||
python3 -m grpc_tools.protoc -I${PROTO_DIR} \
|
||||
--python_out=${PYTHON_OUT} \
|
||||
--pyi_out=${PYTHON_OUT} \
|
||||
--grpc_python_out=${PYTHON_OUT} \
|
||||
${PROTO_DIR}/docreader.proto
|
||||
|
||||
@@ -22,10 +23,10 @@ protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
|
||||
# 修复Python导入问题(MacOS兼容版本)
|
||||
if [ "$(uname)" == "Darwin" ]; then
|
||||
# MacOS版本
|
||||
sed -i '' 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
sed -i '' 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
else
|
||||
# Linux版本
|
||||
sed -i 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
sed -i 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
fi
|
||||
|
||||
echo "Proto files generated successfully!"
|
||||
112
docreader/splitter/header_hook.py
Normal file
112
docreader/splitter/header_hook.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import re
|
||||
from typing import Callable, Dict, List, Match, Pattern, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class HeaderTrackerHook(BaseModel):
|
||||
"""表头追踪Hook的配置类,支持多种场景的表头识别"""
|
||||
|
||||
start_pattern: Pattern[str] = Field(
|
||||
description="表头开始匹配(正则表达式或字符串)"
|
||||
)
|
||||
end_pattern: Pattern[str] = Field(description="表头结束匹配(正则表达式或字符串)")
|
||||
extract_header_fn: Callable[[Match[str]], str] = Field(
|
||||
default=lambda m: m.group(0),
|
||||
description="从开始匹配结果中提取表头内容的函数(默认取匹配到的整个内容)",
|
||||
)
|
||||
priority: int = Field(default=0, description="优先级(多个配置时,高优先级先匹配)")
|
||||
case_sensitive: bool = Field(
|
||||
default=True, description="是否大小写敏感(仅当传入字符串pattern时生效)"
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
start_pattern: Union[str, Pattern[str]],
|
||||
end_pattern: Union[str, Pattern[str]],
|
||||
**kwargs,
|
||||
):
|
||||
flags = 0 if kwargs.get("case_sensitive", True) else re.IGNORECASE
|
||||
if isinstance(start_pattern, str):
|
||||
start_pattern = re.compile(start_pattern, flags | re.DOTALL)
|
||||
if isinstance(end_pattern, str):
|
||||
end_pattern = re.compile(end_pattern, flags | re.DOTALL)
|
||||
super().__init__(
|
||||
start_pattern=start_pattern,
|
||||
end_pattern=end_pattern,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
# 初始化表头Hook配置(提供默认配置:支持Markdown表格、代码块)
|
||||
DEFAULT_CONFIGS = [
|
||||
# 代码块配置(```开头,```结尾)
|
||||
# HeaderTrackerHook(
|
||||
# # 代码块开始(支持语言指定)
|
||||
# start_pattern=r"^\s*```(\w+).*(?!```$)",
|
||||
# # 代码块结束
|
||||
# end_pattern=r"^\s*```.*$",
|
||||
# extract_header_fn=lambda m: f"```{m.group(1)}" if m.group(1) else "```",
|
||||
# priority=20, # 代码块优先级高于表格
|
||||
# case_sensitive=True,
|
||||
# ),
|
||||
# Markdown表格配置(表头带下划线)
|
||||
HeaderTrackerHook(
|
||||
# 表头行 + 分隔行
|
||||
start_pattern=r"^\s*(?:\|[^|\n]*)+[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|?[\r\n]+$",
|
||||
# 空行或非表格内容
|
||||
end_pattern=r"^\s*$|^\s*[^|\s].*$",
|
||||
priority=15,
|
||||
case_sensitive=False,
|
||||
),
|
||||
]
|
||||
DEFAULT_CONFIGS.sort(key=lambda x: -x.priority)
|
||||
|
||||
|
||||
# 定义Hook状态数据结构
|
||||
class HeaderTracker(BaseModel):
|
||||
"""表头追踪 Hook 的状态类"""
|
||||
|
||||
header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS)
|
||||
active_headers: Dict[int, str] = Field(default_factory=dict)
|
||||
ended_headers: set[int] = Field(default_factory=set)
|
||||
|
||||
def update(self, split: str) -> Dict[int, str]:
|
||||
"""检测当前split中的表头开始/结束,更新Hook状态"""
|
||||
new_headers: Dict[int, str] = {}
|
||||
|
||||
# 1. 检查是否有表头结束标记
|
||||
for config in self.header_hook_configs:
|
||||
if config.priority in self.active_headers and config.end_pattern.search(
|
||||
split
|
||||
):
|
||||
self.ended_headers.add(config.priority)
|
||||
del self.active_headers[config.priority]
|
||||
|
||||
# 2. 检查是否有新的表头开始标记(只处理未活跃且未结束的)
|
||||
for config in self.header_hook_configs:
|
||||
if (
|
||||
config.priority not in self.active_headers
|
||||
and config.priority not in self.ended_headers
|
||||
):
|
||||
match = config.start_pattern.search(split)
|
||||
if match:
|
||||
header = config.extract_header_fn(match)
|
||||
self.active_headers[config.priority] = header
|
||||
new_headers[config.priority] = header
|
||||
|
||||
# 3. 检查是否所有活跃表头都已结束(清空结束标记)
|
||||
if not self.active_headers:
|
||||
self.ended_headers.clear()
|
||||
|
||||
return new_headers
|
||||
|
||||
def get_headers(self) -> str:
|
||||
"""获取当前所有活跃表头的拼接文本(按优先级排序)"""
|
||||
# 按优先级降序排列表头
|
||||
sorted_headers = sorted(self.active_headers.items(), key=lambda x: -x[0])
|
||||
return (
|
||||
"\n".join([header for _, header in sorted_headers])
|
||||
if sorted_headers
|
||||
else ""
|
||||
)
|
||||
313
docreader/splitter/splitter.py
Normal file
313
docreader/splitter/splitter.py
Normal file
@@ -0,0 +1,313 @@
|
||||
"""Token splitter."""
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
from typing import Callable, Generic, List, Pattern, Tuple, TypeVar
|
||||
|
||||
from pydantic import BaseModel, Field, PrivateAttr
|
||||
|
||||
from docreader.splitter.header_hook import (
|
||||
HeaderTracker,
|
||||
)
|
||||
from docreader.utils.split import split_by_char, split_by_sep
|
||||
|
||||
DEFAULT_CHUNK_OVERLAP = 100
|
||||
DEFAULT_CHUNK_SIZE = 512
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextSplitter(BaseModel, Generic[T]):
|
||||
chunk_size: int = Field(description="The token chunk size for each chunk.")
|
||||
chunk_overlap: int = Field(
|
||||
description="The token overlap of each chunk when splitting."
|
||||
)
|
||||
separators: List[str] = Field(
|
||||
description="Default separators for splitting into words"
|
||||
)
|
||||
|
||||
# Try to keep the matched characters as a whole.
|
||||
# If it's too long, the content will be further segmented.
|
||||
protected_regex: List[str] = Field(
|
||||
description="Protected regex for splitting into words"
|
||||
)
|
||||
len_function: Callable[[str], int] = Field(description="The length function.")
|
||||
# Header tracking Hook related attributes
|
||||
header_hook: HeaderTracker = Field(default_factory=HeaderTracker, exclude=True)
|
||||
|
||||
_protected_fns: List[Pattern] = PrivateAttr()
|
||||
_split_fns: List[Callable] = PrivateAttr()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
||||
separators: List[str] = ["\n", "。", " "],
|
||||
protected_regex: List[str] = [
|
||||
# math formula
|
||||
r"\$\$[\s\S]*?\$\$",
|
||||
# image
|
||||
r"!\[.*?\]\(.*?\)",
|
||||
# link
|
||||
r"\[.*?\]\(.*?\)",
|
||||
# table header
|
||||
r"(?:\|[^|\n]*)+\|[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|[\r\n]+",
|
||||
# table body
|
||||
r"(?:\|[^|\n]*)+\|[\r\n]+",
|
||||
# code header
|
||||
r"```(?:\w+)[\r\n]+[^\r\n]*",
|
||||
],
|
||||
length_function: Callable[[str], int] = lambda x: len(x),
|
||||
):
|
||||
"""Initialize with parameters."""
|
||||
if chunk_overlap > chunk_size:
|
||||
raise ValueError(
|
||||
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
|
||||
f"({chunk_size}), should be smaller."
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
separators=separators,
|
||||
protected_regex=protected_regex,
|
||||
len_function=length_function,
|
||||
)
|
||||
self._protected_fns = [re.compile(reg) for reg in protected_regex]
|
||||
self._split_fns = [split_by_sep(sep) for sep in separators] + [split_by_char()]
|
||||
|
||||
def split_text(self, text: str) -> List[Tuple[int, int, str]]:
|
||||
"""Split text into chunks."""
|
||||
if text == "":
|
||||
return []
|
||||
|
||||
splits = self._split(text)
|
||||
protect = self._split_protected(text)
|
||||
splits = self._join(splits, protect)
|
||||
|
||||
assert "".join(splits) == text
|
||||
|
||||
chunks = self._merge(splits)
|
||||
return chunks
|
||||
|
||||
def _split(self, text: str) -> List[str]:
|
||||
"""Break text into splits that are smaller than chunk size.
|
||||
|
||||
NOTE: the splits contain the separators.
|
||||
"""
|
||||
if self.len_function(text) <= self.chunk_size:
|
||||
return [text]
|
||||
|
||||
splits = []
|
||||
for split_fn in self._split_fns:
|
||||
splits = split_fn(text)
|
||||
if len(splits) > 1:
|
||||
break
|
||||
|
||||
new_splits = []
|
||||
for split in splits:
|
||||
split_len = self.len_function(split)
|
||||
if split_len <= self.chunk_size:
|
||||
new_splits.append(split)
|
||||
else:
|
||||
# recursively split
|
||||
new_splits.extend(self._split(split))
|
||||
return new_splits
|
||||
|
||||
def _merge(self, splits: List[str]) -> List[Tuple[int, int, str]]:
|
||||
"""Merge splits into chunks.
|
||||
|
||||
The high-level idea is to keep adding splits to a chunk until we
|
||||
exceed the chunk size, then we start a new chunk with overlap.
|
||||
|
||||
When we start a new chunk, we pop off the first element of the previous
|
||||
chunk until the total length is less than the chunk size.
|
||||
"""
|
||||
chunks: List[Tuple[int, int, str]] = []
|
||||
|
||||
cur_chunk: List[Tuple[int, int, str]] = []
|
||||
|
||||
cur_headers, cur_len = "", 0
|
||||
cur_start, cur_end = 0, 0
|
||||
for split in splits:
|
||||
cur_end = cur_start + len(split)
|
||||
split_len = self.len_function(split)
|
||||
if split_len > self.chunk_size:
|
||||
logger.error(
|
||||
f"Got a split of size {split_len}, ",
|
||||
f"larger than chunk size {self.chunk_size}.",
|
||||
)
|
||||
|
||||
self.header_hook.update(split)
|
||||
cur_headers = self.header_hook.get_headers()
|
||||
cur_headers_len = self.len_function(cur_headers)
|
||||
|
||||
if cur_headers_len > self.chunk_size:
|
||||
logger.error(
|
||||
f"Got headers of size {cur_headers_len}, ",
|
||||
f"larger than chunk size {self.chunk_size}.",
|
||||
)
|
||||
cur_headers, cur_headers_len = "", 0
|
||||
|
||||
# if we exceed the chunk size after adding the new split, then
|
||||
# we need to end the current chunk and start a new one
|
||||
if cur_len + split_len + cur_headers_len > self.chunk_size:
|
||||
# end the previous chunk
|
||||
if len(cur_chunk) > 0:
|
||||
chunks.append(
|
||||
(
|
||||
cur_chunk[0][0],
|
||||
cur_chunk[-1][1],
|
||||
"".join([c[2] for c in cur_chunk]),
|
||||
)
|
||||
)
|
||||
|
||||
# start a new chunk with overlap
|
||||
# keep popping off the first element of the previous chunk until:
|
||||
# 1. the current chunk length is less than chunk overlap
|
||||
# 2. the total length is less than chunk size
|
||||
while cur_chunk and (
|
||||
cur_len > self.chunk_overlap
|
||||
or cur_len + split_len + cur_headers_len > self.chunk_size
|
||||
):
|
||||
# pop off the first element
|
||||
first_chunk = cur_chunk.pop(0)
|
||||
cur_len -= self.len_function(first_chunk[2])
|
||||
|
||||
if (
|
||||
cur_headers
|
||||
and split_len + cur_headers_len < self.chunk_size
|
||||
and cur_headers not in split
|
||||
):
|
||||
cur_chunk.insert(
|
||||
0,
|
||||
(
|
||||
cur_chunk[0][0] if cur_chunk else cur_start,
|
||||
cur_chunk[0][1] if cur_chunk else cur_end,
|
||||
cur_headers,
|
||||
),
|
||||
)
|
||||
cur_len += cur_headers_len
|
||||
|
||||
cur_chunk.append((cur_start, cur_end, split))
|
||||
cur_len += split_len
|
||||
cur_start = cur_end
|
||||
|
||||
# handle the last chunk
|
||||
assert cur_chunk
|
||||
if cur_headers and cur_len < self.chunk_size:
|
||||
cur_chunk.insert(0, (cur_chunk[0][0], cur_chunk[0][1], cur_headers))
|
||||
chunks.append(
|
||||
(
|
||||
cur_chunk[0][0],
|
||||
cur_chunk[-1][1],
|
||||
"".join([c[2] for c in cur_chunk]),
|
||||
)
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_protected(self, text: str) -> List[Tuple[int, str]]:
|
||||
matches = [
|
||||
(match.start(), match.end())
|
||||
for pattern in self._protected_fns
|
||||
for match in pattern.finditer(text)
|
||||
]
|
||||
matches.sort(key=lambda x: (x[0], -x[1]))
|
||||
|
||||
res = []
|
||||
|
||||
def fold(initial: int, current: Tuple[int, int]) -> int:
|
||||
if current[0] >= initial:
|
||||
if current[1] - current[0] < self.chunk_size:
|
||||
res.append((current[0], text[current[0] : current[1]]))
|
||||
else:
|
||||
logger.warning(f"Protected text ignore: {current}")
|
||||
return max(initial, current[1])
|
||||
|
||||
# filter overlapping matches
|
||||
list(itertools.accumulate(matches, fold, initial=-1))
|
||||
return res
|
||||
|
||||
def _join(self, splits: List[str], protect: List[Tuple[int, str]]) -> List[str]:
|
||||
"""
|
||||
Merges and splits elements in splits array based on protected substrings.
|
||||
|
||||
The function processes the input splits to ensure all protected substrings
|
||||
remain as single items. If a protected substring is concatenated with preceding
|
||||
or following content in any split element, it will be separated from
|
||||
the adjacent content. The final result maintains the original order of content
|
||||
while enforcing the integrity of protected substrings.
|
||||
|
||||
Key behaviors:
|
||||
1. Preserves the complete structure of each protected substring
|
||||
2. Separates protected substrings from any adjacent non-protected content
|
||||
3. Maintains the original sequence of all content except for necessary
|
||||
4. Handles cases where protected substrings are partially concatenated
|
||||
"""
|
||||
j = 0
|
||||
point, start = 0, 0
|
||||
res = []
|
||||
|
||||
for split in splits:
|
||||
end = start + len(split)
|
||||
|
||||
cur = split[point - start :]
|
||||
while j < len(protect):
|
||||
p_start, p_content = protect[j]
|
||||
p_end = p_start + len(p_content)
|
||||
|
||||
if end <= p_start:
|
||||
break
|
||||
|
||||
if point < p_start:
|
||||
local_end = p_start - point
|
||||
res.append(cur[:local_end])
|
||||
cur = cur[local_end:]
|
||||
point = p_start
|
||||
|
||||
res.append(p_content)
|
||||
j += 1
|
||||
|
||||
if point < p_end:
|
||||
local_start = p_end - point
|
||||
cur = cur[local_start:]
|
||||
point = p_end
|
||||
|
||||
if not cur:
|
||||
break
|
||||
|
||||
if cur:
|
||||
res.append(cur)
|
||||
point = end
|
||||
|
||||
start = end
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
s = """
|
||||
这是一些普通文本。
|
||||
|
||||
| 姓名 | 年龄 | 城市 |
|
||||
|------|------|------|
|
||||
| 张三 | 25 | 北京 |
|
||||
| 李四 | 30 | 上海 |
|
||||
| 王五 | 28 | 广州 |
|
||||
| 张三 | 25 | 北京 |
|
||||
| 李四 | 30 | 上海 |
|
||||
| 王五 | 28 | 广州 |
|
||||
|
||||
这是文本结束。
|
||||
|
||||
"""
|
||||
|
||||
sp = TextSplitter(chunk_size=200, chunk_overlap=2)
|
||||
ck = sp.split_text(s)
|
||||
for c in ck:
|
||||
print("------", len(c))
|
||||
print(c)
|
||||
pass
|
||||
103
docreader/utils/endecode.py
Normal file
103
docreader/utils/endecode.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import base64
|
||||
import binascii
|
||||
import io
|
||||
import logging
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def decode_image(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
|
||||
"""Convert image to base64 encoded string
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, PIL Image object, or numpy array
|
||||
|
||||
Returns:
|
||||
Base64 encoded image string, or empty string if conversion fails
|
||||
"""
|
||||
if isinstance(image, str):
|
||||
# It's a file path
|
||||
with open(image, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode()
|
||||
|
||||
elif isinstance(image, bytes):
|
||||
# It's bytes data
|
||||
return base64.b64encode(image).decode()
|
||||
|
||||
elif isinstance(image, Image.Image):
|
||||
# It's a PIL Image
|
||||
buffer = io.BytesIO()
|
||||
image.save(buffer, format=image.format)
|
||||
return base64.b64encode(buffer.getvalue()).decode()
|
||||
|
||||
elif isinstance(image, np.ndarray):
|
||||
# It's a numpy array
|
||||
pil_image = Image.fromarray(image)
|
||||
buffer = io.BytesIO()
|
||||
pil_image.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode()
|
||||
|
||||
raise ValueError(f"Unsupported image type: {type(image)}")
|
||||
|
||||
|
||||
def encode_image(image: str, errors="strict") -> bytes:
|
||||
"""
|
||||
Decode image bytes using base64.
|
||||
|
||||
errors
|
||||
The error handling scheme to use for the handling of decoding errors.
|
||||
The default is 'strict' meaning that decoding errors raise a
|
||||
UnicodeDecodeError. Other possible values are 'ignore' and '????'
|
||||
as well as any other name registered with codecs.register_error that
|
||||
can handle UnicodeDecodeErrors.
|
||||
"""
|
||||
try:
|
||||
image_bytes = base64.b64decode(image)
|
||||
except binascii.Error as e:
|
||||
if errors == "ignore":
|
||||
return b""
|
||||
else:
|
||||
raise e
|
||||
return image_bytes
|
||||
|
||||
|
||||
def encode_bytes(content: str) -> bytes:
|
||||
return content.encode()
|
||||
|
||||
|
||||
def decode_bytes(
|
||||
content: bytes,
|
||||
encodings: List[str] = [
|
||||
"utf-8",
|
||||
"gb18030",
|
||||
"gb2312",
|
||||
"gbk",
|
||||
"big5",
|
||||
"ascii",
|
||||
"latin-1",
|
||||
],
|
||||
) -> str:
|
||||
# Try decoding with each encoding format
|
||||
for encoding in encodings:
|
||||
try:
|
||||
text = content.decode(encoding)
|
||||
logger.debug(f"Decode content with {encoding}: {len(text)} characters")
|
||||
return text
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
text = content.decode(encoding="latin-1", errors="replace")
|
||||
logger.warning(
|
||||
"Unable to determine correct encoding, using latin-1 as fallback. "
|
||||
"This may cause character issues."
|
||||
)
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
img = "testtest"
|
||||
encode_image(img, errors="ignore")
|
||||
@@ -1,10 +1,10 @@
|
||||
from contextvars import ContextVar
|
||||
import logging
|
||||
import uuid
|
||||
import contextlib
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
import uuid
|
||||
from contextvars import ContextVar
|
||||
from logging import LogRecord
|
||||
from typing import Optional
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -35,7 +35,7 @@ class MillisecondFormatter(logging.Formatter):
|
||||
# 如果使用了包含.%f的格式,则将微秒(6位)截断为毫秒(3位)
|
||||
if datefmt and ".%f" in datefmt:
|
||||
# 格式化的时间字符串应该在最后有6位微秒数
|
||||
parts = result.split('.')
|
||||
parts = result.split(".")
|
||||
if len(parts) > 1 and len(parts[1]) >= 6:
|
||||
# 只保留前3位作为毫秒
|
||||
millis = parts[1][:3]
|
||||
|
||||
34
docreader/utils/split.py
Normal file
34
docreader/utils/split.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import re
|
||||
from typing import Callable, List
|
||||
|
||||
|
||||
def split_text_keep_separator(text: str, separator: str) -> List[str]:
|
||||
"""Split text with separator and keep the separator at the end of each split."""
|
||||
parts = text.split(separator)
|
||||
result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
|
||||
return [s for s in result if s]
|
||||
|
||||
|
||||
def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
|
||||
"""Split text by separator."""
|
||||
if keep_sep:
|
||||
return lambda text: split_text_keep_separator(text, sep)
|
||||
else:
|
||||
return lambda text: text.split(sep)
|
||||
|
||||
|
||||
def split_by_char() -> Callable[[str], List[str]]:
|
||||
"""Split text by character."""
|
||||
return lambda text: list(text)
|
||||
|
||||
|
||||
def split_by_regex(regex: str) -> Callable[[str], List[str]]:
|
||||
"""Split text by regex."""
|
||||
pattern = re.compile(f"({regex})")
|
||||
return lambda text: list(filter(None, pattern.split(text)))
|
||||
|
||||
|
||||
def match_by_regex(regex: str) -> Callable[[str], bool]:
|
||||
"""Split text by regex."""
|
||||
pattern = re.compile(regex)
|
||||
return lambda text: bool(pattern.match(text))
|
||||
77
docreader/utils/tempfile.py
Normal file
77
docreader/utils/tempfile.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TempFileContext:
|
||||
def __init__(self, file_content: bytes, suffix: str):
|
||||
"""
|
||||
Initialize the context
|
||||
:param file_content: Byte data to write to file
|
||||
:param suffix: File suffix
|
||||
"""
|
||||
self.file_content = file_content
|
||||
self.suffix = suffix
|
||||
self.file = None
|
||||
|
||||
def __enter__(self):
|
||||
"""
|
||||
Create file when entering context
|
||||
"""
|
||||
self.temp_file = tempfile.NamedTemporaryFile(suffix=self.suffix, delete=False)
|
||||
self.temp_file.write(self.file_content)
|
||||
self.temp_file.flush()
|
||||
logger.info(
|
||||
f"Saved {self.suffix} content to temporary file: {self.temp_file.name}"
|
||||
)
|
||||
return self.temp_file.name
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""
|
||||
Delete file when exiting context
|
||||
"""
|
||||
if self.temp_file:
|
||||
self.temp_file.close()
|
||||
if os.path.exists(self.temp_file.name):
|
||||
os.remove(self.temp_file.name)
|
||||
logger.info(f"File {self.temp_file.name} has been deleted.")
|
||||
# Return False to propagate exception (if any exception occurred)
|
||||
return False
|
||||
|
||||
|
||||
class TempDirContext:
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize the context
|
||||
"""
|
||||
self.temp_dir = None
|
||||
|
||||
def __enter__(self):
|
||||
"""
|
||||
Create directory when entering context
|
||||
"""
|
||||
self.temp_dir = tempfile.TemporaryDirectory()
|
||||
logger.info(f"Created temporary directory: {self.temp_dir.name}")
|
||||
return self.temp_dir.name
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""
|
||||
Delete directory when exiting context
|
||||
"""
|
||||
if self.temp_dir and os.path.exists(self.temp_dir.name):
|
||||
self.temp_dir.cleanup()
|
||||
logger.info(f"Directory {self.temp_dir.name} has been deleted.")
|
||||
# Return False to propagate exception (if any exception occurred)
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_bytes = b"Hello, this is a test file."
|
||||
file_name = "test_file.txt"
|
||||
|
||||
# Using with statement
|
||||
with TempFileContext(example_bytes, file_name) as temp_file:
|
||||
# File operations can be performed within the context
|
||||
print(f"Does file {file_name} exist: {os.path.exists(file_name)}")
|
||||
438
docreader/uv.lock
generated
438
docreader/uv.lock
generated
@@ -6,17 +6,22 @@ resolution-markers = [
|
||||
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
|
||||
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version >= '3.14' and sys_platform == 'win32'",
|
||||
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.13.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version < '3.11' and sys_platform == 'darwin'",
|
||||
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version < '3.11' and sys_platform == 'win32'",
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -423,6 +428,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cobble"
|
||||
version = "0.1.4"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa", size = 3805, upload-time = "2024-06-01T18:11:09.528Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44", size = 3984, upload-time = "2024-06-01T18:11:07.911Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.6"
|
||||
@@ -432,6 +446,18 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "coloredlogs"
|
||||
version = "15.0.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "humanfriendly" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cos-python-sdk-v5"
|
||||
version = "1.9.38"
|
||||
@@ -587,6 +613,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/18/d5/7a04640bf559bb890455ffb28978daf7d44f667c3f04a4d422c655c1ba92/cython-3.1.6-py3-none-any.whl", hash = "sha256:91dcf7eb9b6a089ce4e9e1140e571d84c3bca834afb77ec269be7aa9d31a8157", size = 1223550, upload-time = "2025-10-23T12:38:16.732Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "defusedxml"
|
||||
version = "0.7.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "distro"
|
||||
version = "1.9.0"
|
||||
@@ -612,6 +647,7 @@ dependencies = [
|
||||
{ name = "lxml" },
|
||||
{ name = "markdown" },
|
||||
{ name = "markdownify" },
|
||||
{ name = "markitdown", extra = ["docx", "pdf", "xls", "xlsx"] },
|
||||
{ name = "minio" },
|
||||
{ name = "mistletoe" },
|
||||
{ name = "ollama" },
|
||||
@@ -622,6 +658,7 @@ dependencies = [
|
||||
{ name = "pillow" },
|
||||
{ name = "playwright" },
|
||||
{ name = "protobuf" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "pypdf" },
|
||||
{ name = "pypdf2" },
|
||||
{ name = "python-docx" },
|
||||
@@ -643,6 +680,7 @@ requires-dist = [
|
||||
{ name = "lxml", specifier = ">=6.0.2" },
|
||||
{ name = "markdown", specifier = ">=3.10" },
|
||||
{ name = "markdownify", specifier = ">=1.2.0" },
|
||||
{ name = "markitdown", extras = ["docx", "pdf", "xls", "xlsx"], specifier = ">=0.1.3" },
|
||||
{ name = "minio", specifier = ">=7.2.18" },
|
||||
{ name = "mistletoe", specifier = ">=1.5.0" },
|
||||
{ name = "ollama", specifier = ">=0.6.0" },
|
||||
@@ -653,6 +691,7 @@ requires-dist = [
|
||||
{ name = "pillow", specifier = ">=12.0.0" },
|
||||
{ name = "playwright", specifier = ">=1.55.0" },
|
||||
{ name = "protobuf", specifier = ">=6.33.0" },
|
||||
{ name = "pydantic", specifier = ">=2.12.3" },
|
||||
{ name = "pypdf", specifier = ">=6.1.3" },
|
||||
{ name = "pypdf2", specifier = ">=3.0.1" },
|
||||
{ name = "python-docx", specifier = ">=1.2.0" },
|
||||
@@ -683,6 +722,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/bf/ee/aa015c5de8b0dc42a8e507eae8c2de5d1c0e068c896858fec6d502402ed6/ebooklib-0.20-py3-none-any.whl", hash = "sha256:fff5322517a37e31c972d27be7d982cc3928c16b3dcc5fd7e8f7c0f5d7bcf42b", size = 40995, upload-time = "2025-10-26T20:56:19.104Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "et-xmlfile"
|
||||
version = "2.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "exceptiongroup"
|
||||
version = "1.3.0"
|
||||
@@ -707,6 +755,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "flatbuffers"
|
||||
version = "25.9.23"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fonttools"
|
||||
version = "4.60.1"
|
||||
@@ -850,6 +907,8 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126, upload-time = "2025-08-07T13:18:20.239Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f1/29/74242b7d72385e29bcc5563fba67dad94943d7cd03552bac320d597f29b2/greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7", size = 1544904, upload-time = "2025-11-04T12:42:04.763Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c8/e2/1572b8eeab0f77df5f6729d6ab6b141e4a84ee8eb9bc8c1e7918f94eda6d/greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8", size = 1611228, upload-time = "2025-11-04T12:42:08.423Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654, upload-time = "2025-08-07T13:50:00.469Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" },
|
||||
@@ -859,6 +918,8 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
|
||||
@@ -868,6 +929,8 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
|
||||
@@ -877,6 +940,8 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
|
||||
@@ -884,6 +949,8 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" },
|
||||
]
|
||||
|
||||
@@ -1061,6 +1128,18 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "humanfriendly"
|
||||
version = "10.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pyreadline3", marker = "sys_platform == 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.11"
|
||||
@@ -1386,6 +1465,38 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "magika"
|
||||
version = "0.6.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click" },
|
||||
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
|
||||
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||
{ name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "onnxruntime", version = "1.23.2", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "python-dotenv" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a3/f3/3d1dcdd7b9c41d589f5cff252d32ed91cdf86ba84391cfc81d9d8773571d/magika-0.6.3.tar.gz", hash = "sha256:7cc52aa7359af861957043e2bf7265ed4741067251c104532765cd668c0c0cb1", size = 3042784, upload-time = "2025-10-30T15:22:34.499Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a2/e4/35c323beb3280482c94299d61626116856ac2d4ec16ecef50afc4fdd4291/magika-0.6.3-py3-none-any.whl", hash = "sha256:eda443d08006ee495e02083b32e51b98cb3696ab595a7d13900d8e2ef506ec9d", size = 2969474, upload-time = "2025-10-30T15:22:25.298Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/25/8f/132b0d7cd51c02c39fd52658a5896276c30c8cc2fd453270b19db8c40f7e/magika-0.6.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:86901e64b05dde5faff408c9b8245495b2e1fd4c226e3393d3d2a3fee65c504b", size = 13358841, upload-time = "2025-10-30T15:22:27.413Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c4/03/5ed859be502903a68b7b393b17ae0283bf34195cfcca79ce2dc25b9290e7/magika-0.6.3-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:3d9661eedbdf445ac9567e97e7ceefb93545d77a6a32858139ea966b5806fb64", size = 15367335, upload-time = "2025-10-30T15:22:29.907Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7b/9e/f8ee7d644affa3b80efdd623a3d75865c8f058f3950cb87fb0c48e3559bc/magika-0.6.3-py3-none-win_amd64.whl", hash = "sha256:e57f75674447b20cab4db928ae58ab264d7d8582b55183a0b876711c2b2787f3", size = 12692831, upload-time = "2025-10-30T15:22:32.063Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mammoth"
|
||||
version = "1.11.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "cobble" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ed/3c/a58418d2af00f2da60d4a51e18cd0311307b72d48d2fffec36a97b4a5e44/mammoth-1.11.0.tar.gz", hash = "sha256:a0f59e442f34d5b6447f4b0999306cbf3e67aaabfa8cb516f878fb1456744637", size = 53142, upload-time = "2025-09-19T10:35:20.373Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ca/54/2e39566a131b13f6d8d193f974cb6a34e81bb7cc2fa6f7e03de067b36588/mammoth-1.11.0-py2.py3-none-any.whl", hash = "sha256:c077ab0d450bd7c0c6ecd529a23bf7e0fa8190c929e28998308ff4eada3f063b", size = 54752, upload-time = "2025-09-19T10:35:18.699Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markdown"
|
||||
version = "3.10"
|
||||
@@ -1408,6 +1519,41 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/6a/e2/7af643acb4cae0741dffffaa7f3f7c9e7ab4046724543ba1777c401d821c/markdownify-1.2.0-py3-none-any.whl", hash = "sha256:48e150a1c4993d4d50f282f725c0111bd9eb25645d41fa2f543708fd44161351", size = 15561, upload-time = "2025-08-09T17:44:14.074Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markitdown"
|
||||
version = "0.1.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "charset-normalizer" },
|
||||
{ name = "defusedxml" },
|
||||
{ name = "magika" },
|
||||
{ name = "markdownify" },
|
||||
{ name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'win32'" },
|
||||
{ name = "requests" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/87/31/90cef2bc8ecd85c200ed3b3d1e20fc7a724213502685c4b05b5431e02668/markitdown-0.1.3.tar.gz", hash = "sha256:b0d9127c3373a68274dede6af6c9bb0684b78ce364c727c4c304da97a20d6fd9", size = 40039, upload-time = "2025-08-26T22:37:04.4Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/97/83/7b47d2ecbf58650a03aeeb21ba2d59175f202bf4fb81d44f40f1deb82bc0/markitdown-0.1.3-py3-none-any.whl", hash = "sha256:08d9a25770979d78f60dcc0afcb868de6799608e4db65342b2e03304fb091251", size = 58391, upload-time = "2025-08-26T22:37:02.924Z" },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
docx = [
|
||||
{ name = "lxml" },
|
||||
{ name = "mammoth" },
|
||||
]
|
||||
pdf = [
|
||||
{ name = "pdfminer-six" },
|
||||
]
|
||||
xls = [
|
||||
{ name = "pandas" },
|
||||
{ name = "xlrd" },
|
||||
]
|
||||
xlsx = [
|
||||
{ name = "openpyxl" },
|
||||
{ name = "pandas" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "minio"
|
||||
version = "7.2.18"
|
||||
@@ -1433,6 +1579,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/3f/c2/bd0fc48dc323035cd65d21c45a3bb5c7b054001bf4ea75e461beb7656e07/mistletoe-1.5.0-py3-none-any.whl", hash = "sha256:d4e77b991b998c5efe3c4eab4e1b472263b5349688acd50d79bd9a6c317a9df1", size = 55262, upload-time = "2025-10-18T16:37:08.376Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mpmath"
|
||||
version = "1.3.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "networkx"
|
||||
version = "3.4.2"
|
||||
@@ -1440,7 +1595,8 @@ source = { registry = "https://pypi.org/simple" }
|
||||
resolution-markers = [
|
||||
"python_full_version < '3.11' and sys_platform == 'darwin'",
|
||||
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version < '3.11' and sys_platform == 'win32'",
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" }
|
||||
wheels = [
|
||||
@@ -1456,14 +1612,18 @@ resolution-markers = [
|
||||
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
|
||||
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version >= '3.14' and sys_platform == 'win32'",
|
||||
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.13.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" }
|
||||
wheels = [
|
||||
@@ -1492,7 +1652,8 @@ source = { registry = "https://pypi.org/simple" }
|
||||
resolution-markers = [
|
||||
"python_full_version < '3.11' and sys_platform == 'darwin'",
|
||||
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version < '3.11' and sys_platform == 'win32'",
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
|
||||
wheels = [
|
||||
@@ -1561,14 +1722,18 @@ resolution-markers = [
|
||||
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
|
||||
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version >= '3.14' and sys_platform == 'win32'",
|
||||
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.13.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b5/f4/098d2270d52b41f1bd7db9fc288aaa0400cb48c2a3e2af6fa365d9720947/numpy-2.3.4.tar.gz", hash = "sha256:a7d018bfedb375a8d979ac758b120ba846a7fe764911a64465fd87b8729f4a6a", size = 20582187, upload-time = "2025-10-15T16:18:11.77Z" }
|
||||
wheels = [
|
||||
@@ -1660,6 +1825,97 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b5/c1/edc9f41b425ca40b26b7c104c5f6841a4537bb2552bfa6ca66e81405bb95/ollama-0.6.0-py3-none-any.whl", hash = "sha256:534511b3ccea2dff419ae06c3b58d7f217c55be7897c8ce5868dfb6b219cf7a0", size = 14130, upload-time = "2025-09-24T22:46:01.19Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "onnxruntime"
|
||||
version = "1.20.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
resolution-markers = [
|
||||
"python_full_version >= '3.14' and sys_platform == 'win32'",
|
||||
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.13.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version < '3.11' and sys_platform == 'win32'",
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
]
|
||||
dependencies = [
|
||||
{ name = "coloredlogs", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "flatbuffers", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "packaging", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "protobuf", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
{ name = "sympy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
|
||||
]
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/6d/c6/c4c0860bee2fde6037bdd9dcd12d323f6e38cf00fcc9a5065b394337fc55/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b2908b50101a19e99c4d4e97ebb9905561daf61829403061c1adc1b588bc0de", size = 11954010, upload-time = "2024-11-21T00:48:35.254Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/63/47/3dc0b075ab539f16b3d8b09df6b504f51836086ee709690a6278d791737d/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d82daaec24045a2e87598b8ac2b417b1cce623244e80e663882e9fe1aae86410", size = 13330452, upload-time = "2024-11-21T00:48:40.02Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/27/ef/80fab86289ecc01a734b7ddf115dfb93d8b2e004bd1e1977e12881c72b12/onnxruntime-1.20.1-cp310-cp310-win32.whl", hash = "sha256:4c4b251a725a3b8cf2aab284f7d940c26094ecd9d442f07dd81ab5470e99b83f", size = 9813849, upload-time = "2024-11-21T00:48:43.569Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a9/e6/33ab10066c9875a29d55e66ae97c3bf91b9b9b987179455d67c32261a49c/onnxruntime-1.20.1-cp310-cp310-win_amd64.whl", hash = "sha256:d3b616bb53a77a9463707bb313637223380fc327f5064c9a782e8ec69c22e6a2", size = 11329702, upload-time = "2024-11-21T00:48:46.599Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a5/da/c44bf9bd66cd6d9018a921f053f28d819445c4d84b4dd4777271b0fe52a2/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6243e34d74423bdd1edf0ae9596dd61023b260f546ee17d701723915f06a9f7", size = 11955227, upload-time = "2024-11-21T00:48:54.556Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/11/ac/4120dfb74c8e45cce1c664fc7f7ce010edd587ba67ac41489f7432eb9381/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5eec64c0269dcdb8d9a9a53dc4d64f87b9e0c19801d9321246a53b7eb5a7d1bc", size = 13331703, upload-time = "2024-11-21T00:48:57.97Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/12/f1/cefacac137f7bb7bfba57c50c478150fcd3c54aca72762ac2c05ce0532c1/onnxruntime-1.20.1-cp311-cp311-win32.whl", hash = "sha256:a19bc6e8c70e2485a1725b3d517a2319603acc14c1f1a017dda0afe6d4665b41", size = 9813977, upload-time = "2024-11-21T00:49:00.519Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/2d/2d4d202c0bcfb3a4cc2b171abb9328672d7f91d7af9ea52572722c6d8d96/onnxruntime-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:8508887eb1c5f9537a4071768723ec7c30c28eb2518a00d0adcd32c89dea3221", size = 11329895, upload-time = "2024-11-21T00:49:03.845Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c5/9d/a42a84e10f1744dd27c6f2f9280cc3fb98f869dd19b7cd042e391ee2ab61/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f56e898815963d6dc4ee1c35fc6c36506466eff6d16f3cb9848cea4e8c8172", size = 11952833, upload-time = "2024-11-21T00:49:10.563Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/47/42/2f71f5680834688a9c81becbe5c5bb996fd33eaed5c66ae0606c3b1d6a02/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb71a814f66517a65628c9e4a2bb530a6edd2cd5d87ffa0af0f6f773a027d99e", size = 13333903, upload-time = "2024-11-21T00:49:12.984Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c8/f1/aabfdf91d013320aa2fc46cf43c88ca0182860ff15df872b4552254a9680/onnxruntime-1.20.1-cp312-cp312-win32.whl", hash = "sha256:bd386cc9ee5f686ee8a75ba74037750aca55183085bf1941da8efcfe12d5b120", size = 9814562, upload-time = "2024-11-21T00:49:15.453Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/dd/80/76979e0b744307d488c79e41051117634b956612cc731f1028eb17ee7294/onnxruntime-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:19c2d843eb074f385e8bbb753a40df780511061a63f9def1b216bf53860223fb", size = 11331482, upload-time = "2024-11-21T00:49:19.412Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/81/0d/13bbd9489be2a6944f4a940084bfe388f1100472f38c07080a46fbd4ab96/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb44b08e017a648924dbe91b82d89b0c105b1adcfe31e90d1dc06b8677ad37be", size = 11951459, upload-time = "2024-11-21T00:49:26.269Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/ea/4454ae122874fd52bbb8a961262de81c5f932edeb1b72217f594c700d6ef/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bda6aebdf7917c1d811f21d41633df00c58aff2bef2f598f69289c1f1dabc4b3", size = 13331620, upload-time = "2024-11-21T00:49:28.875Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d8/e0/50db43188ca1c945decaa8fc2a024c33446d31afed40149897d4f9de505f/onnxruntime-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:d30367df7e70f1d9fc5a6a68106f5961686d39b54d3221f760085524e8d38e16", size = 11331758, upload-time = "2024-11-21T00:49:31.417Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d8/55/3821c5fd60b52a6c82a00bba18531793c93c4addfe64fbf061e235c5617a/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9158465745423b2b5d97ed25aa7740c7d38d2993ee2e5c3bfacb0c4145c49d8", size = 11950342, upload-time = "2024-11-21T00:49:34.164Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/14/56/fd990ca222cef4f9f4a9400567b9a15b220dee2eafffb16b2adbc55c8281/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0df6f2df83d61f46e842dbcde610ede27218947c33e994545a22333491e72a3b", size = 13337040, upload-time = "2024-11-21T00:49:37.271Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "onnxruntime"
|
||||
version = "1.23.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
resolution-markers = [
|
||||
"python_full_version >= '3.14' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
|
||||
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"python_full_version < '3.11' and sys_platform == 'darwin'",
|
||||
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
]
|
||||
dependencies = [
|
||||
{ name = "coloredlogs", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "flatbuffers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'darwin')" },
|
||||
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform == 'darwin')" },
|
||||
{ name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "protobuf", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
|
||||
]
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/35/d6/311b1afea060015b56c742f3531168c1644650767f27ef40062569960587/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a7730122afe186a784660f6ec5807138bf9d792fa1df76556b27307ea9ebcbe3", size = 17195934, upload-time = "2025-10-27T23:06:14.143Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/db/db/81bf3d7cecfbfed9092b6b4052e857a769d62ed90561b410014e0aae18db/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:b28740f4ecef1738ea8f807461dd541b8287d5650b5be33bca7b474e3cbd1f36", size = 19153079, upload-time = "2025-10-27T23:05:57.686Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2e/4d/a382452b17cf70a2313153c520ea4c96ab670c996cb3a95cc5d5ac7bfdac/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f7d1fe034090a1e371b7f3ca9d3ccae2fabae8c1d8844fb7371d1ea38e8e8d2", size = 15219883, upload-time = "2025-10-22T03:46:21.66Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fb/56/179bf90679984c85b417664c26aae4f427cba7514bd2d65c43b181b7b08b/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ca88747e708e5c67337b0f65eed4b7d0dd70d22ac332038c9fc4635760018f7", size = 17370357, upload-time = "2025-10-22T03:46:57.968Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/44/be/467b00f09061572f022ffd17e49e49e5a7a789056bad95b54dfd3bee73ff/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f91d2c9b0965e86827a5ba01531d5b669770b01775b23199565d6c1f136616c", size = 17196113, upload-time = "2025-10-22T03:47:33.526Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9f/a8/3c23a8f75f93122d2b3410bfb74d06d0f8da4ac663185f91866b03f7da1b/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:87d8b6eaf0fbeb6835a60a4265fde7a3b60157cf1b2764773ac47237b4d48612", size = 19153857, upload-time = "2025-10-22T03:46:37.578Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3f/d8/506eed9af03d86f8db4880a4c47cd0dffee973ef7e4f4cff9f1d4bcf7d22/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbfd2fca76c855317568c1b36a885ddea2272c13cb0e395002c402f2360429a6", size = 15220095, upload-time = "2025-10-22T03:46:24.769Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e9/80/113381ba832d5e777accedc6cb41d10f9eca82321ae31ebb6bcede530cea/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da44b99206e77734c5819aa2142c69e64f3b46edc3bd314f6a45a932defc0b3e", size = 17372080, upload-time = "2025-10-22T03:47:00.265Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1b/9e/f748cd64161213adeef83d0cb16cb8ace1e62fa501033acdd9f9341fff57/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:b8f029a6b98d3cf5be564d52802bb50a8489ab73409fa9db0bf583eabb7c2321", size = 17195929, upload-time = "2025-10-22T03:47:36.24Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/91/9d/a81aafd899b900101988ead7fb14974c8a58695338ab6a0f3d6b0100f30b/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:218295a8acae83905f6f1aed8cacb8e3eb3bd7513a13fe4ba3b2664a19fc4a6b", size = 19157705, upload-time = "2025-10-22T03:46:40.415Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3c/35/4e40f2fba272a6698d62be2cd21ddc3675edfc1a4b9ddefcc4648f115315/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76ff670550dc23e58ea9bc53b5149b99a44e63b34b524f7b8547469aaa0dcb8c", size = 15226915, upload-time = "2025-10-22T03:46:27.773Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ef/88/9cc25d2bafe6bc0d4d3c1db3ade98196d5b355c0b273e6a5dc09c5d5d0d5/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f9b4ae77f8e3c9bee50c27bc1beede83f786fe1d52e99ac85aa8d65a01e9b77", size = 17382649, upload-time = "2025-10-22T03:47:02.782Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3d/41/fba0cabccecefe4a1b5fc8020c44febb334637f133acefc7ec492029dd2c/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2ff531ad8496281b4297f32b83b01cdd719617e2351ffe0dba5684fb283afa1f", size = 17196337, upload-time = "2025-10-22T03:46:35.168Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fe/f9/2d49ca491c6a986acce9f1d1d5fc2099108958cc1710c28e89a032c9cfe9/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:162f4ca894ec3de1a6fd53589e511e06ecdc3ff646849b62a9da7489dee9ce95", size = 19157691, upload-time = "2025-10-22T03:46:43.518Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1c/a1/428ee29c6eaf09a6f6be56f836213f104618fb35ac6cc586ff0f477263eb/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45d127d6e1e9b99d1ebeae9bcd8f98617a812f53f46699eafeb976275744826b", size = 15226898, upload-time = "2025-10-22T03:46:30.039Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f2/2b/b57c8a2466a3126dbe0a792f56ad7290949b02f47b86216cd47d857e4b77/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8bace4e0d46480fbeeb7bbe1ffe1f080e6663a42d1086ff95c1551f2d39e7872", size = 17382518, upload-time = "2025-10-22T03:47:05.407Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/3d/6830fa61c69ca8e905f237001dbfc01689a4e4ab06147020a4518318881f/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466", size = 15229610, upload-time = "2025-10-22T03:46:32.239Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b6/ca/862b1e7a639460f0ca25fd5b6135fb42cf9deea86d398a92e44dfda2279d/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2b9233c4947907fd1818d0e581c049c41ccc39b2856cc942ff6d26317cee145", size = 17394184, upload-time = "2025-10-22T03:47:08.127Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openai"
|
||||
version = "2.7.1"
|
||||
@@ -1733,6 +1989,18 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.1.5"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "et-xmlfile" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opt-einsum"
|
||||
version = "3.3.0"
|
||||
@@ -1821,6 +2089,68 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a3/6f/042d73453ad01a2e3dd4adeae4870e25679d9eaa340d70bd54cd409cb982/paddlepaddle-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:d16fd322246c4a580ca512971840ac8c16126a77d59a7bceee165d8f2196a6b2", size = 101708504, upload-time = "2025-10-30T13:21:18.125Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pandas"
|
||||
version = "2.3.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
|
||||
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||
{ name = "python-dateutil" },
|
||||
{ name = "pytz" },
|
||||
{ name = "tzdata" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pdfminer-six"
|
||||
version = "20250506"
|
||||
@@ -2266,6 +2596,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ae/43/2b0607ef7f16d63fbe00de728151a090397ef5b3b9147b4aefe975d17106/pypdfium2-5.0.0-py3-none-win_arm64.whl", hash = "sha256:0a2a473fe95802e7a5f4140f25e5cd036cf17f060f27ee2d28c3977206add763", size = 2939015, upload-time = "2025-10-26T13:31:40.531Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyreadline3"
|
||||
version = "3.5.4"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.9.0.post0"
|
||||
@@ -2291,6 +2630,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-dotenv"
|
||||
version = "1.2.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-pptx"
|
||||
version = "1.0.2"
|
||||
@@ -2306,6 +2654,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2025.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyyaml"
|
||||
version = "6.0.3"
|
||||
@@ -2654,7 +3011,8 @@ source = { registry = "https://pypi.org/simple" }
|
||||
resolution-markers = [
|
||||
"python_full_version < '3.11' and sys_platform == 'darwin'",
|
||||
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version < '3.11' and sys_platform == 'win32'",
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
]
|
||||
dependencies = [
|
||||
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
|
||||
@@ -2717,14 +3075,18 @@ resolution-markers = [
|
||||
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
|
||||
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version >= '3.14' and sys_platform == 'win32'",
|
||||
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.13.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
]
|
||||
dependencies = [
|
||||
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||
@@ -3083,6 +3445,18 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/97/45/1097a0eb3ad2a1cb5a29d493aefffe4170e454b2726cfe2a76f6652e91af/stringzilla-4.2.3-cp313-cp313-win_arm64.whl", hash = "sha256:1000a8df547fb3b194def48cc3ed6494715fe1c8ac25c6444ed3cfb0b52942c7", size = 99815, upload-time = "2025-10-27T18:35:56.555Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sympy"
|
||||
version = "1.14.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "mpmath" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "3.2.0"
|
||||
@@ -3116,7 +3490,8 @@ source = { registry = "https://pypi.org/simple" }
|
||||
resolution-markers = [
|
||||
"python_full_version < '3.11' and sys_platform == 'darwin'",
|
||||
"python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version < '3.11' and sys_platform == 'win32'",
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
]
|
||||
dependencies = [
|
||||
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
|
||||
@@ -3135,14 +3510,18 @@ resolution-markers = [
|
||||
"python_full_version == '3.13.*' and sys_platform == 'darwin'",
|
||||
"python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version >= '3.14' and sys_platform == 'win32'",
|
||||
"(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.13.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version == '3.12.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'darwin'",
|
||||
"python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
|
||||
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
"python_full_version == '3.11.*' and sys_platform == 'win32'",
|
||||
"(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')",
|
||||
]
|
||||
dependencies = [
|
||||
{ name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||
@@ -3185,6 +3564,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tzdata"
|
||||
version = "2025.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unidic-lite"
|
||||
version = "1.0.8"
|
||||
|
||||
Reference in New Issue
Block a user