From 2d66abedf0f805dffc6d0dd4faaaec63b882d182 Mon Sep 17 00:00:00 2001 From: begoniezhao Date: Fri, 7 Nov 2025 10:30:02 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=E6=96=87=E6=A1=A3?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E7=B1=BB=EF=BC=8C=E8=B0=83=E6=95=B4=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E4=B8=8E=E8=A7=A3=E6=9E=90=E9=80=BB=E8=BE=91=EF=BC=8C?= =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=97=A5=E5=BF=97=E5=8F=8A=E5=AF=BC=E5=85=A5?= =?UTF-8?q?=20=E7=A7=BB=E9=99=A4=E6=97=A5=E5=BF=97=E8=AE=BE=E7=BD=AE?= =?UTF-8?q?=E4=B8=8E=E5=86=97=E4=BD=99=E4=BB=A3=E7=A0=81=EF=BC=8C=E4=BC=98?= =?UTF-8?q?=E5=8C=96=E5=AF=BC=E5=85=A5=E3=80=81=E7=B1=BB=E5=9E=8B=E6=8F=90?= =?UTF-8?q?=E7=A4=BA=E5=8F=8AOCR=E5=90=8E=E7=AB=AF=E7=AE=A1=E7=90=86=20?= =?UTF-8?q?=E7=BB=9F=E4=B8=80=E8=B0=83=E6=95=B4=E5=90=84=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E6=A8=A1=E5=9D=97=E5=AF=BC=E5=85=A5=E8=B7=AF=E5=BE=84=E4=B8=BA?= =?UTF-8?q?=E7=BB=9D=E5=AF=B9=E5=AF=BC=E5=85=A5=20=E8=B0=83=E6=95=B4?= =?UTF-8?q?=E5=AF=BC=E5=85=A5=E8=B7=AF=E5=BE=84=EF=BC=8C=E7=A7=BB=E9=99=A4?= =?UTF-8?q?=E9=83=A8=E5=88=86=E5=AF=BC=E5=85=A5=EF=BC=8C=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E6=97=A5=E5=BF=97=E5=8F=8A=E6=B3=A8=E9=87=8A=20=E5=8D=87?= =?UTF-8?q?=E7=BA=A7=E6=96=87=E6=A1=A3=E8=A7=A3=E6=9E=90=E5=99=A8=E4=B8=BA?= =?UTF-8?q?=20Docx2Parser=EF=BC=8C=E4=BC=98=E5=8C=96=E8=B6=85=E6=97=B6?= =?UTF-8?q?=E4=B8=8E=E5=9B=BE=E7=89=87=E5=A4=84=E7=90=86=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 7 +- docker-compose.yml | 1 + docker/Dockerfile.docreader | 10 +- docreader/.pylintrc | 5 + docreader/main.py | 113 ++--- docreader/models/__init__.py | 0 docreader/models/document.py | 87 ++++ docreader/models/read_config.py | 27 + docreader/parser/__init__.py | 19 +- docreader/parser/base_parser.py | 645 +++++++----------------- docreader/parser/caption.py | 105 ++-- docreader/parser/chain_parser.py | 70 +++ docreader/parser/config.py | 21 - docreader/parser/doc_parser.py | 350 ++++++------- docreader/parser/docx2_parser.py | 28 + docreader/parser/docx_parser.py | 185 ++++--- docreader/parser/image_parser.py | 64 +-- docreader/parser/image_utils.py | 43 -- docreader/parser/markdown_image_util.py | 111 ++++ docreader/parser/markdown_parser.py | 66 ++- docreader/parser/markitdown_parser.py | 31 ++ docreader/parser/mineru_parser.py | 124 +++++ docreader/parser/ocr_engine.py | 279 +++++----- docreader/parser/parser.py | 186 +++---- docreader/parser/pdf_parser.py | 116 +---- docreader/parser/storage.py | 272 ++++++---- docreader/parser/text_parser.py | 19 +- docreader/parser/web_parser.py | 27 +- docreader/proto/docreader_pb2.pyi | 127 +++++ docreader/proto/docreader_pb2_grpc.py | 2 +- docreader/pyproject.toml | 2 + docreader/scripts/generate_proto.sh | 11 +- docreader/splitter/header_hook.py | 112 ++++ docreader/splitter/splitter.py | 313 ++++++++++++ docreader/utils/endecode.py | 103 ++++ docreader/utils/request.py | 16 +- docreader/utils/split.py | 34 ++ docreader/utils/tempfile.py | 77 +++ docreader/uv.lock | 438 +++++++++++++++- 39 files changed, 2676 insertions(+), 1570 deletions(-) create mode 100644 docreader/.pylintrc create mode 100644 docreader/models/__init__.py create mode 100644 docreader/models/document.py create mode 100644 docreader/models/read_config.py create mode 100644 docreader/parser/chain_parser.py delete mode 100644 docreader/parser/config.py create mode 100644 docreader/parser/docx2_parser.py delete mode 100644 docreader/parser/image_utils.py create mode 100644 docreader/parser/markdown_image_util.py create mode 100644 docreader/parser/markitdown_parser.py create mode 100644 docreader/parser/mineru_parser.py create mode 100644 docreader/proto/docreader_pb2.pyi create mode 100644 docreader/splitter/header_hook.py create mode 100644 docreader/splitter/splitter.py create mode 100644 docreader/utils/endecode.py create mode 100644 docreader/utils/split.py create mode 100644 docreader/utils/tempfile.py diff --git a/.gitignore b/.gitignore index 8de2b3c..5c4420c 100644 --- a/.gitignore +++ b/.gitignore @@ -24,17 +24,14 @@ node_modules/ tmp/ temp/ -# Docker compose файл (локальные настройки) -# docker-compose.yml - WeKnora /models/ -**/__pycache__ test/data/mswag.txt data/files/ -.python-version .venv/ +**/__pycache__ +.python-version ### macOS # General diff --git a/docker-compose.yml b/docker-compose.yml index 0713397..4c210b3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -127,6 +127,7 @@ services: - MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-} - MINIO_USE_SSL=${MINIO_USE_SSL:-} - WEB_PROXY=${WEB_PROXY:-} + - MINERU_ENDPOINT=${MINERU_ENDPOINT:-} healthcheck: test: ["CMD", "grpc_health_probe", "-addr=:50051"] interval: 30s diff --git a/docker/Dockerfile.docreader b/docker/Dockerfile.docreader index 67d413f..f73476f 100644 --- a/docker/Dockerfile.docreader +++ b/docker/Dockerfile.docreader @@ -80,10 +80,11 @@ RUN pip install uv --break-system-packages && \ python -m uv sync --locked --no-dev # 复制源代码和生成脚本 -COPY docreader . +COPY docreader docreader # 生成 protobuf 代码 -RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh +RUN chmod +x docreader/scripts/generate_proto.sh && \ + bash docreader/scripts/generate_proto.sh # 确保模型目录存在 RUN ls -la /root/.paddleocr/whl/ @@ -150,10 +151,11 @@ RUN python -m playwright install-deps webkit # COPY docreader/scripts/download_deps.py download_deps.py # RUN python -m download_deps -COPY --from=builder /app/ ./ +COPY docreader/pyproject.toml docreader/uv.lock ./ +COPY --from=builder /app/docreader docreader # 暴露 gRPC 端口 EXPOSE 50051 # 直接运行 Python 服务(日志输出到 stdout/stderr) -CMD ["uv", "run", "main.py"] \ No newline at end of file +CMD ["uv", "run", "-m", "docreader.main"] \ No newline at end of file diff --git a/docreader/.pylintrc b/docreader/.pylintrc new file mode 100644 index 0000000..0f446b4 --- /dev/null +++ b/docreader/.pylintrc @@ -0,0 +1,5 @@ +[LOGGING] +logging-format-style=fstr + +[MESSAGES CONTROL] +; disable=W1203 diff --git a/docreader/main.py b/docreader/main.py index bba5256..1a0e2e7 100644 --- a/docreader/main.py +++ b/docreader/main.py @@ -1,37 +1,25 @@ -import os -import sys import logging -from concurrent import futures +import os +import re +import sys import traceback -import grpc import uuid -import atexit +from concurrent import futures +from typing import Optional + +import grpc from grpc_health.v1 import health_pb2_grpc from grpc_health.v1.health import HealthServicer -# Add parent directory to Python path -current_dir = os.path.dirname(os.path.abspath(__file__)) -parent_dir = os.path.dirname(current_dir) -if parent_dir not in sys.path: - sys.path.insert(0, parent_dir) +from docreader.models.read_config import ChunkingConfig +from docreader.parser import Parser +from docreader.parser.ocr_engine import OCREngine +from docreader.proto import docreader_pb2_grpc +from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse +from docreader.utils.request import init_logging_request_id, request_id_context -from proto.docreader_pb2 import ReadResponse, Chunk, Image -from proto import docreader_pb2_grpc -from parser import Parser, OCREngine -from parser.config import ChunkingConfig -from utils.request import request_id_context, init_logging_request_id - -# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read --- -import re -from typing import Optional - -try: - # Optional dependency for charset detection; install via `pip install charset-normalizer` - from charset_normalizer import from_bytes as _cn_from_bytes # type: ignore -except Exception: # pragma: no cover - _cn_from_bytes = None # type: ignore - -# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8 +# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values +# cannot be encoded to UTF-8 _SURROGATE_RE = re.compile(r"[\ud800-\udfff]") @@ -47,29 +35,6 @@ def to_valid_utf8_text(s: Optional[str]) -> str: return s.encode("utf-8", errors="replace").decode("utf-8") -def read_text_with_fallback(file_path: str) -> str: - """Read text from file supporting multiple encodings with graceful fallback. - - This server currently receives bytes over gRPC and delegates decoding to the parser. - This helper is provided for future local-file reads if needed. - """ - with open(file_path, "rb") as f: - raw = f.read() - if _cn_from_bytes is not None: - try: - result = _cn_from_bytes(raw).best() - if result: - return str(result) - except Exception: - pass - for enc in ("utf-8", "gb18030", "latin-1"): - try: - return raw.decode(enc, errors="replace") - except UnicodeDecodeError: - continue - return raw.decode("utf-8", errors="replace") - - # Ensure no existing handlers for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) @@ -113,7 +78,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer): request.file_type or os.path.splitext(request.file_name)[1][1:] ) logger.info( - f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}" + f"ReadFromFile for file: {request.file_name}, type: {file_type}" ) logger.info(f"File content size: {len(request.file_content)} bytes") @@ -124,8 +89,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer): enable_multimodal = request.read_config.enable_multimodal or False logger.info( - f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, " - f"multimodal={enable_multimodal}" + f"Using chunking config: size={chunk_size}, " + f"overlap={chunk_overlap}, multimodal={enable_multimodal}" ) # Get Storage and VLM config from request @@ -144,7 +109,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer): "path_prefix": sc.path_prefix, } logger.info( - f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}" + f"Using Storage config: provider={storage_config.get('provider')}, " + f"bucket={storage_config['bucket_name']}" ) vlm_config = { @@ -170,7 +136,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer): ) # Parse file - logger.info(f"Starting file parsing process") + logger.info("Starting file parsing process") result = self.parser.parse_file( request.file_name, file_type, request.file_content, chunking_config ) @@ -184,7 +150,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer): # Convert to protobuf message logger.info( - f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks" + f"Parsed file {request.file_name}, with {len(result.chunks)} chunks" ) # Build response, including image info @@ -224,8 +190,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer): enable_multimodal = request.read_config.enable_multimodal or False logger.info( - f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, " - f"multimodal={enable_multimodal}" + f"Using chunking config: size={chunk_size}, " + f"overlap={chunk_overlap}, multimodal={enable_multimodal}" ) # Get Storage and VLM config from request @@ -243,7 +209,8 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer): "path_prefix": sc.path_prefix, } logger.info( - f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}" + f"Using Storage config: provider={storage_config.get('provider')}, " + f"bucket={storage_config['bucket_name']}" ) vlm_config = { @@ -269,7 +236,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer): ) # Parse URL - logger.info(f"Starting URL parsing process") + logger.info("Starting URL parsing process") result = self.parser.parse_url( request.url, request.title, chunking_config ) @@ -282,7 +249,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer): # Convert to protobuf message, including image info logger.info( - f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks" + f"Parsed URL {request.url}, returning {len(result.chunks)} chunks" ) response = ReadResponse( @@ -335,29 +302,15 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer): return proto_chunk -def init_ocr_engine(ocr_backend, ocr_config): +def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs): """Initialize OCR engine""" - try: - logger.info(f"Initializing OCR engine with backend: {ocr_backend}") - ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config) - if ocr_engine: - logger.info("OCR engine initialized successfully") - return True - else: - logger.error("OCR engine initialization failed") - return False - except Exception as e: - logger.error(f"Error initializing OCR engine: {str(e)}") - return False + backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle") + logger.info(f"Initializing OCR engine with backend: {backend_type}") + OCREngine.get_instance(backend_type=backend_type, **kwargs) def main(): - init_ocr_engine( - os.getenv("OCR_BACKEND", "paddle"), - { - "OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""), - }, - ) + init_ocr_engine() # Set max number of worker threads max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4")) diff --git a/docreader/models/__init__.py b/docreader/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docreader/models/document.py b/docreader/models/document.py new file mode 100644 index 0000000..1ab8c46 --- /dev/null +++ b/docreader/models/document.py @@ -0,0 +1,87 @@ +"""Chunk document schema.""" + +import json +from typing import Any, Dict, List + +from pydantic import BaseModel, Field + + +class Chunk(BaseModel): + """Document Chunk including chunk content, chunk metadata.""" + + content: str = Field(default="", description="chunk text content") + seq: int = Field(default=0, description="Chunk sequence number") + start: int = Field(default=0, description="Chunk start position") + end: int = Field(description="Chunk end position") + images: List[Dict[str, Any]] = Field( + default_factory=list, description="Images in the chunk" + ) + + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="metadata fields", + ) + + def to_dict(self, **kwargs: Any) -> Dict[str, Any]: + """Convert Chunk to dict.""" + + data = self.model_dump() + data.update(kwargs) + data["class_name"] = self.__class__.__name__ + return data + + def to_json(self, **kwargs: Any) -> str: + """Convert Chunk to json.""" + data = self.to_dict(**kwargs) + return json.dumps(data) + + def __hash__(self): + """Hash function.""" + return hash((self.content,)) + + def __eq__(self, other): + """Equal function.""" + return self.content == other.content + + @classmethod + def from_dict(cls, data: Dict[str, Any], **kwargs: Any): # type: ignore + """Create Chunk from dict.""" + if isinstance(kwargs, dict): + data.update(kwargs) + + data.pop("class_name", None) + return cls(**data) + + @classmethod + def from_json(cls, data_str: str, **kwargs: Any): # type: ignore + """Create Chunk from json.""" + data = json.loads(data_str) + return cls.from_dict(data, **kwargs) + + +class Document(BaseModel): + """Document including document content, document metadata.""" + + model_config = {"arbitrary_types_allowed": True} + + content: str = Field(default="", description="document text content") + images: Dict[str, str] = Field( + default_factory=dict, description="Images in the document" + ) + + chunks: List[Chunk] = Field(default_factory=list, description="document chunks") + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="metadata fields", + ) + + def set_content(self, content: str) -> None: + """Set document content.""" + self.content = content + + def get_content(self) -> str: + """Get document content.""" + return self.content + + def is_valid(self) -> bool: + return self.content != "" diff --git a/docreader/models/read_config.py b/docreader/models/read_config.py new file mode 100644 index 0000000..c2c95d8 --- /dev/null +++ b/docreader/models/read_config.py @@ -0,0 +1,27 @@ +from dataclasses import dataclass, field + + +@dataclass +class ChunkingConfig: + """ + Configuration for text chunking process. + Controls how documents are split into smaller pieces for processing. + """ + + # Maximum size of each chunk in tokens/chars + chunk_size: int = 512 + + # Number of tokens/chars to overlap between chunks + chunk_overlap: int = 50 + + # Text separators in order of priority + separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"]) + + # Whether to enable multimodal processing (text + images) + enable_multimodal: bool = False + + # Preferred field name going forward + storage_config: dict[str, str] = field(default_factory=dict) + + # VLM configuration for image captioning + vlm_config: dict[str, str] = field(default_factory=dict) diff --git a/docreader/parser/__init__.py b/docreader/parser/__init__.py index c60a018..085b09f 100644 --- a/docreader/parser/__init__.py +++ b/docreader/parser/__init__.py @@ -13,22 +13,18 @@ The parsers extract content from documents and can split them into meaningful chunks for further processing and indexing. """ -from .base_parser import BaseParser, ParseResult -from .docx_parser import DocxParser from .doc_parser import DocParser -from .pdf_parser import PDFParser -from .markdown_parser import MarkdownParser -from .text_parser import TextParser +from .docx2_parser import Docx2Parser from .image_parser import ImageParser -from .web_parser import WebParser +from .markdown_parser import MarkdownParser from .parser import Parser -from .config import ChunkingConfig -from .ocr_engine import OCREngine +from .pdf_parser import PDFParser +from .text_parser import TextParser +from .web_parser import WebParser # Export public classes and modules __all__ = [ - "BaseParser", # Base parser class that all format parsers inherit from - "DocxParser", # Parser for .docx files (modern Word documents) + "Docx2Parser", # Parser for .docx files (modern Word documents) "DocParser", # Parser for .doc files (legacy Word documents) "PDFParser", # Parser for PDF documents "MarkdownParser", # Parser for Markdown text files @@ -36,7 +32,4 @@ __all__ = [ "ImageParser", # Parser for images with text content "WebParser", # Parser for web pages "Parser", # Main parser factory that selects the appropriate parser - "ChunkingConfig", # Configuration for text chunking behavior - "ParseResult", # Standard result format returned by all parsers - "OCREngine", # OCR engine for extracting text from images ] diff --git a/docreader/parser/base_parser.py b/docreader/parser/base_parser.py index 052bc82..8dab374 100644 --- a/docreader/parser/base_parser.py +++ b/docreader/parser/base_parser.py @@ -1,65 +1,28 @@ # -*- coding: utf-8 -*- -import re -import os import asyncio -from typing import List, Dict, Any, Optional, Tuple, Union -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -import logging -import sys -import traceback -import numpy as np -import time import io -import json -from .ocr_engine import OCREngine -from .image_utils import image_to_base64 -from .config import ChunkingConfig -from .storage import create_storage +import logging +import os +import re +import time +from abc import ABC, abstractmethod +from typing import Dict, List, Optional, Tuple + +import requests from PIL import Image -# Add parent directory to Python path for src imports -current_dir = os.path.dirname(os.path.abspath(__file__)) -parent_dir = os.path.dirname(current_dir) -if parent_dir not in sys.path: - sys.path.insert(0, parent_dir) - -try: - from services.docreader.src.parser.caption import Caption -except ImportError: - # Fallback: try relative import - try: - from .caption import Caption - except ImportError: - # If both imports fail, set to None - Caption = None - logging.warning( - "Failed to import Caption, image captioning will be unavailable" - ) +from docreader.models.document import Chunk, Document +from docreader.models.read_config import ChunkingConfig +from docreader.parser.caption import Caption +from docreader.parser.ocr_engine import OCREngine +from docreader.parser.storage import create_storage +from docreader.splitter.splitter import TextSplitter +from docreader.utils import endecode logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -@dataclass -class Chunk: - """Chunk result""" - - content: str # Chunk content - seq: int # Chunk sequence number - start: int # Chunk start position - end: int # Chunk end position - images: List[Dict[str, Any]] = field(default_factory=list) # Images in the chunk - - -@dataclass -class ParseResult: - """Parse result""" - - text: str # Extracted text content - chunks: Optional[List[Chunk]] = None # Chunk results - - class BaseParser(ABC): """Base parser interface""" @@ -97,17 +60,17 @@ class BaseParser(ABC): def __init__( self, file_name: str = "", - file_type: str = None, + file_type: Optional[str] = None, enable_multimodal: bool = True, chunk_size: int = 1000, chunk_overlap: int = 200, - separators: list = ["\n\n", "\n", "。"], + separators: list[str] = ["\n\n", "\n", "。"], ocr_backend: str = "paddle", - ocr_config: dict = None, + ocr_config: dict = {}, max_image_size: int = 1920, # Maximum image size max_concurrent_tasks: int = 5, # Max concurrent tasks max_chunks: int = 1000, # Max number of returned chunks - chunking_config: ChunkingConfig = None, # Chunking configuration object + chunking_config: Optional[ChunkingConfig] = None, ): """Initialize parser @@ -125,7 +88,6 @@ class BaseParser(ABC): max_chunks: Max number of returned chunks """ # Storage client instance - self._storage = None self.file_name = file_name self.file_type = file_type or os.path.splitext(file_name)[1] self.enable_multimodal = enable_multimodal @@ -133,15 +95,16 @@ class BaseParser(ABC): self.chunk_overlap = chunk_overlap self.separators = separators self.ocr_backend = os.getenv("OCR_BACKEND", ocr_backend) - self.ocr_config = ocr_config or {} + self.ocr_config = ocr_config self.max_image_size = max_image_size self.max_concurrent_tasks = max_concurrent_tasks self.max_chunks = max_chunks self.chunking_config = chunking_config - - logger.info( - f"Initializing {self.__class__.__name__} for file: {file_name}, type: {self.file_type}" + self.storage = create_storage( + self.chunking_config.storage_config if self.chunking_config else None ) + + logger.info(f"Initializing parser for file: {file_name}, type: {file_type}") logger.info( f"Parser config: chunk_size={chunk_size}, " f"overlap={chunk_overlap}, " @@ -150,16 +113,24 @@ class BaseParser(ABC): f"max_chunks={max_chunks}" ) # Only initialize Caption service if multimodal is enabled - if self.enable_multimodal: - try: - self.caption_parser = Caption(self.chunking_config.vlm_config) - except Exception as e: - logger.warning(f"Failed to initialize Caption service: {str(e)}") - self.caption_parser = None - else: - self.caption_parser = None + vlm_config = self.chunking_config.vlm_config if self.chunking_config else None + self.caption_parser = ( + Caption(vlm_config=vlm_config) if self.enable_multimodal else None + ) - def perform_ocr(self, image): + @abstractmethod + def parse_into_text(self, content: bytes) -> Document: + """Parse document content + + Args: + content: Document content + + Returns: + Either a string containing the parsed text, or a tuple of (text, image_map) + where image_map is a dict mapping image URLs to Image objects + """ + + def perform_ocr(self, image: Image.Image): """Execute OCR recognition on the image Args: @@ -170,53 +141,23 @@ class BaseParser(ABC): """ start_time = time.time() logger.info("Starting OCR recognition") - resized_image = None - try: - # Resize image to avoid processing large images - resized_image = self._resize_image_if_needed(image) + # Resize image to avoid processing large images + resized_image = self._resize_image_if_needed(image) - # Get OCR engine - ocr_engine = self.get_ocr_engine( - backend_type=self.ocr_backend, **self.ocr_config - ) - if ocr_engine is None: - logger.error( - f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, " - "skipping OCR recognition" - ) - return "" + # Get OCR engine + ocr_engine = OCREngine.get_instance(self.ocr_backend) - # Execute OCR prediction - logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)") - # Add extra exception handling - try: - ocr_result = ocr_engine.predict(resized_image) - except RuntimeError as e: - # Handle common CUDA memory issues or other runtime errors - logger.error(f"OCR prediction runtime error: {str(e)}") - return "" - except Exception as e: - # Handle other prediction errors - logger.error(f"Unexpected OCR prediction error: {str(e)}") - return "" + # Execute OCR prediction + logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)") + ocr_result = ocr_engine.predict(resized_image) - process_time = time.time() - start_time - logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds") - return ocr_result - except Exception as e: - process_time = time.time() - start_time - logger.error( - f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds" - ) - return "" - finally: - # Release image resources - if resized_image is not image and hasattr(resized_image, "close"): - # Only close the new image we created, not the original image - resized_image.close() + process_time = time.time() - start_time + logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds") - def _resize_image_if_needed(self, image): + return ocr_result + + def _resize_image_if_needed(self, image: Image.Image) -> Image.Image: """Resize image if it exceeds maximum size limit Args: @@ -225,102 +166,21 @@ class BaseParser(ABC): Returns: Resized image object """ - try: - # If it's a PIL Image - if hasattr(image, "size"): - width, height = image.size - if width > self.max_image_size or height > self.max_image_size: - logger.info(f"Resizing PIL image, original size: {width}x{height}") - scale = min( - self.max_image_size / width, self.max_image_size / height - ) - new_width = int(width * scale) - new_height = int(height * scale) - resized_image = image.resize((new_width, new_height)) - logger.info(f"Resized to: {new_width}x{new_height}") - return resized_image - else: - logger.info( - f"PIL image size {width}x{height} is within limits, no resizing needed" - ) - return image - # If it's a numpy array - elif hasattr(image, "shape"): - height, width = image.shape[:2] - if width > self.max_image_size or height > self.max_image_size: - logger.info( - f"Resizing numpy image, original size: {width}x{height}" - ) - scale = min( - self.max_image_size / width, self.max_image_size / height - ) - new_width = int(width * scale) - new_height = int(height * scale) - # Use PIL for resizing numpy arrays - pil_image = Image.fromarray(image) - resized_pil = pil_image.resize((new_width, new_height)) - resized_image = np.array(resized_pil) - logger.info(f"Resized to: {new_width}x{new_height}") - return resized_image - else: - logger.info( - f"Numpy image size {width}x{height} is within limits, no resizing needed" - ) - return image - else: - logger.warning(f"Unknown image type: {type(image)}, cannot resize") - return image - except Exception as e: - logger.error(f"Error resizing image: {str(e)}") - return image + width, height = image.size + if width > self.max_image_size or height > self.max_image_size: + logger.info(f"Resizing PIL image, original size: {width}x{height}") + scale = min(self.max_image_size / width, self.max_image_size / height) + new_width = int(width * scale) + new_height = int(height * scale) + resized_image = image.resize((new_width, new_height)) + logger.info(f"Resized to: {new_width}x{new_height}") + return resized_image - def process_image(self, image, image_url=None): - """Process image: first perform OCR, then get caption if text is available + logger.info(f"PIL image size is {width}x{height}, no resizing needed") + return image - Args: - image: Image object (PIL.Image or numpy array) - image_url: Image URL (if uploaded) - - Returns: - tuple: (ocr_text, caption, image_url) - - ocr_text: OCR extracted text - - caption: Image description (if OCR has text) or empty string - - image_url: Image URL (if provided) - """ - logger.info("Starting image processing (OCR + optional caption)") - - # Resize image - image = self._resize_image_if_needed(image) - - # Perform OCR recognition - ocr_text = self.perform_ocr(image) - caption = "" - - if self.caption_parser: - logger.info( - f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption" - ) - # Convert image to base64 for caption generation - img_base64 = image_to_base64(image) - if img_base64: - caption = self.get_image_caption(img_base64) - if caption: - logger.info(f"Successfully obtained image caption: {caption}") - else: - logger.warning("Failed to get caption") - else: - logger.warning("Failed to convert image to base64") - caption = "" - else: - logger.info("Caption service not initialized, skipping caption retrieval") - - # Release image resources - del image - - return ocr_text, caption, image_url - - async def process_image_async(self, image, image_url=None): - """Asynchronously process image: first perform OCR, then get caption if text is available + async def process_image_async(self, image: Image.Image, image_url: str): + """Asynchronously process image: first perform OCR, then get caption Args: image: Image object (PIL.Image or numpy array) @@ -333,84 +193,47 @@ class BaseParser(ABC): - image_url: Image URL (if provided) """ logger.info("Starting asynchronous image processing (OCR + optional caption)") - resized_image = None + # Resize image + resized_image = self._resize_image_if_needed(image) try: - # Resize image - resized_image = self._resize_image_if_needed(image) - - # Perform OCR recognition (using run_in_executor to execute synchronous operations in the event loop) + # Perform OCR recognition loop = asyncio.get_event_loop() try: # Add timeout mechanism to avoid infinite blocking (30 seconds timeout) ocr_task = loop.run_in_executor(None, self.perform_ocr, resized_image) ocr_text = await asyncio.wait_for(ocr_task, timeout=30.0) - except asyncio.TimeoutError: - logger.error( - "OCR processing timed out (30 seconds), skipping this image" - ) - ocr_text = "" except Exception as e: - logger.error(f"OCR processing error: {str(e)}") + logger.error(f"OCR processing error, skipping this image: {str(e)}") ocr_text = "" - logger.info( - f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption" - ) - caption = "" - if self.caption_parser: - try: - # Convert image to base64 for caption generation - img_base64 = image_to_base64(resized_image) - if img_base64: - # Add timeout to avoid blocking caption retrieval (30 seconds timeout) - caption_task = self.get_image_caption_async(img_base64) - image_data, caption = await asyncio.wait_for( - caption_task, timeout=30.0 - ) - if caption: - logger.info( - f"Successfully obtained image caption: {caption}" - ) - else: - logger.warning("Failed to get caption") - else: - logger.warning("Failed to convert image to base64") - caption = "" - except asyncio.TimeoutError: - logger.warning("Caption retrieval timed out, skipping") - except Exception as e: - logger.error(f"Failed to get caption: {str(e)}") - else: - logger.info( - "Caption service not initialized, skipping caption retrieval" - ) - + logger.info(f"Successfully obtained image ocr: {ocr_text}") + img_base64 = endecode.decode_image(resized_image) + caption = self.get_image_caption(img_base64) + logger.info(f"Successfully obtained image caption: {caption}") return ocr_text, caption, image_url finally: - # Release image resources - if resized_image is not image and hasattr(resized_image, "close"): - # Only close the new image we created, not the original image - resized_image.close() + resized_image.close() - async def process_with_limit(self, idx, image, url, semaphore): + async def process_with_limit( + self, idx: int, image: Image.Image, url: str, semaphore: asyncio.Semaphore + ): """Function to process a single image using a semaphore""" try: - logger.info(f"Waiting to process image {idx+1}") + logger.info(f"Waiting to process image {idx + 1}") async with semaphore: # Use semaphore to control concurrency - logger.info(f"Starting to process image {idx+1}") + logger.info(f"Starting to process image {idx + 1}") result = await self.process_image_async(image, url) - logger.info(f"Completed processing image {idx+1}") + logger.info(f"Completed processing image {idx + 1}") return result except Exception as e: - logger.error(f"Error processing image {idx+1}: {str(e)}") + logger.error(f"Error processing image {idx + 1}: {str(e)}") return ("", "", url) # Return empty result to avoid overall failure finally: # Manually release image resources - if hasattr(image, "close"): - image.close() + image.close() - async def process_multiple_images(self, images_data): + async def process_multiple_images(self, images_data: List[Tuple[Image.Image, str]]): """Process multiple images concurrently Args: @@ -450,7 +273,7 @@ class BaseParser(ABC): for i, result in enumerate(completed_results): if isinstance(result, Exception): logger.error( - f"Image {i+1} processing returned an exception: {str(result)}" + f"Image {i + 1} processing returned an exception: {str(result)}" ) # For exceptions, add empty results if i < len(images_data): @@ -467,47 +290,10 @@ class BaseParser(ABC): logger.info("Image processing resource cleanup complete") logger.info( - f"Completed concurrent processing of {len(results)}/{len(images_data)} images" + f"Concurrent processing of {len(results)}/{len(images_data)} images" ) return results - def decode_bytes(self, content: bytes) -> str: - """Intelligently decode byte stream, supports multiple encodings - - Tries to decode in common encodings, if all fail, uses latin-1 as fallback - - Args: - content: Byte stream to decode - - Returns: - Decoded string - """ - logger.info(f"Attempting to decode bytes of length: {len(content)}") - # Common encodings, sorted by priority - encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii", "latin-1"] - text = None - - # Try decoding with each encoding format - for encoding in encodings: - try: - text = content.decode(encoding) - logger.info(f"Successfully decoded content using {encoding} encoding") - break - except UnicodeDecodeError: - logger.info(f"Failed to decode using {encoding} encoding") - continue - - # If all encodings fail, use latin-1 as fallback - if text is None: - text = content.decode("latin-1") - logger.warning( - f"Unable to determine correct encoding, using latin-1 as fallback. " - f"This may cause character issues." - ) - - logger.info(f"Decoded text length: {len(text)} characters") - return text - def get_image_caption(self, image_data: str) -> str: """Get image description @@ -517,6 +303,9 @@ class BaseParser(ABC): Returns: Image description """ + if not self.caption_parser: + logger.warning("Caption parser not initialized") + return "" start_time = time.time() logger.info( f"Getting caption for image: {image_data[:250]}..." @@ -533,80 +322,7 @@ class BaseParser(ABC): logger.warning("Failed to get caption for image") return caption - async def get_image_caption_async(self, image_data: str) -> Tuple[str, str]: - """Asynchronously get image description - - Args: - image_data: Image data (base64 encoded string or URL) - - Returns: - Tuple[str, str]: Image data and corresponding description - """ - caption = self.get_image_caption(image_data) - return image_data, caption - - def __init_storage(self): - """Initialize storage client based on configuration""" - if self._storage is None: - storage_config = ( - self.chunking_config.storage_config if self.chunking_config else None - ) - self._storage = create_storage(storage_config) - logger.info( - f"Initialized storage client: {self._storage.__class__.__name__}" - ) - return self._storage - - def upload_file(self, file_path: str) -> str: - """Upload file to object storage - - Args: - file_path: File path - - Returns: - File URL - """ - logger.info(f"Uploading file: {file_path}") - try: - storage = self.__init_storage() - return storage.upload_file(file_path) - except Exception as e: - logger.error(f"Failed to upload file: {str(e)}") - return "" - - def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str: - """Upload bytes to object storage - - Args: - content: Byte content to upload - file_ext: File extension - - Returns: - File URL - """ - logger.info(f"Uploading bytes content, size: {len(content)} bytes") - try: - storage = self.__init_storage() - return storage.upload_bytes(content, file_ext) - except Exception as e: - logger.error(f"Failed to upload bytes to storage: {str(e)}") - traceback.print_exc() - return "" - - @abstractmethod - def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: - """Parse document content - - Args: - content: Document content - - Returns: - Either a string containing the parsed text, or a tuple of (text, image_map) - where image_map is a dict mapping image URLs to Image objects - """ - pass - - def parse(self, content: bytes) -> ParseResult: + def parse(self, content: bytes) -> Document: """Parse document content Args: @@ -616,17 +332,19 @@ class BaseParser(ABC): Parse result """ logger.info( - f"Parsing document with {self.__class__.__name__}, content size: {len(content)} bytes" + f"Parsing document with {self.__class__.__name__}, bytes: {len(content)}" ) - parse_result = self.parse_into_text(content) - if isinstance(parse_result, tuple): - text, image_map = parse_result - else: - text = parse_result - image_map = {} - logger.info(f"Extracted {len(text)} characters of text from {self.file_name}") - logger.info(f"Beginning chunking process for text") - chunks = self.chunk_text(text) + document = self.parse_into_text(content) + logger.info( + f"Extracted {len(document.content)} characters from {self.file_name}" + ) + splitter = TextSplitter( + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap, + separators=self.separators, + ) + chunk_str = splitter.split_text(document.content) + chunks = self._str_to_chunk(chunk_str) logger.info(f"Created {len(chunks)} chunks from document") # Limit the number of returned chunks @@ -636,7 +354,7 @@ class BaseParser(ABC): ) chunks = chunks[: self.max_chunks] - # If multimodal is enabled and file type is supported, process images in each chunk + # If multimodal is enabled and file type is supported, process images if self.enable_multimodal: # Get file extension and convert to lowercase file_ext = ( @@ -647,11 +365,12 @@ class BaseParser(ABC): # Define allowed file types for image processing allowed_types = [ - ".pdf", # PDF files + # Text files + ".pdf", ".md", - ".markdown", # Markdown files + ".markdown", ".doc", - ".docx", # Word documents + ".docx", # Image files ".jpg", ".jpeg", @@ -666,13 +385,21 @@ class BaseParser(ABC): logger.info( f"Processing images in each chunk for file type: {file_ext}" ) - chunks = self.process_chunks_images(chunks, image_map) + chunks = self.process_chunks_images(chunks, document.images) else: logger.info( f"Skipping image processing for unsupported file type: {file_ext}" ) - return ParseResult(text=text, chunks=chunks) + document.chunks = chunks + return document + + def _str_to_chunk(self, text: List[Tuple[int, int, str]]) -> List[Chunk]: + """Convert string to Chunk object""" + return [ + Chunk(seq=i, content=t, start=start, end=end) + for i, (start, end, t) in enumerate(text) + ] def _split_into_units(self, text: str) -> List[str]: """ @@ -682,9 +409,7 @@ class BaseParser(ABC): Returns: 基本单元的列表 """ - logger.info( - f"Splitting text into basic units with robust structure protection, text length: {len(text)}" - ) + logger.info(f"Splitting text into basic units, text length: {len(text)}") # 定义所有需要作为整体保护的结构模式 --- table_pattern = r"(?m)(^\|.*\|[ \t]*\r?\n(?:[ \t]*\r?\n)?^\|\s*:?--+.*\r?\n(?:^\|.*\|\r?\n?)*)" @@ -710,7 +435,8 @@ class BaseParser(ABC): # 按起始位置排序 protected_ranges.sort(key=lambda x: x[0]) logger.info( - f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links)." + f"Found {len(protected_ranges)} protected structures " + "(tables, code, formulas, images, links)." ) # 合并可能重叠的保护范围 --- @@ -731,7 +457,7 @@ class BaseParser(ABC): merged_ranges.append((current_start, current_end)) protected_ranges = merged_ranges logger.info( - f"After merging overlaps, {len(protected_ranges)} protected ranges remain." + f"After overlaps, {len(protected_ranges)} protected ranges remain." ) # 根据保护范围和分隔符来分割文本 --- @@ -749,7 +475,7 @@ class BaseParser(ABC): segments = re.split(separator_pattern, pre_text) units.extend([s for s in segments if s]) # 添加所有非空部分 - # b. 将整个受保护的块(例如,一个完整的表格)作为一个单独的、不可分割的单元添加 + # b. 将整个受保护的块(例如,一个完整的表格)作为一个不可分割的单元添加 protected_text = text[start:end] units.append(protected_text) @@ -764,38 +490,6 @@ class BaseParser(ABC): logger.info(f"Text splitting complete, created {len(units)} final basic units.") return units - def _find_complete_units(self, units: List[str], target_size: int) -> List[str]: - """Find a list of complete units that do not exceed the target size - - Args: - units: List of units - target_size: Target size - - Returns: - List of complete units - """ - logger.info(f"Finding complete units with target size: {target_size}") - result = [] - current_size = 0 - - for unit in units: - unit_size = len(unit) - if current_size + unit_size > target_size and result: - logger.info( - f"Reached target size limit at {current_size} characters, stopping" - ) - break - result.append(unit) - current_size += unit_size - logger.info( - f"Added unit of size {unit_size}, current total: {current_size}/{target_size}" - ) - - logger.info( - f"Found {len(result)} complete units totaling {current_size} characters" - ) - return result - def chunk_text(self, text: str) -> List[Chunk]: """Chunk text, preserving Markdown structure @@ -825,7 +519,7 @@ class BaseParser(ABC): for i, unit in enumerate(units): unit_size = len(unit) - logger.info(f"Processing unit {i+1}/{len(units)}, size: {unit_size}") + logger.info(f"Processing unit {i + 1}/{len(units)}, size: {unit_size}") # If current chunk plus new unit exceeds size limit, create new chunk if current_size + unit_size > self.chunk_size and current_chunk: @@ -855,14 +549,12 @@ class BaseParser(ABC): for u in reversed(current_chunk): if overlap_size + len(u) > overlap_target: logger.info( - f"Reached overlap target ({overlap_size}/{overlap_target})" + f"Overlap target ({overlap_size}/{overlap_target})" ) break overlap_units.insert(0, u) overlap_size += len(u) - logger.info( - f"Added unit to overlap, current overlap size: {overlap_size}" - ) + logger.info(f"Added unit to overlap, size: {overlap_size}") # Remove elements from overlap that are included in separators start_index = 0 @@ -883,7 +575,7 @@ class BaseParser(ABC): overlap_units = overlap_units[start_index:] logger.info( - f"Final overlap: {len(overlap_units)} units, {overlap_size} characters" + f"Overlap: {len(overlap_units)} units, {overlap_size} size" ) current_chunk = overlap_units @@ -899,7 +591,7 @@ class BaseParser(ABC): current_chunk.append(unit) current_size += unit_size logger.info( - f"Added unit to current chunk, now at {current_size}/{self.chunk_size} characters" + f"Added unit to current chunk, at {current_size}/{self.chunk_size}" ) # Add the last chunk @@ -925,12 +617,13 @@ class BaseParser(ABC): chunk: Document chunk Returns: - List of image information, each element contains image URL and match position + List of image information """ logger.info(f"Extracting image information from Chunk #{chunk.seq}") text = chunk.content - # Regex to extract image information from text, supporting Markdown images and HTML images + # Regex to extract image information from text, + # support: Markdown images, HTML images img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)|]*src="([^"]+)" [^>]*>' # Extract image information @@ -954,28 +647,28 @@ class BaseParser(ABC): images_info.append(image_info) logger.info( - f"Image in Chunk #{chunk.seq} {match_idx+1}: " f"URL={img_url[:50]}..." + f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url[:50]}..." if len(img_url) > 50 - else f"Image in Chunk #{chunk.seq} {match_idx+1}: URL={img_url}" + else f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url}" ) return images_info - async def download_and_upload_image(self, img_url: str): - """Download image and upload to object storage, if it's already an object storage path or local path, use directly + async def download_and_upload_image( + self, img_url: str + ) -> Tuple[str, str, Image.Image | None]: + """Download image and upload to object storage, + if it's already an object storage path or local path, use directly Args: img_url: Image URL or local path Returns: - tuple: (original URL, storage URL, image object), if failed returns (original URL, None, None) + tuple: (original URL, storage URL, image object), + if failed returns (original URL, None, None) """ try: - import requests - from PIL import Image - import io - # Check if it's already a storage URL (COS or MinIO) is_storage_url = any( pattern in img_url @@ -997,12 +690,7 @@ class BaseParser(ABC): response = requests.get(img_url, timeout=5, proxies=proxies) if response.status_code == 200: image = Image.open(io.BytesIO(response.content)) - try: - return img_url, img_url, image - finally: - # Ensure image resources are also released after the function returns - # Image will be closed by the caller - pass + return img_url, img_url, image else: logger.warning( f"Failed to get storage image: {response.status_code}" @@ -1022,7 +710,7 @@ class BaseParser(ABC): # Upload to storage with open(img_url, "rb") as f: content = f.read() - storage_url = self.upload_bytes(content) + storage_url = self.storage.upload_bytes(content) logger.info( f"Successfully uploaded local image to storage: {storage_url}" ) @@ -1031,7 +719,7 @@ class BaseParser(ABC): logger.error(f"Error processing local image: {str(e)}") if image and hasattr(image, "close"): image.close() - return img_url, None, None + return img_url, img_url, None # Normal remote URL download handling else: @@ -1044,9 +732,7 @@ class BaseParser(ABC): if https_proxy: proxies["https"] = https_proxy - logger.info( - f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}" - ) + logger.info(f"Downloading image {img_url}, using proxy: {proxies}") response = requests.get(img_url, timeout=5, proxies=proxies) if response.status_code == 200: @@ -1054,7 +740,7 @@ class BaseParser(ABC): image = Image.open(io.BytesIO(response.content)) try: # Upload to storage using the method in BaseParser - storage_url = self.upload_bytes(response.content) + storage_url = self.storage.upload_bytes(response.content) logger.info( f"Successfully uploaded image to storage: {storage_url}" ) @@ -1064,11 +750,11 @@ class BaseParser(ABC): pass else: logger.warning(f"Failed to download image: {response.status_code}") - return img_url, None, None + return img_url, img_url, None except Exception as e: logger.error(f"Error downloading or processing image: {str(e)}") - return img_url, None, None + return img_url, img_url, None async def process_chunk_images_async( self, chunk, chunk_idx, total_chunks, image_map=None @@ -1086,18 +772,19 @@ class BaseParser(ABC): """ logger.info( - f"Starting to process images in Chunk #{chunk_idx+1}/{total_chunks}" + f"Starting to process images in Chunk #{chunk_idx + 1}/{total_chunks}" ) # Extract image information from the Chunk images_info = self.extract_images_from_chunk(chunk) if not images_info: - logger.info(f"Chunk #{chunk_idx+1} found no images") + logger.info(f"Chunk #{chunk_idx + 1} found no images") return chunk # Prepare images that need to be downloaded and processed images_to_process = [] - url_to_info_map = {} # Map URL to image information + # Map URL to image information + url_to_info_map = {} # Record all image URLs that need to be processed for img_info in images_info: @@ -1106,14 +793,21 @@ class BaseParser(ABC): results = [] download_tasks = [] - for img_url in url_to_info_map.keys(): # Check if image is already in the image_map + # Check if image is already in the image_map + for img_url in url_to_info_map.keys(): if image_map and img_url in image_map: - logger.info(f"Image already in image_map: {img_url}, using cached object") - results.append((img_url, img_url, image_map[img_url])) + logger.info( + f"Image already in image_map: {img_url}, using cached object" + ) + image = Image.open( + io.BytesIO(endecode.encode_image(image_map[img_url])) + ) + results.append((img_url, img_url, image)) else: download_task = self.download_and_upload_image(img_url) download_tasks.append(download_task) - # Concurrent download and upload of images, ignore images that are already in the image_map + # Concurrent download and upload of images, + # ignore images that are already in the image_map results.extend(await asyncio.gather(*download_tasks)) # Process download results, prepare for OCR processing @@ -1123,16 +817,17 @@ class BaseParser(ABC): img_info["cos_url"] = cos_url images_to_process.append((image, cos_url)) - # If no images were successfully downloaded and uploaded, return the original Chunk + # If no images were successfully downloaded and uploaded, + # return the original Chunk if not images_to_process: logger.info( - f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images" + f"Chunk #{chunk_idx + 1} not found downloaded and uploaded images" ) return chunk # Concurrent processing of all images (OCR + caption) logger.info( - f"Processing {len(images_to_process)} images in Chunk #{chunk_idx+1}" + f"Processing {len(images_to_process)} images in Chunk #{chunk_idx + 1}" ) # Concurrent processing of all images @@ -1163,10 +858,12 @@ class BaseParser(ABC): # Update image information in the Chunk chunk.images = processed_images - logger.info(f"Completed image processing in Chunk #{chunk_idx+1}") + logger.info(f"Completed image processing in Chunk #{chunk_idx + 1}") return chunk - def process_chunks_images(self, chunks: List[Chunk], image_map=None) -> List[Chunk]: + def process_chunks_images( + self, chunks: List[Chunk], image_map: Dict[str, str] = {} + ) -> List[Chunk]: """Concurrent processing of images in all Chunks Args: @@ -1210,7 +907,7 @@ class BaseParser(ABC): processed_chunks = [] for i, result in enumerate(results): if isinstance(result, Exception): - logger.error(f"Error processing Chunk {i+1}: {str(result)}") + logger.error(f"Error processing Chunk {i + 1}: {str(result)}") # Keep original Chunk if i < len(chunks): processed_chunks.append(chunks[i]) @@ -1235,7 +932,7 @@ class BaseParser(ABC): # Execute processing for all Chunks processed_chunks = loop.run_until_complete(process_all_chunks()) logger.info( - f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks" + f"Completed processing of {len(processed_chunks)}/{len(chunks)} chunks" ) return processed_chunks diff --git a/docreader/parser/caption.py b/docreader/parser/caption.py index d84bf6d..f3e6f69 100644 --- a/docreader/parser/caption.py +++ b/docreader/parser/caption.py @@ -3,11 +3,10 @@ import logging import os import time from dataclasses import dataclass, field -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union -import requests import ollama - +import requests logger = logging.getLogger(__name__) @@ -158,11 +157,16 @@ class CaptionChatResp: Returns: The content string from the first choice, or empty string if no choices """ - if self.choices: - logger.info("Retrieving content from first choice") - return self.choices[0].message.content - logger.warning("No choices available in response") - return "" + if ( + not self.choices + or not self.choices[0] + or not self.choices[0].message + or not self.choices[0].message.content + ): + logger.warning("No choices available in response") + return "" + logger.info("Retrieving content from first choice") + return self.choices[0].message.content class Caption: @@ -171,33 +175,43 @@ class Caption: Uses an external API to process images and return textual descriptions. """ - def __init__(self, vlm_config=None): - """Initialize the Caption service with configuration from parameters or environment variables.""" + def __init__(self, vlm_config: Optional[Dict[str, str]] = None): + """ + Initialize the Caption service with configuration + from parameters or environment variables. + """ logger.info("Initializing Caption service") self.prompt = """简单凝炼的描述图片的主要内容""" - - # Use provided VLM config if available, otherwise fall back to environment variables + self.timeout = 30 + + # Use provided VLM config if available, + # otherwise fall back to environment variables if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"): self.completion_url = vlm_config.get("base_url", "") + "/chat/completions" self.model = vlm_config.get("model_name", "") self.api_key = vlm_config.get("api_key", "") self.interface_type = vlm_config.get("interface_type", "openai").lower() else: - if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "": + base_url = os.getenv("VLM_MODEL_BASE_URL") + model_name = os.getenv("VLM_MODEL_NAME") + if not base_url or not model_name: logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set") return - self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions" - self.model = os.getenv("VLM_MODEL_NAME") - self.api_key = os.getenv("VLM_MODEL_API_KEY") + self.completion_url = base_url + "/chat/completions" + self.model = model_name + self.api_key = os.getenv("VLM_MODEL_API_KEY", "") self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower() - + # 验证接口类型 if self.interface_type not in ["ollama", "openai"]: - logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai") + logger.warning( + f"Unknown interface type: {self.interface_type}, defaulting to openai" + ) self.interface_type = "openai" - + logger.info( - f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}" + f"Configured with model: {self.model}, " + f"endpoint: {self.completion_url}, interface: {self.interface_type}" ) def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]: @@ -210,8 +224,8 @@ class Caption: Returns: CaptionChatResp object if successful, None otherwise """ - logger.info(f"Calling Caption API for image captioning") - logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}") + logger.info("Calling Caption API for image captioning") + logger.info(f"Processing image data: {image_data[:50]}...") # 根据接口类型选择调用方式 if self.interface_type == "ollama": @@ -226,39 +240,35 @@ class Caption: client = ollama.Client( host=host, + timeout=self.timeout, ) - + try: logger.info(f"Calling Ollama API with model: {self.model}") - + # 调用Ollama API,使用images参数传递base64编码的图片 response = client.generate( model=self.model, prompt="简单凝炼的描述图片的主要内容", - images=[image_base64], # image_base64是base64编码的图片数据 + images=[image_base64], # image_base64是base64编码的图片数据 options={"temperature": 0.1}, stream=False, ) - + # 构造响应对象 caption_resp = CaptionChatResp( id="ollama_response", created=int(time.time()), - model=self.model, + model=Model(id=self.model), object="chat.completion", choices=[ - Choice( - message=Message( - role="assistant", - content=response.response - ) - ) - ] + Choice(message=Message(role="assistant", content=response.response)) + ], ) - + logger.info("Successfully received response from Ollama API") return caption_resp - + except Exception as e: logger.error(f"Error calling Ollama API: {e}") return None @@ -266,13 +276,16 @@ class Caption: def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]: """Call OpenAI-compatible API for image captioning.""" logger.info(f"Calling OpenAI-compatible API with model: {self.model}") - + user_msg = UserMessage( role="user", content=[ Content(type="text", text=self.prompt), Content( - type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto") + type="image_url", + image_url=ImageUrl( + url="data:image/png;base64," + image_base64, detail="auto" + ), ), ], ) @@ -295,23 +308,23 @@ class Caption: headers["Authorization"] = f"Bearer {self.api_key}" try: - logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}") + logger.info( + f"Sending request to OpenAI-compatible API with model: {self.model}" + ) response = requests.post( self.completion_url, data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4), headers=headers, - timeout=30, + timeout=self.timeout, ) if response.status_code != 200: logger.error( - f"OpenAI-compatible API returned non-200 status code: {response.status_code}" + f"OpenAI API returned non-200 status code: {response.status_code}" ) response.raise_for_status() - logger.info( - f"Successfully received response from OpenAI-compatible API with status: {response.status_code}" - ) - logger.info(f"Converting response to CaptionChatResp object") + logger.info(f"Received from OpenAI with status: {response.status_code}") + logger.info("Converting response to CaptionChatResp object") caption_resp = CaptionChatResp.from_json(response.json()) if caption_resp.usage: @@ -322,7 +335,7 @@ class Caption: return caption_resp except requests.exceptions.Timeout: - logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds") + logger.error("Timeout while calling OpenAI-compatible API after 30 seconds") return None except requests.exceptions.RequestException as e: logger.error(f"Request error calling OpenAI-compatible API: {e}") diff --git a/docreader/parser/chain_parser.py b/docreader/parser/chain_parser.py new file mode 100644 index 0000000..45fd1c9 --- /dev/null +++ b/docreader/parser/chain_parser.py @@ -0,0 +1,70 @@ +import logging +from typing import List, Tuple, Type + +from docreader.models.document import Document +from docreader.parser.base_parser import BaseParser +from docreader.utils import endecode + +logger = logging.getLogger(__name__) + + +class FirstParser(BaseParser): + _parser_cls: Tuple[Type["BaseParser"], ...] = () + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self._parsers: List[BaseParser] = [] + for parser_cls in self._parser_cls: + try: + parser = parser_cls(*args, **kwargs) + self._parsers.append(parser) + except Exception as e: + logger.error(f"Failed to create parser {parser_cls.__name__}: {e}") + + def parse_into_text(self, content: bytes) -> Document: + for p in self._parsers: + document = p.parse_into_text(content) + if document.is_valid(): + return document + return Document() + + @classmethod + def create(cls, *parser_classes: Type["BaseParser"]) -> Type["FirstParser"]: + names = "_".join([p.__name__ for p in parser_classes]) + return type(f"FirstParser_{names}", (cls,), {"_parser_cls": parser_classes}) + + +class PipelineParser(BaseParser): + _parser_cls: Tuple[Type["BaseParser"], ...] = () + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self._parsers: List[BaseParser] = [] + for parser_cls in self._parser_cls: + try: + parser = parser_cls(*args, **kwargs) + self._parsers.append(parser) + except Exception as e: + logger.error(f"Failed to create parser {parser_cls.__name__}: {e}") + + def parse_into_text(self, content: bytes) -> Document: + document = Document() + for p in self._parsers: + document = p.parse_into_text(content) + content = endecode.encode_bytes(document.content) + return document + + @classmethod + def create(cls, *parser_classes: Type["BaseParser"]) -> Type["PipelineParser"]: + names = "_".join([p.__name__ for p in parser_classes]) + return type(f"PipelineParser_{names}", (cls,), {"_parser_cls": parser_classes}) + + +if __name__ == "__main__": + from docreader.parser.markdown_parser import MarkdownParser + + cls = FirstParser.create(MarkdownParser) + parser = cls() + print(parser.parse_into_text(b"aaa")) diff --git a/docreader/parser/config.py b/docreader/parser/config.py deleted file mode 100644 index 85f9cb5..0000000 --- a/docreader/parser/config.py +++ /dev/null @@ -1,21 +0,0 @@ -from dataclasses import dataclass, field - - -@dataclass -class ChunkingConfig: - """ - Configuration for text chunking process. - Controls how documents are split into smaller pieces for processing. - """ - - chunk_size: int = 512 # Maximum size of each chunk in tokens/chars - chunk_overlap: int = 50 # Number of tokens/chars to overlap between chunks - separators: list = field( - default_factory=lambda: ["\n\n", "\n", "。"] - ) # Text separators in order of priority - enable_multimodal: bool = ( - False # Whether to enable multimodal processing (text + images) - ) - storage_config: dict = None # Preferred field name going forward - vlm_config: dict = None # VLM configuration for image captioning - diff --git a/docreader/parser/doc_parser.py b/docreader/parser/doc_parser.py index 71fc897..337ab85 100644 --- a/docreader/parser/doc_parser.py +++ b/docreader/parser/doc_parser.py @@ -1,134 +1,88 @@ -import asyncio import logging -import re -import tempfile import os import subprocess -import shutil -from io import BytesIO -from typing import Optional, List, Tuple -import textract -from PIL import Image -import zipfile -import xml.etree.ElementTree as ET +from typing import List, Optional -from .base_parser import BaseParser -from .docx_parser import DocxParser, Docx +import textract + +from docreader.models.document import Document +from docreader.parser.docx2_parser import Docx2Parser +from docreader.utils.tempfile import TempDirContext, TempFileContext logger = logging.getLogger(__name__) -class DocParser(BaseParser): +class DocParser(Docx2Parser): """DOC document parser""" - def parse_into_text(self, content: bytes) -> str: - """Parse DOC document - - Args: - content: DOC document content - - Returns: - Parse result - """ + def parse_into_text(self, content: bytes) -> Document: logger.info(f"Parsing DOC document, content size: {len(content)} bytes") + handle_chain = [ + # 1. Try to convert to docx format to extract images + self._parse_with_docx, + # 2. If image extraction is not needed or conversion failed, + # try using antiword to extract text + self._parse_with_antiword, + # 3. If antiword extraction fails, use textract + self._parse_with_textract, + ] + # Save byte content as a temporary file - with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file: - temp_file_path = temp_file.name - temp_file.write(content) - temp_file.flush() - logger.info(f"Saved DOC content to temporary file: {temp_file_path}") + with TempFileContext(content, ".doc") as temp_file_path: + for handle in handle_chain: + try: + document = handle(temp_file_path) + if document: + return document + except Exception as e: + logger.warning(f"Failed to parse DOC with {handle.__name__} {e}") - try: - # First try to convert to docx format to extract images - if self.enable_multimodal: - logger.info("Multimodal enabled, attempting to extract images from DOC") - docx_content = self._convert_doc_to_docx(temp_file_path) + return Document(content="") - if docx_content: - logger.info("Successfully converted DOC to DOCX, using DocxParser") - # Use existing DocxParser to parse the converted docx - docx_parser = DocxParser( - file_name=self.file_name, - file_type="docx", - enable_multimodal=self.enable_multimodal, - chunk_size=self.chunk_size, - chunk_overlap=self.chunk_overlap, - chunking_config=self.chunking_config, - separators=self.separators, - ) - text = docx_parser.parse_into_text(docx_content) - logger.info(f"Extracted {len(text)} characters using DocxParser") + def _parse_with_docx(self, temp_file_path: str) -> Document: + logger.info("Multimodal enabled, attempting to extract images from DOC") - # Clean up temporary file - os.unlink(temp_file_path) - logger.info(f"Deleted temporary file: {temp_file_path}") + docx_content = self._try_convert_doc_to_docx(temp_file_path) + if not docx_content: + raise RuntimeError("Failed to convert DOC to DOCX") - return text - else: - logger.warning( - "Failed to convert DOC to DOCX, falling back to text-only extraction" - ) + logger.info("Successfully converted DOC to DOCX, using DocxParser") + # Use existing DocxParser to parse the converted docx + document = super(Docx2Parser, self).parse_into_text(docx_content) + logger.info(f"Extracted {len(document.content)} characters using DocxParser") + return document - # If image extraction is not needed or conversion failed, try using antiword to extract text - try: - logger.info("Attempting to parse DOC file with antiword") - # Check if antiword is installed - antiword_path = self._find_antiword_path() + def _parse_with_antiword(self, temp_file_path: str) -> Document: + logger.info("Attempting to parse DOC file with antiword") - if antiword_path: - # Use antiword to extract text directly - logger.info(f"Using antiword at {antiword_path} to extract text") - process = subprocess.Popen( - [antiword_path, temp_file_path], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = process.communicate() + # Check if antiword is installed + antiword_path = self._try_find_antiword() + if not antiword_path: + raise RuntimeError("antiword not found in PATH") - if process.returncode == 0: - text = stdout.decode("utf-8", errors="ignore") - logger.info( - f"Successfully extracted {len(text)} characters using antiword" - ) - - # Clean up temporary file - os.unlink(temp_file_path) - logger.info(f"Deleted temporary file: {temp_file_path}") - - return text - else: - logger.warning( - f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}" - ) - else: - logger.warning("antiword not found, falling back to textract") - except Exception as e: - logger.warning( - f"Error using antiword: {str(e)}, falling back to textract" - ) - - # If antiword fails, try using textract - logger.info("Parsing DOC file with textract") - text = textract.process(temp_file_path, method="antiword").decode("utf-8") - logger.info( - f"Successfully extracted {len(text)} characters of text from DOC document using textract" + # Use antiword to extract text directly + process = subprocess.Popen( + [antiword_path, temp_file_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = process.communicate() + if process.returncode != 0: + raise RuntimeError( + f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}" ) + text = stdout.decode("utf-8", errors="ignore") + logger.info(f"Successfully extracted {len(text)} characters using antiword") + return Document(content=text) - # Clean up temporary file - os.unlink(temp_file_path) - logger.info(f"Deleted temporary file: {temp_file_path}") + def _parse_with_textract(self, temp_file_path: str) -> Document: + logger.info(f"Parsing DOC file with textract: {temp_file_path}") + text = textract.process(temp_file_path, method="antiword").decode("utf-8") + logger.info(f"Successfully extracted {len(text)} bytes of DOC using textract") + return Document(content=str(text)) - return text - except Exception as e: - logger.error(f"Error parsing DOC document: {str(e)}") - # Ensure temporary file is cleaned up - if os.path.exists(temp_file_path): - os.unlink(temp_file_path) - logger.info(f"Deleted temporary file after error: {temp_file_path}") - return "" - - def _convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]: + def _try_convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]: """Convert DOC file to DOCX format Uses LibreOffice/OpenOffice for conversion @@ -141,21 +95,16 @@ class DocParser(BaseParser): """ logger.info(f"Converting DOC to DOCX: {doc_path}") + # Check if LibreOffice or OpenOffice is installed + soffice_path = self._try_find_soffice() + if not soffice_path: + return None + + # Execute conversion command + logger.info(f"Using {soffice_path} to convert DOC to DOCX") + # Create a temporary directory to store the converted file - temp_dir = tempfile.mkdtemp() - docx_path = os.path.join(temp_dir, "converted.docx") - - try: - # Check if LibreOffice or OpenOffice is installed - soffice_path = self._find_soffice_path() - if not soffice_path: - logger.error( - "LibreOffice/OpenOffice not found, cannot convert DOC to DOCX" - ) - return None - - # Execute conversion command - logger.info(f"Using {soffice_path} to convert DOC to DOCX") + with TempDirContext() as temp_dir: cmd = [ soffice_path, "--headless", @@ -165,7 +114,6 @@ class DocParser(BaseParser): temp_dir, doc_path, ] - logger.info(f"Running command: {' '.join(cmd)}") process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE @@ -173,41 +121,68 @@ class DocParser(BaseParser): stdout, stderr = process.communicate() if process.returncode != 0: - logger.error( - f"Error converting DOC to DOCX: {stderr.decode('utf-8', errors='ignore')}" + logger.warning( + f"Error converting DOC to DOCX: {stderr.decode('utf-8')}" ) return None # Find the converted file - for file in os.listdir(temp_dir): - if file.endswith(".docx"): - converted_file = os.path.join(temp_dir, file) - logger.info(f"Found converted file: {converted_file}") - - # Read the converted file content - with open(converted_file, "rb") as f: - docx_content = f.read() + docx_file = [ + file for file in os.listdir(temp_dir) if file.endswith(".docx") + ] + logger.info(f"Found {len(docx_file)} DOCX file(s) in temporary directory") + for file in docx_file: + converted_file = os.path.join(temp_dir, file) + logger.info(f"Found converted file: {converted_file}") + # Read the converted file content + with open(converted_file, "rb") as f: + docx_content = f.read() logger.info( - f"Successfully read converted DOCX file, size: {len(docx_content)} bytes" + f"Successfully read DOCX file, size: {len(docx_content)}" ) return docx_content + return None - logger.error("No DOCX file found after conversion") - return None + def _try_find_executable_path( + self, + executable_name: str, + possible_path: List[str] = [], + environment_variable: List[str] = [], + ) -> Optional[str]: + """Find executable path + Args: + executable_name: Executable name + possible_path: List of possible paths + environment_variable: List of environment variables to check + Returns: + Executable path, or None if not found + """ + # Common executable paths + paths: List[str] = [] + paths.extend(possible_path) + paths.extend(os.environ.get(env_var, "") for env_var in environment_variable) + paths = list(set(paths)) - except Exception as e: - logger.error(f"Error during DOC to DOCX conversion: {str(e)}") - return None - finally: - # Clean up temporary directory - try: - shutil.rmtree(temp_dir) - logger.info(f"Cleaned up temporary directory: {temp_dir}") - except Exception as e: - logger.warning(f"Failed to clean up temporary directory: {str(e)}") + # Check if path is set in environment variable + for path in paths: + if os.path.exists(path): + logger.info(f"Found {executable_name} at {path}") + return path - def _find_soffice_path(self) -> Optional[str]: + # Try to find in PATH + result = subprocess.run( + ["which", executable_name], capture_output=True, text=True + ) + if result.returncode == 0 and result.stdout.strip(): + path = result.stdout.strip() + logger.info(f"Found {executable_name} at {path}") + return path + + logger.warning(f"Failed to find {executable_name}") + return None + + def _try_find_soffice(self) -> Optional[str]: """Find LibreOffice/OpenOffice executable path Returns: @@ -225,32 +200,13 @@ class DocParser(BaseParser): "C:\\Program Files\\LibreOffice\\program\\soffice.exe", "C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe", ] + return self._try_find_executable_path( + executable_name="soffice", + possible_path=possible_paths, + environment_variable=["LIBREOFFICE_PATH"], + ) - # Check if path is set in environment variable - if os.environ.get("LIBREOFFICE_PATH"): - possible_paths.insert(0, os.environ.get("LIBREOFFICE_PATH")) - - for path in possible_paths: - if os.path.exists(path): - logger.info(f"Found LibreOffice/OpenOffice at: {path}") - return path - - # Try to find in PATH - try: - result = subprocess.run( - ["which", "soffice"], capture_output=True, text=True - ) - if result.returncode == 0 and result.stdout.strip(): - path = result.stdout.strip() - logger.info(f"Found LibreOffice/OpenOffice in PATH: {path}") - return path - except Exception: - pass - - logger.warning("LibreOffice/OpenOffice not found") - return None - - def _find_antiword_path(self) -> Optional[str]: + def _try_find_antiword(self) -> Optional[str]: """Find antiword executable path Returns: @@ -265,51 +221,27 @@ class DocParser(BaseParser): "C:\\Program Files\\Antiword\\antiword.exe", "C:\\Program Files (x86)\\Antiword\\antiword.exe", ] - - # Check if path is set in environment variable - if os.environ.get("ANTIWORD_PATH"): - possible_paths.insert(0, os.environ.get("ANTIWORD_PATH")) - - for path in possible_paths: - if os.path.exists(path): - logger.info(f"Found antiword at: {path}") - return path - - # Try to find in PATH - try: - result = subprocess.run( - ["which", "antiword"], capture_output=True, text=True - ) - if result.returncode == 0 and result.stdout.strip(): - path = result.stdout.strip() - logger.info(f"Found antiword in PATH: {path}") - return path - except Exception: - pass - - logger.warning("antiword not found") - return None + return self._try_find_executable_path( + executable_name="antiword", + possible_path=possible_paths, + environment_variable=["ANTIWORD_PATH"], + ) if __name__ == "__main__": - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) - logger.info("Running DocParser in standalone mode") + logging.basicConfig(level=logging.DEBUG) file_name = "/path/to/your/test.doc" logger.info(f"Processing file: {file_name}") - doc_parser = DocParser( - file_name, enable_multimodal=True, chunk_size=512, chunk_overlap=60 + file_name=file_name, + enable_multimodal=True, + chunk_size=512, + chunk_overlap=60, ) - logger.info("Parser initialized, starting processing") - with open(file_name, "rb") as f: content = f.read() - text = doc_parser.parse_into_text(content) - logger.info(f"Processing complete, extracted text length: {len(text)}") - logger.info(f"Sample text: {text[:200]}...") + document = doc_parser.parse_into_text(content) + logger.info(f"Processing complete, extracted text length: {len(document.content)}") + logger.info(f"Sample text: {document.content[:200]}...") diff --git a/docreader/parser/docx2_parser.py b/docreader/parser/docx2_parser.py new file mode 100644 index 0000000..872b3ef --- /dev/null +++ b/docreader/parser/docx2_parser.py @@ -0,0 +1,28 @@ +import logging + +from docreader.parser.chain_parser import FirstParser +from docreader.parser.docx_parser import DocxParser +from docreader.parser.markitdown_parser import MarkitdownParser + +logger = logging.getLogger(__name__) + + +class Docx2Parser(FirstParser): + _parser_cls = (MarkitdownParser, DocxParser) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + + your_file = "/path/to/your/file.docx" + parser = Docx2Parser(separators=[".", "?", "!", "。", "?", "!"]) + with open(your_file, "rb") as f: + content = f.read() + + document = parser.parse(content) + for cc in document.chunks: + logger.info(f"chunk: {cc}") + + # document = parser.parse_into_text(content) + # logger.info(f"docx content: {document.content}") + # logger.info(f"find images {document.images.keys()}") diff --git a/docreader/parser/docx_parser.py b/docreader/parser/docx_parser.py index ee0cb90..979a4fc 100644 --- a/docreader/parser/docx_parser.py +++ b/docreader/parser/docx_parser.py @@ -1,37 +1,36 @@ import logging -import tempfile import os -import sys -import time -from io import BytesIO -from typing import Optional, Dict, Any, Tuple, List, Union -from dataclasses import dataclass, field -from PIL import Image -from docx import Document -from docx.image.exceptions import ( - UnrecognizedImageError, - UnexpectedEndOfFileError, - InvalidImageStreamError, -) -from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed +import re import tempfile import threading +import time import traceback +from concurrent.futures import ProcessPoolExecutor, as_completed +from dataclasses import dataclass, field +from io import BytesIO from multiprocessing import Manager -import re +from typing import Any, Dict, List, Optional, Tuple -from .base_parser import BaseParser +from docx import Document +from docx.image.exceptions import ( + InvalidImageStreamError, + UnexpectedEndOfFileError, + UnrecognizedImageError, +) +from PIL import Image + +from docreader.models.document import Document as DocumentModel +from docreader.parser.base_parser import BaseParser +from docreader.utils import endecode logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -# Add thread local storage to track the processing status of each thread -thread_local = threading.local() class ImageData: """Represents a processed image of document content""" + local_path: str = "" - object: Image.Image = None + object: Optional[Image.Image] = None url: str = "" @@ -40,7 +39,9 @@ class LineData: """Represents a processed line of document content with associated images""" text: str = "" # Extracted text content - images: List[ImageData] = field(default_factory=list) # List of images or image paths + images: List[ImageData] = field( + default_factory=list + ) # List of images or image paths extra_info: str = "" # Placeholder for additional info (currently unused) page_num: int = 0 # Page number content_sequence: List[Tuple[str, Any]] = field( @@ -53,18 +54,8 @@ class DocxParser(BaseParser): def __init__( self, - file_name: str = "", - file_type: str = None, - enable_multimodal: bool = True, - chunk_size: int = 1000, - chunk_overlap: int = 200, - separators: list = ["\n\n", "\n", "。"], - ocr_backend: str = "paddle", - ocr_config: dict = None, - max_image_size: int = 1920, - max_concurrent_tasks: int = 5, - max_pages: int = 100, # Maximum number of pages to process, default to 50 pages - chunking_config=None, + max_pages: int = 100, # Maximum number of pages to process + **kwargs, ): """Initialize DOCX document parser @@ -79,37 +70,16 @@ class DocxParser(BaseParser): ocr_config: OCR engine configuration max_image_size: Maximum image size limit max_concurrent_tasks: Maximum number of concurrent tasks - max_pages: Maximum number of pages to process, if more than this, only process the first max_pages pages + max_pages: Maximum number of pages to process """ - super().__init__( - file_name=file_name, - file_type=file_type, - enable_multimodal=enable_multimodal, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - separators=separators, - ocr_backend=ocr_backend, - ocr_config=ocr_config, - max_image_size=max_image_size, - max_concurrent_tasks=max_concurrent_tasks, - chunking_config=chunking_config, - ) + super().__init__(**kwargs) self.max_pages = max_pages logger.info(f"DocxParser initialized with max_pages={max_pages}") - def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: - """Parse DOCX document, extract text content and image Markdown links - - Args: - content: DOCX document content - - Returns: - Tuple of (parsed_text, image_map) where image_map maps image URLs to Image objects - All LineData objects are used internally but not returned directly through this interface - """ + def parse_into_text(self, content: bytes) -> DocumentModel: + """Parse DOCX document, extract text content and image Markdown links""" logger.info(f"Parsing DOCX document, content size: {len(content)} bytes") logger.info(f"Max pages limit set to: {self.max_pages}") - logger.info("Converting DOCX content to sections and tables") start_time = time.time() # Use concurrent processing to handle the document @@ -123,7 +93,7 @@ class DocxParser(BaseParser): docx_processor = Docx( max_image_size=self.max_image_size, enable_multimodal=self.enable_multimodal, - upload_file=self.upload_file, + upload_file=self.storage.upload_file, ) all_lines, tables = docx_processor( binary=content, @@ -140,7 +110,7 @@ class DocxParser(BaseParser): section_start_time = time.time() text_parts = [] - image_parts = {} + image_parts: Dict[str, str] = {} for sec_idx, line in enumerate(all_lines): try: @@ -148,16 +118,19 @@ class DocxParser(BaseParser): text_parts.append(line.text) if sec_idx < 3 or sec_idx % 50 == 0: logger.info( - f"Added section {sec_idx+1} text: {line.text[:50]}..." + f"Added section {sec_idx + 1} text: {line.text[:50]}..." if len(line.text) > 50 - else f"Added section {sec_idx+1} text: {line.text}" + else f"Added section {sec_idx + 1} text: {line.text}" ) if line.images: for image_data in line.images: - if image_data.url: - image_parts[image_data.url] = image_data.object + if image_data.url and image_data.object: + image_parts[image_data.url] = endecode.decode_image( + image_data.object + ) + image_data.object.close() except Exception as e: - logger.error(f"Error processing section {sec_idx+1}: {str(e)}") + logger.error(f"Error processing section {sec_idx + 1}: {str(e)}") logger.error(f"Detailed stack trace: {traceback.format_exc()}") continue @@ -176,17 +149,17 @@ class DocxParser(BaseParser): total_processing_time = time.time() - start_time logger.info( - f"Parsing complete in {total_processing_time:.2f}s, generated {len(text)} characters of text " + f"Parsing complete in {total_processing_time:.2f}s, " + f"generated {len(text)} characters of text" ) - return text, image_parts + return DocumentModel(content=text, images=image_parts) except Exception as e: logger.error(f"Error parsing DOCX document: {str(e)}") logger.error(f"Detailed stack trace: {traceback.format_exc()}") - fallback_text = self._parse_using_simple_method(content) - return fallback_text, {} + return self._parse_using_simple_method(content) - def _parse_using_simple_method(self, content: bytes) -> str: + def _parse_using_simple_method(self, content: bytes) -> DocumentModel: """Parse document using a simplified method, as a fallback Args: @@ -201,7 +174,8 @@ class DocxParser(BaseParser): doc = Document(BytesIO(content)) logger.info( f"Successfully loaded document in simplified method, " - f"contains {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables" + f"contains {len(doc.paragraphs)} paragraphs " + f"and {len(doc.tables)} tables" ) text_parts = [] @@ -211,7 +185,7 @@ class DocxParser(BaseParser): para_with_text = 0 for i, para in enumerate(doc.paragraphs): if i % 100 == 0: - logger.info(f"Processing paragraph {i+1}/{para_count}") + logger.info(f"Processing paragraph {i + 1}/{para_count}") if para.text.strip(): text_parts.append(para.text.strip()) para_with_text += 1 @@ -225,7 +199,7 @@ class DocxParser(BaseParser): rows_processed = 0 for i, table in enumerate(doc.tables): if i % 10 == 0: - logger.info(f"Processing table {i+1}/{table_count}") + logger.info(f"Processing table {i + 1}/{table_count}") table_has_content = False for row in table.rows: @@ -256,25 +230,24 @@ class DocxParser(BaseParser): # If the result is still empty, return an error message if not result_text: logger.warning("No text extracted using simplified method") - return "", {} + return DocumentModel() - return result_text, {} + return DocumentModel(content=result_text) except Exception as backup_error: processing_time = time.time() - start_time logger.error( - f"Simplified parsing failed after {processing_time:.2f}s: {str(backup_error)}" + f"Simplified parsing failed {processing_time:.2f}s: {backup_error}" ) logger.error(f"Detailed traceback: {traceback.format_exc()}") - return "", {} + return DocumentModel() class Docx: def __init__(self, max_image_size=1920, enable_multimodal=False, upload_file=None): logger.info("Initializing DOCX processor") self.max_image_size = max_image_size # Maximum image size limit - self.picture_cache = ( - {} - ) # Image cache to avoid processing the same image repeatedly + # Image cache to avoid processing the same image repeatedly + self.picture_cache = {} self.enable_multimodal = enable_multimodal self.upload_file = upload_file @@ -454,7 +427,6 @@ class Docx: return page_to_paragraphs - def __call__( self, binary: Optional[bytes] = None, @@ -611,7 +583,6 @@ class Docx: return pages_to_process - def _process_document( self, binary, @@ -806,7 +777,9 @@ class Docx: # Collect temporary image paths for later cleanup for line in page_lines: for image_data in line.images: - if image_data.local_path and image_data.local_path.startswith("/tmp/docx_img_"): + if image_data.local_path and image_data.local_path.startswith( + "/tmp/docx_img_" + ): temp_img_paths.add(image_data.local_path) results.extend(page_lines) @@ -876,7 +849,11 @@ class Docx: # Process all image data objects for image_data in image_paths: - if image_data.local_path and os.path.exists(image_data.local_path) and image_data.local_path not in image_url_map: + if ( + image_data.local_path + and os.path.exists(image_data.local_path) + and image_data.local_path not in image_url_map + ): try: # Upload the image if it doesn't have a URL yet if not image_data.url: @@ -886,12 +863,16 @@ class Docx: image_data.url = image_url # Add image URL as Markdown format markdown_image = f"![]({image_url})" - image_url_map[image_data.local_path] = markdown_image + image_url_map[image_data.local_path] = ( + markdown_image + ) logger.info( f"Added image URL for {image_data.local_path}: {image_url}" ) else: - logger.warning(f"Failed to upload image: {image_data.local_path}") + logger.warning( + f"Failed to upload image: {image_data.local_path}" + ) else: # Already has a URL, use it markdown_image = f"![]({image_data.url})" @@ -925,12 +906,19 @@ class Docx: # For ImageData objects, use the URL if isinstance(content, str) and content in image_url_map: combined_parts.append(image_url_map[content]) - elif hasattr(content, 'local_path') and content.local_path in image_url_map: + elif ( + hasattr(content, "local_path") + and content.local_path in image_url_map + ): combined_parts.append(image_url_map[content.local_path]) # Create the final text with proper ordering final_text = "\n\n".join(part for part in combined_parts if part) - processed_lines.append(LineData(text=final_text, page_num=page_num, images=line_data.images)) + processed_lines.append( + LineData( + text=final_text, page_num=page_num, images=line_data.images + ) + ) else: processed_lines = lines @@ -1003,11 +991,11 @@ class Docx: logger.info(f"Processing {table_count} tables") for tb_idx, tb in enumerate(self.doc.tables): if tb_idx % 10 == 0: # Log only every 10 tables to reduce log volume - logger.info(f"Processing table {tb_idx+1}/{table_count}") + logger.info(f"Processing table {tb_idx + 1}/{table_count}") # Optimize: Check if table is empty if len(tb.rows) == 0 or all(len(r.cells) == 0 for r in tb.rows): - logger.info(f"Skipping empty table {tb_idx+1}") + logger.info(f"Skipping empty table {tb_idx + 1}") continue table_html = self._convert_table_to_html(tb) @@ -1111,8 +1099,8 @@ def _save_image_to_temp(logger, image, page_num, img_idx): if not image: return None - import tempfile import os + import tempfile try: # Create a temporary file @@ -1187,8 +1175,15 @@ def process_page_multiprocess( return [] # Extract page content - combined_text, image_objects, content_sequence = _extract_page_content_in_process( - process_logger, doc, page_num, paragraphs, enable_multimodal, max_image_size + combined_text, image_objects, content_sequence = ( + _extract_page_content_in_process( + process_logger, + doc, + page_num, + paragraphs, + enable_multimodal, + max_image_size, + ) ) # Process content sequence to maintain order between processes @@ -1199,7 +1194,9 @@ def process_page_multiprocess( if enable_multimodal: # First pass: save all images to temporary files for i, image_object in enumerate(image_objects): - img_path = _save_image_to_temp(process_logger, image_object, page_num, i) + img_path = _save_image_to_temp( + process_logger, image_object, page_num, i + ) if img_path: # Create ImageData object image_data = ImageData() diff --git a/docreader/parser/image_parser.py b/docreader/parser/image_parser.py index 4ebbcee..5c054bc 100644 --- a/docreader/parser/image_parser.py +++ b/docreader/parser/image_parser.py @@ -1,15 +1,13 @@ +import base64 import logging import os -import asyncio -from PIL import Image -import io -from typing import Dict, Any, Tuple, Union -from .base_parser import BaseParser, ParseResult -import numpy as np + +from docreader.models.document import Document +from docreader.parser.base_parser import BaseParser # Set up logger for this module logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) + class ImageParser(BaseParser): """ @@ -23,46 +21,24 @@ class ImageParser(BaseParser): 4. Returning a combined result with both text and image reference """ - def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: + def parse_into_text(self, content: bytes) -> Document: """ - Parse image content, upload the image and return Markdown reference along with image map. - - Args: - content: Raw image data (bytes) - - Returns: - Tuple of (markdown_text, image_map) where image_map maps image URLs to PIL Image objects + Parse image content into markdown text + :param content: bytes content of the image + :return: Document object """ logger.info(f"Parsing image content, size: {len(content)} bytes") - image_map = {} - - try: - # Upload image to storage service - logger.info("Uploading image to storage") - _, ext = os.path.splitext(self.file_name) - image_url = self.upload_bytes(content, file_ext=ext) - if not image_url: - logger.error("Failed to upload image to storage") - return "", {} - logger.info( - f"Successfully uploaded image, URL: {image_url[:50]}..." - if len(image_url) > 50 - else f"Successfully uploaded image, URL: {image_url}" - ) - # Create image object and add to map - try: - from PIL import Image - import io - image = Image.open(io.BytesIO(content)) - image_map[image_url] = image - logger.info(f"Added image to image_map for URL: {image_url}") - except Exception as img_err: - logger.error(f"Error creating image object: {str(img_err)}") + # Get file extension + ext = os.path.splitext(self.file_name)[1].lower() - markdown_text = f"![{self.file_name}]({image_url})" - return markdown_text, image_map + # Upload image to storage + image_url = self.storage.upload_bytes(content, file_ext=ext) + logger.info(f"Successfully uploaded image, URL: {image_url[:50]}...") - except Exception as e: - logger.error(f"Error parsing image: {str(e)}") - return "", {} + # Generate markdown text + text = f"![{self.file_name}]({image_url})" + images = {image_url: base64.b64encode(content).decode()} + + # Create image object and add to map + return Document(content=text, images=images) diff --git a/docreader/parser/image_utils.py b/docreader/parser/image_utils.py deleted file mode 100644 index 55cb474..0000000 --- a/docreader/parser/image_utils.py +++ /dev/null @@ -1,43 +0,0 @@ -import base64 -import io -import logging -from typing import Union -from PIL import Image -import numpy as np - -logger = logging.getLogger(__name__) - -def image_to_base64(image: Union[str, bytes, Image.Image, np.ndarray]) -> str: - """Convert image to base64 encoded string - - Args: - image: Image file path, bytes, PIL Image object, or numpy array - - Returns: - Base64 encoded image string, or empty string if conversion fails - """ - try: - if isinstance(image, str): - # It's a file path - with open(image, "rb") as image_file: - return base64.b64encode(image_file.read()).decode("utf-8") - elif isinstance(image, bytes): - # It's bytes data - return base64.b64encode(image).decode("utf-8") - elif isinstance(image, Image.Image): - # It's a PIL Image - buffer = io.BytesIO() - image.save(buffer, format="PNG") - return base64.b64encode(buffer.getvalue()).decode("utf-8") - elif isinstance(image, np.ndarray): - # It's a numpy array - pil_image = Image.fromarray(image) - buffer = io.BytesIO() - pil_image.save(buffer, format="PNG") - return base64.b64encode(buffer.getvalue()).decode("utf-8") - else: - logger.error(f"Unsupported image type: {type(image)}") - return "" - except Exception as e: - logger.error(f"Error converting image to base64: {str(e)}") - return "" diff --git a/docreader/parser/markdown_image_util.py b/docreader/parser/markdown_image_util.py new file mode 100644 index 0000000..b748db1 --- /dev/null +++ b/docreader/parser/markdown_image_util.py @@ -0,0 +1,111 @@ +import logging +import re +import uuid +from typing import Dict, List, Match, Optional, Tuple + +from docreader.utils import endecode + +# Get logger object +logger = logging.getLogger(__name__) + + +class MarkdownImageUtil: + def __init__(self): + self.b64_pattern = re.compile( + r"!\[([^\]]*)\]\(data:image/(\w+);base64,([^\)]+)\)" + ) + self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)") + self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)") + + def extract_image( + self, + content: str, + path_prefix: Optional[str] = None, + replace: bool = True, + ) -> Tuple[str, List[str]]: + """Extract base64 encoded images from Markdown content""" + + # image_path => base64 bytes + images: List[str] = [] + + def repl(match: Match[str]) -> str: + title = match.group(1) + image_path = match.group(2) + if path_prefix: + image_path = f"{path_prefix}/{image_path}" + + images.append(image_path) + + if not replace: + return match.group(0) + + # Replace image path with URL + return f"![{title}]({image_path})" + + text = self.image_pattern.sub(repl, content) + logger.debug(f"Extracted {len(images)} images from markdown") + return text, images + + def extract_base64( + self, + content: str, + path_prefix: Optional[str] = None, + replace: bool = True, + ) -> Tuple[str, Dict[str, bytes]]: + """Extract base64 encoded images from Markdown content""" + + # image_path => base64 bytes + images: Dict[str, bytes] = {} + + def repl(match: Match[str]) -> str: + title = match.group(1) + img_ext = match.group(2) + img_b64 = match.group(3) + + image_byte = endecode.encode_image(img_b64, errors="ignore") + if not image_byte: + logger.error(f"Failed to decode base64 image skip it: {img_b64}") + return title + + image_path = f"{uuid.uuid4()}.{img_ext}" + if path_prefix: + image_path = f"{path_prefix}/{image_path}" + images[image_path] = image_byte + + if not replace: + return match.group(0) + + # Replace image path with URL + return f"![{title}]({image_path})" + + text = self.b64_pattern.sub(repl, content) + logger.debug(f"Extracted {len(images)} base64 images from markdown") + return text, images + + def replace_path(self, content: str, images: Dict[str, str]) -> str: + content_replace: set = set() + + def repl(match: Match[str]) -> str: + title = match.group(1) + image_path = match.group(2) + if image_path not in images: + return match.group(0) + + content_replace.add(image_path) + image_path = images[image_path] + return f"![{title}]({image_path})" + + text = self.replace_pattern.sub(repl, content) + logger.debug(f"Replaced {len(content_replace)} images in markdown") + return text + + +if __name__ == "__main__": + your_content = "test![](data:image/png;base64,iVBORw0KGgoAAAA)test" + image_handle = MarkdownImageUtil() + text, images = image_handle.extract_base64(your_content) + print(text) + + for image_url, image_byte in images.items(): + with open(image_url, "wb") as f: + f.write(image_byte) diff --git a/docreader/parser/markdown_parser.py b/docreader/parser/markdown_parser.py index 330d5a0..1758dcd 100644 --- a/docreader/parser/markdown_parser.py +++ b/docreader/parser/markdown_parser.py @@ -1,33 +1,53 @@ -import asyncio -import re +import base64 import logging -import numpy as np -import os # Import os module to get environment variables -from typing import Dict, List, Optional, Tuple, Union, Any -from .base_parser import BaseParser +import os +from typing import Dict + +from docreader.models.document import Document +from docreader.parser.base_parser import BaseParser +from docreader.parser.chain_parser import PipelineParser +from docreader.parser.markdown_image_util import MarkdownImageUtil +from docreader.utils import endecode # Get logger object logger = logging.getLogger(__name__) -class MarkdownParser(BaseParser): - """Markdown document parser""" - - def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: - """Parse Markdown document, only extract text content, do not process images - - Args: - content: Markdown document content - - Returns: - Parsed text result - """ - logger.info(f"Parsing Markdown document, content size: {len(content)} bytes") +class MarkdownImageBase64(BaseParser): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.image_helper = MarkdownImageUtil() + def parse_into_text(self, content: bytes) -> Document: # Convert byte content to string using universal decoding method - text = self.decode_bytes(content) - logger.info(f"Decoded Markdown content, text length: {len(text)} characters") + text = endecode.decode_bytes(content) + text, img_b64 = self.image_helper.extract_base64(text, path_prefix="images") - logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text") - return text + images: Dict[str, str] = {} + image_replace: Dict[str, str] = {} + logger.debug(f"Uploading {len(img_b64)} images from markdown") + for ipath, b64_bytes in img_b64.items(): + ext = os.path.splitext(ipath)[1].lower() + image_url = self.storage.upload_bytes(b64_bytes, ext) + + image_replace[ipath] = image_url + images[image_url] = base64.b64encode(b64_bytes).decode() + + text = self.image_helper.replace_path(text, image_replace) + return Document(content=text, images=images) + + +class MarkdownParser(PipelineParser): + _parser_cls = (MarkdownImageBase64,) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + + your_content = "test![](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMgA)test" + parser = MarkdownParser() + + document = parser.parse_into_text(your_content.encode()) + logger.info(document.content) + logger.info(f"Images: {len(document.images)}, name: {document.images.keys()}") diff --git a/docreader/parser/markitdown_parser.py b/docreader/parser/markitdown_parser.py new file mode 100644 index 0000000..c067cd7 --- /dev/null +++ b/docreader/parser/markitdown_parser.py @@ -0,0 +1,31 @@ +import io +import logging + +from markitdown import MarkItDown + +from docreader.models.document import Document +from docreader.parser.base_parser import BaseParser +from docreader.parser.chain_parser import PipelineParser +from docreader.parser.markdown_parser import MarkdownParser + +logger = logging.getLogger(__name__) + + +class StdMarkitdownParser(BaseParser): + """ + PDF Document Parser + + This parser handles PDF documents by extracting text content. + It uses the markitdown library for simple text extraction. + """ + + def __init__(self, *args, **kwargs): + self.markitdown = MarkItDown() + + def parse_into_text(self, content: bytes) -> Document: + result = self.markitdown.convert(io.BytesIO(content), keep_data_uris=True) + return Document(content=result.text_content) + + +class MarkitdownParser(PipelineParser): + _parser_cls = (StdMarkitdownParser, MarkdownParser) diff --git a/docreader/parser/mineru_parser.py b/docreader/parser/mineru_parser.py new file mode 100644 index 0000000..1e182de --- /dev/null +++ b/docreader/parser/mineru_parser.py @@ -0,0 +1,124 @@ +import logging +import os +import re +from typing import Dict + +import markdownify +import requests + +from docreader.models.document import Document +from docreader.parser.base_parser import BaseParser +from docreader.parser.markdown_parser import MarkdownImageUtil +from docreader.utils import endecode + +logger = logging.getLogger(__name__) + + +class MinerUParser(BaseParser): + def __init__( + self, + enable_markdownify: bool = True, + mineru_endpoint: str = "", + **kwargs, + ): + super().__init__(**kwargs) + self.minerU = os.getenv("MINERU_ENDPOINT", mineru_endpoint) + self.enable_markdownify = enable_markdownify + self.image_helper = MarkdownImageUtil() + self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)") + self.enable = self.ping() + assert self.ping(), "MinerU API is not reachable" + + def ping(self, timeout: int = 5) -> bool: + try: + response = requests.get( + self.minerU + "/docs", timeout=timeout, allow_redirects=True + ) + response.raise_for_status() + return True + except Exception: + return False + + def parse_into_text(self, content: bytes) -> Document: + logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)") + md_content: str = "" + images_b64: Dict[str, str] = {} + try: + response = requests.post( + url=self.minerU + "/file_parse", + data={ + "return_md": True, + "return_images": True, + "lang_list": ["ch", "en"], + "table_enable": True, + "formula_enable": True, + "parse_method": "auto", + "start_page_id": 0, + "end_page_id": 99999, + "backend": "pipeline", + "response_format_zip": False, + "return_middle_json": False, + "return_model_output": False, + "return_content_list": False, + }, + files={"files": content}, + timeout=1000, + ) + response.raise_for_status() + result = response.json()["results"]["files"] + md_content = result["md_content"] + images_b64 = result.get("images", {}) + except Exception as e: + logger.error(f"MinerU parsing failed: {e}", exc_info=True) + return Document() + + # convert table(HTML) in markdown to markdown table + if self.enable_markdownify: + logger.debug("Converting HTML to Markdown") + md_content = markdownify.markdownify(md_content) + + images = {} + image_replace = {} + # image in images_bs64 may not be used in md_content + # such as: table ... + # so we need to filter them + for ipath, b64_str in images_b64.items(): + if f"images/{ipath}" not in md_content: + logger.debug(f"Image {ipath} not used in markdown") + continue + match = self.base64_pattern.match(b64_str) + if match: + file_ext = match.group(1) + b64_str = match.group(2) + + image_bytes = endecode.encode_image(b64_str, errors="ignore") + if not image_bytes: + logger.error("Failed to decode base64 image skip it") + continue + + image_url = self.storage.upload_bytes( + image_bytes, file_ext=f".{file_ext}" + ) + + images[image_url] = b64_str + image_replace[f"images/{ipath}"] = image_url + + logger.info(f"Replaced {len(image_replace)} images in markdown") + text = self.image_helper.replace_path(md_content, image_replace) + + logger.info( + f"Successfully parsed PDF, text: {len(text)}, images: {len(images)}" + ) + return Document(content=text, images=images) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + + your_file = "/path/to/your/file.pdf" + your_mineru = "http://host.docker.internal:9987" + parser = MinerUParser(mineru_endpoint=your_mineru) + with open(your_file, "rb") as f: + content = f.read() + document = parser.parse_into_text(content) + logger.error(document.content) diff --git a/docreader/parser/ocr_engine.py b/docreader/parser/ocr_engine.py index 13c3e88..0a999b9 100644 --- a/docreader/parser/ocr_engine.py +++ b/docreader/parser/ocr_engine.py @@ -1,71 +1,96 @@ -import os -import logging -import base64 -from typing import Optional, Union, Dict, Any -from abc import ABC, abstractmethod -from PIL import Image import io +import logging +import os +import platform +import subprocess +from abc import ABC, abstractmethod +from typing import Dict, Union + import numpy as np -from .image_utils import image_to_base64 +from openai import OpenAI +from PIL import Image + +from docreader.utils import endecode logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) + class OCRBackend(ABC): """Base class for OCR backends""" - + @abstractmethod def predict(self, image: Union[str, bytes, Image.Image]) -> str: """Extract text from an image - + Args: image: Image file path, bytes, or PIL Image object - + Returns: Extracted text """ pass + +class DummyOCRBackend(OCRBackend): + """Dummy OCR backend implementation""" + + def predict(self, image: Union[str, bytes, Image.Image]) -> str: + logger.warning("Dummy OCR backend is used") + return "" + + class PaddleOCRBackend(OCRBackend): """PaddleOCR backend implementation""" - - def __init__(self, **kwargs): + + def __init__(self): """Initialize PaddleOCR backend""" self.ocr = None try: - import os import paddle - + # Set PaddlePaddle to use CPU and disable GPU - os.environ['CUDA_VISIBLE_DEVICES'] = '' - paddle.set_device('cpu') - + os.environ["CUDA_VISIBLE_DEVICES"] = "" + paddle.device.set_device("cpu") + # 尝试检测CPU是否支持AVX指令集 try: - import subprocess - import platform - # 检测CPU是否支持AVX if platform.system() == "Linux": try: - result = subprocess.run(['grep', '-o', 'avx', '/proc/cpuinfo'], - capture_output=True, text=True, timeout=5) - has_avx = 'avx' in result.stdout.lower() + result = subprocess.run( + ["grep", "-o", "avx", "/proc/cpuinfo"], + capture_output=True, + text=True, + timeout=5, + ) + has_avx = "avx" in result.stdout.lower() if not has_avx: - logger.warning("CPU does not support AVX instructions, using compatibility mode") + logger.warning( + "CPU does not support AVX instructions, " + "using compatibility mode" + ) # 进一步限制指令集使用 - os.environ['FLAGS_use_avx2'] = '0' - os.environ['FLAGS_use_avx'] = '1' - except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError): - logger.warning("Could not detect AVX support, using compatibility mode") - os.environ['FLAGS_use_avx2'] = '0' - os.environ['FLAGS_use_avx'] = '1' + os.environ["FLAGS_use_avx2"] = "0" + os.environ["FLAGS_use_avx"] = "1" + except ( + subprocess.TimeoutExpired, + FileNotFoundError, + subprocess.SubprocessError, + ): + logger.warning( + "Could not detect AVX support, using compatibility mode" + ) + os.environ["FLAGS_use_avx2"] = "0" + os.environ["FLAGS_use_avx"] = "1" except Exception as e: - logger.warning(f"Error detecting CPU capabilities: {e}, using compatibility mode") - os.environ['FLAGS_use_avx2'] = '0' - os.environ['FLAGS_use_avx'] = '1' - + logger.warning( + f"Error detecting CPU capabilities: {e}, using compatibility mode" + ) + os.environ["FLAGS_use_avx2"] = "0" + os.environ["FLAGS_use_avx"] = "1" + from paddleocr import PaddleOCR + # OCR configuration with text orientation classification enabled ocr_config = { "use_gpu": False, @@ -86,23 +111,53 @@ class PaddleOCRBackend(OCRBackend): "use_dilation": True, # improves accuracy "det_db_score_mode": "slow", # improves accuracy } - + self.ocr = PaddleOCR(**ocr_config) logger.info("PaddleOCR engine initialized successfully") - + except ImportError as e: - logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'") + logger.error( + f"Failed to import paddleocr: {str(e)}. " + "Please install it with 'pip install paddleocr'" + ) except OSError as e: if "Illegal instruction" in str(e) or "core dumped" in str(e): - logger.error(f"PaddlePaddle crashed due to CPU instruction set incompatibility: {str(e)}") - logger.error("This usually happens when the CPU doesn't support AVX instructions.") - logger.error("Please try installing a CPU-only version of PaddlePaddle or use a different OCR backend.") + logger.error( + f"PaddlePaddle crashed due to CPU instruction set incompatibility:" + f"{e}" + ) + logger.error( + "This happens when the CPU doesn't support AVX instructions. " + "Try install CPU-only version of PaddlePaddle, " + "or use a different OCR backend." + ) else: - logger.error(f"Failed to initialize PaddleOCR due to OS error: {str(e)}") + logger.error( + f"Failed to initialize PaddleOCR due to OS error: {str(e)}" + ) except Exception as e: logger.error(f"Failed to initialize PaddleOCR: {str(e)}") - - def predict(self, image): + + def predict(self, image: Union[str, bytes, Image.Image]) -> str: + """Extract text from an image + + Args: + image: Image file path, bytes, or PIL Image object + + Returns: + Extracted text + """ + if isinstance(image, str): + image = Image.open(image) + elif isinstance(image, bytes): + image = Image.open(io.BytesIO(image)) + + if not isinstance(image, Image.Image): + raise TypeError("image must be a string, bytes, or PIL Image object") + + return self._predict(image) + + def _predict(self, image: Image.Image) -> str: """Perform OCR recognition on the image Args: @@ -111,63 +166,59 @@ class PaddleOCRBackend(OCRBackend): Returns: Extracted text string """ + if self.ocr is None: + logger.error("PaddleOCR engine not initialized") + return "" try: # Ensure image is in RGB format - if hasattr(image, "convert") and image.mode != "RGB": + if image.mode != "RGB": image = image.convert("RGB") # Convert to numpy array if needed - if hasattr(image, "convert"): - image_array = np.array(image) - else: - image_array = image + image_array = np.array(image) # Perform OCR ocr_result = self.ocr.ocr(image_array, cls=False) - + # Extract text ocr_text = "" if ocr_result and ocr_result[0]: - for line in ocr_result[0]: - if line and len(line) >= 2: - text = line[1][0] if line[1] else "" - if text: - ocr_text += text + " " - - text_length = len(ocr_text.strip()) - if text_length > 0: - logger.info(f"OCR extracted {text_length} characters") - return ocr_text.strip() - else: - logger.warning("OCR returned empty result") - return "" - + text = [ + line[1][0] if line and len(line) >= 2 and line[1] else "" + for line in ocr_result[0] + ] + text = [t.strip() for t in text if t] + ocr_text = " ".join(text) + + logger.info(f"OCR extracted {len(ocr_text)} characters") + return ocr_text + except Exception as e: logger.error(f"OCR recognition error: {str(e)}") return "" - + + class NanonetsOCRBackend(OCRBackend): """Nanonets OCR backend implementation using OpenAI API format""" - - def __init__(self, **kwargs): + + def __init__(self): """Initialize Nanonets OCR backend - + Args: api_key: API key for OpenAI API base_url: Base URL for OpenAI API model: Model name """ - try: - from openai import OpenAI - self.api_key = kwargs.get("api_key", "123") - self.base_url = kwargs.get("base_url", "http://localhost:8000/v1") - self.model = kwargs.get("model", "nanonets/Nanonets-OCR-s") - self.temperature = kwargs.get("temperature", 0.0) - self.max_tokens = kwargs.get("max_tokens", 15000) - - self.client = OpenAI(api_key=self.api_key, base_url=self.base_url) - self.prompt = """ -## 任务说明 + base_url = os.getenv("OCR_API_BASE_URL", "http://localhost:8000/v1") + api_key = os.getenv("OCR_API_KEY", "123") + timeout = 30 + self.client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout) + + self.model = os.getenv("OCR_MODEL", "nanonets/Nanonets-OCR-s") + logger.info(f"Nanonets OCR engine initialized with model: {self.model}") + self.temperature = 0.0 + self.max_tokens = 15000 + self.prompt = """## 任务说明 请从上传的文档中提取文字内容,严格按自然阅读顺序(从上到下,从左到右)输出,并遵循以下格式规范。 @@ -192,33 +243,26 @@ class NanonetsOCRBackend(OCRBackend): * 不要猜测或补全不确定的链接地址。 """ - logger.info(f"Nanonets OCR engine initialized with model: {self.model}") - except ImportError: - logger.error("Failed to import openai. Please install it with 'pip install openai'") - self.client = None - except Exception as e: - logger.error(f"Failed to initialize Nanonets OCR: {str(e)}") - self.client = None - + def predict(self, image: Union[str, bytes, Image.Image]) -> str: """Extract text from an image using Nanonets OCR - + Args: image: Image file path, bytes, or PIL Image object - + Returns: Extracted text """ if self.client is None: logger.error("Nanonets OCR client not initialized") return "" - + try: # Encode image to base64 - img_base64 = image_to_base64(image) + img_base64 = endecode.decode_image(image) if not img_base64: return "" - + # Call Nanonets OCR API logger.info(f"Calling Nanonets OCR API with model: {self.model}") response = self.client.chat.completions.create( @@ -229,7 +273,9 @@ class NanonetsOCRBackend(OCRBackend): "content": [ { "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{img_base64}"}, + "image_url": { + "url": f"data:image/png;base64,{img_base64}" + }, }, { "type": "text", @@ -239,40 +285,43 @@ class NanonetsOCRBackend(OCRBackend): } ], temperature=self.temperature, - max_tokens=self.max_tokens + max_tokens=self.max_tokens, ) - - return response.choices[0].message.content + return response.choices[0].message.content or "" except Exception as e: logger.error(f"Nanonets OCR prediction error: {str(e)}") return "" + class OCREngine: """OCR Engine factory class""" - - _instance = None - + + _instance: Dict[str, OCRBackend] = {} + @classmethod - def get_instance(cls, backend_type="paddle", **kwargs) -> Optional[OCRBackend]: + def get_instance(cls, backend_type: str) -> OCRBackend: """Get OCR engine instance - + Args: backend_type: OCR backend type, one of: "paddle", "nanonets" **kwargs: Additional arguments for the backend - + Returns: OCR engine instance or None if initialization fails """ - if cls._instance is None: - logger.info(f"Initializing OCR engine with backend: {backend_type}") - - if backend_type.lower() == "paddle": - cls._instance = PaddleOCRBackend(**kwargs) - elif backend_type.lower() == "nanonets": - cls._instance = NanonetsOCRBackend(**kwargs) - else: - logger.error(f"Unknown OCR backend type: {backend_type}") - return None - - return cls._instance - + backend_type = backend_type.lower() + if cls._instance.get(backend_type): + return cls._instance[backend_type] + + logger.info(f"Initializing OCR engine with backend: {backend_type}") + + if backend_type == "paddle": + cls._instance[backend_type] = PaddleOCRBackend() + + elif backend_type == "nanonets": + cls._instance[backend_type] = NanonetsOCRBackend() + + else: + cls._instance[backend_type] = DummyOCRBackend() + + return cls._instance[backend_type] diff --git a/docreader/parser/parser.py b/docreader/parser/parser.py index 8e1668d..b53448d 100644 --- a/docreader/parser/parser.py +++ b/docreader/parser/parser.py @@ -1,30 +1,19 @@ import logging -from dataclasses import dataclass, field -from typing import Dict, Any, Optional, Type +from typing import Dict, Type -from .base_parser import BaseParser, ParseResult -from .docx_parser import DocxParser -from .doc_parser import DocParser -from .pdf_parser import PDFParser -from .markdown_parser import MarkdownParser -from .text_parser import TextParser -from .image_parser import ImageParser -from .web_parser import WebParser -from .config import ChunkingConfig -import traceback +from docreader.models.document import Document +from docreader.models.read_config import ChunkingConfig +from docreader.parser.base_parser import BaseParser +from docreader.parser.doc_parser import DocParser +from docreader.parser.docx2_parser import Docx2Parser +from docreader.parser.image_parser import ImageParser +from docreader.parser.markdown_parser import MarkdownParser +from docreader.parser.pdf_parser import PDFParser +from docreader.parser.text_parser import TextParser +from docreader.parser.web_parser import WebParser logger = logging.getLogger(__name__) -@dataclass -class Chunk: - """ - Represents a single text chunk with associated metadata. - Basic unit for document processing and embedding. - """ - - content: str # Text content of the chunk - metadata: Dict[str, Any] = None # Associated metadata (source, page number, etc.) - class Parser: """ @@ -33,10 +22,9 @@ class Parser: """ def __init__(self): - logger.info("Initializing document parser") # Initialize all parser types self.parsers: Dict[str, Type[BaseParser]] = { - "docx": DocxParser, + "docx": Docx2Parser, "doc": DocParser, "pdf": PDFParser, "md": MarkdownParser, @@ -56,8 +44,7 @@ class Parser: ", ".join(self.parsers.keys()), ) - - def get_parser(self, file_type: str) -> Optional[Type[BaseParser]]: + def get_parser(self, file_type: str) -> Type[BaseParser]: """ Get parser class for the specified file type. @@ -67,12 +54,9 @@ class Parser: Returns: Parser class for the file type, or None if unsupported """ - file_type = file_type.lower() - parser = self.parsers.get(file_type) - if parser: - logger.info(f"Found parser for file type: {file_type}") - else: - logger.warning(f"No parser found for file type: {file_type}") + parser = self.parsers.get(file_type.lower()) + if not parser: + raise ValueError(f"Unsupported file type: {file_type}") return parser def parse_file( @@ -81,7 +65,7 @@ class Parser: file_type: str, content: bytes, config: ChunkingConfig, - ) -> Optional[ParseResult]: + ) -> Document: """ Parse file content using appropriate parser based on file type. @@ -96,60 +80,41 @@ class Parser: """ logger.info(f"Parsing file: {file_name} with type: {file_type}") logger.info( - f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, " + f"Chunking config: size={config.chunk_size}, " + f"overlap={config.chunk_overlap}, " f"multimodal={config.enable_multimodal}" ) - - parser_instance = None - - try: - # Get appropriate parser for file type - cls = self.get_parser(file_type) - if cls is None: - logger.error(f"Unsupported file type: {file_type}") - return None - # Parse file content - logger.info(f"Creating parser instance for {file_type} file") - parser_instance = cls( - file_name=file_name, - file_type=file_type, - chunk_size=config.chunk_size, - chunk_overlap=config.chunk_overlap, - separators=config.separators, - enable_multimodal=config.enable_multimodal, - max_image_size=1920, # Limit image size to 1920px - max_concurrent_tasks=5, # Limit concurrent tasks to 5 - chunking_config=config, # Pass the entire chunking config - ) + # Get appropriate parser for file type + cls = self.get_parser(file_type) - logger.info(f"Starting to parse file content, size: {len(content)} bytes") - result = parser_instance.parse(content) + # Parse file content + logger.info(f"Creating parser instance for {file_type} file") + parser = cls( + file_name=file_name, + file_type=file_type, + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + separators=config.separators, + enable_multimodal=config.enable_multimodal, + max_image_size=1920, # Limit image size to 1920px + max_concurrent_tasks=5, # Limit concurrent tasks to 5 + chunking_config=config, # Pass the entire chunking config + ) - if result: - logger.info( - f"Successfully parsed file {file_name}, generated {len(result.chunks)} chunks" - ) - if result.chunks and len(result.chunks) > 0: - logger.info( - f"First chunk content length: {len(result.chunks[0].content)}" - ) - else: - logger.warning(f"Parser returned empty chunks for file: {file_name}") - else: - logger.warning(f"Parser returned None result for file: {file_name}") + logger.info(f"Starting to parse file content, size: {len(content)} bytes") + result = parser.parse(content) - # Return parse results - return result + if not result.content: + logger.warning(f"Parser returned empty content for file: {file_name}") + elif not result.chunks: + logger.warning(f"Parser returned empty chunks for file: {file_name}") + elif result.chunks[0]: + logger.info(f"First chunk content length: {len(result.chunks[0].content)}") + logger.info(f"Parsed file {file_name}, with {len(result.chunks)} chunks") + return result - except Exception as e: - logger.error(f"Error parsing file {file_name}: {str(e)}") - logger.info(f"Detailed traceback: {traceback.format_exc()}") - return None - - def parse_url( - self, url: str, title: str, config: ChunkingConfig - ) -> Optional[ParseResult]: + def parse_url(self, url: str, title: str, config: ChunkingConfig) -> Document: """ Parse content from a URL using the WebParser. @@ -163,44 +128,31 @@ class Parser: """ logger.info(f"Parsing URL: {url}, title: {title}") logger.info( - f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, " - f"multimodal={config.enable_multimodal}" + f"Chunking config: size={config.chunk_size}, " + f"overlap={config.chunk_overlap}, multimodal={config.enable_multimodal}" ) - - parser_instance = None - try: - # Create web parser instance - logger.info("Creating WebParser instance") - parser_instance = WebParser( - title=title, - chunk_size=config.chunk_size, - chunk_overlap=config.chunk_overlap, - separators=config.separators, - enable_multimodal=config.enable_multimodal, - max_image_size=1920, # Limit image size - max_concurrent_tasks=5, # Limit concurrent tasks - chunking_config=config, - ) + # Create web parser instance + logger.info("Creating WebParser instance") + parser = WebParser( + title=title, + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + separators=config.separators, + enable_multimodal=config.enable_multimodal, + max_image_size=1920, # Limit image size + max_concurrent_tasks=5, # Limit concurrent tasks + chunking_config=config, + ) - logger.info(f"Starting to parse URL content") - result = parser_instance.parse(url) - - if result: - logger.info( - f"Successfully parsed URL, generated {len(result.chunks)} chunks" - ) - logger.info( - f"First chunk content length: {len(result.chunks[0].content) if result.chunks else 0}" - ) - else: - logger.warning(f"Parser returned empty result for URL: {url}") - - # Return parse results - return result - - except Exception as e: - logger.error(f"Error parsing URL {url}: {str(e)}") - logger.info(f"Detailed traceback: {traceback.format_exc()}") - return None + logger.info("Starting to parse URL content") + result = parser.parse(url.encode()) + if not result.content: + logger.warning(f"Parser returned empty content for url: {url}") + elif not result.chunks: + logger.warning(f"Parser returned empty chunks for url: {url}") + elif result.chunks[0]: + logger.info(f"First chunk content length: {len(result.chunks[0].content)}") + logger.info(f"Parsed url {url}, with {len(result.chunks)} chunks") + return result diff --git a/docreader/parser/pdf_parser.py b/docreader/parser/pdf_parser.py index 94d9f9a..c17184d 100644 --- a/docreader/parser/pdf_parser.py +++ b/docreader/parser/pdf_parser.py @@ -1,113 +1,7 @@ -import logging -import os -import io -from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union +from docreader.parser.chain_parser import FirstParser +from docreader.parser.markitdown_parser import MarkitdownParser +from docreader.parser.mineru_parser import MinerUParser -import pdfplumber -import tempfile -from .base_parser import BaseParser -logger = logging.getLogger(__name__) - -class PDFParser(BaseParser): - """ - PDF Document Parser - - This parser handles PDF documents by extracting text content. - It uses the pypdf library for simple text extraction. - """ - def _convert_table_to_markdown(self, table_data: list) -> str: - - if not table_data or not table_data[0]: return "" - def clean_cell(cell): - if cell is None: return "" - return str(cell).replace("\n", "
") - try: - markdown = "" - header = [clean_cell(cell) for cell in table_data[0]] - markdown += "| " + " | ".join(header) + " |\n" - markdown += "| " + " | ".join(["---"] * len(header)) + " |\n" - for row in table_data[1:]: - if not row: continue - body_row = [clean_cell(cell) for cell in row] - if len(body_row) != len(header): - logger.warning(f"Skipping malformed table row: {body_row}") - continue - markdown += "| " + " | ".join(body_row) + " |\n" - return markdown - except Exception as e: - logger.error(f"Error converting table to markdown: {e}") - return "" - def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: - - logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes") - - all_page_content = [] - - - temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") - temp_pdf_path = temp_pdf.name - - try: - temp_pdf.write(content) - temp_pdf.close() - logger.info(f"PDF content written to temporary file: {temp_pdf_path}") - - with pdfplumber.open(temp_pdf_path) as pdf: - logger.info(f"PDF has {len(pdf.pages)} pages") - - for page_num, page in enumerate(pdf.pages): - page_content_parts = [] - - # Try-fallback strategy for table detection - default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" } - found_tables = page.find_tables(default_settings) - if not found_tables: - logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.") - fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" } - found_tables = page.find_tables(fallback_settings) - - table_bboxes = [table.bbox for table in found_tables] - # Define a filter function that keeps objects NOT inside any table bbox. - def not_within_bboxes(obj): - """Check if an object is outside all table bounding boxes.""" - for bbox in table_bboxes: - # Check if the object's vertical center is within a bbox - if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]: - return False # It's inside a table, so we DON'T keep it - return True # It's outside all tables, so we DO keep it - - # that contains only the non-table text. - non_table_page = page.filter(not_within_bboxes) - - # Now, extract text from this filtered page view. - text = non_table_page.extract_text(x_tolerance=2) - if text: - page_content_parts.append(text) - - # Process and append the structured Markdown tables - if found_tables: - logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}") - for table in found_tables: - markdown_table = self._convert_table_to_markdown(table.extract()) - page_content_parts.append(f"\n\n{markdown_table}\n\n") - - - all_page_content.append("".join(page_content_parts)) - - final_text = "\n\n--- Page Break ---\n\n".join(all_page_content) - logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.") - - return final_text - - except Exception as e: - logger.error(f"Failed to parse PDF document: {str(e)}") - return "" - finally: - # This block is GUARANTEED to execute, preventing resource leaks. - if os.path.exists(temp_pdf_path): - try: - os.remove(temp_pdf_path) - logger.info(f"Temporary file cleaned up: {temp_pdf_path}") - except OSError as e: - logger.error(f"Error removing temporary file {temp_pdf_path}: {e}") +class PDFParser(FirstParser): + _parser_cls = (MinerUParser, MarkitdownParser) diff --git a/docreader/parser/storage.py b/docreader/parser/storage.py index 33cb9a2..767ab0a 100644 --- a/docreader/parser/storage.py +++ b/docreader/parser/storage.py @@ -1,64 +1,68 @@ # -*- coding: utf-8 -*- -import os -import uuid -import logging import io +import logging +import os import traceback +import uuid from abc import ABC, abstractmethod -from typing import Tuple, Optional +from typing import Dict -from qcloud_cos import CosConfig, CosS3Client from minio import Minio +from qcloud_cos import CosConfig, CosS3Client + +from docreader.utils import endecode logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) class Storage(ABC): """Abstract base class for object storage operations""" - + @abstractmethod def upload_file(self, file_path: str) -> str: """Upload file to object storage - + Args: file_path: File path - - Returns: - File URL - """ - pass - - @abstractmethod - def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str: - """Upload bytes to object storage - - Args: - content: Byte content to upload - file_ext: File extension - + Returns: File URL """ pass - + @abstractmethod + def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str: + """Upload bytes to object storage + + Args: + content: Byte content to upload + file_ext: File extension + + Returns: + File URL + """ + pass + + class CosStorage(Storage): """Tencent Cloud COS storage implementation""" - + def __init__(self, storage_config=None): """Initialize COS storage - + Args: storage_config: Storage configuration """ self.storage_config = storage_config - self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client() - + self.client, self.bucket_name, self.region, self.prefix = ( + self._init_cos_client() + ) + def _init_cos_client(self): """Initialize Tencent Cloud COS client""" try: - # Use provided COS config if available, otherwise fall back to environment variables + # Use provided COS config if available, + # otherwise fall back to environment variables if self.storage_config and self.storage_config.get("access_key_id") != "": cos_config = self.storage_config secret_id = cos_config.get("access_key_id") @@ -75,15 +79,16 @@ class CosStorage(Storage): bucket_name = os.getenv("COS_BUCKET_NAME") appid = os.getenv("COS_APP_ID") prefix = os.getenv("COS_PATH_PREFIX") - + enable_old_domain = ( os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true" ) if not all([secret_id, secret_key, region, bucket_name, appid]): logger.error( - "Incomplete COS configuration, missing required environment variables" - f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}" + "Incomplete COS configuration, missing environment variables" + f"secret_id: {secret_id}, secret_key: {secret_key}, " + f"region: {region}, bucket_name: {bucket_name}, appid: {appid}" ) return None, None, None, None @@ -105,27 +110,26 @@ class CosStorage(Storage): except Exception as e: logger.error(f"Failed to initialize COS client: {str(e)}") return None, None, None, None - + def _get_download_url(self, bucket_name, region, object_key): """Generate COS object URL - + Args: bucket_name: Bucket name region: Region object_key: Object key - + Returns: File URL """ return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}" - - + def upload_file(self, file_path: str) -> str: """Upload file to Tencent Cloud COS - + Args: file_path: File path - + Returns: File URL """ @@ -135,16 +139,16 @@ class CosStorage(Storage): return "" # Generate object key, use UUID to avoid conflicts - file_name = os.path.basename(file_path) - object_key = ( - f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" - ) + file_ext = os.path.splitext(file_path)[1] + object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" logger.info(f"Generated object key: {object_key}") # Upload file logger.info("Attempting to upload file to COS") - response = self.client.upload_file( - Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key + self.client.upload_file( + Bucket=self.bucket_name, + LocalFilePath=file_path, + Key=object_key, ) # Get file URL @@ -156,14 +160,14 @@ class CosStorage(Storage): except Exception as e: logger.error(f"Failed to upload file to COS: {str(e)}") return "" - + def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str: """Upload bytes to Tencent Cloud COS - + Args: content: Byte content to upload file_ext: File extension - + Returns: File URL """ @@ -171,10 +175,16 @@ class CosStorage(Storage): logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes") if not self.client: return "" - - object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}" + + object_key = ( + f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" + if self.prefix + else f"images/{uuid.uuid4().hex}{file_ext}" + ) logger.info(f"Generated object key: {object_key}") - self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key) + self.client.put_object( + Bucket=self.bucket_name, Body=content, Key=object_key + ) file_url = self._get_download_url(self.bucket_name, self.region, object_key) logger.info(f"Successfully uploaded bytes to COS: {file_url}") return file_url @@ -186,16 +196,18 @@ class CosStorage(Storage): class MinioStorage(Storage): """MinIO storage implementation""" - + def __init__(self, storage_config=None): """Initialize MinIO storage - + Args: storage_config: Storage configuration """ self.storage_config = storage_config - self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client() - + self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = ( + self._init_minio_client() + ) + def _init_minio_client(self): """Initialize MinIO client from environment variables or injected config. @@ -203,58 +215,69 @@ class MinioStorage(Storage): prefer those values to override envs. """ try: - endpoint = os.getenv("MINIO_ENDPOINT") + endpoint = os.getenv("MINIO_ENDPOINT", "") use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true" if self.storage_config and self.storage_config.get("bucket_name"): storage_config = self.storage_config - bucket_name = storage_config.get("bucket_name") + bucket_name = storage_config.get("bucket_name", "") path_prefix = storage_config.get("path_prefix").strip().strip("/") access_key = storage_config.get("access_key_id") secret_key = storage_config.get("secret_access_key") else: access_key = os.getenv("MINIO_ACCESS_KEY_ID") secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY") - bucket_name = os.getenv("MINIO_BUCKET_NAME") + bucket_name = os.getenv("MINIO_BUCKET_NAME", "") path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/") if not all([endpoint, access_key, secret_key, bucket_name]): - logger.error("Incomplete MinIO configuration, missing required environment variables") + logger.error( + "Incomplete MinIO configuration, missing environment variables" + ) return None, None, None, None, None # Initialize client - client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl) + client = Minio( + endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl + ) # Ensure bucket exists found = client.bucket_exists(bucket_name) if not found: client.make_bucket(bucket_name) - policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name) + policy = ( + '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' + % (bucket_name, bucket_name) + ) client.set_bucket_policy(bucket_name, policy) return client, bucket_name, use_ssl, endpoint, path_prefix except Exception as e: logger.error(f"Failed to initialize MinIO client: {str(e)}") return None, None, None, None, None - - def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None): + + def _get_download_url(self, object_key: str): """Construct a public URL for MinIO object. If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint. """ - if public_endpoint: - base = public_endpoint - else: - scheme = "https" if use_ssl else "http" - base = f"{scheme}://{endpoint}" - # Path-style URL for MinIO - return f"{base}/{bucket_name}/{object_key}" - + # 1. Use public endpoint if provided + endpoint = os.getenv("MINIO_PUBLIC_ENDPOINT") + if endpoint: + return f"{endpoint}/{self.bucket_name}/{object_key}" + + # 2. Use SSL if enabled + if self.use_ssl: + return f"https://{self.endpoint}/{self.bucket_name}/{object_key}" + + # 3. Use HTTP default + return f"http://{self.endpoint}/{self.bucket_name}/{object_key}" + def upload_file(self, file_path: str) -> str: """Upload file to MinIO - + Args: file_path: File path - + Returns: File URL """ @@ -265,29 +288,27 @@ class MinioStorage(Storage): # Generate object key, use UUID to avoid conflicts file_name = os.path.basename(file_path) - object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" + object_key = ( + f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" + if self.path_prefix + else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" + ) logger.info(f"Generated MinIO object key: {object_key}") # Upload file logger.info("Attempting to upload file to MinIO") - with open(file_path, 'rb') as file_data: + with open(file_path, "rb") as file_data: file_size = os.path.getsize(file_path) self.client.put_object( - bucket_name=self.bucket_name, + bucket_name=self.bucket_name or "", object_name=object_key, data=file_data, length=file_size, - content_type='application/octet-stream' + content_type="application/octet-stream", ) # Get file URL - file_url = self._get_download_url( - self.bucket_name, - object_key, - self.use_ssl, - self.endpoint, - os.getenv("MINIO_PUBLIC_ENDPOINT", None) - ) + file_url = self._get_download_url(object_key) logger.info(f"Successfully uploaded file to MinIO: {file_url}") return file_url @@ -295,14 +316,14 @@ class MinioStorage(Storage): except Exception as e: logger.error(f"Failed to upload file to MinIO: {str(e)}") return "" - + def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str: """Upload bytes to MinIO - + Args: content: Byte content to upload file_ext: File extension - + Returns: File URL """ @@ -310,23 +331,21 @@ class MinioStorage(Storage): logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes") if not self.client: return "" - - object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}" + + object_key = ( + f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" + if self.path_prefix + else f"images/{uuid.uuid4().hex}{file_ext}" + ) logger.info(f"Generated MinIO object key: {object_key}") self.client.put_object( - self.bucket_name, - object_key, - data=io.BytesIO(content), - length=len(content), - content_type="application/octet-stream" - ) - file_url = self._get_download_url( - self.bucket_name, - object_key, - self.use_ssl, - self.endpoint, - os.getenv("MINIO_PUBLIC_ENDPOINT", None) + self.bucket_name or "", + object_key, + data=io.BytesIO(content), + length=len(content), + content_type="application/octet-stream", ) + file_url = self._get_download_url(object_key) logger.info(f"Successfully uploaded bytes to MinIO: {file_url}") return file_url except Exception as e: @@ -335,26 +354,61 @@ class MinioStorage(Storage): return "" -def create_storage(storage_config=None) -> Storage: +class LocalStorage(Storage): + """Local file system storage implementation""" + + def __init__(self, storage_config: Dict[str, str] = {}): + self.storage_config = storage_config + base_dir = storage_config.get( + "base_dir", os.getenv("LOCAL_STORAGE_BASE_DIR", "") + ) + self.image_dir = os.path.join(base_dir, "images") + os.makedirs(self.image_dir, exist_ok=True) + + def upload_file(self, file_path: str) -> str: + logger.info(f"Uploading file to local storage: {file_path}") + return file_path + + def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str: + logger.info(f"Uploading file to local storage: {len(content)} bytes") + fname = os.path.join(self.image_dir, f"{uuid.uuid4()}{file_ext}") + with open(fname, "wb") as f: + f.write(content) + return fname + + +class Base64Storage(Storage): + def upload_file(self, file_path: str) -> str: + logger.info(f"Uploading file to base64 storage: {file_path}") + return file_path + + def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str: + logger.info(f"Uploading file to base64 storage: {len(content)} bytes") + file_ext = file_ext.lstrip(".") + return f"data:image/{file_ext};base64,{endecode.decode_image(content)}" + + +def create_storage(storage_config: Dict[str, str] | None = None) -> Storage: """Create a storage instance based on configuration or environment variables - + Args: storage_config: Storage configuration dictionary - + Returns: Storage instance """ storage_type = os.getenv("STORAGE_TYPE", "cos").lower() - if storage_config: storage_type = str(storage_config.get("provider", storage_type)).lower() - logger.info(f"Creating {storage_type} storage instance") - + if storage_type == "minio": return MinioStorage(storage_config) elif storage_type == "cos": - # Default to COS return CosStorage(storage_config) - else: - return None \ No newline at end of file + elif storage_type == "local": + return LocalStorage(storage_config or {}) + elif storage_type == "base64": + return Base64Storage() + + raise ValueError(f"Invalid storage type: {storage_type}") diff --git a/docreader/parser/text_parser.py b/docreader/parser/text_parser.py index 0bd0dd7..7675f17 100644 --- a/docreader/parser/text_parser.py +++ b/docreader/parser/text_parser.py @@ -1,6 +1,8 @@ import logging -from .base_parser import BaseParser -from typing import Dict, Any, Tuple, Union + +from docreader.models.document import Document +from docreader.parser.base_parser import BaseParser +from docreader.utils import endecode logger = logging.getLogger(__name__) @@ -11,7 +13,7 @@ class TextParser(BaseParser): This parser handles text extraction and chunking from plain text documents. """ - def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: + def parse_into_text(self, content: bytes) -> Document: """ Parse text document content by decoding bytes to string. @@ -25,20 +27,15 @@ class TextParser(BaseParser): Parsed text content as string """ logger.info(f"Parsing text document, content size: {len(content)} bytes") - text = self.decode_bytes(content) + text = endecode.decode_bytes(content) logger.info( f"Successfully parsed text document, extracted {len(text)} characters" ) - return text + return Document(content=text) if __name__ == "__main__": - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) - logger.info("Running TextParser in standalone mode") + logger = logging.getLogger(__name__) # Sample text for testing text = """## 标题1 diff --git a/docreader/parser/web_parser.py b/docreader/parser/web_parser.py index 44c883a..e7291a0 100644 --- a/docreader/parser/web_parser.py +++ b/docreader/parser/web_parser.py @@ -1,11 +1,14 @@ -from typing import Any, Optional, Tuple, Dict, Union -import os - -from playwright.async_api import async_playwright -from bs4 import BeautifulSoup -from .base_parser import BaseParser, ParseResult -import logging import asyncio +import logging +import os +from typing import Any + +from bs4 import BeautifulSoup +from playwright.async_api import async_playwright + +from docreader.models.document import Document +from docreader.parser.base_parser import BaseParser +from docreader.utils import endecode logger = logging.getLogger(__name__) @@ -59,7 +62,7 @@ class WebParser(BaseParser): # Return empty BeautifulSoup object on error return BeautifulSoup("", "html.parser") - def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: + def parse_into_text(self, content: bytes) -> Document: """Parse web page Args: @@ -78,10 +81,10 @@ class WebParser(BaseParser): # Run async method # Handle content possibly being a string if isinstance(content, bytes): - url = self.decode_bytes(content) + url = endecode.decode_bytes(content) logger.info(f"Decoded URL from bytes: {url}") else: - url = content + url = str(content) logger.info(f"Using content as URL directly: {url}") logger.info(f"Scraping web page: {url}") @@ -118,11 +121,11 @@ class WebParser(BaseParser): logger.info( f"Web page parsing complete, total content: {len(result)} characters" ) - return result + return Document(content=result) except Exception as e: logger.error(f"Error parsing web page: {str(e)}") - return f"Error parsing web page: {str(e)}" + return Document(content=f"Error parsing web page: {str(e)}") finally: # Close event loop diff --git a/docreader/proto/docreader_pb2.pyi b/docreader/proto/docreader_pb2.pyi new file mode 100644 index 0000000..9f7cdf6 --- /dev/null +++ b/docreader/proto/docreader_pb2.pyi @@ -0,0 +1,127 @@ +from google.protobuf.internal import containers as _containers +from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from collections.abc import Iterable as _Iterable, Mapping as _Mapping +from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union + +DESCRIPTOR: _descriptor.FileDescriptor + +class StorageProvider(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): + __slots__ = () + STORAGE_PROVIDER_UNSPECIFIED: _ClassVar[StorageProvider] + COS: _ClassVar[StorageProvider] + MINIO: _ClassVar[StorageProvider] +STORAGE_PROVIDER_UNSPECIFIED: StorageProvider +COS: StorageProvider +MINIO: StorageProvider + +class StorageConfig(_message.Message): + __slots__ = ("provider", "region", "bucket_name", "access_key_id", "secret_access_key", "app_id", "path_prefix") + PROVIDER_FIELD_NUMBER: _ClassVar[int] + REGION_FIELD_NUMBER: _ClassVar[int] + BUCKET_NAME_FIELD_NUMBER: _ClassVar[int] + ACCESS_KEY_ID_FIELD_NUMBER: _ClassVar[int] + SECRET_ACCESS_KEY_FIELD_NUMBER: _ClassVar[int] + APP_ID_FIELD_NUMBER: _ClassVar[int] + PATH_PREFIX_FIELD_NUMBER: _ClassVar[int] + provider: StorageProvider + region: str + bucket_name: str + access_key_id: str + secret_access_key: str + app_id: str + path_prefix: str + def __init__(self, provider: _Optional[_Union[StorageProvider, str]] = ..., region: _Optional[str] = ..., bucket_name: _Optional[str] = ..., access_key_id: _Optional[str] = ..., secret_access_key: _Optional[str] = ..., app_id: _Optional[str] = ..., path_prefix: _Optional[str] = ...) -> None: ... + +class VLMConfig(_message.Message): + __slots__ = ("model_name", "base_url", "api_key", "interface_type") + MODEL_NAME_FIELD_NUMBER: _ClassVar[int] + BASE_URL_FIELD_NUMBER: _ClassVar[int] + API_KEY_FIELD_NUMBER: _ClassVar[int] + INTERFACE_TYPE_FIELD_NUMBER: _ClassVar[int] + model_name: str + base_url: str + api_key: str + interface_type: str + def __init__(self, model_name: _Optional[str] = ..., base_url: _Optional[str] = ..., api_key: _Optional[str] = ..., interface_type: _Optional[str] = ...) -> None: ... + +class ReadConfig(_message.Message): + __slots__ = ("chunk_size", "chunk_overlap", "separators", "enable_multimodal", "storage_config", "vlm_config") + CHUNK_SIZE_FIELD_NUMBER: _ClassVar[int] + CHUNK_OVERLAP_FIELD_NUMBER: _ClassVar[int] + SEPARATORS_FIELD_NUMBER: _ClassVar[int] + ENABLE_MULTIMODAL_FIELD_NUMBER: _ClassVar[int] + STORAGE_CONFIG_FIELD_NUMBER: _ClassVar[int] + VLM_CONFIG_FIELD_NUMBER: _ClassVar[int] + chunk_size: int + chunk_overlap: int + separators: _containers.RepeatedScalarFieldContainer[str] + enable_multimodal: bool + storage_config: StorageConfig + vlm_config: VLMConfig + def __init__(self, chunk_size: _Optional[int] = ..., chunk_overlap: _Optional[int] = ..., separators: _Optional[_Iterable[str]] = ..., enable_multimodal: bool = ..., storage_config: _Optional[_Union[StorageConfig, _Mapping]] = ..., vlm_config: _Optional[_Union[VLMConfig, _Mapping]] = ...) -> None: ... + +class ReadFromFileRequest(_message.Message): + __slots__ = ("file_content", "file_name", "file_type", "read_config", "request_id") + FILE_CONTENT_FIELD_NUMBER: _ClassVar[int] + FILE_NAME_FIELD_NUMBER: _ClassVar[int] + FILE_TYPE_FIELD_NUMBER: _ClassVar[int] + READ_CONFIG_FIELD_NUMBER: _ClassVar[int] + REQUEST_ID_FIELD_NUMBER: _ClassVar[int] + file_content: bytes + file_name: str + file_type: str + read_config: ReadConfig + request_id: str + def __init__(self, file_content: _Optional[bytes] = ..., file_name: _Optional[str] = ..., file_type: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ... + +class ReadFromURLRequest(_message.Message): + __slots__ = ("url", "title", "read_config", "request_id") + URL_FIELD_NUMBER: _ClassVar[int] + TITLE_FIELD_NUMBER: _ClassVar[int] + READ_CONFIG_FIELD_NUMBER: _ClassVar[int] + REQUEST_ID_FIELD_NUMBER: _ClassVar[int] + url: str + title: str + read_config: ReadConfig + request_id: str + def __init__(self, url: _Optional[str] = ..., title: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ... + +class Image(_message.Message): + __slots__ = ("url", "caption", "ocr_text", "original_url", "start", "end") + URL_FIELD_NUMBER: _ClassVar[int] + CAPTION_FIELD_NUMBER: _ClassVar[int] + OCR_TEXT_FIELD_NUMBER: _ClassVar[int] + ORIGINAL_URL_FIELD_NUMBER: _ClassVar[int] + START_FIELD_NUMBER: _ClassVar[int] + END_FIELD_NUMBER: _ClassVar[int] + url: str + caption: str + ocr_text: str + original_url: str + start: int + end: int + def __init__(self, url: _Optional[str] = ..., caption: _Optional[str] = ..., ocr_text: _Optional[str] = ..., original_url: _Optional[str] = ..., start: _Optional[int] = ..., end: _Optional[int] = ...) -> None: ... + +class Chunk(_message.Message): + __slots__ = ("content", "seq", "start", "end", "images") + CONTENT_FIELD_NUMBER: _ClassVar[int] + SEQ_FIELD_NUMBER: _ClassVar[int] + START_FIELD_NUMBER: _ClassVar[int] + END_FIELD_NUMBER: _ClassVar[int] + IMAGES_FIELD_NUMBER: _ClassVar[int] + content: str + seq: int + start: int + end: int + images: _containers.RepeatedCompositeFieldContainer[Image] + def __init__(self, content: _Optional[str] = ..., seq: _Optional[int] = ..., start: _Optional[int] = ..., end: _Optional[int] = ..., images: _Optional[_Iterable[_Union[Image, _Mapping]]] = ...) -> None: ... + +class ReadResponse(_message.Message): + __slots__ = ("chunks", "error") + CHUNKS_FIELD_NUMBER: _ClassVar[int] + ERROR_FIELD_NUMBER: _ClassVar[int] + chunks: _containers.RepeatedCompositeFieldContainer[Chunk] + error: str + def __init__(self, chunks: _Optional[_Iterable[_Union[Chunk, _Mapping]]] = ..., error: _Optional[str] = ...) -> None: ... diff --git a/docreader/proto/docreader_pb2_grpc.py b/docreader/proto/docreader_pb2_grpc.py index 7cfcba5..b2c9f11 100644 --- a/docreader/proto/docreader_pb2_grpc.py +++ b/docreader/proto/docreader_pb2_grpc.py @@ -3,7 +3,7 @@ import grpc import warnings -from . import docreader_pb2 as docreader__pb2 +import docreader_pb2 as docreader__pb2 GRPC_GENERATED_VERSION = '1.76.0' GRPC_VERSION = grpc.__version__ diff --git a/docreader/pyproject.toml b/docreader/pyproject.toml index 3d24590..c8ccc36 100644 --- a/docreader/pyproject.toml +++ b/docreader/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "lxml>=6.0.2", "markdown>=3.10", "markdownify>=1.2.0", + "markitdown[docx,pdf,xls,xlsx]>=0.1.3", "minio>=7.2.18", "mistletoe>=1.5.0", "ollama>=0.6.0", @@ -26,6 +27,7 @@ dependencies = [ "pillow>=12.0.0", "playwright>=1.55.0", "protobuf>=6.33.0", + "pydantic>=2.12.3", "pypdf>=6.1.3", "pypdf2>=3.0.1", "python-docx>=1.2.0", diff --git a/docreader/scripts/generate_proto.sh b/docreader/scripts/generate_proto.sh index 21516ae..4bef407 100755 --- a/docreader/scripts/generate_proto.sh +++ b/docreader/scripts/generate_proto.sh @@ -2,13 +2,14 @@ set -x # 设置目录 -PROTO_DIR="proto" -PYTHON_OUT="proto" -GO_OUT="proto" +PROTO_DIR="docreader/proto" +PYTHON_OUT="docreader/proto" +GO_OUT="docreader/proto" # 生成Python代码 python3 -m grpc_tools.protoc -I${PROTO_DIR} \ --python_out=${PYTHON_OUT} \ + --pyi_out=${PYTHON_OUT} \ --grpc_python_out=${PYTHON_OUT} \ ${PROTO_DIR}/docreader.proto @@ -22,10 +23,10 @@ protoc -I${PROTO_DIR} --go_out=${GO_OUT} \ # 修复Python导入问题(MacOS兼容版本) if [ "$(uname)" == "Darwin" ]; then # MacOS版本 - sed -i '' 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py + sed -i '' 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py else # Linux版本 - sed -i 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py + sed -i 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py fi echo "Proto files generated successfully!" \ No newline at end of file diff --git a/docreader/splitter/header_hook.py b/docreader/splitter/header_hook.py new file mode 100644 index 0000000..a178e27 --- /dev/null +++ b/docreader/splitter/header_hook.py @@ -0,0 +1,112 @@ +import re +from typing import Callable, Dict, List, Match, Pattern, Union + +from pydantic import BaseModel, Field + + +class HeaderTrackerHook(BaseModel): + """表头追踪Hook的配置类,支持多种场景的表头识别""" + + start_pattern: Pattern[str] = Field( + description="表头开始匹配(正则表达式或字符串)" + ) + end_pattern: Pattern[str] = Field(description="表头结束匹配(正则表达式或字符串)") + extract_header_fn: Callable[[Match[str]], str] = Field( + default=lambda m: m.group(0), + description="从开始匹配结果中提取表头内容的函数(默认取匹配到的整个内容)", + ) + priority: int = Field(default=0, description="优先级(多个配置时,高优先级先匹配)") + case_sensitive: bool = Field( + default=True, description="是否大小写敏感(仅当传入字符串pattern时生效)" + ) + + def __init__( + self, + start_pattern: Union[str, Pattern[str]], + end_pattern: Union[str, Pattern[str]], + **kwargs, + ): + flags = 0 if kwargs.get("case_sensitive", True) else re.IGNORECASE + if isinstance(start_pattern, str): + start_pattern = re.compile(start_pattern, flags | re.DOTALL) + if isinstance(end_pattern, str): + end_pattern = re.compile(end_pattern, flags | re.DOTALL) + super().__init__( + start_pattern=start_pattern, + end_pattern=end_pattern, + **kwargs, + ) + + +# 初始化表头Hook配置(提供默认配置:支持Markdown表格、代码块) +DEFAULT_CONFIGS = [ + # 代码块配置(```开头,```结尾) + # HeaderTrackerHook( + # # 代码块开始(支持语言指定) + # start_pattern=r"^\s*```(\w+).*(?!```$)", + # # 代码块结束 + # end_pattern=r"^\s*```.*$", + # extract_header_fn=lambda m: f"```{m.group(1)}" if m.group(1) else "```", + # priority=20, # 代码块优先级高于表格 + # case_sensitive=True, + # ), + # Markdown表格配置(表头带下划线) + HeaderTrackerHook( + # 表头行 + 分隔行 + start_pattern=r"^\s*(?:\|[^|\n]*)+[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|?[\r\n]+$", + # 空行或非表格内容 + end_pattern=r"^\s*$|^\s*[^|\s].*$", + priority=15, + case_sensitive=False, + ), +] +DEFAULT_CONFIGS.sort(key=lambda x: -x.priority) + + +# 定义Hook状态数据结构 +class HeaderTracker(BaseModel): + """表头追踪 Hook 的状态类""" + + header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS) + active_headers: Dict[int, str] = Field(default_factory=dict) + ended_headers: set[int] = Field(default_factory=set) + + def update(self, split: str) -> Dict[int, str]: + """检测当前split中的表头开始/结束,更新Hook状态""" + new_headers: Dict[int, str] = {} + + # 1. 检查是否有表头结束标记 + for config in self.header_hook_configs: + if config.priority in self.active_headers and config.end_pattern.search( + split + ): + self.ended_headers.add(config.priority) + del self.active_headers[config.priority] + + # 2. 检查是否有新的表头开始标记(只处理未活跃且未结束的) + for config in self.header_hook_configs: + if ( + config.priority not in self.active_headers + and config.priority not in self.ended_headers + ): + match = config.start_pattern.search(split) + if match: + header = config.extract_header_fn(match) + self.active_headers[config.priority] = header + new_headers[config.priority] = header + + # 3. 检查是否所有活跃表头都已结束(清空结束标记) + if not self.active_headers: + self.ended_headers.clear() + + return new_headers + + def get_headers(self) -> str: + """获取当前所有活跃表头的拼接文本(按优先级排序)""" + # 按优先级降序排列表头 + sorted_headers = sorted(self.active_headers.items(), key=lambda x: -x[0]) + return ( + "\n".join([header for _, header in sorted_headers]) + if sorted_headers + else "" + ) diff --git a/docreader/splitter/splitter.py b/docreader/splitter/splitter.py new file mode 100644 index 0000000..abb4851 --- /dev/null +++ b/docreader/splitter/splitter.py @@ -0,0 +1,313 @@ +"""Token splitter.""" + +import itertools +import logging +import re +from typing import Callable, Generic, List, Pattern, Tuple, TypeVar + +from pydantic import BaseModel, Field, PrivateAttr + +from docreader.splitter.header_hook import ( + HeaderTracker, +) +from docreader.utils.split import split_by_char, split_by_sep + +DEFAULT_CHUNK_OVERLAP = 100 +DEFAULT_CHUNK_SIZE = 512 + +T = TypeVar("T") + +logger = logging.getLogger(__name__) + + +class TextSplitter(BaseModel, Generic[T]): + chunk_size: int = Field(description="The token chunk size for each chunk.") + chunk_overlap: int = Field( + description="The token overlap of each chunk when splitting." + ) + separators: List[str] = Field( + description="Default separators for splitting into words" + ) + + # Try to keep the matched characters as a whole. + # If it's too long, the content will be further segmented. + protected_regex: List[str] = Field( + description="Protected regex for splitting into words" + ) + len_function: Callable[[str], int] = Field(description="The length function.") + # Header tracking Hook related attributes + header_hook: HeaderTracker = Field(default_factory=HeaderTracker, exclude=True) + + _protected_fns: List[Pattern] = PrivateAttr() + _split_fns: List[Callable] = PrivateAttr() + + def __init__( + self, + chunk_size: int = DEFAULT_CHUNK_SIZE, + chunk_overlap: int = DEFAULT_CHUNK_OVERLAP, + separators: List[str] = ["\n", "。", " "], + protected_regex: List[str] = [ + # math formula + r"\$\$[\s\S]*?\$\$", + # image + r"!\[.*?\]\(.*?\)", + # link + r"\[.*?\]\(.*?\)", + # table header + r"(?:\|[^|\n]*)+\|[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|[\r\n]+", + # table body + r"(?:\|[^|\n]*)+\|[\r\n]+", + # code header + r"```(?:\w+)[\r\n]+[^\r\n]*", + ], + length_function: Callable[[str], int] = lambda x: len(x), + ): + """Initialize with parameters.""" + if chunk_overlap > chunk_size: + raise ValueError( + f"Got a larger chunk overlap ({chunk_overlap}) than chunk size " + f"({chunk_size}), should be smaller." + ) + + super().__init__( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + separators=separators, + protected_regex=protected_regex, + len_function=length_function, + ) + self._protected_fns = [re.compile(reg) for reg in protected_regex] + self._split_fns = [split_by_sep(sep) for sep in separators] + [split_by_char()] + + def split_text(self, text: str) -> List[Tuple[int, int, str]]: + """Split text into chunks.""" + if text == "": + return [] + + splits = self._split(text) + protect = self._split_protected(text) + splits = self._join(splits, protect) + + assert "".join(splits) == text + + chunks = self._merge(splits) + return chunks + + def _split(self, text: str) -> List[str]: + """Break text into splits that are smaller than chunk size. + + NOTE: the splits contain the separators. + """ + if self.len_function(text) <= self.chunk_size: + return [text] + + splits = [] + for split_fn in self._split_fns: + splits = split_fn(text) + if len(splits) > 1: + break + + new_splits = [] + for split in splits: + split_len = self.len_function(split) + if split_len <= self.chunk_size: + new_splits.append(split) + else: + # recursively split + new_splits.extend(self._split(split)) + return new_splits + + def _merge(self, splits: List[str]) -> List[Tuple[int, int, str]]: + """Merge splits into chunks. + + The high-level idea is to keep adding splits to a chunk until we + exceed the chunk size, then we start a new chunk with overlap. + + When we start a new chunk, we pop off the first element of the previous + chunk until the total length is less than the chunk size. + """ + chunks: List[Tuple[int, int, str]] = [] + + cur_chunk: List[Tuple[int, int, str]] = [] + + cur_headers, cur_len = "", 0 + cur_start, cur_end = 0, 0 + for split in splits: + cur_end = cur_start + len(split) + split_len = self.len_function(split) + if split_len > self.chunk_size: + logger.error( + f"Got a split of size {split_len}, ", + f"larger than chunk size {self.chunk_size}.", + ) + + self.header_hook.update(split) + cur_headers = self.header_hook.get_headers() + cur_headers_len = self.len_function(cur_headers) + + if cur_headers_len > self.chunk_size: + logger.error( + f"Got headers of size {cur_headers_len}, ", + f"larger than chunk size {self.chunk_size}.", + ) + cur_headers, cur_headers_len = "", 0 + + # if we exceed the chunk size after adding the new split, then + # we need to end the current chunk and start a new one + if cur_len + split_len + cur_headers_len > self.chunk_size: + # end the previous chunk + if len(cur_chunk) > 0: + chunks.append( + ( + cur_chunk[0][0], + cur_chunk[-1][1], + "".join([c[2] for c in cur_chunk]), + ) + ) + + # start a new chunk with overlap + # keep popping off the first element of the previous chunk until: + # 1. the current chunk length is less than chunk overlap + # 2. the total length is less than chunk size + while cur_chunk and ( + cur_len > self.chunk_overlap + or cur_len + split_len + cur_headers_len > self.chunk_size + ): + # pop off the first element + first_chunk = cur_chunk.pop(0) + cur_len -= self.len_function(first_chunk[2]) + + if ( + cur_headers + and split_len + cur_headers_len < self.chunk_size + and cur_headers not in split + ): + cur_chunk.insert( + 0, + ( + cur_chunk[0][0] if cur_chunk else cur_start, + cur_chunk[0][1] if cur_chunk else cur_end, + cur_headers, + ), + ) + cur_len += cur_headers_len + + cur_chunk.append((cur_start, cur_end, split)) + cur_len += split_len + cur_start = cur_end + + # handle the last chunk + assert cur_chunk + if cur_headers and cur_len < self.chunk_size: + cur_chunk.insert(0, (cur_chunk[0][0], cur_chunk[0][1], cur_headers)) + chunks.append( + ( + cur_chunk[0][0], + cur_chunk[-1][1], + "".join([c[2] for c in cur_chunk]), + ) + ) + + return chunks + + def _split_protected(self, text: str) -> List[Tuple[int, str]]: + matches = [ + (match.start(), match.end()) + for pattern in self._protected_fns + for match in pattern.finditer(text) + ] + matches.sort(key=lambda x: (x[0], -x[1])) + + res = [] + + def fold(initial: int, current: Tuple[int, int]) -> int: + if current[0] >= initial: + if current[1] - current[0] < self.chunk_size: + res.append((current[0], text[current[0] : current[1]])) + else: + logger.warning(f"Protected text ignore: {current}") + return max(initial, current[1]) + + # filter overlapping matches + list(itertools.accumulate(matches, fold, initial=-1)) + return res + + def _join(self, splits: List[str], protect: List[Tuple[int, str]]) -> List[str]: + """ + Merges and splits elements in splits array based on protected substrings. + + The function processes the input splits to ensure all protected substrings + remain as single items. If a protected substring is concatenated with preceding + or following content in any split element, it will be separated from + the adjacent content. The final result maintains the original order of content + while enforcing the integrity of protected substrings. + + Key behaviors: + 1. Preserves the complete structure of each protected substring + 2. Separates protected substrings from any adjacent non-protected content + 3. Maintains the original sequence of all content except for necessary + 4. Handles cases where protected substrings are partially concatenated + """ + j = 0 + point, start = 0, 0 + res = [] + + for split in splits: + end = start + len(split) + + cur = split[point - start :] + while j < len(protect): + p_start, p_content = protect[j] + p_end = p_start + len(p_content) + + if end <= p_start: + break + + if point < p_start: + local_end = p_start - point + res.append(cur[:local_end]) + cur = cur[local_end:] + point = p_start + + res.append(p_content) + j += 1 + + if point < p_end: + local_start = p_end - point + cur = cur[local_start:] + point = p_end + + if not cur: + break + + if cur: + res.append(cur) + point = end + + start = end + return res + + +if __name__ == "__main__": + s = """ + 这是一些普通文本。 + + | 姓名 | 年龄 | 城市 | + |------|------|------| + | 张三 | 25 | 北京 | + | 李四 | 30 | 上海 | + | 王五 | 28 | 广州 | + | 张三 | 25 | 北京 | + | 李四 | 30 | 上海 | + | 王五 | 28 | 广州 | + + 这是文本结束。 + +""" + + sp = TextSplitter(chunk_size=200, chunk_overlap=2) + ck = sp.split_text(s) + for c in ck: + print("------", len(c)) + print(c) + pass diff --git a/docreader/utils/endecode.py b/docreader/utils/endecode.py new file mode 100644 index 0000000..2457d07 --- /dev/null +++ b/docreader/utils/endecode.py @@ -0,0 +1,103 @@ +import base64 +import binascii +import io +import logging +from typing import List, Union + +import numpy as np +from PIL import Image + +logger = logging.getLogger(__name__) + + +def decode_image(image: Union[str, bytes, Image.Image, np.ndarray]) -> str: + """Convert image to base64 encoded string + + Args: + image: Image file path, bytes, PIL Image object, or numpy array + + Returns: + Base64 encoded image string, or empty string if conversion fails + """ + if isinstance(image, str): + # It's a file path + with open(image, "rb") as image_file: + return base64.b64encode(image_file.read()).decode() + + elif isinstance(image, bytes): + # It's bytes data + return base64.b64encode(image).decode() + + elif isinstance(image, Image.Image): + # It's a PIL Image + buffer = io.BytesIO() + image.save(buffer, format=image.format) + return base64.b64encode(buffer.getvalue()).decode() + + elif isinstance(image, np.ndarray): + # It's a numpy array + pil_image = Image.fromarray(image) + buffer = io.BytesIO() + pil_image.save(buffer, format="PNG") + return base64.b64encode(buffer.getvalue()).decode() + + raise ValueError(f"Unsupported image type: {type(image)}") + + +def encode_image(image: str, errors="strict") -> bytes: + """ + Decode image bytes using base64. + + errors + The error handling scheme to use for the handling of decoding errors. + The default is 'strict' meaning that decoding errors raise a + UnicodeDecodeError. Other possible values are 'ignore' and '????' + as well as any other name registered with codecs.register_error that + can handle UnicodeDecodeErrors. + """ + try: + image_bytes = base64.b64decode(image) + except binascii.Error as e: + if errors == "ignore": + return b"" + else: + raise e + return image_bytes + + +def encode_bytes(content: str) -> bytes: + return content.encode() + + +def decode_bytes( + content: bytes, + encodings: List[str] = [ + "utf-8", + "gb18030", + "gb2312", + "gbk", + "big5", + "ascii", + "latin-1", + ], +) -> str: + # Try decoding with each encoding format + for encoding in encodings: + try: + text = content.decode(encoding) + logger.debug(f"Decode content with {encoding}: {len(text)} characters") + return text + except UnicodeDecodeError: + continue + + text = content.decode(encoding="latin-1", errors="replace") + logger.warning( + "Unable to determine correct encoding, using latin-1 as fallback. " + "This may cause character issues." + ) + return text + + +if __name__ == "__main__": + img = "test![](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMgA)test" + encode_image(img, errors="ignore") diff --git a/docreader/utils/request.py b/docreader/utils/request.py index 867892c..0a6af5b 100644 --- a/docreader/utils/request.py +++ b/docreader/utils/request.py @@ -1,10 +1,10 @@ -from contextvars import ContextVar -import logging -import uuid import contextlib +import logging import time -from typing import Optional +import uuid +from contextvars import ContextVar from logging import LogRecord +from typing import Optional # 配置日志 logger = logging.getLogger(__name__) @@ -26,21 +26,21 @@ def get_request_id() -> Optional[str]: class MillisecondFormatter(logging.Formatter): """自定义日志格式化器,只显示毫秒级时间戳(3位数字)而不是微秒(6位)""" - + def formatTime(self, record, datefmt=None): """重写formatTime方法,将微秒格式化为毫秒""" # 先获取标准的格式化时间 result = super().formatTime(record, datefmt) - + # 如果使用了包含.%f的格式,则将微秒(6位)截断为毫秒(3位) if datefmt and ".%f" in datefmt: # 格式化的时间字符串应该在最后有6位微秒数 - parts = result.split('.') + parts = result.split(".") if len(parts) > 1 and len(parts[1]) >= 6: # 只保留前3位作为毫秒 millis = parts[1][:3] result = f"{parts[0]}.{millis}" - + return result diff --git a/docreader/utils/split.py b/docreader/utils/split.py new file mode 100644 index 0000000..6442c4f --- /dev/null +++ b/docreader/utils/split.py @@ -0,0 +1,34 @@ +import re +from typing import Callable, List + + +def split_text_keep_separator(text: str, separator: str) -> List[str]: + """Split text with separator and keep the separator at the end of each split.""" + parts = text.split(separator) + result = [separator + s if i > 0 else s for i, s in enumerate(parts)] + return [s for s in result if s] + + +def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]: + """Split text by separator.""" + if keep_sep: + return lambda text: split_text_keep_separator(text, sep) + else: + return lambda text: text.split(sep) + + +def split_by_char() -> Callable[[str], List[str]]: + """Split text by character.""" + return lambda text: list(text) + + +def split_by_regex(regex: str) -> Callable[[str], List[str]]: + """Split text by regex.""" + pattern = re.compile(f"({regex})") + return lambda text: list(filter(None, pattern.split(text))) + + +def match_by_regex(regex: str) -> Callable[[str], bool]: + """Split text by regex.""" + pattern = re.compile(regex) + return lambda text: bool(pattern.match(text)) diff --git a/docreader/utils/tempfile.py b/docreader/utils/tempfile.py new file mode 100644 index 0000000..ab61619 --- /dev/null +++ b/docreader/utils/tempfile.py @@ -0,0 +1,77 @@ +import logging +import os +import tempfile + +logger = logging.getLogger(__name__) + + +class TempFileContext: + def __init__(self, file_content: bytes, suffix: str): + """ + Initialize the context + :param file_content: Byte data to write to file + :param suffix: File suffix + """ + self.file_content = file_content + self.suffix = suffix + self.file = None + + def __enter__(self): + """ + Create file when entering context + """ + self.temp_file = tempfile.NamedTemporaryFile(suffix=self.suffix, delete=False) + self.temp_file.write(self.file_content) + self.temp_file.flush() + logger.info( + f"Saved {self.suffix} content to temporary file: {self.temp_file.name}" + ) + return self.temp_file.name + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + Delete file when exiting context + """ + if self.temp_file: + self.temp_file.close() + if os.path.exists(self.temp_file.name): + os.remove(self.temp_file.name) + logger.info(f"File {self.temp_file.name} has been deleted.") + # Return False to propagate exception (if any exception occurred) + return False + + +class TempDirContext: + def __init__(self): + """ + Initialize the context + """ + self.temp_dir = None + + def __enter__(self): + """ + Create directory when entering context + """ + self.temp_dir = tempfile.TemporaryDirectory() + logger.info(f"Created temporary directory: {self.temp_dir.name}") + return self.temp_dir.name + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + Delete directory when exiting context + """ + if self.temp_dir and os.path.exists(self.temp_dir.name): + self.temp_dir.cleanup() + logger.info(f"Directory {self.temp_dir.name} has been deleted.") + # Return False to propagate exception (if any exception occurred) + return False + + +if __name__ == "__main__": + example_bytes = b"Hello, this is a test file." + file_name = "test_file.txt" + + # Using with statement + with TempFileContext(example_bytes, file_name) as temp_file: + # File operations can be performed within the context + print(f"Does file {file_name} exist: {os.path.exists(file_name)}") diff --git a/docreader/uv.lock b/docreader/uv.lock index c5bfad0..8e53ad9 100644 --- a/docreader/uv.lock +++ b/docreader/uv.lock @@ -6,17 +6,22 @@ resolution-markers = [ "python_full_version == '3.13.*' and sys_platform == 'darwin'", "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", - "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", + "python_full_version == '3.13.*' and sys_platform == 'win32'", + "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", "python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.12.*' and sys_platform == 'win32'", + "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", "python_full_version == '3.11.*' and sys_platform == 'darwin'", "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'win32'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", "python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.11' and sys_platform == 'win32'", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", ] [[package]] @@ -423,6 +428,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" }, ] +[[package]] +name = "cobble" +version = "0.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa", size = 3805, upload-time = "2024-06-01T18:11:09.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44", size = 3984, upload-time = "2024-06-01T18:11:07.911Z" }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -432,6 +446,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "coloredlogs" +version = "15.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "humanfriendly" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" }, +] + [[package]] name = "cos-python-sdk-v5" version = "1.9.38" @@ -587,6 +613,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/d5/7a04640bf559bb890455ffb28978daf7d44f667c3f04a4d422c655c1ba92/cython-3.1.6-py3-none-any.whl", hash = "sha256:91dcf7eb9b6a089ce4e9e1140e571d84c3bca834afb77ec269be7aa9d31a8157", size = 1223550, upload-time = "2025-10-23T12:38:16.732Z" }, ] +[[package]] +name = "defusedxml" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" }, +] + [[package]] name = "distro" version = "1.9.0" @@ -612,6 +647,7 @@ dependencies = [ { name = "lxml" }, { name = "markdown" }, { name = "markdownify" }, + { name = "markitdown", extra = ["docx", "pdf", "xls", "xlsx"] }, { name = "minio" }, { name = "mistletoe" }, { name = "ollama" }, @@ -622,6 +658,7 @@ dependencies = [ { name = "pillow" }, { name = "playwright" }, { name = "protobuf" }, + { name = "pydantic" }, { name = "pypdf" }, { name = "pypdf2" }, { name = "python-docx" }, @@ -643,6 +680,7 @@ requires-dist = [ { name = "lxml", specifier = ">=6.0.2" }, { name = "markdown", specifier = ">=3.10" }, { name = "markdownify", specifier = ">=1.2.0" }, + { name = "markitdown", extras = ["docx", "pdf", "xls", "xlsx"], specifier = ">=0.1.3" }, { name = "minio", specifier = ">=7.2.18" }, { name = "mistletoe", specifier = ">=1.5.0" }, { name = "ollama", specifier = ">=0.6.0" }, @@ -653,6 +691,7 @@ requires-dist = [ { name = "pillow", specifier = ">=12.0.0" }, { name = "playwright", specifier = ">=1.55.0" }, { name = "protobuf", specifier = ">=6.33.0" }, + { name = "pydantic", specifier = ">=2.12.3" }, { name = "pypdf", specifier = ">=6.1.3" }, { name = "pypdf2", specifier = ">=3.0.1" }, { name = "python-docx", specifier = ">=1.2.0" }, @@ -683,6 +722,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bf/ee/aa015c5de8b0dc42a8e507eae8c2de5d1c0e068c896858fec6d502402ed6/ebooklib-0.20-py3-none-any.whl", hash = "sha256:fff5322517a37e31c972d27be7d982cc3928c16b3dcc5fd7e8f7c0f5d7bcf42b", size = 40995, upload-time = "2025-10-26T20:56:19.104Z" }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, +] + [[package]] name = "exceptiongroup" version = "1.3.0" @@ -707,6 +755,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" }, ] +[[package]] +name = "flatbuffers" +version = "25.9.23" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" }, +] + [[package]] name = "fonttools" version = "4.60.1" @@ -850,6 +907,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" }, { url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" }, { url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126, upload-time = "2025-08-07T13:18:20.239Z" }, + { url = "https://files.pythonhosted.org/packages/f1/29/74242b7d72385e29bcc5563fba67dad94943d7cd03552bac320d597f29b2/greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7", size = 1544904, upload-time = "2025-11-04T12:42:04.763Z" }, + { url = "https://files.pythonhosted.org/packages/c8/e2/1572b8eeab0f77df5f6729d6ab6b141e4a84ee8eb9bc8c1e7918f94eda6d/greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8", size = 1611228, upload-time = "2025-11-04T12:42:08.423Z" }, { url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654, upload-time = "2025-08-07T13:50:00.469Z" }, { url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" }, { url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" }, @@ -859,6 +918,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" }, { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" }, { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" }, + { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" }, + { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" }, { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" }, { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, @@ -868,6 +929,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" }, + { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" }, + { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" }, { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" }, { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, @@ -877,6 +940,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, + { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" }, + { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, @@ -884,6 +949,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, + { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" }, { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" }, ] @@ -1061,6 +1128,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[[package]] +name = "humanfriendly" +version = "10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyreadline3", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" }, +] + [[package]] name = "idna" version = "3.11" @@ -1386,6 +1465,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" }, ] +[[package]] +name = "magika" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "onnxruntime", version = "1.23.2", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "python-dotenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/f3/3d1dcdd7b9c41d589f5cff252d32ed91cdf86ba84391cfc81d9d8773571d/magika-0.6.3.tar.gz", hash = "sha256:7cc52aa7359af861957043e2bf7265ed4741067251c104532765cd668c0c0cb1", size = 3042784, upload-time = "2025-10-30T15:22:34.499Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/e4/35c323beb3280482c94299d61626116856ac2d4ec16ecef50afc4fdd4291/magika-0.6.3-py3-none-any.whl", hash = "sha256:eda443d08006ee495e02083b32e51b98cb3696ab595a7d13900d8e2ef506ec9d", size = 2969474, upload-time = "2025-10-30T15:22:25.298Z" }, + { url = "https://files.pythonhosted.org/packages/25/8f/132b0d7cd51c02c39fd52658a5896276c30c8cc2fd453270b19db8c40f7e/magika-0.6.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:86901e64b05dde5faff408c9b8245495b2e1fd4c226e3393d3d2a3fee65c504b", size = 13358841, upload-time = "2025-10-30T15:22:27.413Z" }, + { url = "https://files.pythonhosted.org/packages/c4/03/5ed859be502903a68b7b393b17ae0283bf34195cfcca79ce2dc25b9290e7/magika-0.6.3-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:3d9661eedbdf445ac9567e97e7ceefb93545d77a6a32858139ea966b5806fb64", size = 15367335, upload-time = "2025-10-30T15:22:29.907Z" }, + { url = "https://files.pythonhosted.org/packages/7b/9e/f8ee7d644affa3b80efdd623a3d75865c8f058f3950cb87fb0c48e3559bc/magika-0.6.3-py3-none-win_amd64.whl", hash = "sha256:e57f75674447b20cab4db928ae58ab264d7d8582b55183a0b876711c2b2787f3", size = 12692831, upload-time = "2025-10-30T15:22:32.063Z" }, +] + +[[package]] +name = "mammoth" +version = "1.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cobble" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ed/3c/a58418d2af00f2da60d4a51e18cd0311307b72d48d2fffec36a97b4a5e44/mammoth-1.11.0.tar.gz", hash = "sha256:a0f59e442f34d5b6447f4b0999306cbf3e67aaabfa8cb516f878fb1456744637", size = 53142, upload-time = "2025-09-19T10:35:20.373Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/54/2e39566a131b13f6d8d193f974cb6a34e81bb7cc2fa6f7e03de067b36588/mammoth-1.11.0-py2.py3-none-any.whl", hash = "sha256:c077ab0d450bd7c0c6ecd529a23bf7e0fa8190c929e28998308ff4eada3f063b", size = 54752, upload-time = "2025-09-19T10:35:18.699Z" }, +] + [[package]] name = "markdown" version = "3.10" @@ -1408,6 +1519,41 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/e2/7af643acb4cae0741dffffaa7f3f7c9e7ab4046724543ba1777c401d821c/markdownify-1.2.0-py3-none-any.whl", hash = "sha256:48e150a1c4993d4d50f282f725c0111bd9eb25645d41fa2f543708fd44161351", size = 15561, upload-time = "2025-08-09T17:44:14.074Z" }, ] +[[package]] +name = "markitdown" +version = "0.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "beautifulsoup4" }, + { name = "charset-normalizer" }, + { name = "defusedxml" }, + { name = "magika" }, + { name = "markdownify" }, + { name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'win32'" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/87/31/90cef2bc8ecd85c200ed3b3d1e20fc7a724213502685c4b05b5431e02668/markitdown-0.1.3.tar.gz", hash = "sha256:b0d9127c3373a68274dede6af6c9bb0684b78ce364c727c4c304da97a20d6fd9", size = 40039, upload-time = "2025-08-26T22:37:04.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/97/83/7b47d2ecbf58650a03aeeb21ba2d59175f202bf4fb81d44f40f1deb82bc0/markitdown-0.1.3-py3-none-any.whl", hash = "sha256:08d9a25770979d78f60dcc0afcb868de6799608e4db65342b2e03304fb091251", size = 58391, upload-time = "2025-08-26T22:37:02.924Z" }, +] + +[package.optional-dependencies] +docx = [ + { name = "lxml" }, + { name = "mammoth" }, +] +pdf = [ + { name = "pdfminer-six" }, +] +xls = [ + { name = "pandas" }, + { name = "xlrd" }, +] +xlsx = [ + { name = "openpyxl" }, + { name = "pandas" }, +] + [[package]] name = "minio" version = "7.2.18" @@ -1433,6 +1579,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3f/c2/bd0fc48dc323035cd65d21c45a3bb5c7b054001bf4ea75e461beb7656e07/mistletoe-1.5.0-py3-none-any.whl", hash = "sha256:d4e77b991b998c5efe3c4eab4e1b472263b5349688acd50d79bd9a6c317a9df1", size = 55262, upload-time = "2025-10-18T16:37:08.376Z" }, ] +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + [[package]] name = "networkx" version = "3.4.2" @@ -1440,7 +1595,8 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.11' and sys_platform == 'win32'", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", ] sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" } wheels = [ @@ -1456,14 +1612,18 @@ resolution-markers = [ "python_full_version == '3.13.*' and sys_platform == 'darwin'", "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", - "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", + "python_full_version == '3.13.*' and sys_platform == 'win32'", + "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", "python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.12.*' and sys_platform == 'win32'", + "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", "python_full_version == '3.11.*' and sys_platform == 'darwin'", "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'win32'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", ] sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" } wheels = [ @@ -1492,7 +1652,8 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.11' and sys_platform == 'win32'", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", ] sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" } wheels = [ @@ -1561,14 +1722,18 @@ resolution-markers = [ "python_full_version == '3.13.*' and sys_platform == 'darwin'", "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", - "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", + "python_full_version == '3.13.*' and sys_platform == 'win32'", + "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", "python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.12.*' and sys_platform == 'win32'", + "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", "python_full_version == '3.11.*' and sys_platform == 'darwin'", "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'win32'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", ] sdist = { url = "https://files.pythonhosted.org/packages/b5/f4/098d2270d52b41f1bd7db9fc288aaa0400cb48c2a3e2af6fa365d9720947/numpy-2.3.4.tar.gz", hash = "sha256:a7d018bfedb375a8d979ac758b120ba846a7fe764911a64465fd87b8729f4a6a", size = 20582187, upload-time = "2025-10-15T16:18:11.77Z" } wheels = [ @@ -1660,6 +1825,97 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b5/c1/edc9f41b425ca40b26b7c104c5f6841a4537bb2552bfa6ca66e81405bb95/ollama-0.6.0-py3-none-any.whl", hash = "sha256:534511b3ccea2dff419ae06c3b58d7f217c55be7897c8ce5868dfb6b219cf7a0", size = 14130, upload-time = "2025-09-24T22:46:01.19Z" }, ] +[[package]] +name = "onnxruntime" +version = "1.20.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'win32'", + "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", + "python_full_version == '3.13.*' and sys_platform == 'win32'", + "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", + "python_full_version == '3.12.*' and sys_platform == 'win32'", + "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", + "python_full_version == '3.11.*' and sys_platform == 'win32'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", + "python_full_version < '3.11' and sys_platform == 'win32'", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", +] +dependencies = [ + { name = "coloredlogs", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "flatbuffers", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "packaging", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "protobuf", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "sympy", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/c6/c4c0860bee2fde6037bdd9dcd12d323f6e38cf00fcc9a5065b394337fc55/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b2908b50101a19e99c4d4e97ebb9905561daf61829403061c1adc1b588bc0de", size = 11954010, upload-time = "2024-11-21T00:48:35.254Z" }, + { url = "https://files.pythonhosted.org/packages/63/47/3dc0b075ab539f16b3d8b09df6b504f51836086ee709690a6278d791737d/onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d82daaec24045a2e87598b8ac2b417b1cce623244e80e663882e9fe1aae86410", size = 13330452, upload-time = "2024-11-21T00:48:40.02Z" }, + { url = "https://files.pythonhosted.org/packages/27/ef/80fab86289ecc01a734b7ddf115dfb93d8b2e004bd1e1977e12881c72b12/onnxruntime-1.20.1-cp310-cp310-win32.whl", hash = "sha256:4c4b251a725a3b8cf2aab284f7d940c26094ecd9d442f07dd81ab5470e99b83f", size = 9813849, upload-time = "2024-11-21T00:48:43.569Z" }, + { url = "https://files.pythonhosted.org/packages/a9/e6/33ab10066c9875a29d55e66ae97c3bf91b9b9b987179455d67c32261a49c/onnxruntime-1.20.1-cp310-cp310-win_amd64.whl", hash = "sha256:d3b616bb53a77a9463707bb313637223380fc327f5064c9a782e8ec69c22e6a2", size = 11329702, upload-time = "2024-11-21T00:48:46.599Z" }, + { url = "https://files.pythonhosted.org/packages/a5/da/c44bf9bd66cd6d9018a921f053f28d819445c4d84b4dd4777271b0fe52a2/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6243e34d74423bdd1edf0ae9596dd61023b260f546ee17d701723915f06a9f7", size = 11955227, upload-time = "2024-11-21T00:48:54.556Z" }, + { url = "https://files.pythonhosted.org/packages/11/ac/4120dfb74c8e45cce1c664fc7f7ce010edd587ba67ac41489f7432eb9381/onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5eec64c0269dcdb8d9a9a53dc4d64f87b9e0c19801d9321246a53b7eb5a7d1bc", size = 13331703, upload-time = "2024-11-21T00:48:57.97Z" }, + { url = "https://files.pythonhosted.org/packages/12/f1/cefacac137f7bb7bfba57c50c478150fcd3c54aca72762ac2c05ce0532c1/onnxruntime-1.20.1-cp311-cp311-win32.whl", hash = "sha256:a19bc6e8c70e2485a1725b3d517a2319603acc14c1f1a017dda0afe6d4665b41", size = 9813977, upload-time = "2024-11-21T00:49:00.519Z" }, + { url = "https://files.pythonhosted.org/packages/2c/2d/2d4d202c0bcfb3a4cc2b171abb9328672d7f91d7af9ea52572722c6d8d96/onnxruntime-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:8508887eb1c5f9537a4071768723ec7c30c28eb2518a00d0adcd32c89dea3221", size = 11329895, upload-time = "2024-11-21T00:49:03.845Z" }, + { url = "https://files.pythonhosted.org/packages/c5/9d/a42a84e10f1744dd27c6f2f9280cc3fb98f869dd19b7cd042e391ee2ab61/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f56e898815963d6dc4ee1c35fc6c36506466eff6d16f3cb9848cea4e8c8172", size = 11952833, upload-time = "2024-11-21T00:49:10.563Z" }, + { url = "https://files.pythonhosted.org/packages/47/42/2f71f5680834688a9c81becbe5c5bb996fd33eaed5c66ae0606c3b1d6a02/onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb71a814f66517a65628c9e4a2bb530a6edd2cd5d87ffa0af0f6f773a027d99e", size = 13333903, upload-time = "2024-11-21T00:49:12.984Z" }, + { url = "https://files.pythonhosted.org/packages/c8/f1/aabfdf91d013320aa2fc46cf43c88ca0182860ff15df872b4552254a9680/onnxruntime-1.20.1-cp312-cp312-win32.whl", hash = "sha256:bd386cc9ee5f686ee8a75ba74037750aca55183085bf1941da8efcfe12d5b120", size = 9814562, upload-time = "2024-11-21T00:49:15.453Z" }, + { url = "https://files.pythonhosted.org/packages/dd/80/76979e0b744307d488c79e41051117634b956612cc731f1028eb17ee7294/onnxruntime-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:19c2d843eb074f385e8bbb753a40df780511061a63f9def1b216bf53860223fb", size = 11331482, upload-time = "2024-11-21T00:49:19.412Z" }, + { url = "https://files.pythonhosted.org/packages/81/0d/13bbd9489be2a6944f4a940084bfe388f1100472f38c07080a46fbd4ab96/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb44b08e017a648924dbe91b82d89b0c105b1adcfe31e90d1dc06b8677ad37be", size = 11951459, upload-time = "2024-11-21T00:49:26.269Z" }, + { url = "https://files.pythonhosted.org/packages/c0/ea/4454ae122874fd52bbb8a961262de81c5f932edeb1b72217f594c700d6ef/onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bda6aebdf7917c1d811f21d41633df00c58aff2bef2f598f69289c1f1dabc4b3", size = 13331620, upload-time = "2024-11-21T00:49:28.875Z" }, + { url = "https://files.pythonhosted.org/packages/d8/e0/50db43188ca1c945decaa8fc2a024c33446d31afed40149897d4f9de505f/onnxruntime-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:d30367df7e70f1d9fc5a6a68106f5961686d39b54d3221f760085524e8d38e16", size = 11331758, upload-time = "2024-11-21T00:49:31.417Z" }, + { url = "https://files.pythonhosted.org/packages/d8/55/3821c5fd60b52a6c82a00bba18531793c93c4addfe64fbf061e235c5617a/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9158465745423b2b5d97ed25aa7740c7d38d2993ee2e5c3bfacb0c4145c49d8", size = 11950342, upload-time = "2024-11-21T00:49:34.164Z" }, + { url = "https://files.pythonhosted.org/packages/14/56/fd990ca222cef4f9f4a9400567b9a15b220dee2eafffb16b2adbc55c8281/onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0df6f2df83d61f46e842dbcde610ede27218947c33e994545a22333491e72a3b", size = 13337040, upload-time = "2024-11-21T00:49:37.271Z" }, +] + +[[package]] +name = "onnxruntime" +version = "1.23.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'darwin'", + "python_full_version == '3.13.*' and sys_platform == 'darwin'", + "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version == '3.12.*' and sys_platform == 'darwin'", + "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.11' and sys_platform == 'darwin'", + "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "coloredlogs", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "flatbuffers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'darwin')" }, + { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform == 'darwin')" }, + { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "protobuf", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, + { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/d6/311b1afea060015b56c742f3531168c1644650767f27ef40062569960587/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a7730122afe186a784660f6ec5807138bf9d792fa1df76556b27307ea9ebcbe3", size = 17195934, upload-time = "2025-10-27T23:06:14.143Z" }, + { url = "https://files.pythonhosted.org/packages/db/db/81bf3d7cecfbfed9092b6b4052e857a769d62ed90561b410014e0aae18db/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:b28740f4ecef1738ea8f807461dd541b8287d5650b5be33bca7b474e3cbd1f36", size = 19153079, upload-time = "2025-10-27T23:05:57.686Z" }, + { url = "https://files.pythonhosted.org/packages/2e/4d/a382452b17cf70a2313153c520ea4c96ab670c996cb3a95cc5d5ac7bfdac/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f7d1fe034090a1e371b7f3ca9d3ccae2fabae8c1d8844fb7371d1ea38e8e8d2", size = 15219883, upload-time = "2025-10-22T03:46:21.66Z" }, + { url = "https://files.pythonhosted.org/packages/fb/56/179bf90679984c85b417664c26aae4f427cba7514bd2d65c43b181b7b08b/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ca88747e708e5c67337b0f65eed4b7d0dd70d22ac332038c9fc4635760018f7", size = 17370357, upload-time = "2025-10-22T03:46:57.968Z" }, + { url = "https://files.pythonhosted.org/packages/44/be/467b00f09061572f022ffd17e49e49e5a7a789056bad95b54dfd3bee73ff/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f91d2c9b0965e86827a5ba01531d5b669770b01775b23199565d6c1f136616c", size = 17196113, upload-time = "2025-10-22T03:47:33.526Z" }, + { url = "https://files.pythonhosted.org/packages/9f/a8/3c23a8f75f93122d2b3410bfb74d06d0f8da4ac663185f91866b03f7da1b/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:87d8b6eaf0fbeb6835a60a4265fde7a3b60157cf1b2764773ac47237b4d48612", size = 19153857, upload-time = "2025-10-22T03:46:37.578Z" }, + { url = "https://files.pythonhosted.org/packages/3f/d8/506eed9af03d86f8db4880a4c47cd0dffee973ef7e4f4cff9f1d4bcf7d22/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbfd2fca76c855317568c1b36a885ddea2272c13cb0e395002c402f2360429a6", size = 15220095, upload-time = "2025-10-22T03:46:24.769Z" }, + { url = "https://files.pythonhosted.org/packages/e9/80/113381ba832d5e777accedc6cb41d10f9eca82321ae31ebb6bcede530cea/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da44b99206e77734c5819aa2142c69e64f3b46edc3bd314f6a45a932defc0b3e", size = 17372080, upload-time = "2025-10-22T03:47:00.265Z" }, + { url = "https://files.pythonhosted.org/packages/1b/9e/f748cd64161213adeef83d0cb16cb8ace1e62fa501033acdd9f9341fff57/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:b8f029a6b98d3cf5be564d52802bb50a8489ab73409fa9db0bf583eabb7c2321", size = 17195929, upload-time = "2025-10-22T03:47:36.24Z" }, + { url = "https://files.pythonhosted.org/packages/91/9d/a81aafd899b900101988ead7fb14974c8a58695338ab6a0f3d6b0100f30b/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:218295a8acae83905f6f1aed8cacb8e3eb3bd7513a13fe4ba3b2664a19fc4a6b", size = 19157705, upload-time = "2025-10-22T03:46:40.415Z" }, + { url = "https://files.pythonhosted.org/packages/3c/35/4e40f2fba272a6698d62be2cd21ddc3675edfc1a4b9ddefcc4648f115315/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76ff670550dc23e58ea9bc53b5149b99a44e63b34b524f7b8547469aaa0dcb8c", size = 15226915, upload-time = "2025-10-22T03:46:27.773Z" }, + { url = "https://files.pythonhosted.org/packages/ef/88/9cc25d2bafe6bc0d4d3c1db3ade98196d5b355c0b273e6a5dc09c5d5d0d5/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f9b4ae77f8e3c9bee50c27bc1beede83f786fe1d52e99ac85aa8d65a01e9b77", size = 17382649, upload-time = "2025-10-22T03:47:02.782Z" }, + { url = "https://files.pythonhosted.org/packages/3d/41/fba0cabccecefe4a1b5fc8020c44febb334637f133acefc7ec492029dd2c/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2ff531ad8496281b4297f32b83b01cdd719617e2351ffe0dba5684fb283afa1f", size = 17196337, upload-time = "2025-10-22T03:46:35.168Z" }, + { url = "https://files.pythonhosted.org/packages/fe/f9/2d49ca491c6a986acce9f1d1d5fc2099108958cc1710c28e89a032c9cfe9/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:162f4ca894ec3de1a6fd53589e511e06ecdc3ff646849b62a9da7489dee9ce95", size = 19157691, upload-time = "2025-10-22T03:46:43.518Z" }, + { url = "https://files.pythonhosted.org/packages/1c/a1/428ee29c6eaf09a6f6be56f836213f104618fb35ac6cc586ff0f477263eb/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45d127d6e1e9b99d1ebeae9bcd8f98617a812f53f46699eafeb976275744826b", size = 15226898, upload-time = "2025-10-22T03:46:30.039Z" }, + { url = "https://files.pythonhosted.org/packages/f2/2b/b57c8a2466a3126dbe0a792f56ad7290949b02f47b86216cd47d857e4b77/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8bace4e0d46480fbeeb7bbe1ffe1f080e6663a42d1086ff95c1551f2d39e7872", size = 17382518, upload-time = "2025-10-22T03:47:05.407Z" }, + { url = "https://files.pythonhosted.org/packages/7c/3d/6830fa61c69ca8e905f237001dbfc01689a4e4ab06147020a4518318881f/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466", size = 15229610, upload-time = "2025-10-22T03:46:32.239Z" }, + { url = "https://files.pythonhosted.org/packages/b6/ca/862b1e7a639460f0ca25fd5b6135fb42cf9deea86d398a92e44dfda2279d/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2b9233c4947907fd1818d0e581c049c41ccc39b2856cc942ff6d26317cee145", size = 17394184, upload-time = "2025-10-22T03:47:08.127Z" }, +] + [[package]] name = "openai" version = "2.7.1" @@ -1733,6 +1989,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, +] + [[package]] name = "opt-einsum" version = "3.3.0" @@ -1821,6 +2089,68 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a3/6f/042d73453ad01a2e3dd4adeae4870e25679d9eaa340d70bd54cd409cb982/paddlepaddle-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:d16fd322246c4a580ca512971840ac8c16126a77d59a7bceee165d8f2196a6b2", size = 101708504, upload-time = "2025-10-30T13:21:18.125Z" }, ] +[[package]] +name = "pandas" +version = "2.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" }, + { url = "https://files.pythonhosted.org/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" }, + { url = "https://files.pythonhosted.org/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" }, + { url = "https://files.pythonhosted.org/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" }, + { url = "https://files.pythonhosted.org/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" }, + { url = "https://files.pythonhosted.org/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" }, + { url = "https://files.pythonhosted.org/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" }, + { url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" }, + { url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" }, + { url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" }, + { url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" }, + { url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" }, + { url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" }, + { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" }, + { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" }, + { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" }, + { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" }, + { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" }, + { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" }, + { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" }, + { url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" }, + { url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" }, + { url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" }, + { url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" }, + { url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" }, + { url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" }, + { url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" }, + { url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" }, + { url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" }, + { url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" }, + { url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" }, + { url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" }, + { url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" }, + { url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" }, + { url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" }, + { url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" }, + { url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" }, + { url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" }, + { url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" }, + { url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" }, + { url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" }, + { url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" }, + { url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" }, + { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" }, + { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" }, + { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" }, +] + [[package]] name = "pdfminer-six" version = "20250506" @@ -2266,6 +2596,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ae/43/2b0607ef7f16d63fbe00de728151a090397ef5b3b9147b4aefe975d17106/pypdfium2-5.0.0-py3-none-win_arm64.whl", hash = "sha256:0a2a473fe95802e7a5f4140f25e5cd036cf17f060f27ee2d28c3977206add763", size = 2939015, upload-time = "2025-10-26T13:31:40.531Z" }, ] +[[package]] +name = "pyreadline3" +version = "3.5.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -2291,6 +2630,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" }, ] +[[package]] +name = "python-dotenv" +version = "1.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, +] + [[package]] name = "python-pptx" version = "1.0.2" @@ -2306,6 +2654,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" }, ] +[[package]] +name = "pytz" +version = "2025.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, +] + [[package]] name = "pyyaml" version = "6.0.3" @@ -2654,7 +3011,8 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.11' and sys_platform == 'win32'", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", ] dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, @@ -2717,14 +3075,18 @@ resolution-markers = [ "python_full_version == '3.13.*' and sys_platform == 'darwin'", "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", - "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", + "python_full_version == '3.13.*' and sys_platform == 'win32'", + "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", "python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.12.*' and sys_platform == 'win32'", + "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", "python_full_version == '3.11.*' and sys_platform == 'darwin'", "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'win32'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", ] dependencies = [ { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -3083,6 +3445,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/97/45/1097a0eb3ad2a1cb5a29d493aefffe4170e454b2726cfe2a76f6652e91af/stringzilla-4.2.3-cp313-cp313-win_arm64.whl", hash = "sha256:1000a8df547fb3b194def48cc3ed6494715fe1c8ac25c6444ed3cfb0b52942c7", size = 99815, upload-time = "2025-10-27T18:35:56.555Z" }, ] +[[package]] +name = "sympy" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, +] + [[package]] name = "termcolor" version = "3.2.0" @@ -3116,7 +3490,8 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.11' and sys_platform == 'win32'", + "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", ] dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, @@ -3135,14 +3510,18 @@ resolution-markers = [ "python_full_version == '3.13.*' and sys_platform == 'darwin'", "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", - "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", + "python_full_version == '3.13.*' and sys_platform == 'win32'", + "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", "python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.12.*' and sys_platform == 'win32'", + "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", "python_full_version == '3.11.*' and sys_platform == 'darwin'", "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'win32'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and sys_platform != 'win32')", ] dependencies = [ { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -3185,6 +3564,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, ] +[[package]] +name = "tzdata" +version = "2025.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, +] + [[package]] name = "unidic-lite" version = "1.0.8"