WeKnora/docreader/parser/mineru_parser.py

import logging
import os
import re
from typing import Dict

import markdownify
import requests

from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.chain_parser import PipelineParser
from docreader.parser.markdown_parser import MarkdownImageUtil, MarkdownTableFormatter
from docreader.utils import endecode

logger = logging.getLogger(__name__)


class StdMinerUParser(BaseParser):
    def __init__(
        self,
        enable_markdownify: bool = True,
        mineru_endpoint: str = "",
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.minerU = os.getenv("MINERU_ENDPOINT", mineru_endpoint)
        self.enable_markdownify = enable_markdownify
        self.image_helper = MarkdownImageUtil()
        self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
        self.enable = self.ping()

    def ping(self, timeout: int = 5) -> bool:
        try:
            response = requests.get(
                self.minerU + "/docs", timeout=timeout, allow_redirects=True
            )
            response.raise_for_status()
            return True
        except Exception:
            return False

    def parse_into_text(self, content: bytes) -> Document:
        if not self.enable:
            logger.debug("MinerU API is not enabled")
            return Document()

        logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
        md_content: str = ""
        images_b64: Dict[str, str] = {}
        try:
            response = requests.post(
                url=self.minerU + "/file_parse",
                data={
                    "return_md": True,
                    "return_images": True,
                    "lang_list": ["ch", "en"],
                    "table_enable": True,
                    "formula_enable": True,
                    "parse_method": "auto",
                    "start_page_id": 0,
                    "end_page_id": 99999,
                    "backend": "pipeline",
                    "response_format_zip": False,
                    "return_middle_json": False,
                    "return_model_output": False,
                    "return_content_list": False,
                },
                files={"files": content},
                timeout=1000,
            )
            response.raise_for_status()
            result = response.json()["results"]["files"]
            md_content = result["md_content"]
            images_b64 = result.get("images", {})
        except Exception as e:
            logger.error(f"MinerU parsing failed: {e}", exc_info=True)
            return Document()

        # convert table(HTML) in markdown to markdown table
        if self.enable_markdownify:
            logger.debug("Converting HTML to Markdown")
            md_content = markdownify.markdownify(md_content)

        images = {}
        image_replace = {}
        # image in images_bs64 may not be used in md_content
        # such as: table ...
        # so we need to filter them
        for ipath, b64_str in images_b64.items():
            if f"images/{ipath}" not in md_content:
                logger.debug(f"Image {ipath} not used in markdown")
                continue
            match = self.base64_pattern.match(b64_str)
            if match:
                file_ext = match.group(1)
                b64_str = match.group(2)

                image_bytes = endecode.encode_image(b64_str, errors="ignore")
                if not image_bytes:
                    logger.error("Failed to decode base64 image skip it")
                    continue

                image_url = self.storage.upload_bytes(
                    image_bytes, file_ext=f".{file_ext}"
                )

                images[image_url] = b64_str
                image_replace[f"images/{ipath}"] = image_url

        logger.info(f"Replaced {len(image_replace)} images in markdown")
        text = self.image_helper.replace_path(md_content, image_replace)

        logger.info(
            f"Successfully parsed PDF, text: {len(text)}, images: {len(images)}"
        )
        return Document(content=text, images=images)


class MinerUParser(PipelineParser):
    _parser_cls = (StdMinerUParser, MarkdownTableFormatter)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)

    your_file = "/path/to/your/file.pdf"
    your_mineru = "http://host.docker.internal:9987"
    parser = MinerUParser(mineru_endpoint=your_mineru)
    with open(your_file, "rb") as f:
        content = f.read()
        document = parser.parse_into_text(content)
        logger.error(document.content)