mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
133 lines
4.5 KiB
Python
133 lines
4.5 KiB
Python
import logging
|
|
import os
|
|
import re
|
|
from typing import Dict
|
|
|
|
import markdownify
|
|
import requests
|
|
|
|
from docreader.models.document import Document
|
|
from docreader.parser.base_parser import BaseParser
|
|
from docreader.parser.chain_parser import PipelineParser
|
|
from docreader.parser.markdown_parser import MarkdownImageUtil, MarkdownTableFormatter
|
|
from docreader.utils import endecode
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class StdMinerUParser(BaseParser):
|
|
def __init__(
|
|
self,
|
|
enable_markdownify: bool = True,
|
|
mineru_endpoint: str = "",
|
|
**kwargs,
|
|
):
|
|
super().__init__(**kwargs)
|
|
self.minerU = os.getenv("MINERU_ENDPOINT", mineru_endpoint)
|
|
self.enable_markdownify = enable_markdownify
|
|
self.image_helper = MarkdownImageUtil()
|
|
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
|
|
self.enable = self.ping()
|
|
|
|
def ping(self, timeout: int = 5) -> bool:
|
|
try:
|
|
response = requests.get(
|
|
self.minerU + "/docs", timeout=timeout, allow_redirects=True
|
|
)
|
|
response.raise_for_status()
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
def parse_into_text(self, content: bytes) -> Document:
|
|
if not self.enable:
|
|
logger.debug("MinerU API is not enabled")
|
|
return Document()
|
|
|
|
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
|
|
md_content: str = ""
|
|
images_b64: Dict[str, str] = {}
|
|
try:
|
|
response = requests.post(
|
|
url=self.minerU + "/file_parse",
|
|
data={
|
|
"return_md": True,
|
|
"return_images": True,
|
|
"lang_list": ["ch", "en"],
|
|
"table_enable": True,
|
|
"formula_enable": True,
|
|
"parse_method": "auto",
|
|
"start_page_id": 0,
|
|
"end_page_id": 99999,
|
|
"backend": "pipeline",
|
|
"response_format_zip": False,
|
|
"return_middle_json": False,
|
|
"return_model_output": False,
|
|
"return_content_list": False,
|
|
},
|
|
files={"files": content},
|
|
timeout=1000,
|
|
)
|
|
response.raise_for_status()
|
|
result = response.json()["results"]["files"]
|
|
md_content = result["md_content"]
|
|
images_b64 = result.get("images", {})
|
|
except Exception as e:
|
|
logger.error(f"MinerU parsing failed: {e}", exc_info=True)
|
|
return Document()
|
|
|
|
# convert table(HTML) in markdown to markdown table
|
|
if self.enable_markdownify:
|
|
logger.debug("Converting HTML to Markdown")
|
|
md_content = markdownify.markdownify(md_content)
|
|
|
|
images = {}
|
|
image_replace = {}
|
|
# image in images_bs64 may not be used in md_content
|
|
# such as: table ...
|
|
# so we need to filter them
|
|
for ipath, b64_str in images_b64.items():
|
|
if f"images/{ipath}" not in md_content:
|
|
logger.debug(f"Image {ipath} not used in markdown")
|
|
continue
|
|
match = self.base64_pattern.match(b64_str)
|
|
if match:
|
|
file_ext = match.group(1)
|
|
b64_str = match.group(2)
|
|
|
|
image_bytes = endecode.encode_image(b64_str, errors="ignore")
|
|
if not image_bytes:
|
|
logger.error("Failed to decode base64 image skip it")
|
|
continue
|
|
|
|
image_url = self.storage.upload_bytes(
|
|
image_bytes, file_ext=f".{file_ext}"
|
|
)
|
|
|
|
images[image_url] = b64_str
|
|
image_replace[f"images/{ipath}"] = image_url
|
|
|
|
logger.info(f"Replaced {len(image_replace)} images in markdown")
|
|
text = self.image_helper.replace_path(md_content, image_replace)
|
|
|
|
logger.info(
|
|
f"Successfully parsed PDF, text: {len(text)}, images: {len(images)}"
|
|
)
|
|
return Document(content=text, images=images)
|
|
|
|
|
|
class MinerUParser(PipelineParser):
|
|
_parser_cls = (StdMinerUParser, MarkdownTableFormatter)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
your_file = "/path/to/your/file.pdf"
|
|
your_mineru = "http://host.docker.internal:9987"
|
|
parser = MinerUParser(mineru_endpoint=your_mineru)
|
|
with open(your_file, "rb") as f:
|
|
content = f.read()
|
|
document = parser.parse_into_text(content)
|
|
logger.error(document.content)
|