refactor: 优化解析器日志与API检查逻辑,简化异常处理

This commit is contained in:
begoniezhao
2025-11-20 15:01:14 +08:00
parent 587d1b2bd3
commit 154025f723
2 changed files with 12 additions and 11 deletions

View File

@@ -6,6 +6,7 @@ from docreader.parser.base_parser import BaseParser
from docreader.utils import endecode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class FirstParser(BaseParser):
@@ -16,16 +17,15 @@ class FirstParser(BaseParser):
self._parsers: List[BaseParser] = []
for parser_cls in self._parser_cls:
try:
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
except Exception as e:
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
def parse_into_text(self, content: bytes) -> Document:
for p in self._parsers:
logger.info(f"FirstParser: using parser {p.__class__.__name__}")
document = p.parse_into_text(content)
if document.is_valid():
logger.info(f"FirstParser: parser {p.__class__.__name__} succeeded")
return document
return Document()
@@ -43,16 +43,14 @@ class PipelineParser(BaseParser):
self._parsers: List[BaseParser] = []
for parser_cls in self._parser_cls:
try:
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
except Exception as e:
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
def parse_into_text(self, content: bytes) -> Document:
images: Dict[str, str] = {}
document = Document()
for p in self._parsers:
logger.info(f"PipelineParser: using parser {p.__class__.__name__}")
document = p.parse_into_text(content)
content = endecode.encode_bytes(document.content)
images.update(document.images)

View File

@@ -28,7 +28,6 @@ class StdMinerUParser(BaseParser):
self.image_helper = MarkdownImageUtil()
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
self.enable = self.ping()
assert self.ping(), "MinerU API is not reachable"
def ping(self, timeout: int = 5) -> bool:
try:
@@ -41,6 +40,10 @@ class StdMinerUParser(BaseParser):
return False
def parse_into_text(self, content: bytes) -> Document:
if not self.enable:
logger.debug("MinerU API is not enabled")
return Document()
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
md_content: str = ""
images_b64: Dict[str, str] = {}