mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
refactor: 优化解析器日志与API检查逻辑,简化异常处理
This commit is contained in:
@@ -6,6 +6,7 @@ from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class FirstParser(BaseParser):
|
||||
@@ -16,16 +17,15 @@ class FirstParser(BaseParser):
|
||||
|
||||
self._parsers: List[BaseParser] = []
|
||||
for parser_cls in self._parser_cls:
|
||||
try:
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
for p in self._parsers:
|
||||
logger.info(f"FirstParser: using parser {p.__class__.__name__}")
|
||||
document = p.parse_into_text(content)
|
||||
if document.is_valid():
|
||||
logger.info(f"FirstParser: parser {p.__class__.__name__} succeeded")
|
||||
return document
|
||||
return Document()
|
||||
|
||||
@@ -43,16 +43,14 @@ class PipelineParser(BaseParser):
|
||||
|
||||
self._parsers: List[BaseParser] = []
|
||||
for parser_cls in self._parser_cls:
|
||||
try:
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
images: Dict[str, str] = {}
|
||||
document = Document()
|
||||
for p in self._parsers:
|
||||
logger.info(f"PipelineParser: using parser {p.__class__.__name__}")
|
||||
document = p.parse_into_text(content)
|
||||
content = endecode.encode_bytes(document.content)
|
||||
images.update(document.images)
|
||||
|
||||
@@ -28,7 +28,6 @@ class StdMinerUParser(BaseParser):
|
||||
self.image_helper = MarkdownImageUtil()
|
||||
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
|
||||
self.enable = self.ping()
|
||||
assert self.ping(), "MinerU API is not reachable"
|
||||
|
||||
def ping(self, timeout: int = 5) -> bool:
|
||||
try:
|
||||
@@ -41,6 +40,10 @@ class StdMinerUParser(BaseParser):
|
||||
return False
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
if not self.enable:
|
||||
logger.debug("MinerU API is not enabled")
|
||||
return Document()
|
||||
|
||||
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
|
||||
md_content: str = ""
|
||||
images_b64: Dict[str, str] = {}
|
||||
|
||||
Reference in New Issue
Block a user