mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
refactor: 优化解析器日志与API检查逻辑,简化异常处理
This commit is contained in:
@@ -6,6 +6,7 @@ from docreader.parser.base_parser import BaseParser
|
|||||||
from docreader.utils import endecode
|
from docreader.utils import endecode
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
class FirstParser(BaseParser):
|
class FirstParser(BaseParser):
|
||||||
@@ -16,16 +17,15 @@ class FirstParser(BaseParser):
|
|||||||
|
|
||||||
self._parsers: List[BaseParser] = []
|
self._parsers: List[BaseParser] = []
|
||||||
for parser_cls in self._parser_cls:
|
for parser_cls in self._parser_cls:
|
||||||
try:
|
parser = parser_cls(*args, **kwargs)
|
||||||
parser = parser_cls(*args, **kwargs)
|
self._parsers.append(parser)
|
||||||
self._parsers.append(parser)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
|
|
||||||
|
|
||||||
def parse_into_text(self, content: bytes) -> Document:
|
def parse_into_text(self, content: bytes) -> Document:
|
||||||
for p in self._parsers:
|
for p in self._parsers:
|
||||||
|
logger.info(f"FirstParser: using parser {p.__class__.__name__}")
|
||||||
document = p.parse_into_text(content)
|
document = p.parse_into_text(content)
|
||||||
if document.is_valid():
|
if document.is_valid():
|
||||||
|
logger.info(f"FirstParser: parser {p.__class__.__name__} succeeded")
|
||||||
return document
|
return document
|
||||||
return Document()
|
return Document()
|
||||||
|
|
||||||
@@ -43,16 +43,14 @@ class PipelineParser(BaseParser):
|
|||||||
|
|
||||||
self._parsers: List[BaseParser] = []
|
self._parsers: List[BaseParser] = []
|
||||||
for parser_cls in self._parser_cls:
|
for parser_cls in self._parser_cls:
|
||||||
try:
|
parser = parser_cls(*args, **kwargs)
|
||||||
parser = parser_cls(*args, **kwargs)
|
self._parsers.append(parser)
|
||||||
self._parsers.append(parser)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
|
|
||||||
|
|
||||||
def parse_into_text(self, content: bytes) -> Document:
|
def parse_into_text(self, content: bytes) -> Document:
|
||||||
images: Dict[str, str] = {}
|
images: Dict[str, str] = {}
|
||||||
document = Document()
|
document = Document()
|
||||||
for p in self._parsers:
|
for p in self._parsers:
|
||||||
|
logger.info(f"PipelineParser: using parser {p.__class__.__name__}")
|
||||||
document = p.parse_into_text(content)
|
document = p.parse_into_text(content)
|
||||||
content = endecode.encode_bytes(document.content)
|
content = endecode.encode_bytes(document.content)
|
||||||
images.update(document.images)
|
images.update(document.images)
|
||||||
|
|||||||
@@ -28,7 +28,6 @@ class StdMinerUParser(BaseParser):
|
|||||||
self.image_helper = MarkdownImageUtil()
|
self.image_helper = MarkdownImageUtil()
|
||||||
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
|
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
|
||||||
self.enable = self.ping()
|
self.enable = self.ping()
|
||||||
assert self.ping(), "MinerU API is not reachable"
|
|
||||||
|
|
||||||
def ping(self, timeout: int = 5) -> bool:
|
def ping(self, timeout: int = 5) -> bool:
|
||||||
try:
|
try:
|
||||||
@@ -41,6 +40,10 @@ class StdMinerUParser(BaseParser):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def parse_into_text(self, content: bytes) -> Document:
|
def parse_into_text(self, content: bytes) -> Document:
|
||||||
|
if not self.enable:
|
||||||
|
logger.debug("MinerU API is not enabled")
|
||||||
|
return Document()
|
||||||
|
|
||||||
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
|
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
|
||||||
md_content: str = ""
|
md_content: str = ""
|
||||||
images_b64: Dict[str, str] = {}
|
images_b64: Dict[str, str] = {}
|
||||||
|
|||||||
Reference in New Issue
Block a user