diff --git a/docreader/parser/chain_parser.py b/docreader/parser/chain_parser.py index 1dcaeb0..7edac88 100644 --- a/docreader/parser/chain_parser.py +++ b/docreader/parser/chain_parser.py @@ -6,6 +6,7 @@ from docreader.parser.base_parser import BaseParser from docreader.utils import endecode logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) class FirstParser(BaseParser): @@ -16,16 +17,15 @@ class FirstParser(BaseParser): self._parsers: List[BaseParser] = [] for parser_cls in self._parser_cls: - try: - parser = parser_cls(*args, **kwargs) - self._parsers.append(parser) - except Exception as e: - logger.error(f"Failed to create parser {parser_cls.__name__}: {e}") + parser = parser_cls(*args, **kwargs) + self._parsers.append(parser) def parse_into_text(self, content: bytes) -> Document: for p in self._parsers: + logger.info(f"FirstParser: using parser {p.__class__.__name__}") document = p.parse_into_text(content) if document.is_valid(): + logger.info(f"FirstParser: parser {p.__class__.__name__} succeeded") return document return Document() @@ -43,16 +43,14 @@ class PipelineParser(BaseParser): self._parsers: List[BaseParser] = [] for parser_cls in self._parser_cls: - try: - parser = parser_cls(*args, **kwargs) - self._parsers.append(parser) - except Exception as e: - logger.error(f"Failed to create parser {parser_cls.__name__}: {e}") + parser = parser_cls(*args, **kwargs) + self._parsers.append(parser) def parse_into_text(self, content: bytes) -> Document: images: Dict[str, str] = {} document = Document() for p in self._parsers: + logger.info(f"PipelineParser: using parser {p.__class__.__name__}") document = p.parse_into_text(content) content = endecode.encode_bytes(document.content) images.update(document.images) diff --git a/docreader/parser/mineru_parser.py b/docreader/parser/mineru_parser.py index 16f2a46..ef1c5ca 100644 --- a/docreader/parser/mineru_parser.py +++ b/docreader/parser/mineru_parser.py @@ -28,7 +28,6 @@ class StdMinerUParser(BaseParser): self.image_helper = MarkdownImageUtil() self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)") self.enable = self.ping() - assert self.ping(), "MinerU API is not reachable" def ping(self, timeout: int = 5) -> bool: try: @@ -41,6 +40,10 @@ class StdMinerUParser(BaseParser): return False def parse_into_text(self, content: bytes) -> Document: + if not self.enable: + logger.debug("MinerU API is not enabled") + return Document() + logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)") md_content: str = "" images_b64: Dict[str, str] = {}