Files
WeKnora/docreader/parser/chain_parser.py

72 lines
2.5 KiB
Python

import logging
from typing import Dict, List, Tuple, Type
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.utils import endecode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class FirstParser(BaseParser):
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._parsers: List[BaseParser] = []
for parser_cls in self._parser_cls:
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
def parse_into_text(self, content: bytes) -> Document:
for p in self._parsers:
logger.info(f"FirstParser: using parser {p.__class__.__name__}")
document = p.parse_into_text(content)
if document.is_valid():
logger.info(f"FirstParser: parser {p.__class__.__name__} succeeded")
return document
return Document()
@classmethod
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["FirstParser"]:
names = "_".join([p.__name__ for p in parser_classes])
return type(f"FirstParser_{names}", (cls,), {"_parser_cls": parser_classes})
class PipelineParser(BaseParser):
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._parsers: List[BaseParser] = []
for parser_cls in self._parser_cls:
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
def parse_into_text(self, content: bytes) -> Document:
images: Dict[str, str] = {}
document = Document()
for p in self._parsers:
logger.info(f"PipelineParser: using parser {p.__class__.__name__}")
document = p.parse_into_text(content)
content = endecode.encode_bytes(document.content)
images.update(document.images)
document.images.update(images)
return document
@classmethod
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["PipelineParser"]:
names = "_".join([p.__name__ for p in parser_classes])
return type(f"PipelineParser_{names}", (cls,), {"_parser_cls": parser_classes})
if __name__ == "__main__":
from docreader.parser.markdown_parser import MarkdownParser
cls = FirstParser.create(MarkdownParser)
parser = cls()
print(parser.parse_into_text(b"aaa"))