Files
WeKnora/docreader/parser/markdown_parser.py
2025-11-05 12:07:39 +08:00

34 lines
1.0 KiB
Python

import asyncio
import re
import logging
import numpy as np
import os # Import os module to get environment variables
from typing import Dict, List, Optional, Tuple, Union, Any
from .base_parser import BaseParser
# Get logger object
logger = logging.getLogger(__name__)
class MarkdownParser(BaseParser):
"""Markdown document parser"""
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
"""Parse Markdown document, only extract text content, do not process images
Args:
content: Markdown document content
Returns:
Parsed text result
"""
logger.info(f"Parsing Markdown document, content size: {len(content)} bytes")
# Convert byte content to string using universal decoding method
text = self.decode_bytes(content)
logger.info(f"Decoded Markdown content, text length: {len(text)} characters")
logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text")
return text