mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 19:37:45 +08:00
34 lines
1.0 KiB
Python
34 lines
1.0 KiB
Python
import asyncio
|
|
import re
|
|
import logging
|
|
import numpy as np
|
|
import os # Import os module to get environment variables
|
|
from typing import Dict, List, Optional, Tuple, Union, Any
|
|
from .base_parser import BaseParser
|
|
|
|
# Get logger object
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class MarkdownParser(BaseParser):
|
|
"""Markdown document parser"""
|
|
|
|
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
|
"""Parse Markdown document, only extract text content, do not process images
|
|
|
|
Args:
|
|
content: Markdown document content
|
|
|
|
Returns:
|
|
Parsed text result
|
|
"""
|
|
logger.info(f"Parsing Markdown document, content size: {len(content)} bytes")
|
|
|
|
# Convert byte content to string using universal decoding method
|
|
text = self.decode_bytes(content)
|
|
logger.info(f"Decoded Markdown content, text length: {len(text)} characters")
|
|
|
|
logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text")
|
|
return text
|
|
|