mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
移除日志设置与冗余代码,优化导入、类型提示及OCR后端管理 统一调整各文件模块导入路径为绝对导入 调整导入路径,移除部分导入,优化日志及注释 升级文档解析器为 Docx2Parser,优化超时与图片处理逻辑
36 lines
1.2 KiB
Python
36 lines
1.2 KiB
Python
"""
|
|
Parser module for WeKnora document processing system.
|
|
|
|
This module provides document parsers for various file formats including:
|
|
- Microsoft Word documents (.doc, .docx)
|
|
- PDF documents
|
|
- Markdown files
|
|
- Plain text files
|
|
- Images with text content
|
|
- Web pages
|
|
|
|
The parsers extract content from documents and can split them into
|
|
meaningful chunks for further processing and indexing.
|
|
"""
|
|
|
|
from .doc_parser import DocParser
|
|
from .docx2_parser import Docx2Parser
|
|
from .image_parser import ImageParser
|
|
from .markdown_parser import MarkdownParser
|
|
from .parser import Parser
|
|
from .pdf_parser import PDFParser
|
|
from .text_parser import TextParser
|
|
from .web_parser import WebParser
|
|
|
|
# Export public classes and modules
|
|
__all__ = [
|
|
"Docx2Parser", # Parser for .docx files (modern Word documents)
|
|
"DocParser", # Parser for .doc files (legacy Word documents)
|
|
"PDFParser", # Parser for PDF documents
|
|
"MarkdownParser", # Parser for Markdown text files
|
|
"TextParser", # Parser for plain text files
|
|
"ImageParser", # Parser for images with text content
|
|
"WebParser", # Parser for web pages
|
|
"Parser", # Main parser factory that selects the appropriate parser
|
|
]
|