chore(docreader): 重新组织模块文件

2025-11-25 03:15:00 +08:00 · 2025-11-05 11:33:50 +08:00
parent 0d790ffedc
commit c1f731e026
43 changed files with 3435 additions and 2398 deletions
--- a/docreader/Makefile
+++ b/docreader/Makefile
@@ -0,0 +1,23 @@
+.PHONY: proto build run docker-build docker-run clean
+
+# 生成 protobuf 代码
+proto:
+	@echo "Generating protobuf code..."
+	@sh ./scripts/generate_proto.sh
+
+# 构建 Go 客户端
+build:
+	@echo "Building Go client..."
+	@go build -o bin/client ./src/client
+
+# 运行 Python 服务
+run:
+	@echo "Running Python server..."
+	@python src/server/server.py
+
+# 清理
+clean:
+	@echo "Cleaning up..."
+	@rm -rf bin/
+	@find . -name "*.pyc" -delete
+	@find . -name "__pycache__" -delete 
--- a/docreader/README.md
+++ b/docreader/README.md
--- a/docreader/client/client.go
+++ b/docreader/client/client.go
@@ -0,0 +1,115 @@
+package client
+
+import (
+	"fmt"
+	"log"
+	"os"
+	"time"
+
+	"github.com/Tencent/WeKnora/services/docreader/src/proto"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
+	"google.golang.org/grpc/resolver"
+)
+
+const (
+	maxMessageSize = 50 * 1024 * 1024 // 50MB
+)
+
+var (
+	// Logger is the default logger used by the client
+	Logger = log.New(os.Stdout, "[DocReader] ", log.LstdFlags|log.Lmicroseconds)
+)
+
+// ImageInfo 表示一个图片的信息
+type ImageInfo struct {
+	URL         string // 图片URL（COS）
+	Caption     string // 图片描述
+	OCRText     string // OCR提取的文本
+	OriginalURL string // 原始图片URL
+	Start       int    // 图片在文本中的开始位置
+	End         int    // 图片在文本中的结束位置
+}
+
+// Client represents a DocReader service client
+type Client struct {
+	conn *grpc.ClientConn
+	proto.DocReaderClient
+	debug bool
+}
+
+// NewClient creates a new DocReader client with the specified address
+func NewClient(addr string) (*Client, error) {
+	Logger.Printf("INFO: Creating new DocReader client connecting to %s", addr)
+
+	// 设置消息大小限制
+	opts := []grpc.DialOption{
+		grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy":"round_robin"}`),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(maxMessageSize),
+			grpc.MaxCallSendMsgSize(maxMessageSize),
+		),
+	}
+	resolver.SetDefaultScheme("dns")
+
+	startTime := time.Now()
+	conn, err := grpc.Dial("dns:///"+addr, opts...)
+	if err != nil {
+		Logger.Printf("ERROR: Failed to connect to DocReader service: %v", err)
+		return nil, err
+	}
+	Logger.Printf("INFO: Successfully connected to DocReader service in %v", time.Since(startTime))
+
+	return &Client{
+		conn:            conn,
+		DocReaderClient: proto.NewDocReaderClient(conn),
+		debug:           false,
+	}, nil
+}
+
+// Close closes the client connection
+func (c *Client) Close() error {
+	Logger.Printf("INFO: Closing DocReader client connection")
+	return c.conn.Close()
+}
+
+// SetDebug enables or disables debug logging
+func (c *Client) SetDebug(debug bool) {
+	c.debug = debug
+	Logger.Printf("INFO: Debug logging set to %v", debug)
+}
+
+// Log logs a message with the appropriate level
+func (c *Client) Log(level string, format string, args ...interface{}) {
+	if level == "DEBUG" && !c.debug {
+		return
+	}
+	Logger.Printf("%s: %s", level, fmt.Sprintf(format, args...))
+}
+
+// GetImagesFromChunk 从一个Chunk中提取所有图片信息
+func GetImagesFromChunk(chunk *proto.Chunk) []ImageInfo {
+	if chunk == nil || len(chunk.Images) == 0 {
+		return nil
+	}
+
+	images := make([]ImageInfo, 0, len(chunk.Images))
+	for _, img := range chunk.Images {
+		images = append(images, ImageInfo{
+			URL:         img.Url,
+			Caption:     img.Caption,
+			OCRText:     img.OcrText,
+			OriginalURL: img.OriginalUrl,
+			Start:       int(img.Start),
+			End:         int(img.End),
+		})
+	}
+
+	return images
+}
+
+// HasImagesInChunk 判断一个Chunk是否包含图片
+func HasImagesInChunk(chunk *proto.Chunk) bool {
+	return chunk != nil && len(chunk.Images) > 0
+}
--- a/docreader/client/client_test.go
+++ b/docreader/client/client_test.go
@@ -0,0 +1,154 @@
+package client
+
+import (
+	"context"
+	"log"
+	"os"
+	"testing"
+	"time"
+
+	"github.com/Tencent/WeKnora/services/docreader/src/proto"
+)
+
+func init() {
+	// 配置测试日志
+	log.SetOutput(os.Stdout)
+	log.SetFlags(log.LstdFlags | log.Lmicroseconds | log.Lshortfile)
+	log.Println("INFO: Initializing DocReader client tests")
+}
+
+func TestReadFromURL(t *testing.T) {
+	log.Println("INFO: Starting TestReadFromURL")
+
+	// 创建测试客户端
+	log.Println("INFO: Creating test client")
+	client, err := NewClient("localhost:50051")
+	if err != nil {
+		log.Printf("ERROR: Failed to create client: %v", err)
+		t.Fatalf("Failed to create client: %v", err)
+	}
+	defer client.Close()
+
+	// 启用调试日志
+	client.SetDebug(true)
+
+	// 测试 ReadFromURL 方法
+	log.Println("INFO: Sending ReadFromURL request to server")
+	startTime := time.Now()
+	resp, err := client.ReadFromURL(
+		context.Background(),
+		&proto.ReadFromURLRequest{
+			Url:   "https://example.com",
+			Title: "test",
+			ReadConfig: &proto.ReadConfig{
+				ChunkSize:        512,
+				ChunkOverlap:     50,
+				Separators:       []string{"\n\n", "\n", "。"},
+				EnableMultimodal: true,
+			},
+		},
+	)
+
+	requestDuration := time.Since(startTime)
+	if err != nil {
+		log.Printf("ERROR: ReadFromURL failed: %v", err)
+		t.Fatalf("ReadFromURL failed: %v", err)
+	}
+	log.Printf("INFO: ReadFromURL completed in %v", requestDuration)
+
+	// 验证结果
+	chunkCount := len(resp.Chunks)
+	log.Printf("INFO: Received %d chunks from URL parsing", chunkCount)
+	if chunkCount == 0 {
+		log.Println("WARN: Expected non-empty content but received none")
+		t.Error("Expected non-empty content")
+	}
+
+	// 打印结果
+	for i, chunk := range resp.Chunks {
+		if i < 2 || i >= chunkCount-2 { // 只打印前两个和后两个块
+			log.Printf("DEBUG: Chunk %d: %s", chunk.Seq, truncateString(chunk.Content, 50))
+		} else if i == 2 && chunkCount > 4 {
+			log.Printf("DEBUG: ... %d more chunks ...", chunkCount-4)
+		}
+	}
+
+	log.Println("INFO: TestReadFromURL completed successfully")
+}
+
+func TestReadFromFileWithChunking(t *testing.T) {
+	log.Println("INFO: Starting TestReadFromFileWithChunking")
+
+	// 创建测试客户端
+	log.Println("INFO: Creating test client")
+	client, err := NewClient("localhost:50051")
+	if err != nil {
+		log.Printf("ERROR: Failed to create client: %v", err)
+		t.Fatalf("Failed to create client: %v", err)
+	}
+	defer client.Close()
+
+	// 启用调试日志
+	client.SetDebug(true)
+
+	// 读取测试文件
+	log.Println("INFO: Reading test file")
+	fileContent, err := os.ReadFile("../testdata/test.md")
+	if err != nil {
+		log.Printf("ERROR: Failed to read test file: %v", err)
+		t.Fatalf("Failed to read test file: %v", err)
+	}
+	log.Printf("INFO: Read test file, size: %d bytes", len(fileContent))
+
+	// 测试 ReadFromFile 方法，带分块参数
+	log.Println("INFO: Sending ReadFromFile request to server")
+	startTime := time.Now()
+	resp, err := client.ReadFromFile(
+		context.Background(),
+		&proto.ReadFromFileRequest{
+			FileContent: fileContent,
+			FileName:    "test.md",
+			FileType:    "md",
+			ReadConfig: &proto.ReadConfig{
+				ChunkSize:        200,
+				ChunkOverlap:     50,
+				Separators:       []string{"\n\n", "\n", "。"},
+				EnableMultimodal: true,
+			},
+		},
+	)
+
+	requestDuration := time.Since(startTime)
+	if err != nil {
+		log.Printf("ERROR: ReadFromFile failed: %v", err)
+		t.Fatalf("ReadFromFile failed: %v", err)
+	}
+	log.Printf("INFO: ReadFromFile completed in %v", requestDuration)
+
+	// 验证结果
+	chunkCount := len(resp.Chunks)
+	log.Printf("INFO: Received %d chunks from file parsing", chunkCount)
+	if chunkCount == 0 {
+		log.Println("WARN: Expected non-empty content but received none")
+		t.Error("Expected non-empty content")
+	}
+
+	// 打印结果
+	for i, chunk := range resp.Chunks {
+		if i < 2 || i >= chunkCount-2 { // 只打印前两个和后两个块
+			log.Printf("DEBUG: Chunk %d: %s", chunk.Seq, truncateString(chunk.Content, 50))
+		} else if i == 2 && chunkCount > 4 {
+			log.Printf("DEBUG: ... %d more chunks ...", chunkCount-4)
+		}
+	}
+
+	log.Println("INFO: TestReadFromFileWithChunking completed successfully")
+}
+
+// 截断字符串以供日志打印
+func truncateString(s string, maxLen int) string {
+	if len(s) <= maxLen {
+		return s
+	}
+	return s[:maxLen] + "..."
+}
--- a/docreader/main.py
+++ b/docreader/main.py
@@ -0,0 +1,403 @@
+import os
+import sys
+import logging
+from concurrent import futures
+import traceback
+import grpc
+import uuid
+import atexit
+from grpc_health.v1 import health_pb2_grpc
+from grpc_health.v1.health import HealthServicer
+
+# Add parent directory to Python path
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+if parent_dir not in sys.path:
+    sys.path.insert(0, parent_dir)
+
+from proto.docreader_pb2 import ReadResponse, Chunk, Image
+from proto import docreader_pb2_grpc
+from parser import Parser, OCREngine
+from parser.config import ChunkingConfig
+from utils.request import request_id_context, init_logging_request_id
+
+# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
+import re
+from typing import Optional
+
+try:
+    # Optional dependency for charset detection; install via `pip install charset-normalizer`
+    from charset_normalizer import from_bytes as _cn_from_bytes  # type: ignore
+except Exception:  # pragma: no cover
+    _cn_from_bytes = None  # type: ignore
+
+# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
+_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
+
+
+def to_valid_utf8_text(s: Optional[str]) -> str:
+    """Return a UTF-8 safe string for protobuf.
+
+    - Replace any surrogate code points with U+FFFD
+    - Re-encode with errors='replace' to ensure valid UTF-8
+    """
+    if not s:
+        return ""
+    s = _SURROGATE_RE.sub("\ufffd", s)
+    return s.encode("utf-8", errors="replace").decode("utf-8")
+
+
+def read_text_with_fallback(file_path: str) -> str:
+    """Read text from file supporting multiple encodings with graceful fallback.
+
+    This server currently receives bytes over gRPC and delegates decoding to the parser.
+    This helper is provided for future local-file reads if needed.
+    """
+    with open(file_path, "rb") as f:
+        raw = f.read()
+    if _cn_from_bytes is not None:
+        try:
+            result = _cn_from_bytes(raw).best()
+            if result:
+                return str(result)
+        except Exception:
+            pass
+    for enc in ("utf-8", "gb18030", "latin-1"):
+        try:
+            return raw.decode(enc, errors="replace")
+        except UnicodeDecodeError:
+            continue
+    return raw.decode("utf-8", errors="replace")
+
+
+# Ensure no existing handlers
+for handler in logging.root.handlers[:]:
+    logging.root.removeHandler(handler)
+
+# Configure logging - use stdout
+handler = logging.StreamHandler(sys.stdout)
+logging.root.addHandler(handler)
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logger.info("Initializing server logging")
+
+# Initialize request ID logging
+init_logging_request_id()
+
+# Set max message size to 50MB
+MAX_MESSAGE_LENGTH = 50 * 1024 * 1024
+
+
+parser = Parser()
+
+
+class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
+    def __init__(self):
+        super().__init__()
+        self.parser = Parser()
+
+    def ReadFromFile(self, request, context):
+        # Get or generate request ID
+        request_id = (
+            request.request_id
+            if hasattr(request, "request_id") and request.request_id
+            else str(uuid.uuid4())
+        )
+
+        # Use request ID context
+        with request_id_context(request_id):
+            try:
+                # Get file type
+                file_type = (
+                    request.file_type or os.path.splitext(request.file_name)[1][1:]
+                )
+                logger.info(
+                    f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
+                )
+                logger.info(f"File content size: {len(request.file_content)} bytes")
+
+                # Create chunking config
+                chunk_size = request.read_config.chunk_size or 512
+                chunk_overlap = request.read_config.chunk_overlap or 50
+                separators = request.read_config.separators or ["\n\n", "\n", "。"]
+                enable_multimodal = request.read_config.enable_multimodal or False
+
+                logger.info(
+                    f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
+                    f"multimodal={enable_multimodal}"
+                )
+
+                # Get Storage and VLM config from request
+                storage_config = None
+                vlm_config = None
+
+                sc = request.read_config.storage_config
+                # Keep parser-side key name as cos_config for backward compatibility
+                storage_config = {
+                    "provider": "minio" if sc.provider == 2 else "cos",
+                    "region": sc.region,
+                    "bucket_name": sc.bucket_name,
+                    "access_key_id": sc.access_key_id,
+                    "secret_access_key": sc.secret_access_key,
+                    "app_id": sc.app_id,
+                    "path_prefix": sc.path_prefix,
+                }
+                logger.info(
+                    f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
+                )
+
+                vlm_config = {
+                    "model_name": request.read_config.vlm_config.model_name,
+                    "base_url": request.read_config.vlm_config.base_url,
+                    "api_key": request.read_config.vlm_config.api_key or "",
+                    "interface_type": request.read_config.vlm_config.interface_type
+                    or "openai",
+                }
+                logger.info(
+                    f"Using VLM config: model={vlm_config['model_name']}, "
+                    f"base_url={vlm_config['base_url']}, "
+                    f"interface_type={vlm_config['interface_type']}"
+                )
+
+                chunking_config = ChunkingConfig(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                    separators=separators,
+                    enable_multimodal=enable_multimodal,
+                    storage_config=storage_config,
+                    vlm_config=vlm_config,
+                )
+
+                # Parse file
+                logger.info(f"Starting file parsing process")
+                result = self.parser.parse_file(
+                    request.file_name, file_type, request.file_content, chunking_config
+                )
+
+                if not result:
+                    error_msg = "Failed to parse file"
+                    logger.error(error_msg)
+                    context.set_code(grpc.StatusCode.INTERNAL)
+                    context.set_details(error_msg)
+                    return ReadResponse()
+
+                # Convert to protobuf message
+                logger.info(
+                    f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
+                )
+
+                # Build response, including image info
+                response = ReadResponse(
+                    chunks=[
+                        self._convert_chunk_to_proto(chunk) for chunk in result.chunks
+                    ]
+                )
+                logger.info(f"Response size: {response.ByteSize()} bytes")
+                return response
+
+            except Exception as e:
+                error_msg = f"Error reading file: {str(e)}"
+                logger.error(error_msg)
+                logger.info(f"Detailed traceback: {traceback.format_exc()}")
+                context.set_code(grpc.StatusCode.INTERNAL)
+                context.set_details(str(e))
+                return ReadResponse(error=str(e))
+
+    def ReadFromURL(self, request, context):
+        # Get or generate request ID
+        request_id = (
+            request.request_id
+            if hasattr(request, "request_id") and request.request_id
+            else str(uuid.uuid4())
+        )
+
+        # Use request ID context
+        with request_id_context(request_id):
+            try:
+                logger.info(f"Received ReadFromURL request for URL: {request.url}")
+
+                # Create chunking config
+                chunk_size = request.read_config.chunk_size or 512
+                chunk_overlap = request.read_config.chunk_overlap or 50
+                separators = request.read_config.separators or ["\n\n", "\n", "。"]
+                enable_multimodal = request.read_config.enable_multimodal or False
+
+                logger.info(
+                    f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
+                    f"multimodal={enable_multimodal}"
+                )
+
+                # Get Storage and VLM config from request
+                storage_config = None
+                vlm_config = None
+
+                sc = request.read_config.storage_config
+                storage_config = {
+                    "provider": "minio" if sc.provider == 2 else "cos",
+                    "region": sc.region,
+                    "bucket_name": sc.bucket_name,
+                    "access_key_id": sc.access_key_id,
+                    "secret_access_key": sc.secret_access_key,
+                    "app_id": sc.app_id,
+                    "path_prefix": sc.path_prefix,
+                }
+                logger.info(
+                    f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
+                )
+
+                vlm_config = {
+                    "model_name": request.read_config.vlm_config.model_name,
+                    "base_url": request.read_config.vlm_config.base_url,
+                    "api_key": request.read_config.vlm_config.api_key or "",
+                    "interface_type": request.read_config.vlm_config.interface_type
+                    or "openai",
+                }
+                logger.info(
+                    f"Using VLM config: model={vlm_config['model_name']}, "
+                    f"base_url={vlm_config['base_url']}, "
+                    f"interface_type={vlm_config['interface_type']}"
+                )
+
+                chunking_config = ChunkingConfig(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                    separators=separators,
+                    enable_multimodal=enable_multimodal,
+                    storage_config=storage_config,
+                    vlm_config=vlm_config,
+                )
+
+                # Parse URL
+                logger.info(f"Starting URL parsing process")
+                result = self.parser.parse_url(
+                    request.url, request.title, chunking_config
+                )
+                if not result:
+                    error_msg = "Failed to parse URL"
+                    logger.error(error_msg)
+                    context.set_code(grpc.StatusCode.INTERNAL)
+                    context.set_details(error_msg)
+                    return ReadResponse(error=error_msg)
+
+                # Convert to protobuf message, including image info
+                logger.info(
+                    f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
+                )
+
+                response = ReadResponse(
+                    chunks=[
+                        self._convert_chunk_to_proto(chunk) for chunk in result.chunks
+                    ]
+                )
+                logger.info(f"Response size: {response.ByteSize()} bytes")
+                return response
+
+            except Exception as e:
+                error_msg = f"Error reading URL: {str(e)}"
+                logger.error(error_msg)
+                logger.info(f"Detailed traceback: {traceback.format_exc()}")
+                context.set_code(grpc.StatusCode.INTERNAL)
+                context.set_details(str(e))
+                return ReadResponse(error=str(e))
+
+    def _convert_chunk_to_proto(self, chunk):
+        """Convert internal Chunk object to protobuf Chunk message
+        Ensures all string fields are valid UTF-8 for protobuf (no lone surrogates).
+        """
+        # Clean helper for strings
+        _c = to_valid_utf8_text
+
+        proto_chunk = Chunk(
+            content=_c(getattr(chunk, "content", None)),
+            seq=getattr(chunk, "seq", 0),
+            start=getattr(chunk, "start", 0),
+            end=getattr(chunk, "end", 0),
+        )
+
+        # If chunk has images attribute and is not empty, add image info
+        if hasattr(chunk, "images") and chunk.images:
+            logger.info(
+                f"Adding {len(chunk.images)} images to chunk {getattr(chunk, 'seq', 0)}"
+            )
+            for img_info in chunk.images:
+                # img_info expected as dict
+                proto_image = Image(
+                    url=_c(img_info.get("cos_url", "")),
+                    caption=_c(img_info.get("caption", "")),
+                    ocr_text=_c(img_info.get("ocr_text", "")),
+                    original_url=_c(img_info.get("original_url", "")),
+                    start=int(img_info.get("start", 0) or 0),
+                    end=int(img_info.get("end", 0) or 0),
+                )
+                proto_chunk.images.append(proto_image)
+
+        return proto_chunk
+
+
+def init_ocr_engine(ocr_backend, ocr_config):
+    """Initialize OCR engine"""
+    try:
+        logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
+        ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
+        if ocr_engine:
+            logger.info("OCR engine initialized successfully")
+            return True
+        else:
+            logger.error("OCR engine initialization failed")
+            return False
+    except Exception as e:
+        logger.error(f"Error initializing OCR engine: {str(e)}")
+        return False
+
+
+def main():
+    init_ocr_engine(
+        os.getenv("OCR_BACKEND", "paddle"),
+        {
+            "OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
+        },
+    )
+
+    # Set max number of worker threads
+    max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
+    logger.info(f"Starting DocReader service with {max_workers} worker threads")
+
+    # Get port number
+    port = os.environ.get("GRPC_PORT", "50051")
+
+    # Create server
+    server = grpc.server(
+        futures.ThreadPoolExecutor(max_workers=max_workers),
+        options=[
+            ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
+            ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
+        ],
+    )
+
+    # Register services
+    docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
+
+    # Register health check service
+    health_servicer = HealthServicer()
+    health_pb2_grpc.add_HealthServicer_to_server(health_servicer, server)
+
+    # Set listen address
+    server.add_insecure_port(f"[::]:{port}")
+
+    # Start service
+    server.start()
+
+    logger.info(f"Server started on port {port}")
+    logger.info("Server is ready to accept connections")
+
+    try:
+        # Wait for service termination
+        server.wait_for_termination()
+    except KeyboardInterrupt:
+        logger.info("Received termination signal, shutting down server")
+        server.stop(0)
+
+
+if __name__ == "__main__":
+    main()
--- a/docreader/parser/init.py
+++ b/docreader/parser/init.py
@@ -0,0 +1,42 @@
+"""
+Parser module for WeKnora document processing system.
+
+This module provides document parsers for various file formats including:
+- Microsoft Word documents (.doc, .docx)
+- PDF documents
+- Markdown files
+- Plain text files
+- Images with text content
+- Web pages
+
+The parsers extract content from documents and can split them into
+meaningful chunks for further processing and indexing.
+"""
+
+from .base_parser import BaseParser, ParseResult
+from .docx_parser import DocxParser
+from .doc_parser import DocParser
+from .pdf_parser import PDFParser
+from .markdown_parser import MarkdownParser
+from .text_parser import TextParser
+from .image_parser import ImageParser
+from .web_parser import WebParser
+from .parser import Parser
+from .config import ChunkingConfig
+from .ocr_engine import OCREngine
+
+# Export public classes and modules
+__all__ = [
+    "BaseParser",  # Base parser class that all format parsers inherit from
+    "DocxParser",  # Parser for .docx files (modern Word documents)
+    "DocParser",  # Parser for .doc files (legacy Word documents)
+    "PDFParser",  # Parser for PDF documents
+    "MarkdownParser",  # Parser for Markdown text files
+    "TextParser",  # Parser for plain text files
+    "ImageParser",  # Parser for images with text content
+    "WebParser",  # Parser for web pages
+    "Parser",  # Main parser factory that selects the appropriate parser
+    "ChunkingConfig",  # Configuration for text chunking behavior
+    "ParseResult",  # Standard result format returned by all parsers
+    "OCREngine",  # OCR engine for extracting text from images
+]
--- a/docreader/parser/base_parser.py
+++ b/docreader/parser/base_parser.py
--- a/docreader/parser/caption.py
+++ b/docreader/parser/caption.py
@@ -0,0 +1,360 @@
+import json
+import logging
+import os
+import time
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+import requests
+import ollama
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ImageUrl:
+    """Image URL data structure for caption requests."""
+
+    url: Optional[str] = None
+    detail: Optional[str] = None
+
+
+@dataclass
+class Content:
+    """Content data structure that can contain text or image URL."""
+
+    type: Optional[str] = None
+    text: Optional[str] = None
+    image_url: Optional[ImageUrl] = None
+
+
+@dataclass
+class SystemMessage:
+    """System message for VLM model requests."""
+
+    role: Optional[str] = None
+    content: Optional[str] = None
+
+
+@dataclass
+class UserMessage:
+    """User message for VLM model requests, can contain multiple content items."""
+
+    role: Optional[str] = None
+    content: List[Content] = field(default_factory=list)
+
+
+@dataclass
+class CompletionRequest:
+    """Request structure for VLM model completion API."""
+
+    model: str
+    temperature: float
+    top_p: float
+    messages: List[Union[SystemMessage, UserMessage]]
+    user: str
+
+
+@dataclass
+class Model:
+    """Model identifier structure."""
+
+    id: str
+
+
+@dataclass
+class ModelsResp:
+    """Response structure for available models API."""
+
+    data: List[Model] = field(default_factory=list)
+
+
+@dataclass
+class Message:
+    """Message structure in API response."""
+
+    role: Optional[str] = None
+    content: Optional[str] = None
+    tool_calls: Optional[str] = None
+
+
+@dataclass
+class Choice:
+    """Choice structure in API response."""
+
+    message: Optional[Message] = None
+
+
+@dataclass
+class Usage:
+    """Token usage information in API response."""
+
+    prompt_tokens: Optional[int] = 0
+    total_tokens: Optional[int] = 0
+    completion_tokens: Optional[int] = 0
+
+
+@dataclass
+class CaptionChatResp:
+    """Response structure for caption chat API."""
+
+    id: Optional[str] = None
+    created: Optional[int] = None
+    model: Optional[Model] = None
+    object: Optional[str] = None
+    choices: List[Choice] = field(default_factory=list)
+    usage: Optional[Usage] = None
+
+    @staticmethod
+    def from_json(json_data: dict) -> "CaptionChatResp":
+        """
+        Parse API response JSON into a CaptionChatResp object.
+
+        Args:
+            json_data: The JSON response from the API
+
+        Returns:
+            A parsed CaptionChatResp object
+        """
+        logger.info("Parsing CaptionChatResp from JSON")
+        # Manually parse nested fields with safe field extraction
+        choices = []
+        for choice in json_data.get("choices", []):
+            message_data = choice.get("message", {})
+            message = Message(
+                role=message_data.get("role"),
+                content=message_data.get("content"),
+                tool_calls=message_data.get("tool_calls"),
+            )
+            choices.append(Choice(message=message))
+
+        # Handle usage with safe field extraction
+        usage_data = json_data.get("usage", {})
+        usage = None
+        if usage_data:
+            usage = Usage(
+                prompt_tokens=usage_data.get("prompt_tokens", 0),
+                total_tokens=usage_data.get("total_tokens", 0),
+                completion_tokens=usage_data.get("completion_tokens", 0),
+            )
+
+        logger.info(
+            f"Parsed {len(choices)} choices and usage data: {usage is not None}"
+        )
+        return CaptionChatResp(
+            id=json_data.get("id"),
+            created=json_data.get("created"),
+            model=json_data.get("model"),
+            object=json_data.get("object"),
+            choices=choices,
+            usage=usage,
+        )
+
+    def choice_data(self) -> str:
+        """
+        Extract the content from the first choice in the response.
+
+        Returns:
+            The content string from the first choice, or empty string if no choices
+        """
+        if self.choices:
+            logger.info("Retrieving content from first choice")
+            return self.choices[0].message.content
+        logger.warning("No choices available in response")
+        return ""
+
+
+class Caption:
+    """
+    Service for generating captions for images using a Vision Language Model.
+    Uses an external API to process images and return textual descriptions.
+    """
+
+    def __init__(self, vlm_config=None):
+        """Initialize the Caption service with configuration from parameters or environment variables."""
+        logger.info("Initializing Caption service")
+        self.prompt = """简单凝炼的描述图片的主要内容"""
+        
+        # Use provided VLM config if available, otherwise fall back to environment variables
+        if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"):
+            self.completion_url = vlm_config.get("base_url", "") + "/chat/completions"
+            self.model = vlm_config.get("model_name", "")
+            self.api_key = vlm_config.get("api_key", "")
+            self.interface_type = vlm_config.get("interface_type", "openai").lower()
+        else:
+            if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "":
+                logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set")
+                return
+            self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions"
+            self.model = os.getenv("VLM_MODEL_NAME")
+            self.api_key = os.getenv("VLM_MODEL_API_KEY")
+            self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower()
+        
+        # 验证接口类型
+        if self.interface_type not in ["ollama", "openai"]:
+            logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai")
+            self.interface_type = "openai"
+        
+        logger.info(
+            f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}"
+        )
+
+    def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]:
+        """
+        Call the Caption API to generate a description for the given image.
+
+        Args:
+            image_data: URL of the image or base64 encoded image data
+
+        Returns:
+            CaptionChatResp object if successful, None otherwise
+        """
+        logger.info(f"Calling Caption API for image captioning")
+        logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}")
+
+        # 根据接口类型选择调用方式
+        if self.interface_type == "ollama":
+            return self._call_ollama_api(image_data)
+        else:
+            return self._call_openai_api(image_data)
+
+    def _call_ollama_api(self, image_base64: str) -> Optional[CaptionChatResp]:
+        """Call Ollama API for image captioning using base64 encoded image data."""
+
+        host = self.completion_url.replace("/v1/chat/completions", "")
+
+        client = ollama.Client(
+            host=host,
+        )
+        
+        try:
+            logger.info(f"Calling Ollama API with model: {self.model}")
+            
+            # 调用Ollama API，使用images参数传递base64编码的图片
+            response = client.generate(
+                model=self.model,
+                prompt="简单凝炼的描述图片的主要内容",
+                images=[image_base64], # image_base64是base64编码的图片数据
+                options={"temperature": 0.1},
+                stream=False,
+            )
+            
+            # 构造响应对象
+            caption_resp = CaptionChatResp(
+                id="ollama_response",
+                created=int(time.time()),
+                model=self.model,
+                object="chat.completion",
+                choices=[
+                    Choice(
+                        message=Message(
+                            role="assistant",
+                            content=response.response
+                        )
+                    )
+                ]
+            )
+            
+            logger.info("Successfully received response from Ollama API")
+            return caption_resp
+            
+        except Exception as e:
+            logger.error(f"Error calling Ollama API: {e}")
+            return None
+
+    def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]:
+        """Call OpenAI-compatible API for image captioning."""
+        logger.info(f"Calling OpenAI-compatible API with model: {self.model}")
+        
+        user_msg = UserMessage(
+            role="user",
+            content=[
+                Content(type="text", text=self.prompt),
+                Content(
+                    type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto")
+                ),
+            ],
+        )
+
+        gpt_req = CompletionRequest(
+            model=self.model,
+            temperature=0.3,
+            top_p=0.8,
+            messages=[user_msg],
+            user="abc",
+        )
+
+        headers = {
+            "Content-Type": "application/json",
+            "Accept": "text/event-stream",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+        }
+        if self.api_key:
+            headers["Authorization"] = f"Bearer {self.api_key}"
+
+        try:
+            logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}")
+            response = requests.post(
+                self.completion_url,
+                data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4),
+                headers=headers,
+                timeout=30,
+            )
+            if response.status_code != 200:
+                logger.error(
+                    f"OpenAI-compatible API returned non-200 status code: {response.status_code}"
+                )
+                response.raise_for_status()
+
+            logger.info(
+                f"Successfully received response from OpenAI-compatible API with status: {response.status_code}"
+            )
+            logger.info(f"Converting response to CaptionChatResp object")
+            caption_resp = CaptionChatResp.from_json(response.json())
+
+            if caption_resp.usage:
+                logger.info(
+                    f"API usage: prompt_tokens={caption_resp.usage.prompt_tokens}, "
+                    f"completion_tokens={caption_resp.usage.completion_tokens}"
+                )
+
+            return caption_resp
+        except requests.exceptions.Timeout:
+            logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds")
+            return None
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Request error calling OpenAI-compatible API: {e}")
+            return None
+        except Exception as e:
+            logger.error(f"Unexpected error calling OpenAI-compatible API: {e}")
+            return None
+
+    def get_caption(self, image_data: str) -> str:
+        """
+        Get a caption for the provided image data.
+
+        Args:
+            image_data: URL of the image or base64 encoded image data
+
+        Returns:
+            Caption text as string, or empty string if captioning failed
+        """
+        logger.info("Getting caption for image")
+        if not image_data or self.completion_url is None:
+            logger.error("Image data is not set")
+            return ""
+        caption_resp = self._call_caption_api(image_data)
+        if caption_resp:
+            caption = caption_resp.choice_data()
+            caption_length = len(caption)
+            logger.info(f"Successfully generated caption of length {caption_length}")
+            logger.info(
+                f"Caption: {caption[:50]}..."
+                if caption_length > 50
+                else f"Caption: {caption}"
+            )
+            return caption
+        logger.warning("Failed to get caption from Caption API")
+        return ""
--- a/docreader/parser/config.py
+++ b/docreader/parser/config.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ChunkingConfig:
+    """
+    Configuration for text chunking process.
+    Controls how documents are split into smaller pieces for processing.
+    """
+
+    chunk_size: int = 512  # Maximum size of each chunk in tokens/chars
+    chunk_overlap: int = 50  # Number of tokens/chars to overlap between chunks
+    separators: list = field(
+        default_factory=lambda: ["\n\n", "\n", "。"]
+    )  # Text separators in order of priority
+    enable_multimodal: bool = (
+        False  # Whether to enable multimodal processing (text + images)
+    )
+    storage_config: dict = None  # Preferred field name going forward
+    vlm_config: dict = None  # VLM configuration for image captioning
+
--- a/docreader/parser/doc_parser.py
+++ b/docreader/parser/doc_parser.py
@@ -0,0 +1,315 @@
+import asyncio
+import logging
+import re
+import tempfile
+import os
+import subprocess
+import shutil
+from io import BytesIO
+from typing import Optional, List, Tuple
+import textract
+from PIL import Image
+import zipfile
+import xml.etree.ElementTree as ET
+
+from .base_parser import BaseParser
+from .docx_parser import DocxParser, Docx
+
+logger = logging.getLogger(__name__)
+
+
+class DocParser(BaseParser):
+    """DOC document parser"""
+
+    def parse_into_text(self, content: bytes) -> str:
+        """Parse DOC document
+
+        Args:
+            content: DOC document content
+
+        Returns:
+            Parse result
+        """
+        logger.info(f"Parsing DOC document, content size: {len(content)} bytes")
+
+        # Save byte content as a temporary file
+        with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
+            temp_file_path = temp_file.name
+            temp_file.write(content)
+            temp_file.flush()
+            logger.info(f"Saved DOC content to temporary file: {temp_file_path}")
+
+        try:
+            # First try to convert to docx format to extract images
+            if self.enable_multimodal:
+                logger.info("Multimodal enabled, attempting to extract images from DOC")
+                docx_content = self._convert_doc_to_docx(temp_file_path)
+
+                if docx_content:
+                    logger.info("Successfully converted DOC to DOCX, using DocxParser")
+                    # Use existing DocxParser to parse the converted docx
+                    docx_parser = DocxParser(
+                        file_name=self.file_name,
+                        file_type="docx",
+                        enable_multimodal=self.enable_multimodal,
+                        chunk_size=self.chunk_size,
+                        chunk_overlap=self.chunk_overlap,
+                        chunking_config=self.chunking_config,
+                        separators=self.separators,
+                    )
+                    text = docx_parser.parse_into_text(docx_content)
+                    logger.info(f"Extracted {len(text)} characters using DocxParser")
+
+                    # Clean up temporary file
+                    os.unlink(temp_file_path)
+                    logger.info(f"Deleted temporary file: {temp_file_path}")
+
+                    return text
+                else:
+                    logger.warning(
+                        "Failed to convert DOC to DOCX, falling back to text-only extraction"
+                    )
+
+            # If image extraction is not needed or conversion failed, try using antiword to extract text
+            try:
+                logger.info("Attempting to parse DOC file with antiword")
+                # Check if antiword is installed
+                antiword_path = self._find_antiword_path()
+
+                if antiword_path:
+                    # Use antiword to extract text directly
+                    logger.info(f"Using antiword at {antiword_path} to extract text")
+                    process = subprocess.Popen(
+                        [antiword_path, temp_file_path],
+                        stdout=subprocess.PIPE,
+                        stderr=subprocess.PIPE,
+                    )
+                    stdout, stderr = process.communicate()
+
+                    if process.returncode == 0:
+                        text = stdout.decode("utf-8", errors="ignore")
+                        logger.info(
+                            f"Successfully extracted {len(text)} characters using antiword"
+                        )
+
+                        # Clean up temporary file
+                        os.unlink(temp_file_path)
+                        logger.info(f"Deleted temporary file: {temp_file_path}")
+
+                        return text
+                    else:
+                        logger.warning(
+                            f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
+                        )
+                else:
+                    logger.warning("antiword not found, falling back to textract")
+            except Exception as e:
+                logger.warning(
+                    f"Error using antiword: {str(e)}, falling back to textract"
+                )
+
+            # If antiword fails, try using textract
+            logger.info("Parsing DOC file with textract")
+            text = textract.process(temp_file_path, method="antiword").decode("utf-8")
+            logger.info(
+                f"Successfully extracted {len(text)} characters of text from DOC document using textract"
+            )
+
+            # Clean up temporary file
+            os.unlink(temp_file_path)
+            logger.info(f"Deleted temporary file: {temp_file_path}")
+
+            return text
+        except Exception as e:
+            logger.error(f"Error parsing DOC document: {str(e)}")
+            # Ensure temporary file is cleaned up
+            if os.path.exists(temp_file_path):
+                os.unlink(temp_file_path)
+                logger.info(f"Deleted temporary file after error: {temp_file_path}")
+            return ""
+
+    def _convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
+        """Convert DOC file to DOCX format
+
+        Uses LibreOffice/OpenOffice for conversion
+
+        Args:
+            doc_path: DOC file path
+
+        Returns:
+            Byte stream of DOCX file content, or None if conversion fails
+        """
+        logger.info(f"Converting DOC to DOCX: {doc_path}")
+
+        # Create a temporary directory to store the converted file
+        temp_dir = tempfile.mkdtemp()
+        docx_path = os.path.join(temp_dir, "converted.docx")
+
+        try:
+            # Check if LibreOffice or OpenOffice is installed
+            soffice_path = self._find_soffice_path()
+            if not soffice_path:
+                logger.error(
+                    "LibreOffice/OpenOffice not found, cannot convert DOC to DOCX"
+                )
+                return None
+
+            # Execute conversion command
+            logger.info(f"Using {soffice_path} to convert DOC to DOCX")
+            cmd = [
+                soffice_path,
+                "--headless",
+                "--convert-to",
+                "docx",
+                "--outdir",
+                temp_dir,
+                doc_path,
+            ]
+
+            logger.info(f"Running command: {' '.join(cmd)}")
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            stdout, stderr = process.communicate()
+
+            if process.returncode != 0:
+                logger.error(
+                    f"Error converting DOC to DOCX: {stderr.decode('utf-8', errors='ignore')}"
+                )
+                return None
+
+            # Find the converted file
+            for file in os.listdir(temp_dir):
+                if file.endswith(".docx"):
+                    converted_file = os.path.join(temp_dir, file)
+                    logger.info(f"Found converted file: {converted_file}")
+
+                    # Read the converted file content
+                    with open(converted_file, "rb") as f:
+                        docx_content = f.read()
+
+                    logger.info(
+                        f"Successfully read converted DOCX file, size: {len(docx_content)} bytes"
+                    )
+                    return docx_content
+
+            logger.error("No DOCX file found after conversion")
+            return None
+
+        except Exception as e:
+            logger.error(f"Error during DOC to DOCX conversion: {str(e)}")
+            return None
+        finally:
+            # Clean up temporary directory
+            try:
+                shutil.rmtree(temp_dir)
+                logger.info(f"Cleaned up temporary directory: {temp_dir}")
+            except Exception as e:
+                logger.warning(f"Failed to clean up temporary directory: {str(e)}")
+
+    def _find_soffice_path(self) -> Optional[str]:
+        """Find LibreOffice/OpenOffice executable path
+
+        Returns:
+            Executable path, or None if not found
+        """
+        # Common LibreOffice/OpenOffice executable paths
+        possible_paths = [
+            # Linux
+            "/usr/bin/soffice",
+            "/usr/lib/libreoffice/program/soffice",
+            "/opt/libreoffice25.2/program/soffice",
+            # macOS
+            "/Applications/LibreOffice.app/Contents/MacOS/soffice",
+            # Windows
+            "C:\\Program Files\\LibreOffice\\program\\soffice.exe",
+            "C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
+        ]
+
+        # Check if path is set in environment variable
+        if os.environ.get("LIBREOFFICE_PATH"):
+            possible_paths.insert(0, os.environ.get("LIBREOFFICE_PATH"))
+
+        for path in possible_paths:
+            if os.path.exists(path):
+                logger.info(f"Found LibreOffice/OpenOffice at: {path}")
+                return path
+
+        # Try to find in PATH
+        try:
+            result = subprocess.run(
+                ["which", "soffice"], capture_output=True, text=True
+            )
+            if result.returncode == 0 and result.stdout.strip():
+                path = result.stdout.strip()
+                logger.info(f"Found LibreOffice/OpenOffice in PATH: {path}")
+                return path
+        except Exception:
+            pass
+
+        logger.warning("LibreOffice/OpenOffice not found")
+        return None
+
+    def _find_antiword_path(self) -> Optional[str]:
+        """Find antiword executable path
+
+        Returns:
+            Executable path, or None if not found
+        """
+        # Common antiword executable paths
+        possible_paths = [
+            # Linux/macOS
+            "/usr/bin/antiword",
+            "/usr/local/bin/antiword",
+            # Windows
+            "C:\\Program Files\\Antiword\\antiword.exe",
+            "C:\\Program Files (x86)\\Antiword\\antiword.exe",
+        ]
+
+        # Check if path is set in environment variable
+        if os.environ.get("ANTIWORD_PATH"):
+            possible_paths.insert(0, os.environ.get("ANTIWORD_PATH"))
+
+        for path in possible_paths:
+            if os.path.exists(path):
+                logger.info(f"Found antiword at: {path}")
+                return path
+
+        # Try to find in PATH
+        try:
+            result = subprocess.run(
+                ["which", "antiword"], capture_output=True, text=True
+            )
+            if result.returncode == 0 and result.stdout.strip():
+                path = result.stdout.strip()
+                logger.info(f"Found antiword in PATH: {path}")
+                return path
+        except Exception:
+            pass
+
+        logger.warning("antiword not found")
+        return None
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    logger.info("Running DocParser in standalone mode")
+
+    file_name = "/path/to/your/test.doc"
+    logger.info(f"Processing file: {file_name}")
+
+    doc_parser = DocParser(
+        file_name, enable_multimodal=True, chunk_size=512, chunk_overlap=60
+    )
+    logger.info("Parser initialized, starting processing")
+
+    with open(file_name, "rb") as f:
+        content = f.read()
+
+    text = doc_parser.parse_into_text(content)
+    logger.info(f"Processing complete, extracted text length: {len(text)}")
+    logger.info(f"Sample text: {text[:200]}...")
--- a/docreader/parser/docx_parser.py
+++ b/docreader/parser/docx_parser.py
--- a/docreader/parser/image_parser.py
+++ b/docreader/parser/image_parser.py
@@ -0,0 +1,68 @@
+import logging
+import os
+import asyncio
+from PIL import Image
+import io
+from typing import Dict, Any, Tuple, Union
+from .base_parser import BaseParser, ParseResult
+import numpy as np
+
+# Set up logger for this module
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+class ImageParser(BaseParser):
+    """
+    Parser for image files with OCR capability.
+    Extracts text from images and generates captions.
+
+    This parser handles image processing by:
+    1. Uploading the image to storage
+    2. Generating a descriptive caption
+    3. Performing OCR to extract text content
+    4. Returning a combined result with both text and image reference
+    """
+
+    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+        """
+        Parse image content, upload the image and return Markdown reference along with image map.
+
+        Args:
+            content: Raw image data (bytes)
+
+        Returns:
+            Tuple of (markdown_text, image_map) where image_map maps image URLs to PIL Image objects
+        """
+        logger.info(f"Parsing image content, size: {len(content)} bytes")
+        image_map = {}
+        
+        try:
+            # Upload image to storage service
+            logger.info("Uploading image to storage")
+            _, ext = os.path.splitext(self.file_name)
+            image_url = self.upload_bytes(content, file_ext=ext)
+            if not image_url:
+                logger.error("Failed to upload image to storage")
+                return "", {}
+            logger.info(
+                f"Successfully uploaded image, URL: {image_url[:50]}..."
+                if len(image_url) > 50
+                else f"Successfully uploaded image, URL: {image_url}"
+            )
+
+            # Create image object and add to map
+            try:
+                from PIL import Image
+                import io
+                image = Image.open(io.BytesIO(content))
+                image_map[image_url] = image
+                logger.info(f"Added image to image_map for URL: {image_url}")
+            except Exception as img_err:
+                logger.error(f"Error creating image object: {str(img_err)}")
+
+            markdown_text = f"![{self.file_name}]({image_url})"
+            return markdown_text, image_map
+
+        except Exception as e:
+            logger.error(f"Error parsing image: {str(e)}")
+            return "", {}
--- a/docreader/parser/image_utils.py
+++ b/docreader/parser/image_utils.py
@@ -0,0 +1,43 @@
+import base64
+import io
+import logging
+from typing import Union
+from PIL import Image
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+def image_to_base64(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
+    """Convert image to base64 encoded string
+    
+    Args:
+        image: Image file path, bytes, PIL Image object, or numpy array
+        
+    Returns:
+        Base64 encoded image string, or empty string if conversion fails
+    """
+    try:
+        if isinstance(image, str):
+            # It's a file path
+            with open(image, "rb") as image_file:
+                return base64.b64encode(image_file.read()).decode("utf-8")
+        elif isinstance(image, bytes):
+            # It's bytes data
+            return base64.b64encode(image).decode("utf-8")
+        elif isinstance(image, Image.Image):
+            # It's a PIL Image
+            buffer = io.BytesIO()
+            image.save(buffer, format="PNG")
+            return base64.b64encode(buffer.getvalue()).decode("utf-8")
+        elif isinstance(image, np.ndarray):
+            # It's a numpy array
+            pil_image = Image.fromarray(image)
+            buffer = io.BytesIO()
+            pil_image.save(buffer, format="PNG")
+            return base64.b64encode(buffer.getvalue()).decode("utf-8")
+        else:
+            logger.error(f"Unsupported image type: {type(image)}")
+            return ""
+    except Exception as e:
+        logger.error(f"Error converting image to base64: {str(e)}")
+        return ""
--- a/docreader/parser/markdown_parser.py
+++ b/docreader/parser/markdown_parser.py
@@ -0,0 +1,33 @@
+import asyncio
+import re
+import logging
+import numpy as np
+import os  # Import os module to get environment variables
+from typing import Dict, List, Optional, Tuple, Union, Any
+from .base_parser import BaseParser
+
+# Get logger object
+logger = logging.getLogger(__name__)
+
+
+class MarkdownParser(BaseParser):
+    """Markdown document parser"""
+
+    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+        """Parse Markdown document, only extract text content, do not process images
+
+        Args:
+            content: Markdown document content
+
+        Returns:
+            Parsed text result
+        """
+        logger.info(f"Parsing Markdown document, content size: {len(content)} bytes")
+
+        # Convert byte content to string using universal decoding method
+        text = self.decode_bytes(content)
+        logger.info(f"Decoded Markdown content, text length: {len(text)} characters")
+
+        logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text")
+        return text
+
--- a/docreader/parser/ocr_engine.py
+++ b/docreader/parser/ocr_engine.py
@@ -0,0 +1,278 @@
+import os
+import logging
+import base64
+from typing import Optional, Union, Dict, Any
+from abc import ABC, abstractmethod
+from PIL import Image
+import io
+import numpy as np
+from .image_utils import image_to_base64
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+class OCRBackend(ABC):
+    """Base class for OCR backends"""
+    
+    @abstractmethod
+    def predict(self, image: Union[str, bytes, Image.Image]) -> str:
+        """Extract text from an image
+        
+        Args:
+            image: Image file path, bytes, or PIL Image object
+            
+        Returns:
+            Extracted text
+        """
+        pass
+
+class PaddleOCRBackend(OCRBackend):
+    """PaddleOCR backend implementation"""
+    
+    def __init__(self, **kwargs):
+        """Initialize PaddleOCR backend"""
+        self.ocr = None
+        try:
+            import os
+            import paddle
+            
+            # Set PaddlePaddle to use CPU and disable GPU
+            os.environ['CUDA_VISIBLE_DEVICES'] = ''
+            paddle.set_device('cpu')
+            
+            # 尝试检测CPU是否支持AVX指令集
+            try:
+                import subprocess
+                import platform
+                
+                # 检测CPU是否支持AVX
+                if platform.system() == "Linux":
+                    try:
+                        result = subprocess.run(['grep', '-o', 'avx', '/proc/cpuinfo'], 
+                                              capture_output=True, text=True, timeout=5)
+                        has_avx = 'avx' in result.stdout.lower()
+                        if not has_avx:
+                            logger.warning("CPU does not support AVX instructions, using compatibility mode")
+                            # 进一步限制指令集使用
+                            os.environ['FLAGS_use_avx2'] = '0'
+                            os.environ['FLAGS_use_avx'] = '1'
+                    except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
+                        logger.warning("Could not detect AVX support, using compatibility mode")
+                        os.environ['FLAGS_use_avx2'] = '0'
+                        os.environ['FLAGS_use_avx'] = '1'
+            except Exception as e:
+                logger.warning(f"Error detecting CPU capabilities: {e}, using compatibility mode")
+                os.environ['FLAGS_use_avx2'] = '0'
+                os.environ['FLAGS_use_avx'] = '1'
+            
+            from paddleocr import PaddleOCR
+            # OCR configuration with text orientation classification enabled
+            ocr_config = {
+                "use_gpu": False,
+                "text_det_limit_type": "max",
+                "text_det_limit_side_len": 960,
+                "use_doc_orientation_classify": True,  # 启用文档方向分类
+                "use_doc_unwarping": False,
+                "use_textline_orientation": True,  # 启用文本行方向检测
+                "text_recognition_model_name": "PP-OCRv4_server_rec",
+                "text_detection_model_name": "PP-OCRv4_server_det",
+                "text_det_thresh": 0.3,
+                "text_det_box_thresh": 0.6,
+                "text_det_unclip_ratio": 1.5,
+                "text_rec_score_thresh": 0.0,
+                "ocr_version": "PP-OCRv4",
+                "lang": "ch",
+                "show_log": False,
+                "use_dilation": True,  # improves accuracy
+                "det_db_score_mode": "slow",  # improves accuracy
+            }
+            
+            self.ocr = PaddleOCR(**ocr_config)
+            logger.info("PaddleOCR engine initialized successfully")
+            
+        except ImportError as e:
+            logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
+        except OSError as e:
+            if "Illegal instruction" in str(e) or "core dumped" in str(e):
+                logger.error(f"PaddlePaddle crashed due to CPU instruction set incompatibility: {str(e)}")
+                logger.error("This usually happens when the CPU doesn't support AVX instructions.")
+                logger.error("Please try installing a CPU-only version of PaddlePaddle or use a different OCR backend.")
+            else:
+                logger.error(f"Failed to initialize PaddleOCR due to OS error: {str(e)}")
+        except Exception as e:
+            logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
+    
+    def predict(self, image):
+        """Perform OCR recognition on the image
+
+        Args:
+            image: Image object (PIL.Image or numpy array)
+
+        Returns:
+            Extracted text string
+        """
+        try:
+            # Ensure image is in RGB format
+            if hasattr(image, "convert") and image.mode != "RGB":
+                image = image.convert("RGB")
+
+            # Convert to numpy array if needed
+            if hasattr(image, "convert"):
+                image_array = np.array(image)
+            else:
+                image_array = image
+
+            # Perform OCR
+            ocr_result = self.ocr.ocr(image_array, cls=False)
+   
+            # Extract text
+            ocr_text = ""
+            if ocr_result and ocr_result[0]:
+                for line in ocr_result[0]:
+                    if line and len(line) >= 2:
+                        text = line[1][0] if line[1] else ""
+                        if text:
+                            ocr_text += text + " "
+            
+            text_length = len(ocr_text.strip())
+            if text_length > 0:
+                logger.info(f"OCR extracted {text_length} characters")
+                return ocr_text.strip()
+            else:
+                logger.warning("OCR returned empty result")
+                return ""
+                
+        except Exception as e:
+            logger.error(f"OCR recognition error: {str(e)}")
+            return ""
+    
+class NanonetsOCRBackend(OCRBackend):
+    """Nanonets OCR backend implementation using OpenAI API format"""
+    
+    def __init__(self, **kwargs):
+        """Initialize Nanonets OCR backend
+        
+        Args:
+            api_key: API key for OpenAI API
+            base_url: Base URL for OpenAI API
+            model: Model name
+        """
+        try:
+            from openai import OpenAI
+            self.api_key = kwargs.get("api_key", "123")
+            self.base_url = kwargs.get("base_url", "http://localhost:8000/v1")
+            self.model = kwargs.get("model", "nanonets/Nanonets-OCR-s")
+            self.temperature = kwargs.get("temperature", 0.0)
+            self.max_tokens = kwargs.get("max_tokens", 15000)
+            
+            self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+            self.prompt = """
+## 任务说明
+
+请从上传的文档中提取文字内容，严格按自然阅读顺序（从上到下，从左到右）输出，并遵循以下格式规范。
+
+### 1. **文本处理**
+
+* 按正常阅读顺序提取文字，语句流畅自然。
+
+### 2. **表格**
+
+* 所有表格统一转换为 **Markdown 表格格式**。
+* 内容保持清晰、对齐整齐，便于阅读。
+
+### 3. **公式**
+
+* 所有公式转换为 **LaTeX 格式**，使用 `$$公式$$` 包裹。
+
+### 4. **图片**
+
+* 忽略图片信息
+
+### 5. **链接**
+
+* 不要猜测或补全不确定的链接地址。
+"""
+            logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
+        except ImportError:
+            logger.error("Failed to import openai. Please install it with 'pip install openai'")
+            self.client = None
+        except Exception as e:
+            logger.error(f"Failed to initialize Nanonets OCR: {str(e)}")
+            self.client = None
+    
+    def predict(self, image: Union[str, bytes, Image.Image]) -> str:
+        """Extract text from an image using Nanonets OCR
+        
+        Args:
+            image: Image file path, bytes, or PIL Image object
+            
+        Returns:
+            Extracted text
+        """
+        if self.client is None:
+            logger.error("Nanonets OCR client not initialized")
+            return ""
+        
+        try:
+            # Encode image to base64
+            img_base64 = image_to_base64(image)
+            if not img_base64:
+                return ""
+            
+            # Call Nanonets OCR API
+            logger.info(f"Calling Nanonets OCR API with model: {self.model}")
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/png;base64,{img_base64}"},
+                            },
+                            {
+                                "type": "text",
+                                "text": self.prompt,
+                            },
+                        ],
+                    }
+                ],
+                temperature=self.temperature,
+                max_tokens=self.max_tokens
+            )
+            
+            return response.choices[0].message.content
+        except Exception as e:
+            logger.error(f"Nanonets OCR prediction error: {str(e)}")
+            return ""
+
+class OCREngine:
+    """OCR Engine factory class"""
+    
+    _instance = None
+    
+    @classmethod
+    def get_instance(cls, backend_type="paddle", **kwargs) -> Optional[OCRBackend]:
+        """Get OCR engine instance
+        
+        Args:
+            backend_type: OCR backend type, one of: "paddle", "nanonets"
+            **kwargs: Additional arguments for the backend
+            
+        Returns:
+            OCR engine instance or None if initialization fails
+        """
+        if cls._instance is None:
+            logger.info(f"Initializing OCR engine with backend: {backend_type}")
+            
+            if backend_type.lower() == "paddle":
+                cls._instance = PaddleOCRBackend(**kwargs)
+            elif backend_type.lower() == "nanonets":
+                cls._instance = NanonetsOCRBackend(**kwargs)
+            else:
+                logger.error(f"Unknown OCR backend type: {backend_type}")
+                return None
+        
+        return cls._instance
+    
--- a/docreader/parser/parser.py
+++ b/docreader/parser/parser.py
@@ -0,0 +1,206 @@
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, Any, Optional, Type
+
+from .base_parser import BaseParser, ParseResult
+from .docx_parser import DocxParser
+from .doc_parser import DocParser
+from .pdf_parser import PDFParser
+from .markdown_parser import MarkdownParser
+from .text_parser import TextParser
+from .image_parser import ImageParser
+from .web_parser import WebParser
+from .config import ChunkingConfig
+import traceback
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class Chunk:
+    """
+    Represents a single text chunk with associated metadata.
+    Basic unit for document processing and embedding.
+    """
+
+    content: str  # Text content of the chunk
+    metadata: Dict[str, Any] = None  # Associated metadata (source, page number, etc.)
+
+
+class Parser:
+    """
+    Document parser facade that integrates all specialized parsers.
+    Provides a unified interface for parsing various document types.
+    """
+
+    def __init__(self):
+        logger.info("Initializing document parser")
+        # Initialize all parser types
+        self.parsers: Dict[str, Type[BaseParser]] = {
+            "docx": DocxParser,
+            "doc": DocParser,
+            "pdf": PDFParser,
+            "md": MarkdownParser,
+            "txt": TextParser,
+            "jpg": ImageParser,
+            "jpeg": ImageParser,
+            "png": ImageParser,
+            "gif": ImageParser,
+            "bmp": ImageParser,
+            "tiff": ImageParser,
+            "webp": ImageParser,
+            "markdown": MarkdownParser,
+        }
+        logger.info(
+            "Parser initialized with %d parsers: %s",
+            len(self.parsers),
+            ", ".join(self.parsers.keys()),
+        )
+
+
+    def get_parser(self, file_type: str) -> Optional[Type[BaseParser]]:
+        """
+        Get parser class for the specified file type.
+
+        Args:
+            file_type: The file extension or type identifier
+
+        Returns:
+            Parser class for the file type, or None if unsupported
+        """
+        file_type = file_type.lower()
+        parser = self.parsers.get(file_type)
+        if parser:
+            logger.info(f"Found parser for file type: {file_type}")
+        else:
+            logger.warning(f"No parser found for file type: {file_type}")
+        return parser
+
+    def parse_file(
+        self,
+        file_name: str,
+        file_type: str,
+        content: bytes,
+        config: ChunkingConfig,
+    ) -> Optional[ParseResult]:
+        """
+        Parse file content using appropriate parser based on file type.
+
+        Args:
+            file_name: Name of the file being parsed
+            file_type: Type/extension of the file
+            content: Raw file content as bytes
+            config: Configuration for chunking process
+
+        Returns:
+            ParseResult containing chunks and metadata, or None if parsing failed
+        """
+        logger.info(f"Parsing file: {file_name} with type: {file_type}")
+        logger.info(
+            f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
+            f"multimodal={config.enable_multimodal}"
+        )
+        
+        parser_instance = None
+        
+        try:
+            # Get appropriate parser for file type
+            cls = self.get_parser(file_type)
+            if cls is None:
+                logger.error(f"Unsupported file type: {file_type}")
+                return None
+
+            # Parse file content
+            logger.info(f"Creating parser instance for {file_type} file")
+            parser_instance = cls(
+                file_name=file_name,
+                file_type=file_type,
+                chunk_size=config.chunk_size,
+                chunk_overlap=config.chunk_overlap,
+                separators=config.separators,
+                enable_multimodal=config.enable_multimodal,
+                max_image_size=1920,  # Limit image size to 1920px
+                max_concurrent_tasks=5,  # Limit concurrent tasks to 5
+                chunking_config=config,  # Pass the entire chunking config
+            )
+
+            logger.info(f"Starting to parse file content, size: {len(content)} bytes")
+            result = parser_instance.parse(content)
+
+            if result:
+                logger.info(
+                    f"Successfully parsed file {file_name}, generated {len(result.chunks)} chunks"
+                )
+                if result.chunks and len(result.chunks) > 0:
+                    logger.info(
+                        f"First chunk content length: {len(result.chunks[0].content)}"
+                    )
+                else:
+                    logger.warning(f"Parser returned empty chunks for file: {file_name}")
+            else:
+                logger.warning(f"Parser returned None result for file: {file_name}")
+
+            # Return parse results
+            return result
+
+        except Exception as e:
+            logger.error(f"Error parsing file {file_name}: {str(e)}")
+            logger.info(f"Detailed traceback: {traceback.format_exc()}")
+            return None
+
+    def parse_url(
+        self, url: str, title: str, config: ChunkingConfig
+    ) -> Optional[ParseResult]:
+        """
+        Parse content from a URL using the WebParser.
+
+        Args:
+            url: URL to parse
+            title: Title of the webpage (for metadata)
+            config: Configuration for chunking process
+
+        Returns:
+            ParseResult containing chunks and metadata, or None if parsing failed
+        """
+        logger.info(f"Parsing URL: {url}, title: {title}")
+        logger.info(
+            f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
+            f"multimodal={config.enable_multimodal}"
+        )
+        
+        parser_instance = None
+
+        try:
+            # Create web parser instance
+            logger.info("Creating WebParser instance")
+            parser_instance = WebParser(
+                title=title,
+                chunk_size=config.chunk_size,
+                chunk_overlap=config.chunk_overlap,
+                separators=config.separators,
+                enable_multimodal=config.enable_multimodal,
+                max_image_size=1920,  # Limit image size
+                max_concurrent_tasks=5,  # Limit concurrent tasks
+                chunking_config=config,
+            )
+
+            logger.info(f"Starting to parse URL content")
+            result = parser_instance.parse(url)
+
+            if result:
+                logger.info(
+                    f"Successfully parsed URL, generated {len(result.chunks)} chunks"
+                )
+                logger.info(
+                    f"First chunk content length: {len(result.chunks[0].content) if result.chunks else 0}"
+                )
+            else:
+                logger.warning(f"Parser returned empty result for URL: {url}")
+
+            # Return parse results
+            return result
+
+        except Exception as e:
+            logger.error(f"Error parsing URL {url}: {str(e)}")
+            logger.info(f"Detailed traceback: {traceback.format_exc()}")
+            return None
+
--- a/docreader/parser/pdf_parser.py
+++ b/docreader/parser/pdf_parser.py
@@ -0,0 +1,113 @@
+import logging
+import os
+import io
+from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
+
+import pdfplumber
+import tempfile
+from .base_parser import BaseParser
+
+logger = logging.getLogger(__name__)
+
+class PDFParser(BaseParser):
+    """
+    PDF Document Parser
+
+    This parser handles PDF documents by extracting text content.
+    It uses the pypdf library for simple text extraction.
+    """
+    def _convert_table_to_markdown(self, table_data: list) -> str:
+    
+        if not table_data or not table_data[0]: return ""
+        def clean_cell(cell):
+            if cell is None: return ""
+            return str(cell).replace("\n", " <br> ")
+        try:
+            markdown = ""
+            header = [clean_cell(cell) for cell in table_data[0]]
+            markdown += "| " + " | ".join(header) + " |\n"
+            markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
+            for row in table_data[1:]:
+                if not row: continue
+                body_row = [clean_cell(cell) for cell in row]
+                if len(body_row) != len(header):
+                    logger.warning(f"Skipping malformed table row: {body_row}")
+                    continue
+                markdown += "| " + " | ".join(body_row) + " |\n"
+            return markdown
+        except Exception as e:
+            logger.error(f"Error converting table to markdown: {e}")
+            return ""
+    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+       
+        logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
+
+        all_page_content = []
+     
+
+        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+        temp_pdf_path = temp_pdf.name
+        
+        try:
+            temp_pdf.write(content)
+            temp_pdf.close()
+            logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
+            
+            with pdfplumber.open(temp_pdf_path) as pdf:
+                logger.info(f"PDF has {len(pdf.pages)} pages")
+                
+                for page_num, page in enumerate(pdf.pages):
+                    page_content_parts = []
+                    
+                    # Try-fallback strategy for table detection
+                    default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
+                    found_tables = page.find_tables(default_settings)
+                    if not found_tables:
+                        logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
+                        fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
+                        found_tables = page.find_tables(fallback_settings)
+
+                    table_bboxes = [table.bbox for table in found_tables]
+                    # Define a filter function that keeps objects NOT inside any table bbox.
+                    def not_within_bboxes(obj):
+                        """Check if an object is outside all table bounding boxes."""
+                        for bbox in table_bboxes:
+                            # Check if the object's vertical center is within a bbox
+                            if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
+                                return False # It's inside a table, so we DON'T keep it
+                        return True # It's outside all tables, so we DO keep it
+
+                    # that contains only the non-table text.
+                    non_table_page = page.filter(not_within_bboxes)
+
+                    # Now, extract text from this filtered page view.
+                    text = non_table_page.extract_text(x_tolerance=2)
+                    if text:
+                        page_content_parts.append(text)
+              
+                    # Process and append the structured Markdown tables
+                    if found_tables:
+                        logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
+                        for table in found_tables:
+                            markdown_table = self._convert_table_to_markdown(table.extract())
+                            page_content_parts.append(f"\n\n{markdown_table}\n\n")
+                    
+                    
+                    all_page_content.append("".join(page_content_parts))
+
+            final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
+            logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
+            
+            return final_text
+            
+        except Exception as e:
+            logger.error(f"Failed to parse PDF document: {str(e)}")
+            return ""
+        finally:
+            # This block is GUARANTEED to execute, preventing resource leaks.
+            if os.path.exists(temp_pdf_path):
+                try:
+                    os.remove(temp_pdf_path)
+                    logger.info(f"Temporary file cleaned up: {temp_pdf_path}")
+                except OSError as e:
+                    logger.error(f"Error removing temporary file {temp_pdf_path}: {e}")
--- a/docreader/parser/storage.py
+++ b/docreader/parser/storage.py
@@ -0,0 +1,360 @@
+# -*- coding: utf-8 -*-
+import os
+import uuid
+import logging
+import io
+import traceback
+from abc import ABC, abstractmethod
+from typing import Tuple, Optional
+
+from qcloud_cos import CosConfig, CosS3Client
+from minio import Minio
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class Storage(ABC):
+    """Abstract base class for object storage operations"""
+    
+    @abstractmethod
+    def upload_file(self, file_path: str) -> str:
+        """Upload file to object storage
+        
+        Args:
+            file_path: File path
+            
+        Returns:
+            File URL
+        """
+        pass
+    
+    @abstractmethod
+    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
+        """Upload bytes to object storage
+        
+        Args:
+            content: Byte content to upload
+            file_ext: File extension
+            
+        Returns:
+            File URL
+        """
+        pass
+
+        
+class CosStorage(Storage):
+    """Tencent Cloud COS storage implementation"""
+    
+    def __init__(self, storage_config=None):
+        """Initialize COS storage
+        
+        Args:
+            storage_config: Storage configuration
+        """
+        self.storage_config = storage_config
+        self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client()
+        
+    def _init_cos_client(self):
+        """Initialize Tencent Cloud COS client"""
+        try:
+            # Use provided COS config if available, otherwise fall back to environment variables
+            if self.storage_config and self.storage_config.get("access_key_id") != "":
+                cos_config = self.storage_config
+                secret_id = cos_config.get("access_key_id")
+                secret_key = cos_config.get("secret_access_key")
+                region = cos_config.get("region")
+                bucket_name = cos_config.get("bucket_name")
+                appid = cos_config.get("app_id")
+                prefix = cos_config.get("path_prefix", "")
+            else:
+                # Get COS configuration from environment variables
+                secret_id = os.getenv("COS_SECRET_ID")
+                secret_key = os.getenv("COS_SECRET_KEY")
+                region = os.getenv("COS_REGION")
+                bucket_name = os.getenv("COS_BUCKET_NAME")
+                appid = os.getenv("COS_APP_ID")
+                prefix = os.getenv("COS_PATH_PREFIX")
+                
+            enable_old_domain = (
+                os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true"
+            )
+
+            if not all([secret_id, secret_key, region, bucket_name, appid]):
+                logger.error(
+                    "Incomplete COS configuration, missing required environment variables"
+                    f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}"
+                )
+                return None, None, None, None
+
+            # Initialize COS configuration
+            logger.info(
+                f"Initializing COS client with region: {region}, bucket: {bucket_name}"
+            )
+            config = CosConfig(
+                Appid=appid,
+                Region=region,
+                SecretId=secret_id,
+                SecretKey=secret_key,
+                EnableOldDomain=enable_old_domain,
+            )
+
+            # Create client
+            client = CosS3Client(config)
+            return client, bucket_name, region, prefix
+        except Exception as e:
+            logger.error(f"Failed to initialize COS client: {str(e)}")
+            return None, None, None, None
+            
+    def _get_download_url(self, bucket_name, region, object_key):
+        """Generate COS object URL
+        
+        Args:
+            bucket_name: Bucket name
+            region: Region
+            object_key: Object key
+            
+        Returns:
+            File URL
+        """
+        return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
+    
+        
+    def upload_file(self, file_path: str) -> str:
+        """Upload file to Tencent Cloud COS
+        
+        Args:
+            file_path: File path
+            
+        Returns:
+            File URL
+        """
+        logger.info(f"Uploading file to COS: {file_path}")
+        try:
+            if not self.client:
+                return ""
+
+            # Generate object key, use UUID to avoid conflicts
+            file_name = os.path.basename(file_path)
+            object_key = (
+                f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
+            )
+            logger.info(f"Generated object key: {object_key}")
+
+            # Upload file
+            logger.info("Attempting to upload file to COS")
+            response = self.client.upload_file(
+                Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key
+            )
+
+            # Get file URL
+            file_url = self._get_download_url(self.bucket_name, self.region, object_key)
+
+            logger.info(f"Successfully uploaded file to COS: {file_url}")
+            return file_url
+
+        except Exception as e:
+            logger.error(f"Failed to upload file to COS: {str(e)}")
+            return ""
+            
+    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
+        """Upload bytes to Tencent Cloud COS
+        
+        Args:
+            content: Byte content to upload
+            file_ext: File extension
+            
+        Returns:
+            File URL
+        """
+        try:
+            logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes")
+            if not self.client:
+                return ""
+                
+            object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}"
+            logger.info(f"Generated object key: {object_key}")
+            self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key)
+            file_url = self._get_download_url(self.bucket_name, self.region, object_key)
+            logger.info(f"Successfully uploaded bytes to COS: {file_url}")
+            return file_url
+        except Exception as e:
+            logger.error(f"Failed to upload bytes to COS: {str(e)}")
+            traceback.print_exc()
+            return ""
+
+
+class MinioStorage(Storage):
+    """MinIO storage implementation"""
+    
+    def __init__(self, storage_config=None):
+        """Initialize MinIO storage
+        
+        Args:
+            storage_config: Storage configuration
+        """
+        self.storage_config = storage_config
+        self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client()
+        
+    def _init_minio_client(self):
+        """Initialize MinIO client from environment variables or injected config.
+
+        If storage_config.path_prefix contains JSON from server (for minio case),
+        prefer those values to override envs.
+        """
+        try:
+            endpoint = os.getenv("MINIO_ENDPOINT")
+            use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true"
+            if self.storage_config and self.storage_config.get("bucket_name"):
+                storage_config = self.storage_config
+                bucket_name = storage_config.get("bucket_name")
+                path_prefix = storage_config.get("path_prefix").strip().strip("/")
+                access_key = storage_config.get("access_key_id")
+                secret_key = storage_config.get("secret_access_key")
+            else:
+                access_key = os.getenv("MINIO_ACCESS_KEY_ID")
+                secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY")
+                bucket_name = os.getenv("MINIO_BUCKET_NAME")
+                path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/")
+
+            if not all([endpoint, access_key, secret_key, bucket_name]):
+                logger.error("Incomplete MinIO configuration, missing required environment variables")
+                return None, None, None, None, None
+
+            # Initialize client
+            client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl)
+
+            # Ensure bucket exists
+            found = client.bucket_exists(bucket_name)
+            if not found:
+                client.make_bucket(bucket_name)
+                policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name)
+                client.set_bucket_policy(bucket_name, policy)
+
+            return client, bucket_name, use_ssl, endpoint, path_prefix
+        except Exception as e:
+            logger.error(f"Failed to initialize MinIO client: {str(e)}")
+            return None, None, None, None, None
+            
+    def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None):
+        """Construct a public URL for MinIO object.
+
+        If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint.
+        """
+        if public_endpoint:
+            base = public_endpoint
+        else:
+            scheme = "https" if use_ssl else "http"
+            base = f"{scheme}://{endpoint}"
+        # Path-style URL for MinIO
+        return f"{base}/{bucket_name}/{object_key}"
+        
+    def upload_file(self, file_path: str) -> str:
+        """Upload file to MinIO
+        
+        Args:
+            file_path: File path
+            
+        Returns:
+            File URL
+        """
+        logger.info(f"Uploading file to MinIO: {file_path}")
+        try:
+            if not self.client:
+                return ""
+
+            # Generate object key, use UUID to avoid conflicts
+            file_name = os.path.basename(file_path)
+            object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
+            logger.info(f"Generated MinIO object key: {object_key}")
+
+            # Upload file
+            logger.info("Attempting to upload file to MinIO")
+            with open(file_path, 'rb') as file_data:
+                file_size = os.path.getsize(file_path)
+                self.client.put_object(
+                    bucket_name=self.bucket_name,
+                    object_name=object_key,
+                    data=file_data,
+                    length=file_size,
+                    content_type='application/octet-stream'
+                )
+
+            # Get file URL
+            file_url = self._get_download_url(
+                self.bucket_name, 
+                object_key, 
+                self.use_ssl, 
+                self.endpoint,
+                os.getenv("MINIO_PUBLIC_ENDPOINT", None)
+            )
+
+            logger.info(f"Successfully uploaded file to MinIO: {file_url}")
+            return file_url
+
+        except Exception as e:
+            logger.error(f"Failed to upload file to MinIO: {str(e)}")
+            return ""
+            
+    def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
+        """Upload bytes to MinIO
+        
+        Args:
+            content: Byte content to upload
+            file_ext: File extension
+            
+        Returns:
+            File URL
+        """
+        try:
+            logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes")
+            if not self.client:
+                return ""
+                
+            object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}"
+            logger.info(f"Generated MinIO object key: {object_key}")
+            self.client.put_object(
+                self.bucket_name, 
+                object_key, 
+                data=io.BytesIO(content), 
+                length=len(content), 
+                content_type="application/octet-stream"
+            )
+            file_url = self._get_download_url(
+                self.bucket_name, 
+                object_key, 
+                self.use_ssl, 
+                self.endpoint,
+                os.getenv("MINIO_PUBLIC_ENDPOINT", None)
+            )
+            logger.info(f"Successfully uploaded bytes to MinIO: {file_url}")
+            return file_url
+        except Exception as e:
+            logger.error(f"Failed to upload bytes to MinIO: {str(e)}")
+            traceback.print_exc()
+            return ""
+
+
+def create_storage(storage_config=None) -> Storage:
+    """Create a storage instance based on configuration or environment variables
+    
+    Args:
+        storage_config: Storage configuration dictionary
+        
+    Returns:
+        Storage instance
+    """
+    storage_type = os.getenv("STORAGE_TYPE", "cos").lower()
+    
+    if storage_config:
+        storage_type = str(storage_config.get("provider", storage_type)).lower()
+        
+    logger.info(f"Creating {storage_type} storage instance")
+    
+    if storage_type == "minio":
+        return MinioStorage(storage_config)
+    elif storage_type == "cos":
+        # Default to COS
+        return CosStorage(storage_config)
+    else:
+        return None
--- a/docreader/parser/text_parser.py
+++ b/docreader/parser/text_parser.py
@@ -0,0 +1,58 @@
+import logging
+from .base_parser import BaseParser
+from typing import Dict, Any, Tuple, Union
+
+logger = logging.getLogger(__name__)
+
+
+class TextParser(BaseParser):
+    """
+    Text document parser for processing plain text files.
+    This parser handles text extraction and chunking from plain text documents.
+    """
+
+    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+        """
+        Parse text document content by decoding bytes to string.
+
+        This is a straightforward parser that simply converts the binary content
+        to text using appropriate character encoding.
+
+        Args:
+            content: Raw document content as bytes
+
+        Returns:
+            Parsed text content as string
+        """
+        logger.info(f"Parsing text document, content size: {len(content)} bytes")
+        text = self.decode_bytes(content)
+        logger.info(
+            f"Successfully parsed text document, extracted {len(text)} characters"
+        )
+        return text
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    logger.info("Running TextParser in standalone mode")
+
+    # Sample text for testing
+    text = """## 标题1
+    ![alt text](image.png)
+    ## 标题2
+    ![alt text](image2.png)
+    ## 标题3
+    ![alt text](image3.png)"""
+    logger.info(f"Test text content: {text}")
+
+    # Define separators for text splitting
+    seperators = ["\n\n", "\n", "。"]
+    parser = TextParser(separators=seperators)
+    logger.info("Splitting text into units")
+    units = parser._split_into_units(text)
+    logger.info(f"Split text into {len(units)} units")
+    logger.info(f"Units: {units}")
--- a/docreader/parser/web_parser.py
+++ b/docreader/parser/web_parser.py
@@ -0,0 +1,130 @@
+from typing import Any, Optional, Tuple, Dict, Union
+import os
+
+from playwright.async_api import async_playwright
+from bs4 import BeautifulSoup
+from .base_parser import BaseParser, ParseResult
+import logging
+import asyncio
+
+logger = logging.getLogger(__name__)
+
+
+class WebParser(BaseParser):
+    """Web page parser"""
+
+    def __init__(self, title: str, **kwargs):
+        self.title = title
+        self.proxy = os.environ.get("WEB_PROXY", "")
+        super().__init__(file_name=title, **kwargs)
+        logger.info(f"Initialized WebParser with title: {title}")
+
+    async def scrape(self, url: str) -> Any:
+        logger.info(f"Starting web page scraping for URL: {url}")
+        try:
+            async with async_playwright() as p:
+                kwargs = {}
+                if self.proxy:
+                    kwargs["proxy"] = {"server": self.proxy}
+                logger.info("Launching WebKit browser")
+                browser = await p.webkit.launch(**kwargs)
+                page = await browser.new_page()
+
+                logger.info(f"Navigating to URL: {url}")
+                try:
+                    await page.goto(url, timeout=30000)
+                    logger.info("Initial page load complete")
+                except Exception as e:
+                    logger.error(f"Error navigating to URL: {str(e)}")
+                    await browser.close()
+                    return BeautifulSoup(
+                        "", "html.parser"
+                    )  # Return empty soup on navigation error
+
+                logger.info("Retrieving page HTML content")
+                content = await page.content()
+                logger.info(f"Retrieved {len(content)} bytes of HTML content")
+
+                await browser.close()
+                logger.info("Browser closed")
+
+            # Parse HTML content with BeautifulSoup
+            logger.info("Parsing HTML with BeautifulSoup")
+            soup = BeautifulSoup(content, "html.parser")
+            logger.info("Successfully parsed HTML content")
+            return soup
+
+        except Exception as e:
+            logger.error(f"Failed to scrape web page: {str(e)}")
+            # Return empty BeautifulSoup object on error
+            return BeautifulSoup("", "html.parser")
+
+    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+        """Parse web page
+
+        Args:
+            content: Web page content
+
+        Returns:
+            Parse result
+        """
+        logger.info("Starting web page parsing")
+
+        # Call async method synchronously
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
+        try:
+            # Run async method
+            # Handle content possibly being a string
+            if isinstance(content, bytes):
+                url = self.decode_bytes(content)
+                logger.info(f"Decoded URL from bytes: {url}")
+            else:
+                url = content
+                logger.info(f"Using content as URL directly: {url}")
+
+            logger.info(f"Scraping web page: {url}")
+            soup = loop.run_until_complete(self.scrape(url))
+
+            # Extract page text
+            logger.info("Extracting text from web page")
+            text = soup.get_text("\n")
+            logger.info(f"Extracted {len(text)} characters of text from URL: {url}")
+
+            # Get title, usually in <title> or <h1> tag
+            if self.title != "":
+                title = self.title
+                logger.info(f"Using provided title: {title}")
+            else:
+                title = soup.title.string if soup.title else None
+                logger.info(f"Found title tag: {title}")
+
+            if not title:  # If <title> tag does not exist or is empty, try <h1> tag
+                h1_tag = soup.find("h1")
+                if h1_tag:
+                    title = h1_tag.get_text()
+                    logger.info(f"Using h1 tag as title: {title}")
+                else:
+                    title = "Untitled Web Page"
+                    logger.info("No title found, using default")
+
+            logger.info(f"Web page title: {title}")
+            text = "\n".join(
+                (line.strip() for line in text.splitlines() if line.strip())
+            )
+
+            result = title + "\n\n" + text
+            logger.info(
+                f"Web page parsing complete, total content: {len(result)} characters"
+            )
+            return result
+
+        except Exception as e:
+            logger.error(f"Error parsing web page: {str(e)}")
+            return f"Error parsing web page: {str(e)}"
+
+        finally:
+            # Close event loop
+            logger.info("Closing event loop")
+            loop.close()
--- a/docreader/proto/docreader.pb.go
+++ b/docreader/proto/docreader.pb.go
@@ -0,0 +1,819 @@
+// Code generated by protoc-gen-go. DO NOT EDIT.
+// versions:
+// 	protoc-gen-go v1.36.6
+// 	protoc        v5.29.3
+// source: docreader.proto
+
+package proto
+
+import (
+	protoreflect "google.golang.org/protobuf/reflect/protoreflect"
+	protoimpl "google.golang.org/protobuf/runtime/protoimpl"
+	reflect "reflect"
+	sync "sync"
+	unsafe "unsafe"
+)
+
+const (
+	// Verify that this generated code is sufficiently up-to-date.
+	_ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion)
+	// Verify that runtime/protoimpl is sufficiently up-to-date.
+	_ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20)
+)
+
+// 对象存储提供方
+type StorageProvider int32
+
+const (
+	StorageProvider_STORAGE_PROVIDER_UNSPECIFIED StorageProvider = 0
+	StorageProvider_COS                          StorageProvider = 1 // 腾讯云 COS
+	StorageProvider_MINIO                        StorageProvider = 2 // MinIO/S3 兼容
+)
+
+// Enum value maps for StorageProvider.
+var (
+	StorageProvider_name = map[int32]string{
+		0: "STORAGE_PROVIDER_UNSPECIFIED",
+		1: "COS",
+		2: "MINIO",
+	}
+	StorageProvider_value = map[string]int32{
+		"STORAGE_PROVIDER_UNSPECIFIED": 0,
+		"COS":                          1,
+		"MINIO":                        2,
+	}
+)
+
+func (x StorageProvider) Enum() *StorageProvider {
+	p := new(StorageProvider)
+	*p = x
+	return p
+}
+
+func (x StorageProvider) String() string {
+	return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x))
+}
+
+func (StorageProvider) Descriptor() protoreflect.EnumDescriptor {
+	return file_docreader_proto_enumTypes[0].Descriptor()
+}
+
+func (StorageProvider) Type() protoreflect.EnumType {
+	return &file_docreader_proto_enumTypes[0]
+}
+
+func (x StorageProvider) Number() protoreflect.EnumNumber {
+	return protoreflect.EnumNumber(x)
+}
+
+// Deprecated: Use StorageProvider.Descriptor instead.
+func (StorageProvider) EnumDescriptor() ([]byte, []int) {
+	return file_docreader_proto_rawDescGZIP(), []int{0}
+}
+
+// 通用对象存储配置，兼容 COS 与 MinIO
+type StorageConfig struct {
+	state           protoimpl.MessageState `protogen:"open.v1"`
+	Provider        StorageProvider        `protobuf:"varint,1,opt,name=provider,proto3,enum=docreader.StorageProvider" json:"provider,omitempty"`        // 存储提供方
+	Region          string                 `protobuf:"bytes,2,opt,name=region,proto3" json:"region,omitempty"`                                            // 区域（COS 使用）
+	BucketName      string                 `protobuf:"bytes,3,opt,name=bucket_name,json=bucketName,proto3" json:"bucket_name,omitempty"`                  // 桶名
+	AccessKeyId     string                 `protobuf:"bytes,4,opt,name=access_key_id,json=accessKeyId,proto3" json:"access_key_id,omitempty"`             // 访问密钥 ID（MinIO/S3 使用）
+	SecretAccessKey string                 `protobuf:"bytes,5,opt,name=secret_access_key,json=secretAccessKey,proto3" json:"secret_access_key,omitempty"` // 访问密钥 Secret（MinIO/S3 使用）
+	AppId           string                 `protobuf:"bytes,6,opt,name=app_id,json=appId,proto3" json:"app_id,omitempty"`                                 // 应用 ID（COS 使用）
+	PathPrefix      string                 `protobuf:"bytes,7,opt,name=path_prefix,json=pathPrefix,proto3" json:"path_prefix,omitempty"`                  // 路径前缀
+	unknownFields   protoimpl.UnknownFields
+	sizeCache       protoimpl.SizeCache
+}
+
+func (x *StorageConfig) Reset() {
+	*x = StorageConfig{}
+	mi := &file_docreader_proto_msgTypes[0]
+	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+	ms.StoreMessageInfo(mi)
+}
+
+func (x *StorageConfig) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*StorageConfig) ProtoMessage() {}
+
+func (x *StorageConfig) ProtoReflect() protoreflect.Message {
+	mi := &file_docreader_proto_msgTypes[0]
+	if x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use StorageConfig.ProtoReflect.Descriptor instead.
+func (*StorageConfig) Descriptor() ([]byte, []int) {
+	return file_docreader_proto_rawDescGZIP(), []int{0}
+}
+
+func (x *StorageConfig) GetProvider() StorageProvider {
+	if x != nil {
+		return x.Provider
+	}
+	return StorageProvider_STORAGE_PROVIDER_UNSPECIFIED
+}
+
+func (x *StorageConfig) GetRegion() string {
+	if x != nil {
+		return x.Region
+	}
+	return ""
+}
+
+func (x *StorageConfig) GetBucketName() string {
+	if x != nil {
+		return x.BucketName
+	}
+	return ""
+}
+
+func (x *StorageConfig) GetAccessKeyId() string {
+	if x != nil {
+		return x.AccessKeyId
+	}
+	return ""
+}
+
+func (x *StorageConfig) GetSecretAccessKey() string {
+	if x != nil {
+		return x.SecretAccessKey
+	}
+	return ""
+}
+
+func (x *StorageConfig) GetAppId() string {
+	if x != nil {
+		return x.AppId
+	}
+	return ""
+}
+
+func (x *StorageConfig) GetPathPrefix() string {
+	if x != nil {
+		return x.PathPrefix
+	}
+	return ""
+}
+
+// VLM 配置
+type VLMConfig struct {
+	state         protoimpl.MessageState `protogen:"open.v1"`
+	ModelName     string                 `protobuf:"bytes,1,opt,name=model_name,json=modelName,proto3" json:"model_name,omitempty"`             // VLM Model Name
+	BaseUrl       string                 `protobuf:"bytes,2,opt,name=base_url,json=baseUrl,proto3" json:"base_url,omitempty"`                   // VLM Base URL
+	ApiKey        string                 `protobuf:"bytes,3,opt,name=api_key,json=apiKey,proto3" json:"api_key,omitempty"`                      // VLM API Key
+	InterfaceType string                 `protobuf:"bytes,4,opt,name=interface_type,json=interfaceType,proto3" json:"interface_type,omitempty"` // VLM Interface Type: "ollama" or "openai"
+	unknownFields protoimpl.UnknownFields
+	sizeCache     protoimpl.SizeCache
+}
+
+func (x *VLMConfig) Reset() {
+	*x = VLMConfig{}
+	mi := &file_docreader_proto_msgTypes[1]
+	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+	ms.StoreMessageInfo(mi)
+}
+
+func (x *VLMConfig) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*VLMConfig) ProtoMessage() {}
+
+func (x *VLMConfig) ProtoReflect() protoreflect.Message {
+	mi := &file_docreader_proto_msgTypes[1]
+	if x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use VLMConfig.ProtoReflect.Descriptor instead.
+func (*VLMConfig) Descriptor() ([]byte, []int) {
+	return file_docreader_proto_rawDescGZIP(), []int{1}
+}
+
+func (x *VLMConfig) GetModelName() string {
+	if x != nil {
+		return x.ModelName
+	}
+	return ""
+}
+
+func (x *VLMConfig) GetBaseUrl() string {
+	if x != nil {
+		return x.BaseUrl
+	}
+	return ""
+}
+
+func (x *VLMConfig) GetApiKey() string {
+	if x != nil {
+		return x.ApiKey
+	}
+	return ""
+}
+
+func (x *VLMConfig) GetInterfaceType() string {
+	if x != nil {
+		return x.InterfaceType
+	}
+	return ""
+}
+
+type ReadConfig struct {
+	state            protoimpl.MessageState `protogen:"open.v1"`
+	ChunkSize        int32                  `protobuf:"varint,1,opt,name=chunk_size,json=chunkSize,proto3" json:"chunk_size,omitempty"`                      // 分块大小
+	ChunkOverlap     int32                  `protobuf:"varint,2,opt,name=chunk_overlap,json=chunkOverlap,proto3" json:"chunk_overlap,omitempty"`             // 分块重叠
+	Separators       []string               `protobuf:"bytes,3,rep,name=separators,proto3" json:"separators,omitempty"`                                      // 分隔符
+	EnableMultimodal bool                   `protobuf:"varint,4,opt,name=enable_multimodal,json=enableMultimodal,proto3" json:"enable_multimodal,omitempty"` // 多模态处理
+	StorageConfig    *StorageConfig         `protobuf:"bytes,5,opt,name=storage_config,json=storageConfig,proto3" json:"storage_config,omitempty"`           // 对象存储配置（通用）
+	VlmConfig        *VLMConfig             `protobuf:"bytes,6,opt,name=vlm_config,json=vlmConfig,proto3" json:"vlm_config,omitempty"`                       // VLM 配置
+	unknownFields    protoimpl.UnknownFields
+	sizeCache        protoimpl.SizeCache
+}
+
+func (x *ReadConfig) Reset() {
+	*x = ReadConfig{}
+	mi := &file_docreader_proto_msgTypes[2]
+	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+	ms.StoreMessageInfo(mi)
+}
+
+func (x *ReadConfig) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*ReadConfig) ProtoMessage() {}
+
+func (x *ReadConfig) ProtoReflect() protoreflect.Message {
+	mi := &file_docreader_proto_msgTypes[2]
+	if x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use ReadConfig.ProtoReflect.Descriptor instead.
+func (*ReadConfig) Descriptor() ([]byte, []int) {
+	return file_docreader_proto_rawDescGZIP(), []int{2}
+}
+
+func (x *ReadConfig) GetChunkSize() int32 {
+	if x != nil {
+		return x.ChunkSize
+	}
+	return 0
+}
+
+func (x *ReadConfig) GetChunkOverlap() int32 {
+	if x != nil {
+		return x.ChunkOverlap
+	}
+	return 0
+}
+
+func (x *ReadConfig) GetSeparators() []string {
+	if x != nil {
+		return x.Separators
+	}
+	return nil
+}
+
+func (x *ReadConfig) GetEnableMultimodal() bool {
+	if x != nil {
+		return x.EnableMultimodal
+	}
+	return false
+}
+
+func (x *ReadConfig) GetStorageConfig() *StorageConfig {
+	if x != nil {
+		return x.StorageConfig
+	}
+	return nil
+}
+
+func (x *ReadConfig) GetVlmConfig() *VLMConfig {
+	if x != nil {
+		return x.VlmConfig
+	}
+	return nil
+}
+
+// 从文件读取文档请求
+type ReadFromFileRequest struct {
+	state         protoimpl.MessageState `protogen:"open.v1"`
+	FileContent   []byte                 `protobuf:"bytes,1,opt,name=file_content,json=fileContent,proto3" json:"file_content,omitempty"` // 文件内容
+	FileName      string                 `protobuf:"bytes,2,opt,name=file_name,json=fileName,proto3" json:"file_name,omitempty"`          // 文件名
+	FileType      string                 `protobuf:"bytes,3,opt,name=file_type,json=fileType,proto3" json:"file_type,omitempty"`          // 文件类型
+	ReadConfig    *ReadConfig            `protobuf:"bytes,4,opt,name=read_config,json=readConfig,proto3" json:"read_config,omitempty"`
+	RequestId     string                 `protobuf:"bytes,5,opt,name=request_id,json=requestId,proto3" json:"request_id,omitempty"`
+	unknownFields protoimpl.UnknownFields
+	sizeCache     protoimpl.SizeCache
+}
+
+func (x *ReadFromFileRequest) Reset() {
+	*x = ReadFromFileRequest{}
+	mi := &file_docreader_proto_msgTypes[3]
+	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+	ms.StoreMessageInfo(mi)
+}
+
+func (x *ReadFromFileRequest) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*ReadFromFileRequest) ProtoMessage() {}
+
+func (x *ReadFromFileRequest) ProtoReflect() protoreflect.Message {
+	mi := &file_docreader_proto_msgTypes[3]
+	if x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use ReadFromFileRequest.ProtoReflect.Descriptor instead.
+func (*ReadFromFileRequest) Descriptor() ([]byte, []int) {
+	return file_docreader_proto_rawDescGZIP(), []int{3}
+}
+
+func (x *ReadFromFileRequest) GetFileContent() []byte {
+	if x != nil {
+		return x.FileContent
+	}
+	return nil
+}
+
+func (x *ReadFromFileRequest) GetFileName() string {
+	if x != nil {
+		return x.FileName
+	}
+	return ""
+}
+
+func (x *ReadFromFileRequest) GetFileType() string {
+	if x != nil {
+		return x.FileType
+	}
+	return ""
+}
+
+func (x *ReadFromFileRequest) GetReadConfig() *ReadConfig {
+	if x != nil {
+		return x.ReadConfig
+	}
+	return nil
+}
+
+func (x *ReadFromFileRequest) GetRequestId() string {
+	if x != nil {
+		return x.RequestId
+	}
+	return ""
+}
+
+// 从URL读取文档请求
+type ReadFromURLRequest struct {
+	state         protoimpl.MessageState `protogen:"open.v1"`
+	Url           string                 `protobuf:"bytes,1,opt,name=url,proto3" json:"url,omitempty"`     // 文档URL
+	Title         string                 `protobuf:"bytes,2,opt,name=title,proto3" json:"title,omitempty"` // 标题
+	ReadConfig    *ReadConfig            `protobuf:"bytes,3,opt,name=read_config,json=readConfig,proto3" json:"read_config,omitempty"`
+	RequestId     string                 `protobuf:"bytes,4,opt,name=request_id,json=requestId,proto3" json:"request_id,omitempty"`
+	unknownFields protoimpl.UnknownFields
+	sizeCache     protoimpl.SizeCache
+}
+
+func (x *ReadFromURLRequest) Reset() {
+	*x = ReadFromURLRequest{}
+	mi := &file_docreader_proto_msgTypes[4]
+	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+	ms.StoreMessageInfo(mi)
+}
+
+func (x *ReadFromURLRequest) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*ReadFromURLRequest) ProtoMessage() {}
+
+func (x *ReadFromURLRequest) ProtoReflect() protoreflect.Message {
+	mi := &file_docreader_proto_msgTypes[4]
+	if x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use ReadFromURLRequest.ProtoReflect.Descriptor instead.
+func (*ReadFromURLRequest) Descriptor() ([]byte, []int) {
+	return file_docreader_proto_rawDescGZIP(), []int{4}
+}
+
+func (x *ReadFromURLRequest) GetUrl() string {
+	if x != nil {
+		return x.Url
+	}
+	return ""
+}
+
+func (x *ReadFromURLRequest) GetTitle() string {
+	if x != nil {
+		return x.Title
+	}
+	return ""
+}
+
+func (x *ReadFromURLRequest) GetReadConfig() *ReadConfig {
+	if x != nil {
+		return x.ReadConfig
+	}
+	return nil
+}
+
+func (x *ReadFromURLRequest) GetRequestId() string {
+	if x != nil {
+		return x.RequestId
+	}
+	return ""
+}
+
+// 图片信息
+type Image struct {
+	state         protoimpl.MessageState `protogen:"open.v1"`
+	Url           string                 `protobuf:"bytes,1,opt,name=url,proto3" json:"url,omitempty"`                                    // 图片URL
+	Caption       string                 `protobuf:"bytes,2,opt,name=caption,proto3" json:"caption,omitempty"`                            // 图片描述
+	OcrText       string                 `protobuf:"bytes,3,opt,name=ocr_text,json=ocrText,proto3" json:"ocr_text,omitempty"`             // OCR提取的文本内容
+	OriginalUrl   string                 `protobuf:"bytes,4,opt,name=original_url,json=originalUrl,proto3" json:"original_url,omitempty"` // 原始图片URL
+	Start         int32                  `protobuf:"varint,5,opt,name=start,proto3" json:"start,omitempty"`                               // 图片在文本中的开始位置
+	End           int32                  `protobuf:"varint,6,opt,name=end,proto3" json:"end,omitempty"`                                   // 图片在文本中的结束位置
+	unknownFields protoimpl.UnknownFields
+	sizeCache     protoimpl.SizeCache
+}
+
+func (x *Image) Reset() {
+	*x = Image{}
+	mi := &file_docreader_proto_msgTypes[5]
+	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+	ms.StoreMessageInfo(mi)
+}
+
+func (x *Image) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*Image) ProtoMessage() {}
+
+func (x *Image) ProtoReflect() protoreflect.Message {
+	mi := &file_docreader_proto_msgTypes[5]
+	if x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use Image.ProtoReflect.Descriptor instead.
+func (*Image) Descriptor() ([]byte, []int) {
+	return file_docreader_proto_rawDescGZIP(), []int{5}
+}
+
+func (x *Image) GetUrl() string {
+	if x != nil {
+		return x.Url
+	}
+	return ""
+}
+
+func (x *Image) GetCaption() string {
+	if x != nil {
+		return x.Caption
+	}
+	return ""
+}
+
+func (x *Image) GetOcrText() string {
+	if x != nil {
+		return x.OcrText
+	}
+	return ""
+}
+
+func (x *Image) GetOriginalUrl() string {
+	if x != nil {
+		return x.OriginalUrl
+	}
+	return ""
+}
+
+func (x *Image) GetStart() int32 {
+	if x != nil {
+		return x.Start
+	}
+	return 0
+}
+
+func (x *Image) GetEnd() int32 {
+	if x != nil {
+		return x.End
+	}
+	return 0
+}
+
+type Chunk struct {
+	state         protoimpl.MessageState `protogen:"open.v1"`
+	Content       string                 `protobuf:"bytes,1,opt,name=content,proto3" json:"content,omitempty"` // 块内容
+	Seq           int32                  `protobuf:"varint,2,opt,name=seq,proto3" json:"seq,omitempty"`        // 块在文档中的次序
+	Start         int32                  `protobuf:"varint,3,opt,name=start,proto3" json:"start,omitempty"`    // 块在文档中的起始位置
+	End           int32                  `protobuf:"varint,4,opt,name=end,proto3" json:"end,omitempty"`        // 块在文档中的结束位置
+	Images        []*Image               `protobuf:"bytes,5,rep,name=images,proto3" json:"images,omitempty"`   // 块中包含的图片信息
+	unknownFields protoimpl.UnknownFields
+	sizeCache     protoimpl.SizeCache
+}
+
+func (x *Chunk) Reset() {
+	*x = Chunk{}
+	mi := &file_docreader_proto_msgTypes[6]
+	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+	ms.StoreMessageInfo(mi)
+}
+
+func (x *Chunk) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*Chunk) ProtoMessage() {}
+
+func (x *Chunk) ProtoReflect() protoreflect.Message {
+	mi := &file_docreader_proto_msgTypes[6]
+	if x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use Chunk.ProtoReflect.Descriptor instead.
+func (*Chunk) Descriptor() ([]byte, []int) {
+	return file_docreader_proto_rawDescGZIP(), []int{6}
+}
+
+func (x *Chunk) GetContent() string {
+	if x != nil {
+		return x.Content
+	}
+	return ""
+}
+
+func (x *Chunk) GetSeq() int32 {
+	if x != nil {
+		return x.Seq
+	}
+	return 0
+}
+
+func (x *Chunk) GetStart() int32 {
+	if x != nil {
+		return x.Start
+	}
+	return 0
+}
+
+func (x *Chunk) GetEnd() int32 {
+	if x != nil {
+		return x.End
+	}
+	return 0
+}
+
+func (x *Chunk) GetImages() []*Image {
+	if x != nil {
+		return x.Images
+	}
+	return nil
+}
+
+// 从URL读取文档响应
+type ReadResponse struct {
+	state         protoimpl.MessageState `protogen:"open.v1"`
+	Chunks        []*Chunk               `protobuf:"bytes,1,rep,name=chunks,proto3" json:"chunks,omitempty"` // 文档分块
+	Error         string                 `protobuf:"bytes,2,opt,name=error,proto3" json:"error,omitempty"`   // 错误信息
+	unknownFields protoimpl.UnknownFields
+	sizeCache     protoimpl.SizeCache
+}
+
+func (x *ReadResponse) Reset() {
+	*x = ReadResponse{}
+	mi := &file_docreader_proto_msgTypes[7]
+	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+	ms.StoreMessageInfo(mi)
+}
+
+func (x *ReadResponse) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*ReadResponse) ProtoMessage() {}
+
+func (x *ReadResponse) ProtoReflect() protoreflect.Message {
+	mi := &file_docreader_proto_msgTypes[7]
+	if x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use ReadResponse.ProtoReflect.Descriptor instead.
+func (*ReadResponse) Descriptor() ([]byte, []int) {
+	return file_docreader_proto_rawDescGZIP(), []int{7}
+}
+
+func (x *ReadResponse) GetChunks() []*Chunk {
+	if x != nil {
+		return x.Chunks
+	}
+	return nil
+}
+
+func (x *ReadResponse) GetError() string {
+	if x != nil {
+		return x.Error
+	}
+	return ""
+}
+
+var File_docreader_proto protoreflect.FileDescriptor
+
+const file_docreader_proto_rawDesc = "" +
+	"\n" +
+	"\x0fdocreader.proto\x12\tdocreader\"\x88\x02\n" +
+	"\rStorageConfig\x126\n" +
+	"\bprovider\x18\x01 \x01(\x0e2\x1a.docreader.StorageProviderR\bprovider\x12\x16\n" +
+	"\x06region\x18\x02 \x01(\tR\x06region\x12\x1f\n" +
+	"\vbucket_name\x18\x03 \x01(\tR\n" +
+	"bucketName\x12\"\n" +
+	"\raccess_key_id\x18\x04 \x01(\tR\vaccessKeyId\x12*\n" +
+	"\x11secret_access_key\x18\x05 \x01(\tR\x0fsecretAccessKey\x12\x15\n" +
+	"\x06app_id\x18\x06 \x01(\tR\x05appId\x12\x1f\n" +
+	"\vpath_prefix\x18\a \x01(\tR\n" +
+	"pathPrefix\"\x85\x01\n" +
+	"\tVLMConfig\x12\x1d\n" +
+	"\n" +
+	"model_name\x18\x01 \x01(\tR\tmodelName\x12\x19\n" +
+	"\bbase_url\x18\x02 \x01(\tR\abaseUrl\x12\x17\n" +
+	"\aapi_key\x18\x03 \x01(\tR\x06apiKey\x12%\n" +
+	"\x0einterface_type\x18\x04 \x01(\tR\rinterfaceType\"\x93\x02\n" +
+	"\n" +
+	"ReadConfig\x12\x1d\n" +
+	"\n" +
+	"chunk_size\x18\x01 \x01(\x05R\tchunkSize\x12#\n" +
+	"\rchunk_overlap\x18\x02 \x01(\x05R\fchunkOverlap\x12\x1e\n" +
+	"\n" +
+	"separators\x18\x03 \x03(\tR\n" +
+	"separators\x12+\n" +
+	"\x11enable_multimodal\x18\x04 \x01(\bR\x10enableMultimodal\x12?\n" +
+	"\x0estorage_config\x18\x05 \x01(\v2\x18.docreader.StorageConfigR\rstorageConfig\x123\n" +
+	"\n" +
+	"vlm_config\x18\x06 \x01(\v2\x14.docreader.VLMConfigR\tvlmConfig\"\xc9\x01\n" +
+	"\x13ReadFromFileRequest\x12!\n" +
+	"\ffile_content\x18\x01 \x01(\fR\vfileContent\x12\x1b\n" +
+	"\tfile_name\x18\x02 \x01(\tR\bfileName\x12\x1b\n" +
+	"\tfile_type\x18\x03 \x01(\tR\bfileType\x126\n" +
+	"\vread_config\x18\x04 \x01(\v2\x15.docreader.ReadConfigR\n" +
+	"readConfig\x12\x1d\n" +
+	"\n" +
+	"request_id\x18\x05 \x01(\tR\trequestId\"\x93\x01\n" +
+	"\x12ReadFromURLRequest\x12\x10\n" +
+	"\x03url\x18\x01 \x01(\tR\x03url\x12\x14\n" +
+	"\x05title\x18\x02 \x01(\tR\x05title\x126\n" +
+	"\vread_config\x18\x03 \x01(\v2\x15.docreader.ReadConfigR\n" +
+	"readConfig\x12\x1d\n" +
+	"\n" +
+	"request_id\x18\x04 \x01(\tR\trequestId\"\x99\x01\n" +
+	"\x05Image\x12\x10\n" +
+	"\x03url\x18\x01 \x01(\tR\x03url\x12\x18\n" +
+	"\acaption\x18\x02 \x01(\tR\acaption\x12\x19\n" +
+	"\bocr_text\x18\x03 \x01(\tR\aocrText\x12!\n" +
+	"\foriginal_url\x18\x04 \x01(\tR\voriginalUrl\x12\x14\n" +
+	"\x05start\x18\x05 \x01(\x05R\x05start\x12\x10\n" +
+	"\x03end\x18\x06 \x01(\x05R\x03end\"\x85\x01\n" +
+	"\x05Chunk\x12\x18\n" +
+	"\acontent\x18\x01 \x01(\tR\acontent\x12\x10\n" +
+	"\x03seq\x18\x02 \x01(\x05R\x03seq\x12\x14\n" +
+	"\x05start\x18\x03 \x01(\x05R\x05start\x12\x10\n" +
+	"\x03end\x18\x04 \x01(\x05R\x03end\x12(\n" +
+	"\x06images\x18\x05 \x03(\v2\x10.docreader.ImageR\x06images\"N\n" +
+	"\fReadResponse\x12(\n" +
+	"\x06chunks\x18\x01 \x03(\v2\x10.docreader.ChunkR\x06chunks\x12\x14\n" +
+	"\x05error\x18\x02 \x01(\tR\x05error*G\n" +
+	"\x0fStorageProvider\x12 \n" +
+	"\x1cSTORAGE_PROVIDER_UNSPECIFIED\x10\x00\x12\a\n" +
+	"\x03COS\x10\x01\x12\t\n" +
+	"\x05MINIO\x10\x022\x9f\x01\n" +
+	"\tDocReader\x12I\n" +
+	"\fReadFromFile\x12\x1e.docreader.ReadFromFileRequest\x1a\x17.docreader.ReadResponse\"\x00\x12G\n" +
+	"\vReadFromURL\x12\x1d.docreader.ReadFromURLRequest\x1a\x17.docreader.ReadResponse\"\x00B5Z3github.com/Tencent/WeKnora/internal/docreader/protob\x06proto3"
+
+var (
+	file_docreader_proto_rawDescOnce sync.Once
+	file_docreader_proto_rawDescData []byte
+)
+
+func file_docreader_proto_rawDescGZIP() []byte {
+	file_docreader_proto_rawDescOnce.Do(func() {
+		file_docreader_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_docreader_proto_rawDesc), len(file_docreader_proto_rawDesc)))
+	})
+	return file_docreader_proto_rawDescData
+}
+
+var file_docreader_proto_enumTypes = make([]protoimpl.EnumInfo, 1)
+var file_docreader_proto_msgTypes = make([]protoimpl.MessageInfo, 8)
+var file_docreader_proto_goTypes = []any{
+	(StorageProvider)(0),        // 0: docreader.StorageProvider
+	(*StorageConfig)(nil),       // 1: docreader.StorageConfig
+	(*VLMConfig)(nil),           // 2: docreader.VLMConfig
+	(*ReadConfig)(nil),          // 3: docreader.ReadConfig
+	(*ReadFromFileRequest)(nil), // 4: docreader.ReadFromFileRequest
+	(*ReadFromURLRequest)(nil),  // 5: docreader.ReadFromURLRequest
+	(*Image)(nil),               // 6: docreader.Image
+	(*Chunk)(nil),               // 7: docreader.Chunk
+	(*ReadResponse)(nil),        // 8: docreader.ReadResponse
+}
+var file_docreader_proto_depIdxs = []int32{
+	0, // 0: docreader.StorageConfig.provider:type_name -> docreader.StorageProvider
+	1, // 1: docreader.ReadConfig.storage_config:type_name -> docreader.StorageConfig
+	2, // 2: docreader.ReadConfig.vlm_config:type_name -> docreader.VLMConfig
+	3, // 3: docreader.ReadFromFileRequest.read_config:type_name -> docreader.ReadConfig
+	3, // 4: docreader.ReadFromURLRequest.read_config:type_name -> docreader.ReadConfig
+	6, // 5: docreader.Chunk.images:type_name -> docreader.Image
+	7, // 6: docreader.ReadResponse.chunks:type_name -> docreader.Chunk
+	4, // 7: docreader.DocReader.ReadFromFile:input_type -> docreader.ReadFromFileRequest
+	5, // 8: docreader.DocReader.ReadFromURL:input_type -> docreader.ReadFromURLRequest
+	8, // 9: docreader.DocReader.ReadFromFile:output_type -> docreader.ReadResponse
+	8, // 10: docreader.DocReader.ReadFromURL:output_type -> docreader.ReadResponse
+	9, // [9:11] is the sub-list for method output_type
+	7, // [7:9] is the sub-list for method input_type
+	7, // [7:7] is the sub-list for extension type_name
+	7, // [7:7] is the sub-list for extension extendee
+	0, // [0:7] is the sub-list for field type_name
+}
+
+func init() { file_docreader_proto_init() }
+func file_docreader_proto_init() {
+	if File_docreader_proto != nil {
+		return
+	}
+	type x struct{}
+	out := protoimpl.TypeBuilder{
+		File: protoimpl.DescBuilder{
+			GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
+			RawDescriptor: unsafe.Slice(unsafe.StringData(file_docreader_proto_rawDesc), len(file_docreader_proto_rawDesc)),
+			NumEnums:      1,
+			NumMessages:   8,
+			NumExtensions: 0,
+			NumServices:   1,
+		},
+		GoTypes:           file_docreader_proto_goTypes,
+		DependencyIndexes: file_docreader_proto_depIdxs,
+		EnumInfos:         file_docreader_proto_enumTypes,
+		MessageInfos:      file_docreader_proto_msgTypes,
+	}.Build()
+	File_docreader_proto = out.File
+	file_docreader_proto_goTypes = nil
+	file_docreader_proto_depIdxs = nil
+}
--- a/docreader/proto/docreader.proto
+++ b/docreader/proto/docreader.proto
@@ -0,0 +1,89 @@
+syntax = "proto3";
+
+package docreader;
+
+option go_package = "github.com/Tencent/WeKnora/internal/docreader/proto";
+
+// 文档读取服务
+service DocReader {
+  // 从文件读取文档
+  rpc ReadFromFile(ReadFromFileRequest) returns (ReadResponse) {}
+  // 从URL读取文档
+  rpc ReadFromURL(ReadFromURLRequest) returns (ReadResponse) {}
+}
+
+// 对象存储提供方
+enum StorageProvider {
+  STORAGE_PROVIDER_UNSPECIFIED = 0;
+  COS = 1;     // 腾讯云 COS
+  MINIO = 2;   // MinIO/S3 兼容
+}
+
+// 通用对象存储配置，兼容 COS 与 MinIO
+message StorageConfig {
+  StorageProvider provider = 1;   // 存储提供方
+  string region = 2;              // 区域（COS 使用）
+  string bucket_name = 3;         // 桶名
+  string access_key_id = 4;       // 访问密钥 ID（MinIO/S3 使用）
+  string secret_access_key = 5;   // 访问密钥 Secret（MinIO/S3 使用）
+  string app_id = 6;              // 应用 ID（COS 使用）
+  string path_prefix = 7;         // 路径前缀
+}
+
+// VLM 配置
+message VLMConfig {
+  string model_name = 1;     // VLM Model Name
+  string base_url = 2;       // VLM Base URL
+  string api_key = 3;        // VLM API Key
+  string interface_type = 4; // VLM Interface Type: "ollama" or "openai"
+}
+
+message ReadConfig {
+  int32 chunk_size = 1;    // 分块大小
+  int32 chunk_overlap = 2; // 分块重叠
+  repeated string separators = 3;    // 分隔符
+  bool enable_multimodal = 4; // 多模态处理
+  StorageConfig storage_config = 5;   // 对象存储配置（通用）
+  VLMConfig vlm_config = 6;   // VLM 配置
+}
+
+// 从文件读取文档请求
+message ReadFromFileRequest {
+  bytes file_content = 1;  // 文件内容
+  string file_name = 2;    // 文件名
+  string file_type = 3;    // 文件类型
+  ReadConfig read_config = 4; 
+  string request_id = 5;
+}
+
+// 从URL读取文档请求
+message ReadFromURLRequest {
+  string url = 1;          // 文档URL
+  string title = 2;        // 标题
+  ReadConfig read_config = 3; 
+  string request_id = 4;
+}
+
+// 图片信息
+message Image {
+  string url = 1;           // 图片URL
+  string caption = 2;       // 图片描述
+  string ocr_text = 3;      // OCR提取的文本内容
+  string original_url = 4;  // 原始图片URL
+  int32 start = 5;          // 图片在文本中的开始位置
+  int32 end = 6;            // 图片在文本中的结束位置
+}
+
+message Chunk {
+  string content = 1;     // 块内容
+  int32 seq = 2;          // 块在文档中的次序
+  int32 start = 3;        // 块在文档中的起始位置
+  int32 end = 4;          // 块在文档中的结束位置
+  repeated Image images = 5; // 块中包含的图片信息
+}
+
+// 从URL读取文档响应
+message ReadResponse {
+  repeated Chunk chunks = 1; // 文档分块
+  string error = 2;          // 错误信息
+} 
--- a/docreader/proto/docreader_grpc.pb.go
+++ b/docreader/proto/docreader_grpc.pb.go
@@ -0,0 +1,167 @@
+// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
+// versions:
+// - protoc-gen-go-grpc v1.5.1
+// - protoc             v5.29.3
+// source: docreader.proto
+
+package proto
+
+import (
+	context "context"
+	grpc "google.golang.org/grpc"
+	codes "google.golang.org/grpc/codes"
+	status "google.golang.org/grpc/status"
+)
+
+// This is a compile-time assertion to ensure that this generated file
+// is compatible with the grpc package it is being compiled against.
+// Requires gRPC-Go v1.64.0 or later.
+const _ = grpc.SupportPackageIsVersion9
+
+const (
+	DocReader_ReadFromFile_FullMethodName = "/docreader.DocReader/ReadFromFile"
+	DocReader_ReadFromURL_FullMethodName  = "/docreader.DocReader/ReadFromURL"
+)
+
+// DocReaderClient is the client API for DocReader service.
+//
+// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
+//
+// 文档读取服务
+type DocReaderClient interface {
+	// 从文件读取文档
+	ReadFromFile(ctx context.Context, in *ReadFromFileRequest, opts ...grpc.CallOption) (*ReadResponse, error)
+	// 从URL读取文档
+	ReadFromURL(ctx context.Context, in *ReadFromURLRequest, opts ...grpc.CallOption) (*ReadResponse, error)
+}
+
+type docReaderClient struct {
+	cc grpc.ClientConnInterface
+}
+
+func NewDocReaderClient(cc grpc.ClientConnInterface) DocReaderClient {
+	return &docReaderClient{cc}
+}
+
+func (c *docReaderClient) ReadFromFile(ctx context.Context, in *ReadFromFileRequest, opts ...grpc.CallOption) (*ReadResponse, error) {
+	cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
+	out := new(ReadResponse)
+	err := c.cc.Invoke(ctx, DocReader_ReadFromFile_FullMethodName, in, out, cOpts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *docReaderClient) ReadFromURL(ctx context.Context, in *ReadFromURLRequest, opts ...grpc.CallOption) (*ReadResponse, error) {
+	cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
+	out := new(ReadResponse)
+	err := c.cc.Invoke(ctx, DocReader_ReadFromURL_FullMethodName, in, out, cOpts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+// DocReaderServer is the server API for DocReader service.
+// All implementations must embed UnimplementedDocReaderServer
+// for forward compatibility.
+//
+// 文档读取服务
+type DocReaderServer interface {
+	// 从文件读取文档
+	ReadFromFile(context.Context, *ReadFromFileRequest) (*ReadResponse, error)
+	// 从URL读取文档
+	ReadFromURL(context.Context, *ReadFromURLRequest) (*ReadResponse, error)
+	mustEmbedUnimplementedDocReaderServer()
+}
+
+// UnimplementedDocReaderServer must be embedded to have
+// forward compatible implementations.
+//
+// NOTE: this should be embedded by value instead of pointer to avoid a nil
+// pointer dereference when methods are called.
+type UnimplementedDocReaderServer struct{}
+
+func (UnimplementedDocReaderServer) ReadFromFile(context.Context, *ReadFromFileRequest) (*ReadResponse, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method ReadFromFile not implemented")
+}
+func (UnimplementedDocReaderServer) ReadFromURL(context.Context, *ReadFromURLRequest) (*ReadResponse, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method ReadFromURL not implemented")
+}
+func (UnimplementedDocReaderServer) mustEmbedUnimplementedDocReaderServer() {}
+func (UnimplementedDocReaderServer) testEmbeddedByValue()                   {}
+
+// UnsafeDocReaderServer may be embedded to opt out of forward compatibility for this service.
+// Use of this interface is not recommended, as added methods to DocReaderServer will
+// result in compilation errors.
+type UnsafeDocReaderServer interface {
+	mustEmbedUnimplementedDocReaderServer()
+}
+
+func RegisterDocReaderServer(s grpc.ServiceRegistrar, srv DocReaderServer) {
+	// If the following call pancis, it indicates UnimplementedDocReaderServer was
+	// embedded by pointer and is nil.  This will cause panics if an
+	// unimplemented method is ever invoked, so we test this at initialization
+	// time to prevent it from happening at runtime later due to I/O.
+	if t, ok := srv.(interface{ testEmbeddedByValue() }); ok {
+		t.testEmbeddedByValue()
+	}
+	s.RegisterService(&DocReader_ServiceDesc, srv)
+}
+
+func _DocReader_ReadFromFile_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(ReadFromFileRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(DocReaderServer).ReadFromFile(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: DocReader_ReadFromFile_FullMethodName,
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(DocReaderServer).ReadFromFile(ctx, req.(*ReadFromFileRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _DocReader_ReadFromURL_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(ReadFromURLRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(DocReaderServer).ReadFromURL(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: DocReader_ReadFromURL_FullMethodName,
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(DocReaderServer).ReadFromURL(ctx, req.(*ReadFromURLRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+// DocReader_ServiceDesc is the grpc.ServiceDesc for DocReader service.
+// It's only intended for direct use with grpc.RegisterService,
+// and not to be introspected or modified (even as a copy)
+var DocReader_ServiceDesc = grpc.ServiceDesc{
+	ServiceName: "docreader.DocReader",
+	HandlerType: (*DocReaderServer)(nil),
+	Methods: []grpc.MethodDesc{
+		{
+			MethodName: "ReadFromFile",
+			Handler:    _DocReader_ReadFromFile_Handler,
+		},
+		{
+			MethodName: "ReadFromURL",
+			Handler:    _DocReader_ReadFromURL_Handler,
+		},
+	},
+	Streams:  []grpc.StreamDesc{},
+	Metadata: "docreader.proto",
+}
--- a/docreader/proto/docreader_pb2.py
+++ b/docreader/proto/docreader_pb2.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# NO CHECKED-IN PROTOBUF GENCODE
+# source: docreader.proto
+# Protobuf Python Version: 6.31.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import runtime_version as _runtime_version
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+_runtime_version.ValidateProtobufRuntimeVersion(
+    _runtime_version.Domain.PUBLIC,
+    6,
+    31,
+    1,
+    '',
+    'docreader.proto'
+)
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x64ocreader.proto\x12\tdocreader\"\xb9\x01\n\rStorageConfig\x12,\n\x08provider\x18\x01 \x01(\x0e\x32\x1a.docreader.StorageProvider\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x03 \x01(\t\x12\x15\n\raccess_key_id\x18\x04 \x01(\t\x12\x19\n\x11secret_access_key\x18\x05 \x01(\t\x12\x0e\n\x06\x61pp_id\x18\x06 \x01(\t\x12\x13\n\x0bpath_prefix\x18\x07 \x01(\t\"Z\n\tVLMConfig\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x10\n\x08\x62\x61se_url\x18\x02 \x01(\t\x12\x0f\n\x07\x61pi_key\x18\x03 \x01(\t\x12\x16\n\x0einterface_type\x18\x04 \x01(\t\"\xc2\x01\n\nReadConfig\x12\x12\n\nchunk_size\x18\x01 \x01(\x05\x12\x15\n\rchunk_overlap\x18\x02 \x01(\x05\x12\x12\n\nseparators\x18\x03 \x03(\t\x12\x19\n\x11\x65nable_multimodal\x18\x04 \x01(\x08\x12\x30\n\x0estorage_config\x18\x05 \x01(\x0b\x32\x18.docreader.StorageConfig\x12(\n\nvlm_config\x18\x06 \x01(\x0b\x32\x14.docreader.VLMConfig\"\x91\x01\n\x13ReadFromFileRequest\x12\x14\n\x0c\x66ile_content\x18\x01 \x01(\x0c\x12\x11\n\tfile_name\x18\x02 \x01(\t\x12\x11\n\tfile_type\x18\x03 \x01(\t\x12*\n\x0bread_config\x18\x04 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x05 \x01(\t\"p\n\x12ReadFromURLRequest\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\r\n\x05title\x18\x02 \x01(\t\x12*\n\x0bread_config\x18\x03 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x04 \x01(\t\"i\n\x05Image\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\x0f\n\x07\x63\x61ption\x18\x02 \x01(\t\x12\x10\n\x08ocr_text\x18\x03 \x01(\t\x12\x14\n\x0coriginal_url\x18\x04 \x01(\t\x12\r\n\x05start\x18\x05 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x06 \x01(\x05\"c\n\x05\x43hunk\x12\x0f\n\x07\x63ontent\x18\x01 \x01(\t\x12\x0b\n\x03seq\x18\x02 \x01(\x05\x12\r\n\x05start\x18\x03 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x04 \x01(\x05\x12 \n\x06images\x18\x05 \x03(\x0b\x32\x10.docreader.Image\"?\n\x0cReadResponse\x12 \n\x06\x63hunks\x18\x01 \x03(\x0b\x32\x10.docreader.Chunk\x12\r\n\x05\x65rror\x18\x02 \x01(\t*G\n\x0fStorageProvider\x12 \n\x1cSTORAGE_PROVIDER_UNSPECIFIED\x10\x00\x12\x07\n\x03\x43OS\x10\x01\x12\t\n\x05MINIO\x10\x02\x32\x9f\x01\n\tDocReader\x12I\n\x0cReadFromFile\x12\x1e.docreader.ReadFromFileRequest\x1a\x17.docreader.ReadResponse\"\x00\x12G\n\x0bReadFromURL\x12\x1d.docreader.ReadFromURLRequest\x1a\x17.docreader.ReadResponse\"\x00\x42\x35Z3github.com/Tencent/WeKnora/internal/docreader/protob\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'docreader_pb2', _globals)
+if not _descriptor._USE_C_DESCRIPTORS:
+  _globals['DESCRIPTOR']._loaded_options = None
+  _globals['DESCRIPTOR']._serialized_options = b'Z3github.com/Tencent/WeKnora/internal/docreader/proto'
+  _globals['_STORAGEPROVIDER']._serialized_start=1042
+  _globals['_STORAGEPROVIDER']._serialized_end=1113
+  _globals['_STORAGECONFIG']._serialized_start=31
+  _globals['_STORAGECONFIG']._serialized_end=216
+  _globals['_VLMCONFIG']._serialized_start=218
+  _globals['_VLMCONFIG']._serialized_end=308
+  _globals['_READCONFIG']._serialized_start=311
+  _globals['_READCONFIG']._serialized_end=505
+  _globals['_READFROMFILEREQUEST']._serialized_start=508
+  _globals['_READFROMFILEREQUEST']._serialized_end=653
+  _globals['_READFROMURLREQUEST']._serialized_start=655
+  _globals['_READFROMURLREQUEST']._serialized_end=767
+  _globals['_IMAGE']._serialized_start=769
+  _globals['_IMAGE']._serialized_end=874
+  _globals['_CHUNK']._serialized_start=876
+  _globals['_CHUNK']._serialized_end=975
+  _globals['_READRESPONSE']._serialized_start=977
+  _globals['_READRESPONSE']._serialized_end=1040
+  _globals['_DOCREADER']._serialized_start=1116
+  _globals['_DOCREADER']._serialized_end=1275
+# @@protoc_insertion_point(module_scope)
--- a/docreader/proto/docreader_pb2_grpc.py
+++ b/docreader/proto/docreader_pb2_grpc.py
@@ -0,0 +1,145 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+import warnings
+
+from . import docreader_pb2 as docreader__pb2
+
+GRPC_GENERATED_VERSION = '1.76.0'
+GRPC_VERSION = grpc.__version__
+_version_not_supported = False
+
+try:
+    from grpc._utilities import first_version_is_lower
+    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
+except ImportError:
+    _version_not_supported = True
+
+if _version_not_supported:
+    raise RuntimeError(
+        f'The grpc package installed is at version {GRPC_VERSION},'
+        + ' but the generated code in docreader_pb2_grpc.py depends on'
+        + f' grpcio>={GRPC_GENERATED_VERSION}.'
+        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
+    )
+
+
+class DocReaderStub(object):
+    """文档读取服务
+    """
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.ReadFromFile = channel.unary_unary(
+                '/docreader.DocReader/ReadFromFile',
+                request_serializer=docreader__pb2.ReadFromFileRequest.SerializeToString,
+                response_deserializer=docreader__pb2.ReadResponse.FromString,
+                _registered_method=True)
+        self.ReadFromURL = channel.unary_unary(
+                '/docreader.DocReader/ReadFromURL',
+                request_serializer=docreader__pb2.ReadFromURLRequest.SerializeToString,
+                response_deserializer=docreader__pb2.ReadResponse.FromString,
+                _registered_method=True)
+
+
+class DocReaderServicer(object):
+    """文档读取服务
+    """
+
+    def ReadFromFile(self, request, context):
+        """从文件读取文档
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def ReadFromURL(self, request, context):
+        """从URL读取文档
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_DocReaderServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'ReadFromFile': grpc.unary_unary_rpc_method_handler(
+                    servicer.ReadFromFile,
+                    request_deserializer=docreader__pb2.ReadFromFileRequest.FromString,
+                    response_serializer=docreader__pb2.ReadResponse.SerializeToString,
+            ),
+            'ReadFromURL': grpc.unary_unary_rpc_method_handler(
+                    servicer.ReadFromURL,
+                    request_deserializer=docreader__pb2.ReadFromURLRequest.FromString,
+                    response_serializer=docreader__pb2.ReadResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'docreader.DocReader', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('docreader.DocReader', rpc_method_handlers)
+
+
+ # This class is part of an EXPERIMENTAL API.
+class DocReader(object):
+    """文档读取服务
+    """
+
+    @staticmethod
+    def ReadFromFile(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/docreader.DocReader/ReadFromFile',
+            docreader__pb2.ReadFromFileRequest.SerializeToString,
+            docreader__pb2.ReadResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def ReadFromURL(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/docreader.DocReader/ReadFromURL',
+            docreader__pb2.ReadFromURLRequest.SerializeToString,
+            docreader__pb2.ReadResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
--- a/docreader/pyproject.toml
+++ b/docreader/pyproject.toml
@@ -0,0 +1,35 @@
+[project]
+name = "docreader"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10.18"
+dependencies = [
+    "antiword>=0.1.0",
+    "asyncio>=4.0.0",
+    "beautifulsoup4>=4.14.2",
+    "cos-python-sdk-v5>=1.9.38",
+    "goose3[all]>=3.1.20",
+    "grpcio>=1.76.0",
+    "grpcio-health-checking>=1.76.0",
+    "grpcio-tools>=1.76.0",
+    "lxml>=6.0.2",
+    "markdown>=3.10",
+    "markdownify>=1.2.0",
+    "minio>=7.2.18",
+    "mistletoe>=1.5.0",
+    "ollama>=0.6.0",
+    "openai>=2.7.1",
+    "paddleocr>=2.10.0,<3.0.0",
+    "paddlepaddle>=3.0.0,<4.0.0",
+    "pdfplumber>=0.11.7",
+    "pillow>=12.0.0",
+    "playwright>=1.55.0",
+    "protobuf>=6.33.0",
+    "pypdf>=6.1.3",
+    "pypdf2>=3.0.1",
+    "python-docx>=1.2.0",
+    "requests>=2.32.5",
+    "textract==1.5.0",
+    "urllib3>=2.5.0",
+]
--- a/docreader/scripts/download_deps.py
+++ b/docreader/scripts/download_deps.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+import logging
+from paddleocr import PaddleOCR
+
+# 添加当前目录到Python路径
+current_dir = os.path.dirname(os.path.abspath(__file__))
+if current_dir not in sys.path:
+    sys.path.append(current_dir)
+
+# 导入ImageParser
+from parser.image_parser import ImageParser
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+
+
+def init_ocr_model():
+    """Initialize PaddleOCR model to pre-download and cache models"""
+    try:
+        logger.info("Initializing PaddleOCR model for pre-download...")
+        
+        # 使用与代码中相同的配置
+        ocr_config = {
+            "use_gpu": False,
+            "text_det_limit_type": "max",
+            "text_det_limit_side_len": 960,
+            "use_doc_orientation_classify": True,  # 启用文档方向分类
+            "use_doc_unwarping": False,
+            "use_textline_orientation": True,  # 启用文本行方向检测
+            "text_recognition_model_name": "PP-OCRv4_server_rec",
+            "text_detection_model_name": "PP-OCRv4_server_det",
+            "text_det_thresh": 0.3,
+            "text_det_box_thresh": 0.6,
+            "text_det_unclip_ratio": 1.5,
+            "text_rec_score_thresh": 0.0,
+            "ocr_version": "PP-OCRv4",
+            "lang": "ch",
+            "show_log": False,
+            "use_dilation": True,
+            "det_db_score_mode": "slow",
+        }
+        
+        # 初始化PaddleOCR，这会触发模型下载和缓存
+        ocr = PaddleOCR(**ocr_config)
+        logger.info("PaddleOCR model initialization completed successfully")
+        
+        # 测试OCR功能以确保模型正常工作
+        import numpy as np
+        from PIL import Image
+        
+        # 创建一个简单的测试图像
+        test_image = np.ones((100, 300, 3), dtype=np.uint8) * 255
+        test_pil = Image.fromarray(test_image)
+        
+        # 执行一次OCR测试
+        result = ocr.ocr(np.array(test_pil), cls=False)
+        logger.info("PaddleOCR test completed successfully")
+        
+    except Exception as e:
+        logger.error(f"Failed to initialize PaddleOCR model: {str(e)}")
+        raise
--- a/docreader/scripts/generate_proto.sh
+++ b/docreader/scripts/generate_proto.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+set -x
+
+# 设置目录
+PROTO_DIR="proto"
+PYTHON_OUT="proto"
+GO_OUT="proto"
+
+# 生成Python代码
+python3 -m grpc_tools.protoc -I${PROTO_DIR} \
+    --python_out=${PYTHON_OUT} \
+    --grpc_python_out=${PYTHON_OUT} \
+    ${PROTO_DIR}/docreader.proto
+
+# 生成Go代码
+protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
+    --go_opt=paths=source_relative \
+    --go-grpc_out=${GO_OUT} \
+    --go-grpc_opt=paths=source_relative \
+    ${PROTO_DIR}/docreader.proto
+
+# 修复Python导入问题（MacOS兼容版本）
+if [ "$(uname)" == "Darwin" ]; then
+    # MacOS版本
+    sed -i '' 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
+else
+    # Linux版本
+    sed -i 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
+fi
+
+echo "Proto files generated successfully!"
--- a/docreader/testdata/images/test_text.png
+++ b/docreader/testdata/images/test_text.png
--- a/docreader/testdata/test.html
+++ b/docreader/testdata/test.html
@@ -0,0 +1,56 @@
+<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>测试 HTML 文档</title>
+</head>
+<body>
+    <h1>测试 HTML 文档</h1>
+    
+    <p>这是一个测试 HTML 文档，用于测试 HTML 解析功能。</p>
+    
+    <h2>包含图片</h2>
+    <img src="https://example.com/image.jpg" alt="测试图片">
+    
+    <h2>包含链接</h2>
+    <p>这是一个<a href="https://example.com">测试链接</a>。</p>
+    
+    <h2>包含代码块</h2>
+    <pre><code>
+def hello_world():
+    print("Hello, World!")
+    </code></pre>
+    
+    <h2>包含表格</h2>
+    <table>
+        <thead>
+            <tr>
+                <th>表头1</th>
+                <th>表头2</th>
+            </tr>
+        </thead>
+        <tbody>
+            <tr>
+                <td>内容1</td>
+                <td>内容2</td>
+            </tr>
+            <tr>
+                <td>内容3</td>
+                <td>内容4</td>
+            </tr>
+        </tbody>
+    </table>
+    
+    <h2>测试分块功能</h2>
+    <p>这部分内容用于测试分块功能，确保 HTML 结构在分块时保持完整。</p>
+    <ul>
+        <li>第一块内容</li>
+        <li>第二块内容</li>
+        <li>第三块内容</li>
+    </ul>
+    
+    <h2>测试重叠功能</h2>
+    <p>这部分内容可能会在分块时与前后块重叠，以确保上下文的连续性。</p>
+</body>
+</html> 
--- a/docreader/testdata/test.md
+++ b/docreader/testdata/test.md
@@ -0,0 +1,37 @@
+# 测试 Markdown 文档
+
+这是一个测试 Markdown 文档，用于测试 Markdown 解析功能。
+
+## 包含图片
+
+![测试图片](https://geektutu.com/post/quick-go-protobuf/go-protobuf.jpg)
+
+## 包含链接
+
+这是一个[测试链接](https://example.com)。
+
+## 包含代码块
+
+```python
+def hello_world():
+    print("Hello, World!")
+```
+
+## 包含表格
+
+| 表头1 | 表头2 |
+|-------|-------|
+| 内容1 | 内容2 |
+| 内容3 | 内容4 |
+
+## 测试分块功能
+
+这部分内容用于测试分块功能，确保 Markdown 结构在分块时保持完整。
+
+- 第一块内容
+- 第二块内容
+- 第三块内容
+
+## 测试重叠功能
+
+这部分内容可能会在分块时与前后块重叠，以确保上下文的连续性。 
--- a/docreader/testdata/test.txt
+++ b/docreader/testdata/test.txt
@@ -0,0 +1,16 @@
+这是一个测试文档
+包含多行内容
+用于测试文档解析功能
+
+这个文档包含以下内容：
+1. 基本文本内容
+2. 多行段落
+3. 列表项
+
+测试分块功能：
+- 第一块内容
+- 第二块内容
+- 第三块内容
+
+测试重叠功能：
+这部分内容可能会在分块时与前后块重叠，以确保上下文的连续性。 
--- a/docreader/testdata/test_download.txt
+++ b/docreader/testdata/test_download.txt
@@ -0,0 +1,19 @@
+这是一个测试文档
+包含多行内容
+用于测试文档解析功能
+
+这个文档包含以下内容：
+1. 基本文本内容
+2. 多行段落
+3. 列表项
+
+测试分块功能：
+- 第一块内容
+- 第二块内容
+- 第三块内容
+
+测试重叠功能：
+这部分内容可能会在分块时与前后块重叠，以确保上下文的连续性。 
+
+
+test
--- a/docreader/utils/init.py
+++ b/docreader/utils/init.py
@@ -0,0 +1,83 @@
+#
+#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+import re
+import logging
+
+# 配置日志
+logger = logging.getLogger(__name__)
+
+
+def singleton(cls, *args, **kw):
+    instances = {}
+
+    def _singleton():
+        key = str(cls) + str(os.getpid())
+        if key not in instances:
+            logger.info(f"Creating new singleton instance with key: {key}")
+            instances[key] = cls(*args, **kw)
+        else:
+            logger.info(f"Returning existing singleton instance with key: {key}")
+        return instances[key]
+
+    return _singleton
+
+
+def rmSpace(txt):
+    logger.info(f"Removing spaces from text of length: {len(txt)}")
+    txt = re.sub(r"([^a-z0-9.,\)>]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
+    return re.sub(r"([^ ]) +([^a-z0-9.,\(<])", r"\1\2", txt, flags=re.IGNORECASE)
+
+
+def findMaxDt(fnm):
+    m = "1970-01-01 00:00:00"
+    logger.info(f"Finding maximum date in file: {fnm}")
+    try:
+        with open(fnm, "r") as f:
+            while True:
+                l = f.readline()
+                if not l:
+                    break
+                l = l.strip("\n")
+                if l == "nan":
+                    continue
+                if l > m:
+                    m = l
+        logger.info(f"Maximum date found: {m}")
+    except Exception as e:
+        logger.error(f"Error reading file {fnm} for max date: {str(e)}")
+    return m
+
+
+def findMaxTm(fnm):
+    m = 0
+    logger.info(f"Finding maximum time in file: {fnm}")
+    try:
+        with open(fnm, "r") as f:
+            while True:
+                l = f.readline()
+                if not l:
+                    break
+                l = l.strip("\n")
+                if l == "nan":
+                    continue
+                if int(l) > m:
+                    m = int(l)
+        logger.info(f"Maximum time found: {m}")
+    except Exception as e:
+        logger.error(f"Error reading file {fnm} for max time: {str(e)}")
+    return m
--- a/docreader/utils/request.py
+++ b/docreader/utils/request.py
@@ -0,0 +1,149 @@
+from contextvars import ContextVar
+import logging
+import uuid
+import contextlib
+import time
+from typing import Optional
+from logging import LogRecord
+
+# 配置日志
+logger = logging.getLogger(__name__)
+
+# 定义上下文变量
+request_id_var = ContextVar("request_id", default=None)
+_request_start_time_ctx = ContextVar("request_start_time", default=None)
+
+
+def set_request_id(request_id: str) -> None:
+    """设置当前上下文的请求ID"""
+    request_id_var.set(request_id)
+
+
+def get_request_id() -> Optional[str]:
+    """获取当前上下文的请求ID"""
+    return request_id_var.get()
+
+
+class MillisecondFormatter(logging.Formatter):
+    """自定义日志格式化器，只显示毫秒级时间戳(3位数字)而不是微秒(6位)"""
+    
+    def formatTime(self, record, datefmt=None):
+        """重写formatTime方法，将微秒格式化为毫秒"""
+        # 先获取标准的格式化时间
+        result = super().formatTime(record, datefmt)
+        
+        # 如果使用了包含.%f的格式，则将微秒(6位)截断为毫秒(3位)
+        if datefmt and ".%f" in datefmt:
+            # 格式化的时间字符串应该在最后有6位微秒数
+            parts = result.split('.')
+            if len(parts) > 1 and len(parts[1]) >= 6:
+                # 只保留前3位作为毫秒
+                millis = parts[1][:3]
+                result = f"{parts[0]}.{millis}"
+                
+        return result
+
+
+def init_logging_request_id():
+    """
+    Initialize logging to include request ID in log messages.
+    Add the custom filter to all existing handlers
+    """
+    logger.info("Initializing request ID logging")
+    root_logger = logging.getLogger()
+
+    # 添加自定义过滤器到所有处理器
+    for handler in root_logger.handlers:
+        # 添加请求ID过滤器
+        handler.addFilter(RequestIdFilter())
+
+        # 更新格式化器以包含请求ID，调整格式使其更紧凑整齐
+        formatter = logging.Formatter(
+            fmt="%(asctime)s.%(msecs)03d [%(request_id)s] %(levelname)-5s %(name)-20s | %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+        handler.setFormatter(formatter)
+
+    logger.info(
+        f"Updated {len(root_logger.handlers)} handlers with request ID formatting"
+    )
+
+    # 如果没有处理器，添加一个标准输出处理器
+    if not root_logger.handlers:
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(
+            fmt="%(asctime)s.%(msecs)03d [%(request_id)s] %(levelname)-5s %(name)-20s | %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+        handler.setFormatter(formatter)
+        handler.addFilter(RequestIdFilter())
+        root_logger.addHandler(handler)
+        logger.info("Added new StreamHandler with request ID formatting")
+
+
+class RequestIdFilter(logging.Filter):
+    """Filter that adds request ID to log messages"""
+
+    def filter(self, record: LogRecord) -> bool:
+        request_id = request_id_var.get()
+        if request_id is not None:
+            # 为日志记录添加请求ID属性，使用短格式
+            if len(request_id) > 8:
+                # 截取ID的前8个字符，确保显示整齐
+                short_id = request_id[:8]
+                if "-" in request_id:
+                    # 尝试保留格式，例如 test-req-1-XXX
+                    parts = request_id.split("-")
+                    if len(parts) >= 3:
+                        # 如果格式是 xxx-xxx-n-randompart
+                        short_id = f"{parts[0]}-{parts[1]}-{parts[2]}"
+                record.request_id = short_id
+            else:
+                record.request_id = request_id
+
+            # 添加执行时间属性
+            start_time = _request_start_time_ctx.get()
+            if start_time is not None:
+                elapsed_ms = int((time.time() - start_time) * 1000)
+                record.elapsed_ms = elapsed_ms
+                # 添加执行时间到消息中
+                if not hasattr(record, "message_with_elapsed"):
+                    record.message_with_elapsed = True
+                    record.msg = f"{record.msg} (elapsed: {elapsed_ms}ms)"
+        else:
+            # 如果没有请求ID，使用占位符
+            record.request_id = "no-req-id"
+
+        return True
+
+
+@contextlib.contextmanager
+def request_id_context(request_id: str = None):
+    """Context manager that sets a request ID for the current context
+
+    Args:
+        request_id: 要使用的请求ID，如果为None则自动生成
+
+    Example:
+        with request_id_context("req-123"):
+            # 在这个代码块中的所有日志都会包含请求ID req-123
+            logging.info("Processing request")
+    """
+    # Generate or use provided request ID
+    req_id = request_id or str(uuid.uuid4())
+
+    # Set start time and request ID
+    start_time = time.time()
+    req_token = request_id_var.set(req_id)
+    time_token = _request_start_time_ctx.set(start_time)
+
+    logger.info(f"Starting new request with ID: {req_id}")
+
+    try:
+        yield request_id_var.get()
+    finally:
+        # Log completion and reset context vars
+        elapsed_ms = int((time.time() - start_time) * 1000)
+        logger.info(f"Request {req_id} completed in {elapsed_ms}ms")
+        request_id_var.reset(req_token)
+        _request_start_time_ctx.reset(time_token)
--- a/docreader/uv.lock
+++ b/docreader/uv.lock