chore(docreader): 重新组织模块文件

This commit is contained in:
begoniezhao
2025-11-05 11:33:50 +08:00
parent 0d790ffedc
commit c1f731e026
43 changed files with 3435 additions and 2398 deletions

23
docreader/Makefile Normal file
View File

@@ -0,0 +1,23 @@
.PHONY: proto build run docker-build docker-run clean
# 生成 protobuf 代码
proto:
@echo "Generating protobuf code..."
@sh ./scripts/generate_proto.sh
# 构建 Go 客户端
build:
@echo "Building Go client..."
@go build -o bin/client ./src/client
# 运行 Python 服务
run:
@echo "Running Python server..."
@python src/server/server.py
# 清理
clean:
@echo "Cleaning up..."
@rm -rf bin/
@find . -name "*.pyc" -delete
@find . -name "__pycache__" -delete

0
docreader/README.md Normal file
View File

115
docreader/client/client.go Normal file
View File

@@ -0,0 +1,115 @@
package client
import (
"fmt"
"log"
"os"
"time"
"github.com/Tencent/WeKnora/services/docreader/src/proto"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/resolver"
)
const (
maxMessageSize = 50 * 1024 * 1024 // 50MB
)
var (
// Logger is the default logger used by the client
Logger = log.New(os.Stdout, "[DocReader] ", log.LstdFlags|log.Lmicroseconds)
)
// ImageInfo 表示一个图片的信息
type ImageInfo struct {
URL string // 图片URLCOS
Caption string // 图片描述
OCRText string // OCR提取的文本
OriginalURL string // 原始图片URL
Start int // 图片在文本中的开始位置
End int // 图片在文本中的结束位置
}
// Client represents a DocReader service client
type Client struct {
conn *grpc.ClientConn
proto.DocReaderClient
debug bool
}
// NewClient creates a new DocReader client with the specified address
func NewClient(addr string) (*Client, error) {
Logger.Printf("INFO: Creating new DocReader client connecting to %s", addr)
// 设置消息大小限制
opts := []grpc.DialOption{
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy":"round_robin"}`),
grpc.WithDefaultCallOptions(
grpc.MaxCallRecvMsgSize(maxMessageSize),
grpc.MaxCallSendMsgSize(maxMessageSize),
),
}
resolver.SetDefaultScheme("dns")
startTime := time.Now()
conn, err := grpc.Dial("dns:///"+addr, opts...)
if err != nil {
Logger.Printf("ERROR: Failed to connect to DocReader service: %v", err)
return nil, err
}
Logger.Printf("INFO: Successfully connected to DocReader service in %v", time.Since(startTime))
return &Client{
conn: conn,
DocReaderClient: proto.NewDocReaderClient(conn),
debug: false,
}, nil
}
// Close closes the client connection
func (c *Client) Close() error {
Logger.Printf("INFO: Closing DocReader client connection")
return c.conn.Close()
}
// SetDebug enables or disables debug logging
func (c *Client) SetDebug(debug bool) {
c.debug = debug
Logger.Printf("INFO: Debug logging set to %v", debug)
}
// Log logs a message with the appropriate level
func (c *Client) Log(level string, format string, args ...interface{}) {
if level == "DEBUG" && !c.debug {
return
}
Logger.Printf("%s: %s", level, fmt.Sprintf(format, args...))
}
// GetImagesFromChunk 从一个Chunk中提取所有图片信息
func GetImagesFromChunk(chunk *proto.Chunk) []ImageInfo {
if chunk == nil || len(chunk.Images) == 0 {
return nil
}
images := make([]ImageInfo, 0, len(chunk.Images))
for _, img := range chunk.Images {
images = append(images, ImageInfo{
URL: img.Url,
Caption: img.Caption,
OCRText: img.OcrText,
OriginalURL: img.OriginalUrl,
Start: int(img.Start),
End: int(img.End),
})
}
return images
}
// HasImagesInChunk 判断一个Chunk是否包含图片
func HasImagesInChunk(chunk *proto.Chunk) bool {
return chunk != nil && len(chunk.Images) > 0
}

View File

@@ -0,0 +1,154 @@
package client
import (
"context"
"log"
"os"
"testing"
"time"
"github.com/Tencent/WeKnora/services/docreader/src/proto"
)
func init() {
// 配置测试日志
log.SetOutput(os.Stdout)
log.SetFlags(log.LstdFlags | log.Lmicroseconds | log.Lshortfile)
log.Println("INFO: Initializing DocReader client tests")
}
func TestReadFromURL(t *testing.T) {
log.Println("INFO: Starting TestReadFromURL")
// 创建测试客户端
log.Println("INFO: Creating test client")
client, err := NewClient("localhost:50051")
if err != nil {
log.Printf("ERROR: Failed to create client: %v", err)
t.Fatalf("Failed to create client: %v", err)
}
defer client.Close()
// 启用调试日志
client.SetDebug(true)
// 测试 ReadFromURL 方法
log.Println("INFO: Sending ReadFromURL request to server")
startTime := time.Now()
resp, err := client.ReadFromURL(
context.Background(),
&proto.ReadFromURLRequest{
Url: "https://example.com",
Title: "test",
ReadConfig: &proto.ReadConfig{
ChunkSize: 512,
ChunkOverlap: 50,
Separators: []string{"\n\n", "\n", "。"},
EnableMultimodal: true,
},
},
)
requestDuration := time.Since(startTime)
if err != nil {
log.Printf("ERROR: ReadFromURL failed: %v", err)
t.Fatalf("ReadFromURL failed: %v", err)
}
log.Printf("INFO: ReadFromURL completed in %v", requestDuration)
// 验证结果
chunkCount := len(resp.Chunks)
log.Printf("INFO: Received %d chunks from URL parsing", chunkCount)
if chunkCount == 0 {
log.Println("WARN: Expected non-empty content but received none")
t.Error("Expected non-empty content")
}
// 打印结果
for i, chunk := range resp.Chunks {
if i < 2 || i >= chunkCount-2 { // 只打印前两个和后两个块
log.Printf("DEBUG: Chunk %d: %s", chunk.Seq, truncateString(chunk.Content, 50))
} else if i == 2 && chunkCount > 4 {
log.Printf("DEBUG: ... %d more chunks ...", chunkCount-4)
}
}
log.Println("INFO: TestReadFromURL completed successfully")
}
func TestReadFromFileWithChunking(t *testing.T) {
log.Println("INFO: Starting TestReadFromFileWithChunking")
// 创建测试客户端
log.Println("INFO: Creating test client")
client, err := NewClient("localhost:50051")
if err != nil {
log.Printf("ERROR: Failed to create client: %v", err)
t.Fatalf("Failed to create client: %v", err)
}
defer client.Close()
// 启用调试日志
client.SetDebug(true)
// 读取测试文件
log.Println("INFO: Reading test file")
fileContent, err := os.ReadFile("../testdata/test.md")
if err != nil {
log.Printf("ERROR: Failed to read test file: %v", err)
t.Fatalf("Failed to read test file: %v", err)
}
log.Printf("INFO: Read test file, size: %d bytes", len(fileContent))
// 测试 ReadFromFile 方法,带分块参数
log.Println("INFO: Sending ReadFromFile request to server")
startTime := time.Now()
resp, err := client.ReadFromFile(
context.Background(),
&proto.ReadFromFileRequest{
FileContent: fileContent,
FileName: "test.md",
FileType: "md",
ReadConfig: &proto.ReadConfig{
ChunkSize: 200,
ChunkOverlap: 50,
Separators: []string{"\n\n", "\n", "。"},
EnableMultimodal: true,
},
},
)
requestDuration := time.Since(startTime)
if err != nil {
log.Printf("ERROR: ReadFromFile failed: %v", err)
t.Fatalf("ReadFromFile failed: %v", err)
}
log.Printf("INFO: ReadFromFile completed in %v", requestDuration)
// 验证结果
chunkCount := len(resp.Chunks)
log.Printf("INFO: Received %d chunks from file parsing", chunkCount)
if chunkCount == 0 {
log.Println("WARN: Expected non-empty content but received none")
t.Error("Expected non-empty content")
}
// 打印结果
for i, chunk := range resp.Chunks {
if i < 2 || i >= chunkCount-2 { // 只打印前两个和后两个块
log.Printf("DEBUG: Chunk %d: %s", chunk.Seq, truncateString(chunk.Content, 50))
} else if i == 2 && chunkCount > 4 {
log.Printf("DEBUG: ... %d more chunks ...", chunkCount-4)
}
}
log.Println("INFO: TestReadFromFileWithChunking completed successfully")
}
// 截断字符串以供日志打印
func truncateString(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
return s[:maxLen] + "..."
}

403
docreader/main.py Normal file
View File

@@ -0,0 +1,403 @@
import os
import sys
import logging
from concurrent import futures
import traceback
import grpc
import uuid
import atexit
from grpc_health.v1 import health_pb2_grpc
from grpc_health.v1.health import HealthServicer
# Add parent directory to Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
from proto.docreader_pb2 import ReadResponse, Chunk, Image
from proto import docreader_pb2_grpc
from parser import Parser, OCREngine
from parser.config import ChunkingConfig
from utils.request import request_id_context, init_logging_request_id
# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
import re
from typing import Optional
try:
# Optional dependency for charset detection; install via `pip install charset-normalizer`
from charset_normalizer import from_bytes as _cn_from_bytes # type: ignore
except Exception: # pragma: no cover
_cn_from_bytes = None # type: ignore
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
def to_valid_utf8_text(s: Optional[str]) -> str:
"""Return a UTF-8 safe string for protobuf.
- Replace any surrogate code points with U+FFFD
- Re-encode with errors='replace' to ensure valid UTF-8
"""
if not s:
return ""
s = _SURROGATE_RE.sub("\ufffd", s)
return s.encode("utf-8", errors="replace").decode("utf-8")
def read_text_with_fallback(file_path: str) -> str:
"""Read text from file supporting multiple encodings with graceful fallback.
This server currently receives bytes over gRPC and delegates decoding to the parser.
This helper is provided for future local-file reads if needed.
"""
with open(file_path, "rb") as f:
raw = f.read()
if _cn_from_bytes is not None:
try:
result = _cn_from_bytes(raw).best()
if result:
return str(result)
except Exception:
pass
for enc in ("utf-8", "gb18030", "latin-1"):
try:
return raw.decode(enc, errors="replace")
except UnicodeDecodeError:
continue
return raw.decode("utf-8", errors="replace")
# Ensure no existing handlers
for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
# Configure logging - use stdout
handler = logging.StreamHandler(sys.stdout)
logging.root.addHandler(handler)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.info("Initializing server logging")
# Initialize request ID logging
init_logging_request_id()
# Set max message size to 50MB
MAX_MESSAGE_LENGTH = 50 * 1024 * 1024
parser = Parser()
class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
def __init__(self):
super().__init__()
self.parser = Parser()
def ReadFromFile(self, request, context):
# Get or generate request ID
request_id = (
request.request_id
if hasattr(request, "request_id") and request.request_id
else str(uuid.uuid4())
)
# Use request ID context
with request_id_context(request_id):
try:
# Get file type
file_type = (
request.file_type or os.path.splitext(request.file_name)[1][1:]
)
logger.info(
f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
)
logger.info(f"File content size: {len(request.file_content)} bytes")
# Create chunking config
chunk_size = request.read_config.chunk_size or 512
chunk_overlap = request.read_config.chunk_overlap or 50
separators = request.read_config.separators or ["\n\n", "\n", ""]
enable_multimodal = request.read_config.enable_multimodal or False
logger.info(
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
f"multimodal={enable_multimodal}"
)
# Get Storage and VLM config from request
storage_config = None
vlm_config = None
sc = request.read_config.storage_config
# Keep parser-side key name as cos_config for backward compatibility
storage_config = {
"provider": "minio" if sc.provider == 2 else "cos",
"region": sc.region,
"bucket_name": sc.bucket_name,
"access_key_id": sc.access_key_id,
"secret_access_key": sc.secret_access_key,
"app_id": sc.app_id,
"path_prefix": sc.path_prefix,
}
logger.info(
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
)
vlm_config = {
"model_name": request.read_config.vlm_config.model_name,
"base_url": request.read_config.vlm_config.base_url,
"api_key": request.read_config.vlm_config.api_key or "",
"interface_type": request.read_config.vlm_config.interface_type
or "openai",
}
logger.info(
f"Using VLM config: model={vlm_config['model_name']}, "
f"base_url={vlm_config['base_url']}, "
f"interface_type={vlm_config['interface_type']}"
)
chunking_config = ChunkingConfig(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=separators,
enable_multimodal=enable_multimodal,
storage_config=storage_config,
vlm_config=vlm_config,
)
# Parse file
logger.info(f"Starting file parsing process")
result = self.parser.parse_file(
request.file_name, file_type, request.file_content, chunking_config
)
if not result:
error_msg = "Failed to parse file"
logger.error(error_msg)
context.set_code(grpc.StatusCode.INTERNAL)
context.set_details(error_msg)
return ReadResponse()
# Convert to protobuf message
logger.info(
f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
)
# Build response, including image info
response = ReadResponse(
chunks=[
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
]
)
logger.info(f"Response size: {response.ByteSize()} bytes")
return response
except Exception as e:
error_msg = f"Error reading file: {str(e)}"
logger.error(error_msg)
logger.info(f"Detailed traceback: {traceback.format_exc()}")
context.set_code(grpc.StatusCode.INTERNAL)
context.set_details(str(e))
return ReadResponse(error=str(e))
def ReadFromURL(self, request, context):
# Get or generate request ID
request_id = (
request.request_id
if hasattr(request, "request_id") and request.request_id
else str(uuid.uuid4())
)
# Use request ID context
with request_id_context(request_id):
try:
logger.info(f"Received ReadFromURL request for URL: {request.url}")
# Create chunking config
chunk_size = request.read_config.chunk_size or 512
chunk_overlap = request.read_config.chunk_overlap or 50
separators = request.read_config.separators or ["\n\n", "\n", ""]
enable_multimodal = request.read_config.enable_multimodal or False
logger.info(
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
f"multimodal={enable_multimodal}"
)
# Get Storage and VLM config from request
storage_config = None
vlm_config = None
sc = request.read_config.storage_config
storage_config = {
"provider": "minio" if sc.provider == 2 else "cos",
"region": sc.region,
"bucket_name": sc.bucket_name,
"access_key_id": sc.access_key_id,
"secret_access_key": sc.secret_access_key,
"app_id": sc.app_id,
"path_prefix": sc.path_prefix,
}
logger.info(
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
)
vlm_config = {
"model_name": request.read_config.vlm_config.model_name,
"base_url": request.read_config.vlm_config.base_url,
"api_key": request.read_config.vlm_config.api_key or "",
"interface_type": request.read_config.vlm_config.interface_type
or "openai",
}
logger.info(
f"Using VLM config: model={vlm_config['model_name']}, "
f"base_url={vlm_config['base_url']}, "
f"interface_type={vlm_config['interface_type']}"
)
chunking_config = ChunkingConfig(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=separators,
enable_multimodal=enable_multimodal,
storage_config=storage_config,
vlm_config=vlm_config,
)
# Parse URL
logger.info(f"Starting URL parsing process")
result = self.parser.parse_url(
request.url, request.title, chunking_config
)
if not result:
error_msg = "Failed to parse URL"
logger.error(error_msg)
context.set_code(grpc.StatusCode.INTERNAL)
context.set_details(error_msg)
return ReadResponse(error=error_msg)
# Convert to protobuf message, including image info
logger.info(
f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
)
response = ReadResponse(
chunks=[
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
]
)
logger.info(f"Response size: {response.ByteSize()} bytes")
return response
except Exception as e:
error_msg = f"Error reading URL: {str(e)}"
logger.error(error_msg)
logger.info(f"Detailed traceback: {traceback.format_exc()}")
context.set_code(grpc.StatusCode.INTERNAL)
context.set_details(str(e))
return ReadResponse(error=str(e))
def _convert_chunk_to_proto(self, chunk):
"""Convert internal Chunk object to protobuf Chunk message
Ensures all string fields are valid UTF-8 for protobuf (no lone surrogates).
"""
# Clean helper for strings
_c = to_valid_utf8_text
proto_chunk = Chunk(
content=_c(getattr(chunk, "content", None)),
seq=getattr(chunk, "seq", 0),
start=getattr(chunk, "start", 0),
end=getattr(chunk, "end", 0),
)
# If chunk has images attribute and is not empty, add image info
if hasattr(chunk, "images") and chunk.images:
logger.info(
f"Adding {len(chunk.images)} images to chunk {getattr(chunk, 'seq', 0)}"
)
for img_info in chunk.images:
# img_info expected as dict
proto_image = Image(
url=_c(img_info.get("cos_url", "")),
caption=_c(img_info.get("caption", "")),
ocr_text=_c(img_info.get("ocr_text", "")),
original_url=_c(img_info.get("original_url", "")),
start=int(img_info.get("start", 0) or 0),
end=int(img_info.get("end", 0) or 0),
)
proto_chunk.images.append(proto_image)
return proto_chunk
def init_ocr_engine(ocr_backend, ocr_config):
"""Initialize OCR engine"""
try:
logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
if ocr_engine:
logger.info("OCR engine initialized successfully")
return True
else:
logger.error("OCR engine initialization failed")
return False
except Exception as e:
logger.error(f"Error initializing OCR engine: {str(e)}")
return False
def main():
init_ocr_engine(
os.getenv("OCR_BACKEND", "paddle"),
{
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
},
)
# Set max number of worker threads
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
logger.info(f"Starting DocReader service with {max_workers} worker threads")
# Get port number
port = os.environ.get("GRPC_PORT", "50051")
# Create server
server = grpc.server(
futures.ThreadPoolExecutor(max_workers=max_workers),
options=[
("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
],
)
# Register services
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
# Register health check service
health_servicer = HealthServicer()
health_pb2_grpc.add_HealthServicer_to_server(health_servicer, server)
# Set listen address
server.add_insecure_port(f"[::]:{port}")
# Start service
server.start()
logger.info(f"Server started on port {port}")
logger.info("Server is ready to accept connections")
try:
# Wait for service termination
server.wait_for_termination()
except KeyboardInterrupt:
logger.info("Received termination signal, shutting down server")
server.stop(0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,42 @@
"""
Parser module for WeKnora document processing system.
This module provides document parsers for various file formats including:
- Microsoft Word documents (.doc, .docx)
- PDF documents
- Markdown files
- Plain text files
- Images with text content
- Web pages
The parsers extract content from documents and can split them into
meaningful chunks for further processing and indexing.
"""
from .base_parser import BaseParser, ParseResult
from .docx_parser import DocxParser
from .doc_parser import DocParser
from .pdf_parser import PDFParser
from .markdown_parser import MarkdownParser
from .text_parser import TextParser
from .image_parser import ImageParser
from .web_parser import WebParser
from .parser import Parser
from .config import ChunkingConfig
from .ocr_engine import OCREngine
# Export public classes and modules
__all__ = [
"BaseParser", # Base parser class that all format parsers inherit from
"DocxParser", # Parser for .docx files (modern Word documents)
"DocParser", # Parser for .doc files (legacy Word documents)
"PDFParser", # Parser for PDF documents
"MarkdownParser", # Parser for Markdown text files
"TextParser", # Parser for plain text files
"ImageParser", # Parser for images with text content
"WebParser", # Parser for web pages
"Parser", # Main parser factory that selects the appropriate parser
"ChunkingConfig", # Configuration for text chunking behavior
"ParseResult", # Standard result format returned by all parsers
"OCREngine", # OCR engine for extracting text from images
]

File diff suppressed because it is too large Load Diff

360
docreader/parser/caption.py Normal file
View File

@@ -0,0 +1,360 @@
import json
import logging
import os
import time
from dataclasses import dataclass, field
from typing import List, Optional, Union
import requests
import ollama
logger = logging.getLogger(__name__)
@dataclass
class ImageUrl:
"""Image URL data structure for caption requests."""
url: Optional[str] = None
detail: Optional[str] = None
@dataclass
class Content:
"""Content data structure that can contain text or image URL."""
type: Optional[str] = None
text: Optional[str] = None
image_url: Optional[ImageUrl] = None
@dataclass
class SystemMessage:
"""System message for VLM model requests."""
role: Optional[str] = None
content: Optional[str] = None
@dataclass
class UserMessage:
"""User message for VLM model requests, can contain multiple content items."""
role: Optional[str] = None
content: List[Content] = field(default_factory=list)
@dataclass
class CompletionRequest:
"""Request structure for VLM model completion API."""
model: str
temperature: float
top_p: float
messages: List[Union[SystemMessage, UserMessage]]
user: str
@dataclass
class Model:
"""Model identifier structure."""
id: str
@dataclass
class ModelsResp:
"""Response structure for available models API."""
data: List[Model] = field(default_factory=list)
@dataclass
class Message:
"""Message structure in API response."""
role: Optional[str] = None
content: Optional[str] = None
tool_calls: Optional[str] = None
@dataclass
class Choice:
"""Choice structure in API response."""
message: Optional[Message] = None
@dataclass
class Usage:
"""Token usage information in API response."""
prompt_tokens: Optional[int] = 0
total_tokens: Optional[int] = 0
completion_tokens: Optional[int] = 0
@dataclass
class CaptionChatResp:
"""Response structure for caption chat API."""
id: Optional[str] = None
created: Optional[int] = None
model: Optional[Model] = None
object: Optional[str] = None
choices: List[Choice] = field(default_factory=list)
usage: Optional[Usage] = None
@staticmethod
def from_json(json_data: dict) -> "CaptionChatResp":
"""
Parse API response JSON into a CaptionChatResp object.
Args:
json_data: The JSON response from the API
Returns:
A parsed CaptionChatResp object
"""
logger.info("Parsing CaptionChatResp from JSON")
# Manually parse nested fields with safe field extraction
choices = []
for choice in json_data.get("choices", []):
message_data = choice.get("message", {})
message = Message(
role=message_data.get("role"),
content=message_data.get("content"),
tool_calls=message_data.get("tool_calls"),
)
choices.append(Choice(message=message))
# Handle usage with safe field extraction
usage_data = json_data.get("usage", {})
usage = None
if usage_data:
usage = Usage(
prompt_tokens=usage_data.get("prompt_tokens", 0),
total_tokens=usage_data.get("total_tokens", 0),
completion_tokens=usage_data.get("completion_tokens", 0),
)
logger.info(
f"Parsed {len(choices)} choices and usage data: {usage is not None}"
)
return CaptionChatResp(
id=json_data.get("id"),
created=json_data.get("created"),
model=json_data.get("model"),
object=json_data.get("object"),
choices=choices,
usage=usage,
)
def choice_data(self) -> str:
"""
Extract the content from the first choice in the response.
Returns:
The content string from the first choice, or empty string if no choices
"""
if self.choices:
logger.info("Retrieving content from first choice")
return self.choices[0].message.content
logger.warning("No choices available in response")
return ""
class Caption:
"""
Service for generating captions for images using a Vision Language Model.
Uses an external API to process images and return textual descriptions.
"""
def __init__(self, vlm_config=None):
"""Initialize the Caption service with configuration from parameters or environment variables."""
logger.info("Initializing Caption service")
self.prompt = """简单凝炼的描述图片的主要内容"""
# Use provided VLM config if available, otherwise fall back to environment variables
if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"):
self.completion_url = vlm_config.get("base_url", "") + "/chat/completions"
self.model = vlm_config.get("model_name", "")
self.api_key = vlm_config.get("api_key", "")
self.interface_type = vlm_config.get("interface_type", "openai").lower()
else:
if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "":
logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set")
return
self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions"
self.model = os.getenv("VLM_MODEL_NAME")
self.api_key = os.getenv("VLM_MODEL_API_KEY")
self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower()
# 验证接口类型
if self.interface_type not in ["ollama", "openai"]:
logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai")
self.interface_type = "openai"
logger.info(
f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}"
)
def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]:
"""
Call the Caption API to generate a description for the given image.
Args:
image_data: URL of the image or base64 encoded image data
Returns:
CaptionChatResp object if successful, None otherwise
"""
logger.info(f"Calling Caption API for image captioning")
logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}")
# 根据接口类型选择调用方式
if self.interface_type == "ollama":
return self._call_ollama_api(image_data)
else:
return self._call_openai_api(image_data)
def _call_ollama_api(self, image_base64: str) -> Optional[CaptionChatResp]:
"""Call Ollama API for image captioning using base64 encoded image data."""
host = self.completion_url.replace("/v1/chat/completions", "")
client = ollama.Client(
host=host,
)
try:
logger.info(f"Calling Ollama API with model: {self.model}")
# 调用Ollama API使用images参数传递base64编码的图片
response = client.generate(
model=self.model,
prompt="简单凝炼的描述图片的主要内容",
images=[image_base64], # image_base64是base64编码的图片数据
options={"temperature": 0.1},
stream=False,
)
# 构造响应对象
caption_resp = CaptionChatResp(
id="ollama_response",
created=int(time.time()),
model=self.model,
object="chat.completion",
choices=[
Choice(
message=Message(
role="assistant",
content=response.response
)
)
]
)
logger.info("Successfully received response from Ollama API")
return caption_resp
except Exception as e:
logger.error(f"Error calling Ollama API: {e}")
return None
def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]:
"""Call OpenAI-compatible API for image captioning."""
logger.info(f"Calling OpenAI-compatible API with model: {self.model}")
user_msg = UserMessage(
role="user",
content=[
Content(type="text", text=self.prompt),
Content(
type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto")
),
],
)
gpt_req = CompletionRequest(
model=self.model,
temperature=0.3,
top_p=0.8,
messages=[user_msg],
user="abc",
)
headers = {
"Content-Type": "application/json",
"Accept": "text/event-stream",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
try:
logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}")
response = requests.post(
self.completion_url,
data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4),
headers=headers,
timeout=30,
)
if response.status_code != 200:
logger.error(
f"OpenAI-compatible API returned non-200 status code: {response.status_code}"
)
response.raise_for_status()
logger.info(
f"Successfully received response from OpenAI-compatible API with status: {response.status_code}"
)
logger.info(f"Converting response to CaptionChatResp object")
caption_resp = CaptionChatResp.from_json(response.json())
if caption_resp.usage:
logger.info(
f"API usage: prompt_tokens={caption_resp.usage.prompt_tokens}, "
f"completion_tokens={caption_resp.usage.completion_tokens}"
)
return caption_resp
except requests.exceptions.Timeout:
logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds")
return None
except requests.exceptions.RequestException as e:
logger.error(f"Request error calling OpenAI-compatible API: {e}")
return None
except Exception as e:
logger.error(f"Unexpected error calling OpenAI-compatible API: {e}")
return None
def get_caption(self, image_data: str) -> str:
"""
Get a caption for the provided image data.
Args:
image_data: URL of the image or base64 encoded image data
Returns:
Caption text as string, or empty string if captioning failed
"""
logger.info("Getting caption for image")
if not image_data or self.completion_url is None:
logger.error("Image data is not set")
return ""
caption_resp = self._call_caption_api(image_data)
if caption_resp:
caption = caption_resp.choice_data()
caption_length = len(caption)
logger.info(f"Successfully generated caption of length {caption_length}")
logger.info(
f"Caption: {caption[:50]}..."
if caption_length > 50
else f"Caption: {caption}"
)
return caption
logger.warning("Failed to get caption from Caption API")
return ""

View File

@@ -0,0 +1,21 @@
from dataclasses import dataclass, field
@dataclass
class ChunkingConfig:
"""
Configuration for text chunking process.
Controls how documents are split into smaller pieces for processing.
"""
chunk_size: int = 512 # Maximum size of each chunk in tokens/chars
chunk_overlap: int = 50 # Number of tokens/chars to overlap between chunks
separators: list = field(
default_factory=lambda: ["\n\n", "\n", ""]
) # Text separators in order of priority
enable_multimodal: bool = (
False # Whether to enable multimodal processing (text + images)
)
storage_config: dict = None # Preferred field name going forward
vlm_config: dict = None # VLM configuration for image captioning

View File

@@ -0,0 +1,315 @@
import asyncio
import logging
import re
import tempfile
import os
import subprocess
import shutil
from io import BytesIO
from typing import Optional, List, Tuple
import textract
from PIL import Image
import zipfile
import xml.etree.ElementTree as ET
from .base_parser import BaseParser
from .docx_parser import DocxParser, Docx
logger = logging.getLogger(__name__)
class DocParser(BaseParser):
"""DOC document parser"""
def parse_into_text(self, content: bytes) -> str:
"""Parse DOC document
Args:
content: DOC document content
Returns:
Parse result
"""
logger.info(f"Parsing DOC document, content size: {len(content)} bytes")
# Save byte content as a temporary file
with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
temp_file_path = temp_file.name
temp_file.write(content)
temp_file.flush()
logger.info(f"Saved DOC content to temporary file: {temp_file_path}")
try:
# First try to convert to docx format to extract images
if self.enable_multimodal:
logger.info("Multimodal enabled, attempting to extract images from DOC")
docx_content = self._convert_doc_to_docx(temp_file_path)
if docx_content:
logger.info("Successfully converted DOC to DOCX, using DocxParser")
# Use existing DocxParser to parse the converted docx
docx_parser = DocxParser(
file_name=self.file_name,
file_type="docx",
enable_multimodal=self.enable_multimodal,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
chunking_config=self.chunking_config,
separators=self.separators,
)
text = docx_parser.parse_into_text(docx_content)
logger.info(f"Extracted {len(text)} characters using DocxParser")
# Clean up temporary file
os.unlink(temp_file_path)
logger.info(f"Deleted temporary file: {temp_file_path}")
return text
else:
logger.warning(
"Failed to convert DOC to DOCX, falling back to text-only extraction"
)
# If image extraction is not needed or conversion failed, try using antiword to extract text
try:
logger.info("Attempting to parse DOC file with antiword")
# Check if antiword is installed
antiword_path = self._find_antiword_path()
if antiword_path:
# Use antiword to extract text directly
logger.info(f"Using antiword at {antiword_path} to extract text")
process = subprocess.Popen(
[antiword_path, temp_file_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = process.communicate()
if process.returncode == 0:
text = stdout.decode("utf-8", errors="ignore")
logger.info(
f"Successfully extracted {len(text)} characters using antiword"
)
# Clean up temporary file
os.unlink(temp_file_path)
logger.info(f"Deleted temporary file: {temp_file_path}")
return text
else:
logger.warning(
f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
)
else:
logger.warning("antiword not found, falling back to textract")
except Exception as e:
logger.warning(
f"Error using antiword: {str(e)}, falling back to textract"
)
# If antiword fails, try using textract
logger.info("Parsing DOC file with textract")
text = textract.process(temp_file_path, method="antiword").decode("utf-8")
logger.info(
f"Successfully extracted {len(text)} characters of text from DOC document using textract"
)
# Clean up temporary file
os.unlink(temp_file_path)
logger.info(f"Deleted temporary file: {temp_file_path}")
return text
except Exception as e:
logger.error(f"Error parsing DOC document: {str(e)}")
# Ensure temporary file is cleaned up
if os.path.exists(temp_file_path):
os.unlink(temp_file_path)
logger.info(f"Deleted temporary file after error: {temp_file_path}")
return ""
def _convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
"""Convert DOC file to DOCX format
Uses LibreOffice/OpenOffice for conversion
Args:
doc_path: DOC file path
Returns:
Byte stream of DOCX file content, or None if conversion fails
"""
logger.info(f"Converting DOC to DOCX: {doc_path}")
# Create a temporary directory to store the converted file
temp_dir = tempfile.mkdtemp()
docx_path = os.path.join(temp_dir, "converted.docx")
try:
# Check if LibreOffice or OpenOffice is installed
soffice_path = self._find_soffice_path()
if not soffice_path:
logger.error(
"LibreOffice/OpenOffice not found, cannot convert DOC to DOCX"
)
return None
# Execute conversion command
logger.info(f"Using {soffice_path} to convert DOC to DOCX")
cmd = [
soffice_path,
"--headless",
"--convert-to",
"docx",
"--outdir",
temp_dir,
doc_path,
]
logger.info(f"Running command: {' '.join(cmd)}")
process = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
if process.returncode != 0:
logger.error(
f"Error converting DOC to DOCX: {stderr.decode('utf-8', errors='ignore')}"
)
return None
# Find the converted file
for file in os.listdir(temp_dir):
if file.endswith(".docx"):
converted_file = os.path.join(temp_dir, file)
logger.info(f"Found converted file: {converted_file}")
# Read the converted file content
with open(converted_file, "rb") as f:
docx_content = f.read()
logger.info(
f"Successfully read converted DOCX file, size: {len(docx_content)} bytes"
)
return docx_content
logger.error("No DOCX file found after conversion")
return None
except Exception as e:
logger.error(f"Error during DOC to DOCX conversion: {str(e)}")
return None
finally:
# Clean up temporary directory
try:
shutil.rmtree(temp_dir)
logger.info(f"Cleaned up temporary directory: {temp_dir}")
except Exception as e:
logger.warning(f"Failed to clean up temporary directory: {str(e)}")
def _find_soffice_path(self) -> Optional[str]:
"""Find LibreOffice/OpenOffice executable path
Returns:
Executable path, or None if not found
"""
# Common LibreOffice/OpenOffice executable paths
possible_paths = [
# Linux
"/usr/bin/soffice",
"/usr/lib/libreoffice/program/soffice",
"/opt/libreoffice25.2/program/soffice",
# macOS
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
# Windows
"C:\\Program Files\\LibreOffice\\program\\soffice.exe",
"C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
]
# Check if path is set in environment variable
if os.environ.get("LIBREOFFICE_PATH"):
possible_paths.insert(0, os.environ.get("LIBREOFFICE_PATH"))
for path in possible_paths:
if os.path.exists(path):
logger.info(f"Found LibreOffice/OpenOffice at: {path}")
return path
# Try to find in PATH
try:
result = subprocess.run(
["which", "soffice"], capture_output=True, text=True
)
if result.returncode == 0 and result.stdout.strip():
path = result.stdout.strip()
logger.info(f"Found LibreOffice/OpenOffice in PATH: {path}")
return path
except Exception:
pass
logger.warning("LibreOffice/OpenOffice not found")
return None
def _find_antiword_path(self) -> Optional[str]:
"""Find antiword executable path
Returns:
Executable path, or None if not found
"""
# Common antiword executable paths
possible_paths = [
# Linux/macOS
"/usr/bin/antiword",
"/usr/local/bin/antiword",
# Windows
"C:\\Program Files\\Antiword\\antiword.exe",
"C:\\Program Files (x86)\\Antiword\\antiword.exe",
]
# Check if path is set in environment variable
if os.environ.get("ANTIWORD_PATH"):
possible_paths.insert(0, os.environ.get("ANTIWORD_PATH"))
for path in possible_paths:
if os.path.exists(path):
logger.info(f"Found antiword at: {path}")
return path
# Try to find in PATH
try:
result = subprocess.run(
["which", "antiword"], capture_output=True, text=True
)
if result.returncode == 0 and result.stdout.strip():
path = result.stdout.strip()
logger.info(f"Found antiword in PATH: {path}")
return path
except Exception:
pass
logger.warning("antiword not found")
return None
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger.info("Running DocParser in standalone mode")
file_name = "/path/to/your/test.doc"
logger.info(f"Processing file: {file_name}")
doc_parser = DocParser(
file_name, enable_multimodal=True, chunk_size=512, chunk_overlap=60
)
logger.info("Parser initialized, starting processing")
with open(file_name, "rb") as f:
content = f.read()
text = doc_parser.parse_into_text(content)
logger.info(f"Processing complete, extracted text length: {len(text)}")
logger.info(f"Sample text: {text[:200]}...")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,68 @@
import logging
import os
import asyncio
from PIL import Image
import io
from typing import Dict, Any, Tuple, Union
from .base_parser import BaseParser, ParseResult
import numpy as np
# Set up logger for this module
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class ImageParser(BaseParser):
"""
Parser for image files with OCR capability.
Extracts text from images and generates captions.
This parser handles image processing by:
1. Uploading the image to storage
2. Generating a descriptive caption
3. Performing OCR to extract text content
4. Returning a combined result with both text and image reference
"""
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
"""
Parse image content, upload the image and return Markdown reference along with image map.
Args:
content: Raw image data (bytes)
Returns:
Tuple of (markdown_text, image_map) where image_map maps image URLs to PIL Image objects
"""
logger.info(f"Parsing image content, size: {len(content)} bytes")
image_map = {}
try:
# Upload image to storage service
logger.info("Uploading image to storage")
_, ext = os.path.splitext(self.file_name)
image_url = self.upload_bytes(content, file_ext=ext)
if not image_url:
logger.error("Failed to upload image to storage")
return "", {}
logger.info(
f"Successfully uploaded image, URL: {image_url[:50]}..."
if len(image_url) > 50
else f"Successfully uploaded image, URL: {image_url}"
)
# Create image object and add to map
try:
from PIL import Image
import io
image = Image.open(io.BytesIO(content))
image_map[image_url] = image
logger.info(f"Added image to image_map for URL: {image_url}")
except Exception as img_err:
logger.error(f"Error creating image object: {str(img_err)}")
markdown_text = f"![{self.file_name}]({image_url})"
return markdown_text, image_map
except Exception as e:
logger.error(f"Error parsing image: {str(e)}")
return "", {}

View File

@@ -0,0 +1,43 @@
import base64
import io
import logging
from typing import Union
from PIL import Image
import numpy as np
logger = logging.getLogger(__name__)
def image_to_base64(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
"""Convert image to base64 encoded string
Args:
image: Image file path, bytes, PIL Image object, or numpy array
Returns:
Base64 encoded image string, or empty string if conversion fails
"""
try:
if isinstance(image, str):
# It's a file path
with open(image, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
elif isinstance(image, bytes):
# It's bytes data
return base64.b64encode(image).decode("utf-8")
elif isinstance(image, Image.Image):
# It's a PIL Image
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
elif isinstance(image, np.ndarray):
# It's a numpy array
pil_image = Image.fromarray(image)
buffer = io.BytesIO()
pil_image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
else:
logger.error(f"Unsupported image type: {type(image)}")
return ""
except Exception as e:
logger.error(f"Error converting image to base64: {str(e)}")
return ""

View File

@@ -0,0 +1,33 @@
import asyncio
import re
import logging
import numpy as np
import os # Import os module to get environment variables
from typing import Dict, List, Optional, Tuple, Union, Any
from .base_parser import BaseParser
# Get logger object
logger = logging.getLogger(__name__)
class MarkdownParser(BaseParser):
"""Markdown document parser"""
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
"""Parse Markdown document, only extract text content, do not process images
Args:
content: Markdown document content
Returns:
Parsed text result
"""
logger.info(f"Parsing Markdown document, content size: {len(content)} bytes")
# Convert byte content to string using universal decoding method
text = self.decode_bytes(content)
logger.info(f"Decoded Markdown content, text length: {len(text)} characters")
logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text")
return text

View File

@@ -0,0 +1,278 @@
import os
import logging
import base64
from typing import Optional, Union, Dict, Any
from abc import ABC, abstractmethod
from PIL import Image
import io
import numpy as np
from .image_utils import image_to_base64
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class OCRBackend(ABC):
"""Base class for OCR backends"""
@abstractmethod
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
"""Extract text from an image
Args:
image: Image file path, bytes, or PIL Image object
Returns:
Extracted text
"""
pass
class PaddleOCRBackend(OCRBackend):
"""PaddleOCR backend implementation"""
def __init__(self, **kwargs):
"""Initialize PaddleOCR backend"""
self.ocr = None
try:
import os
import paddle
# Set PaddlePaddle to use CPU and disable GPU
os.environ['CUDA_VISIBLE_DEVICES'] = ''
paddle.set_device('cpu')
# 尝试检测CPU是否支持AVX指令集
try:
import subprocess
import platform
# 检测CPU是否支持AVX
if platform.system() == "Linux":
try:
result = subprocess.run(['grep', '-o', 'avx', '/proc/cpuinfo'],
capture_output=True, text=True, timeout=5)
has_avx = 'avx' in result.stdout.lower()
if not has_avx:
logger.warning("CPU does not support AVX instructions, using compatibility mode")
# 进一步限制指令集使用
os.environ['FLAGS_use_avx2'] = '0'
os.environ['FLAGS_use_avx'] = '1'
except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
logger.warning("Could not detect AVX support, using compatibility mode")
os.environ['FLAGS_use_avx2'] = '0'
os.environ['FLAGS_use_avx'] = '1'
except Exception as e:
logger.warning(f"Error detecting CPU capabilities: {e}, using compatibility mode")
os.environ['FLAGS_use_avx2'] = '0'
os.environ['FLAGS_use_avx'] = '1'
from paddleocr import PaddleOCR
# OCR configuration with text orientation classification enabled
ocr_config = {
"use_gpu": False,
"text_det_limit_type": "max",
"text_det_limit_side_len": 960,
"use_doc_orientation_classify": True, # 启用文档方向分类
"use_doc_unwarping": False,
"use_textline_orientation": True, # 启用文本行方向检测
"text_recognition_model_name": "PP-OCRv4_server_rec",
"text_detection_model_name": "PP-OCRv4_server_det",
"text_det_thresh": 0.3,
"text_det_box_thresh": 0.6,
"text_det_unclip_ratio": 1.5,
"text_rec_score_thresh": 0.0,
"ocr_version": "PP-OCRv4",
"lang": "ch",
"show_log": False,
"use_dilation": True, # improves accuracy
"det_db_score_mode": "slow", # improves accuracy
}
self.ocr = PaddleOCR(**ocr_config)
logger.info("PaddleOCR engine initialized successfully")
except ImportError as e:
logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
except OSError as e:
if "Illegal instruction" in str(e) or "core dumped" in str(e):
logger.error(f"PaddlePaddle crashed due to CPU instruction set incompatibility: {str(e)}")
logger.error("This usually happens when the CPU doesn't support AVX instructions.")
logger.error("Please try installing a CPU-only version of PaddlePaddle or use a different OCR backend.")
else:
logger.error(f"Failed to initialize PaddleOCR due to OS error: {str(e)}")
except Exception as e:
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
def predict(self, image):
"""Perform OCR recognition on the image
Args:
image: Image object (PIL.Image or numpy array)
Returns:
Extracted text string
"""
try:
# Ensure image is in RGB format
if hasattr(image, "convert") and image.mode != "RGB":
image = image.convert("RGB")
# Convert to numpy array if needed
if hasattr(image, "convert"):
image_array = np.array(image)
else:
image_array = image
# Perform OCR
ocr_result = self.ocr.ocr(image_array, cls=False)
# Extract text
ocr_text = ""
if ocr_result and ocr_result[0]:
for line in ocr_result[0]:
if line and len(line) >= 2:
text = line[1][0] if line[1] else ""
if text:
ocr_text += text + " "
text_length = len(ocr_text.strip())
if text_length > 0:
logger.info(f"OCR extracted {text_length} characters")
return ocr_text.strip()
else:
logger.warning("OCR returned empty result")
return ""
except Exception as e:
logger.error(f"OCR recognition error: {str(e)}")
return ""
class NanonetsOCRBackend(OCRBackend):
"""Nanonets OCR backend implementation using OpenAI API format"""
def __init__(self, **kwargs):
"""Initialize Nanonets OCR backend
Args:
api_key: API key for OpenAI API
base_url: Base URL for OpenAI API
model: Model name
"""
try:
from openai import OpenAI
self.api_key = kwargs.get("api_key", "123")
self.base_url = kwargs.get("base_url", "http://localhost:8000/v1")
self.model = kwargs.get("model", "nanonets/Nanonets-OCR-s")
self.temperature = kwargs.get("temperature", 0.0)
self.max_tokens = kwargs.get("max_tokens", 15000)
self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
self.prompt = """
## 任务说明
请从上传的文档中提取文字内容,严格按自然阅读顺序(从上到下,从左到右)输出,并遵循以下格式规范。
### 1. **文本处理**
* 按正常阅读顺序提取文字,语句流畅自然。
### 2. **表格**
* 所有表格统一转换为 **Markdown 表格格式**。
* 内容保持清晰、对齐整齐,便于阅读。
### 3. **公式**
* 所有公式转换为 **LaTeX 格式**,使用 `$$公式$$` 包裹。
### 4. **图片**
* 忽略图片信息
### 5. **链接**
* 不要猜测或补全不确定的链接地址。
"""
logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
except ImportError:
logger.error("Failed to import openai. Please install it with 'pip install openai'")
self.client = None
except Exception as e:
logger.error(f"Failed to initialize Nanonets OCR: {str(e)}")
self.client = None
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
"""Extract text from an image using Nanonets OCR
Args:
image: Image file path, bytes, or PIL Image object
Returns:
Extracted text
"""
if self.client is None:
logger.error("Nanonets OCR client not initialized")
return ""
try:
# Encode image to base64
img_base64 = image_to_base64(image)
if not img_base64:
return ""
# Call Nanonets OCR API
logger.info(f"Calling Nanonets OCR API with model: {self.model}")
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{img_base64}"},
},
{
"type": "text",
"text": self.prompt,
},
],
}
],
temperature=self.temperature,
max_tokens=self.max_tokens
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"Nanonets OCR prediction error: {str(e)}")
return ""
class OCREngine:
"""OCR Engine factory class"""
_instance = None
@classmethod
def get_instance(cls, backend_type="paddle", **kwargs) -> Optional[OCRBackend]:
"""Get OCR engine instance
Args:
backend_type: OCR backend type, one of: "paddle", "nanonets"
**kwargs: Additional arguments for the backend
Returns:
OCR engine instance or None if initialization fails
"""
if cls._instance is None:
logger.info(f"Initializing OCR engine with backend: {backend_type}")
if backend_type.lower() == "paddle":
cls._instance = PaddleOCRBackend(**kwargs)
elif backend_type.lower() == "nanonets":
cls._instance = NanonetsOCRBackend(**kwargs)
else:
logger.error(f"Unknown OCR backend type: {backend_type}")
return None
return cls._instance

206
docreader/parser/parser.py Normal file
View File

@@ -0,0 +1,206 @@
import logging
from dataclasses import dataclass, field
from typing import Dict, Any, Optional, Type
from .base_parser import BaseParser, ParseResult
from .docx_parser import DocxParser
from .doc_parser import DocParser
from .pdf_parser import PDFParser
from .markdown_parser import MarkdownParser
from .text_parser import TextParser
from .image_parser import ImageParser
from .web_parser import WebParser
from .config import ChunkingConfig
import traceback
logger = logging.getLogger(__name__)
@dataclass
class Chunk:
"""
Represents a single text chunk with associated metadata.
Basic unit for document processing and embedding.
"""
content: str # Text content of the chunk
metadata: Dict[str, Any] = None # Associated metadata (source, page number, etc.)
class Parser:
"""
Document parser facade that integrates all specialized parsers.
Provides a unified interface for parsing various document types.
"""
def __init__(self):
logger.info("Initializing document parser")
# Initialize all parser types
self.parsers: Dict[str, Type[BaseParser]] = {
"docx": DocxParser,
"doc": DocParser,
"pdf": PDFParser,
"md": MarkdownParser,
"txt": TextParser,
"jpg": ImageParser,
"jpeg": ImageParser,
"png": ImageParser,
"gif": ImageParser,
"bmp": ImageParser,
"tiff": ImageParser,
"webp": ImageParser,
"markdown": MarkdownParser,
}
logger.info(
"Parser initialized with %d parsers: %s",
len(self.parsers),
", ".join(self.parsers.keys()),
)
def get_parser(self, file_type: str) -> Optional[Type[BaseParser]]:
"""
Get parser class for the specified file type.
Args:
file_type: The file extension or type identifier
Returns:
Parser class for the file type, or None if unsupported
"""
file_type = file_type.lower()
parser = self.parsers.get(file_type)
if parser:
logger.info(f"Found parser for file type: {file_type}")
else:
logger.warning(f"No parser found for file type: {file_type}")
return parser
def parse_file(
self,
file_name: str,
file_type: str,
content: bytes,
config: ChunkingConfig,
) -> Optional[ParseResult]:
"""
Parse file content using appropriate parser based on file type.
Args:
file_name: Name of the file being parsed
file_type: Type/extension of the file
content: Raw file content as bytes
config: Configuration for chunking process
Returns:
ParseResult containing chunks and metadata, or None if parsing failed
"""
logger.info(f"Parsing file: {file_name} with type: {file_type}")
logger.info(
f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
f"multimodal={config.enable_multimodal}"
)
parser_instance = None
try:
# Get appropriate parser for file type
cls = self.get_parser(file_type)
if cls is None:
logger.error(f"Unsupported file type: {file_type}")
return None
# Parse file content
logger.info(f"Creating parser instance for {file_type} file")
parser_instance = cls(
file_name=file_name,
file_type=file_type,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
separators=config.separators,
enable_multimodal=config.enable_multimodal,
max_image_size=1920, # Limit image size to 1920px
max_concurrent_tasks=5, # Limit concurrent tasks to 5
chunking_config=config, # Pass the entire chunking config
)
logger.info(f"Starting to parse file content, size: {len(content)} bytes")
result = parser_instance.parse(content)
if result:
logger.info(
f"Successfully parsed file {file_name}, generated {len(result.chunks)} chunks"
)
if result.chunks and len(result.chunks) > 0:
logger.info(
f"First chunk content length: {len(result.chunks[0].content)}"
)
else:
logger.warning(f"Parser returned empty chunks for file: {file_name}")
else:
logger.warning(f"Parser returned None result for file: {file_name}")
# Return parse results
return result
except Exception as e:
logger.error(f"Error parsing file {file_name}: {str(e)}")
logger.info(f"Detailed traceback: {traceback.format_exc()}")
return None
def parse_url(
self, url: str, title: str, config: ChunkingConfig
) -> Optional[ParseResult]:
"""
Parse content from a URL using the WebParser.
Args:
url: URL to parse
title: Title of the webpage (for metadata)
config: Configuration for chunking process
Returns:
ParseResult containing chunks and metadata, or None if parsing failed
"""
logger.info(f"Parsing URL: {url}, title: {title}")
logger.info(
f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
f"multimodal={config.enable_multimodal}"
)
parser_instance = None
try:
# Create web parser instance
logger.info("Creating WebParser instance")
parser_instance = WebParser(
title=title,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
separators=config.separators,
enable_multimodal=config.enable_multimodal,
max_image_size=1920, # Limit image size
max_concurrent_tasks=5, # Limit concurrent tasks
chunking_config=config,
)
logger.info(f"Starting to parse URL content")
result = parser_instance.parse(url)
if result:
logger.info(
f"Successfully parsed URL, generated {len(result.chunks)} chunks"
)
logger.info(
f"First chunk content length: {len(result.chunks[0].content) if result.chunks else 0}"
)
else:
logger.warning(f"Parser returned empty result for URL: {url}")
# Return parse results
return result
except Exception as e:
logger.error(f"Error parsing URL {url}: {str(e)}")
logger.info(f"Detailed traceback: {traceback.format_exc()}")
return None

View File

@@ -0,0 +1,113 @@
import logging
import os
import io
from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
import pdfplumber
import tempfile
from .base_parser import BaseParser
logger = logging.getLogger(__name__)
class PDFParser(BaseParser):
"""
PDF Document Parser
This parser handles PDF documents by extracting text content.
It uses the pypdf library for simple text extraction.
"""
def _convert_table_to_markdown(self, table_data: list) -> str:
if not table_data or not table_data[0]: return ""
def clean_cell(cell):
if cell is None: return ""
return str(cell).replace("\n", " <br> ")
try:
markdown = ""
header = [clean_cell(cell) for cell in table_data[0]]
markdown += "| " + " | ".join(header) + " |\n"
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
for row in table_data[1:]:
if not row: continue
body_row = [clean_cell(cell) for cell in row]
if len(body_row) != len(header):
logger.warning(f"Skipping malformed table row: {body_row}")
continue
markdown += "| " + " | ".join(body_row) + " |\n"
return markdown
except Exception as e:
logger.error(f"Error converting table to markdown: {e}")
return ""
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
all_page_content = []
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
temp_pdf_path = temp_pdf.name
try:
temp_pdf.write(content)
temp_pdf.close()
logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
with pdfplumber.open(temp_pdf_path) as pdf:
logger.info(f"PDF has {len(pdf.pages)} pages")
for page_num, page in enumerate(pdf.pages):
page_content_parts = []
# Try-fallback strategy for table detection
default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
found_tables = page.find_tables(default_settings)
if not found_tables:
logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
found_tables = page.find_tables(fallback_settings)
table_bboxes = [table.bbox for table in found_tables]
# Define a filter function that keeps objects NOT inside any table bbox.
def not_within_bboxes(obj):
"""Check if an object is outside all table bounding boxes."""
for bbox in table_bboxes:
# Check if the object's vertical center is within a bbox
if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
return False # It's inside a table, so we DON'T keep it
return True # It's outside all tables, so we DO keep it
# that contains only the non-table text.
non_table_page = page.filter(not_within_bboxes)
# Now, extract text from this filtered page view.
text = non_table_page.extract_text(x_tolerance=2)
if text:
page_content_parts.append(text)
# Process and append the structured Markdown tables
if found_tables:
logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
for table in found_tables:
markdown_table = self._convert_table_to_markdown(table.extract())
page_content_parts.append(f"\n\n{markdown_table}\n\n")
all_page_content.append("".join(page_content_parts))
final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
return final_text
except Exception as e:
logger.error(f"Failed to parse PDF document: {str(e)}")
return ""
finally:
# This block is GUARANTEED to execute, preventing resource leaks.
if os.path.exists(temp_pdf_path):
try:
os.remove(temp_pdf_path)
logger.info(f"Temporary file cleaned up: {temp_pdf_path}")
except OSError as e:
logger.error(f"Error removing temporary file {temp_pdf_path}: {e}")

360
docreader/parser/storage.py Normal file
View File

@@ -0,0 +1,360 @@
# -*- coding: utf-8 -*-
import os
import uuid
import logging
import io
import traceback
from abc import ABC, abstractmethod
from typing import Tuple, Optional
from qcloud_cos import CosConfig, CosS3Client
from minio import Minio
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class Storage(ABC):
"""Abstract base class for object storage operations"""
@abstractmethod
def upload_file(self, file_path: str) -> str:
"""Upload file to object storage
Args:
file_path: File path
Returns:
File URL
"""
pass
@abstractmethod
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to object storage
Args:
content: Byte content to upload
file_ext: File extension
Returns:
File URL
"""
pass
class CosStorage(Storage):
"""Tencent Cloud COS storage implementation"""
def __init__(self, storage_config=None):
"""Initialize COS storage
Args:
storage_config: Storage configuration
"""
self.storage_config = storage_config
self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client()
def _init_cos_client(self):
"""Initialize Tencent Cloud COS client"""
try:
# Use provided COS config if available, otherwise fall back to environment variables
if self.storage_config and self.storage_config.get("access_key_id") != "":
cos_config = self.storage_config
secret_id = cos_config.get("access_key_id")
secret_key = cos_config.get("secret_access_key")
region = cos_config.get("region")
bucket_name = cos_config.get("bucket_name")
appid = cos_config.get("app_id")
prefix = cos_config.get("path_prefix", "")
else:
# Get COS configuration from environment variables
secret_id = os.getenv("COS_SECRET_ID")
secret_key = os.getenv("COS_SECRET_KEY")
region = os.getenv("COS_REGION")
bucket_name = os.getenv("COS_BUCKET_NAME")
appid = os.getenv("COS_APP_ID")
prefix = os.getenv("COS_PATH_PREFIX")
enable_old_domain = (
os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true"
)
if not all([secret_id, secret_key, region, bucket_name, appid]):
logger.error(
"Incomplete COS configuration, missing required environment variables"
f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}"
)
return None, None, None, None
# Initialize COS configuration
logger.info(
f"Initializing COS client with region: {region}, bucket: {bucket_name}"
)
config = CosConfig(
Appid=appid,
Region=region,
SecretId=secret_id,
SecretKey=secret_key,
EnableOldDomain=enable_old_domain,
)
# Create client
client = CosS3Client(config)
return client, bucket_name, region, prefix
except Exception as e:
logger.error(f"Failed to initialize COS client: {str(e)}")
return None, None, None, None
def _get_download_url(self, bucket_name, region, object_key):
"""Generate COS object URL
Args:
bucket_name: Bucket name
region: Region
object_key: Object key
Returns:
File URL
"""
return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
def upload_file(self, file_path: str) -> str:
"""Upload file to Tencent Cloud COS
Args:
file_path: File path
Returns:
File URL
"""
logger.info(f"Uploading file to COS: {file_path}")
try:
if not self.client:
return ""
# Generate object key, use UUID to avoid conflicts
file_name = os.path.basename(file_path)
object_key = (
f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
)
logger.info(f"Generated object key: {object_key}")
# Upload file
logger.info("Attempting to upload file to COS")
response = self.client.upload_file(
Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key
)
# Get file URL
file_url = self._get_download_url(self.bucket_name, self.region, object_key)
logger.info(f"Successfully uploaded file to COS: {file_url}")
return file_url
except Exception as e:
logger.error(f"Failed to upload file to COS: {str(e)}")
return ""
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to Tencent Cloud COS
Args:
content: Byte content to upload
file_ext: File extension
Returns:
File URL
"""
try:
logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes")
if not self.client:
return ""
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}"
logger.info(f"Generated object key: {object_key}")
self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key)
file_url = self._get_download_url(self.bucket_name, self.region, object_key)
logger.info(f"Successfully uploaded bytes to COS: {file_url}")
return file_url
except Exception as e:
logger.error(f"Failed to upload bytes to COS: {str(e)}")
traceback.print_exc()
return ""
class MinioStorage(Storage):
"""MinIO storage implementation"""
def __init__(self, storage_config=None):
"""Initialize MinIO storage
Args:
storage_config: Storage configuration
"""
self.storage_config = storage_config
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client()
def _init_minio_client(self):
"""Initialize MinIO client from environment variables or injected config.
If storage_config.path_prefix contains JSON from server (for minio case),
prefer those values to override envs.
"""
try:
endpoint = os.getenv("MINIO_ENDPOINT")
use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true"
if self.storage_config and self.storage_config.get("bucket_name"):
storage_config = self.storage_config
bucket_name = storage_config.get("bucket_name")
path_prefix = storage_config.get("path_prefix").strip().strip("/")
access_key = storage_config.get("access_key_id")
secret_key = storage_config.get("secret_access_key")
else:
access_key = os.getenv("MINIO_ACCESS_KEY_ID")
secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY")
bucket_name = os.getenv("MINIO_BUCKET_NAME")
path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/")
if not all([endpoint, access_key, secret_key, bucket_name]):
logger.error("Incomplete MinIO configuration, missing required environment variables")
return None, None, None, None, None
# Initialize client
client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl)
# Ensure bucket exists
found = client.bucket_exists(bucket_name)
if not found:
client.make_bucket(bucket_name)
policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name)
client.set_bucket_policy(bucket_name, policy)
return client, bucket_name, use_ssl, endpoint, path_prefix
except Exception as e:
logger.error(f"Failed to initialize MinIO client: {str(e)}")
return None, None, None, None, None
def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None):
"""Construct a public URL for MinIO object.
If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint.
"""
if public_endpoint:
base = public_endpoint
else:
scheme = "https" if use_ssl else "http"
base = f"{scheme}://{endpoint}"
# Path-style URL for MinIO
return f"{base}/{bucket_name}/{object_key}"
def upload_file(self, file_path: str) -> str:
"""Upload file to MinIO
Args:
file_path: File path
Returns:
File URL
"""
logger.info(f"Uploading file to MinIO: {file_path}")
try:
if not self.client:
return ""
# Generate object key, use UUID to avoid conflicts
file_name = os.path.basename(file_path)
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
logger.info(f"Generated MinIO object key: {object_key}")
# Upload file
logger.info("Attempting to upload file to MinIO")
with open(file_path, 'rb') as file_data:
file_size = os.path.getsize(file_path)
self.client.put_object(
bucket_name=self.bucket_name,
object_name=object_key,
data=file_data,
length=file_size,
content_type='application/octet-stream'
)
# Get file URL
file_url = self._get_download_url(
self.bucket_name,
object_key,
self.use_ssl,
self.endpoint,
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
)
logger.info(f"Successfully uploaded file to MinIO: {file_url}")
return file_url
except Exception as e:
logger.error(f"Failed to upload file to MinIO: {str(e)}")
return ""
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
"""Upload bytes to MinIO
Args:
content: Byte content to upload
file_ext: File extension
Returns:
File URL
"""
try:
logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes")
if not self.client:
return ""
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}"
logger.info(f"Generated MinIO object key: {object_key}")
self.client.put_object(
self.bucket_name,
object_key,
data=io.BytesIO(content),
length=len(content),
content_type="application/octet-stream"
)
file_url = self._get_download_url(
self.bucket_name,
object_key,
self.use_ssl,
self.endpoint,
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
)
logger.info(f"Successfully uploaded bytes to MinIO: {file_url}")
return file_url
except Exception as e:
logger.error(f"Failed to upload bytes to MinIO: {str(e)}")
traceback.print_exc()
return ""
def create_storage(storage_config=None) -> Storage:
"""Create a storage instance based on configuration or environment variables
Args:
storage_config: Storage configuration dictionary
Returns:
Storage instance
"""
storage_type = os.getenv("STORAGE_TYPE", "cos").lower()
if storage_config:
storage_type = str(storage_config.get("provider", storage_type)).lower()
logger.info(f"Creating {storage_type} storage instance")
if storage_type == "minio":
return MinioStorage(storage_config)
elif storage_type == "cos":
# Default to COS
return CosStorage(storage_config)
else:
return None

View File

@@ -0,0 +1,58 @@
import logging
from .base_parser import BaseParser
from typing import Dict, Any, Tuple, Union
logger = logging.getLogger(__name__)
class TextParser(BaseParser):
"""
Text document parser for processing plain text files.
This parser handles text extraction and chunking from plain text documents.
"""
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
"""
Parse text document content by decoding bytes to string.
This is a straightforward parser that simply converts the binary content
to text using appropriate character encoding.
Args:
content: Raw document content as bytes
Returns:
Parsed text content as string
"""
logger.info(f"Parsing text document, content size: {len(content)} bytes")
text = self.decode_bytes(content)
logger.info(
f"Successfully parsed text document, extracted {len(text)} characters"
)
return text
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger.info("Running TextParser in standalone mode")
# Sample text for testing
text = """## 标题1
![alt text](image.png)
## 标题2
![alt text](image2.png)
## 标题3
![alt text](image3.png)"""
logger.info(f"Test text content: {text}")
# Define separators for text splitting
seperators = ["\n\n", "\n", ""]
parser = TextParser(separators=seperators)
logger.info("Splitting text into units")
units = parser._split_into_units(text)
logger.info(f"Split text into {len(units)} units")
logger.info(f"Units: {units}")

View File

@@ -0,0 +1,130 @@
from typing import Any, Optional, Tuple, Dict, Union
import os
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from .base_parser import BaseParser, ParseResult
import logging
import asyncio
logger = logging.getLogger(__name__)
class WebParser(BaseParser):
"""Web page parser"""
def __init__(self, title: str, **kwargs):
self.title = title
self.proxy = os.environ.get("WEB_PROXY", "")
super().__init__(file_name=title, **kwargs)
logger.info(f"Initialized WebParser with title: {title}")
async def scrape(self, url: str) -> Any:
logger.info(f"Starting web page scraping for URL: {url}")
try:
async with async_playwright() as p:
kwargs = {}
if self.proxy:
kwargs["proxy"] = {"server": self.proxy}
logger.info("Launching WebKit browser")
browser = await p.webkit.launch(**kwargs)
page = await browser.new_page()
logger.info(f"Navigating to URL: {url}")
try:
await page.goto(url, timeout=30000)
logger.info("Initial page load complete")
except Exception as e:
logger.error(f"Error navigating to URL: {str(e)}")
await browser.close()
return BeautifulSoup(
"", "html.parser"
) # Return empty soup on navigation error
logger.info("Retrieving page HTML content")
content = await page.content()
logger.info(f"Retrieved {len(content)} bytes of HTML content")
await browser.close()
logger.info("Browser closed")
# Parse HTML content with BeautifulSoup
logger.info("Parsing HTML with BeautifulSoup")
soup = BeautifulSoup(content, "html.parser")
logger.info("Successfully parsed HTML content")
return soup
except Exception as e:
logger.error(f"Failed to scrape web page: {str(e)}")
# Return empty BeautifulSoup object on error
return BeautifulSoup("", "html.parser")
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
"""Parse web page
Args:
content: Web page content
Returns:
Parse result
"""
logger.info("Starting web page parsing")
# Call async method synchronously
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# Run async method
# Handle content possibly being a string
if isinstance(content, bytes):
url = self.decode_bytes(content)
logger.info(f"Decoded URL from bytes: {url}")
else:
url = content
logger.info(f"Using content as URL directly: {url}")
logger.info(f"Scraping web page: {url}")
soup = loop.run_until_complete(self.scrape(url))
# Extract page text
logger.info("Extracting text from web page")
text = soup.get_text("\n")
logger.info(f"Extracted {len(text)} characters of text from URL: {url}")
# Get title, usually in <title> or <h1> tag
if self.title != "":
title = self.title
logger.info(f"Using provided title: {title}")
else:
title = soup.title.string if soup.title else None
logger.info(f"Found title tag: {title}")
if not title: # If <title> tag does not exist or is empty, try <h1> tag
h1_tag = soup.find("h1")
if h1_tag:
title = h1_tag.get_text()
logger.info(f"Using h1 tag as title: {title}")
else:
title = "Untitled Web Page"
logger.info("No title found, using default")
logger.info(f"Web page title: {title}")
text = "\n".join(
(line.strip() for line in text.splitlines() if line.strip())
)
result = title + "\n\n" + text
logger.info(
f"Web page parsing complete, total content: {len(result)} characters"
)
return result
except Exception as e:
logger.error(f"Error parsing web page: {str(e)}")
return f"Error parsing web page: {str(e)}"
finally:
# Close event loop
logger.info("Closing event loop")
loop.close()

View File

@@ -0,0 +1,819 @@
// Code generated by protoc-gen-go. DO NOT EDIT.
// versions:
// protoc-gen-go v1.36.6
// protoc v5.29.3
// source: docreader.proto
package proto
import (
protoreflect "google.golang.org/protobuf/reflect/protoreflect"
protoimpl "google.golang.org/protobuf/runtime/protoimpl"
reflect "reflect"
sync "sync"
unsafe "unsafe"
)
const (
// Verify that this generated code is sufficiently up-to-date.
_ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion)
// Verify that runtime/protoimpl is sufficiently up-to-date.
_ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20)
)
// 对象存储提供方
type StorageProvider int32
const (
StorageProvider_STORAGE_PROVIDER_UNSPECIFIED StorageProvider = 0
StorageProvider_COS StorageProvider = 1 // 腾讯云 COS
StorageProvider_MINIO StorageProvider = 2 // MinIO/S3 兼容
)
// Enum value maps for StorageProvider.
var (
StorageProvider_name = map[int32]string{
0: "STORAGE_PROVIDER_UNSPECIFIED",
1: "COS",
2: "MINIO",
}
StorageProvider_value = map[string]int32{
"STORAGE_PROVIDER_UNSPECIFIED": 0,
"COS": 1,
"MINIO": 2,
}
)
func (x StorageProvider) Enum() *StorageProvider {
p := new(StorageProvider)
*p = x
return p
}
func (x StorageProvider) String() string {
return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x))
}
func (StorageProvider) Descriptor() protoreflect.EnumDescriptor {
return file_docreader_proto_enumTypes[0].Descriptor()
}
func (StorageProvider) Type() protoreflect.EnumType {
return &file_docreader_proto_enumTypes[0]
}
func (x StorageProvider) Number() protoreflect.EnumNumber {
return protoreflect.EnumNumber(x)
}
// Deprecated: Use StorageProvider.Descriptor instead.
func (StorageProvider) EnumDescriptor() ([]byte, []int) {
return file_docreader_proto_rawDescGZIP(), []int{0}
}
// 通用对象存储配置,兼容 COS 与 MinIO
type StorageConfig struct {
state protoimpl.MessageState `protogen:"open.v1"`
Provider StorageProvider `protobuf:"varint,1,opt,name=provider,proto3,enum=docreader.StorageProvider" json:"provider,omitempty"` // 存储提供方
Region string `protobuf:"bytes,2,opt,name=region,proto3" json:"region,omitempty"` // 区域COS 使用)
BucketName string `protobuf:"bytes,3,opt,name=bucket_name,json=bucketName,proto3" json:"bucket_name,omitempty"` // 桶名
AccessKeyId string `protobuf:"bytes,4,opt,name=access_key_id,json=accessKeyId,proto3" json:"access_key_id,omitempty"` // 访问密钥 IDMinIO/S3 使用)
SecretAccessKey string `protobuf:"bytes,5,opt,name=secret_access_key,json=secretAccessKey,proto3" json:"secret_access_key,omitempty"` // 访问密钥 SecretMinIO/S3 使用)
AppId string `protobuf:"bytes,6,opt,name=app_id,json=appId,proto3" json:"app_id,omitempty"` // 应用 IDCOS 使用)
PathPrefix string `protobuf:"bytes,7,opt,name=path_prefix,json=pathPrefix,proto3" json:"path_prefix,omitempty"` // 路径前缀
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *StorageConfig) Reset() {
*x = StorageConfig{}
mi := &file_docreader_proto_msgTypes[0]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *StorageConfig) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*StorageConfig) ProtoMessage() {}
func (x *StorageConfig) ProtoReflect() protoreflect.Message {
mi := &file_docreader_proto_msgTypes[0]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use StorageConfig.ProtoReflect.Descriptor instead.
func (*StorageConfig) Descriptor() ([]byte, []int) {
return file_docreader_proto_rawDescGZIP(), []int{0}
}
func (x *StorageConfig) GetProvider() StorageProvider {
if x != nil {
return x.Provider
}
return StorageProvider_STORAGE_PROVIDER_UNSPECIFIED
}
func (x *StorageConfig) GetRegion() string {
if x != nil {
return x.Region
}
return ""
}
func (x *StorageConfig) GetBucketName() string {
if x != nil {
return x.BucketName
}
return ""
}
func (x *StorageConfig) GetAccessKeyId() string {
if x != nil {
return x.AccessKeyId
}
return ""
}
func (x *StorageConfig) GetSecretAccessKey() string {
if x != nil {
return x.SecretAccessKey
}
return ""
}
func (x *StorageConfig) GetAppId() string {
if x != nil {
return x.AppId
}
return ""
}
func (x *StorageConfig) GetPathPrefix() string {
if x != nil {
return x.PathPrefix
}
return ""
}
// VLM 配置
type VLMConfig struct {
state protoimpl.MessageState `protogen:"open.v1"`
ModelName string `protobuf:"bytes,1,opt,name=model_name,json=modelName,proto3" json:"model_name,omitempty"` // VLM Model Name
BaseUrl string `protobuf:"bytes,2,opt,name=base_url,json=baseUrl,proto3" json:"base_url,omitempty"` // VLM Base URL
ApiKey string `protobuf:"bytes,3,opt,name=api_key,json=apiKey,proto3" json:"api_key,omitempty"` // VLM API Key
InterfaceType string `protobuf:"bytes,4,opt,name=interface_type,json=interfaceType,proto3" json:"interface_type,omitempty"` // VLM Interface Type: "ollama" or "openai"
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *VLMConfig) Reset() {
*x = VLMConfig{}
mi := &file_docreader_proto_msgTypes[1]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *VLMConfig) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*VLMConfig) ProtoMessage() {}
func (x *VLMConfig) ProtoReflect() protoreflect.Message {
mi := &file_docreader_proto_msgTypes[1]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use VLMConfig.ProtoReflect.Descriptor instead.
func (*VLMConfig) Descriptor() ([]byte, []int) {
return file_docreader_proto_rawDescGZIP(), []int{1}
}
func (x *VLMConfig) GetModelName() string {
if x != nil {
return x.ModelName
}
return ""
}
func (x *VLMConfig) GetBaseUrl() string {
if x != nil {
return x.BaseUrl
}
return ""
}
func (x *VLMConfig) GetApiKey() string {
if x != nil {
return x.ApiKey
}
return ""
}
func (x *VLMConfig) GetInterfaceType() string {
if x != nil {
return x.InterfaceType
}
return ""
}
type ReadConfig struct {
state protoimpl.MessageState `protogen:"open.v1"`
ChunkSize int32 `protobuf:"varint,1,opt,name=chunk_size,json=chunkSize,proto3" json:"chunk_size,omitempty"` // 分块大小
ChunkOverlap int32 `protobuf:"varint,2,opt,name=chunk_overlap,json=chunkOverlap,proto3" json:"chunk_overlap,omitempty"` // 分块重叠
Separators []string `protobuf:"bytes,3,rep,name=separators,proto3" json:"separators,omitempty"` // 分隔符
EnableMultimodal bool `protobuf:"varint,4,opt,name=enable_multimodal,json=enableMultimodal,proto3" json:"enable_multimodal,omitempty"` // 多模态处理
StorageConfig *StorageConfig `protobuf:"bytes,5,opt,name=storage_config,json=storageConfig,proto3" json:"storage_config,omitempty"` // 对象存储配置(通用)
VlmConfig *VLMConfig `protobuf:"bytes,6,opt,name=vlm_config,json=vlmConfig,proto3" json:"vlm_config,omitempty"` // VLM 配置
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *ReadConfig) Reset() {
*x = ReadConfig{}
mi := &file_docreader_proto_msgTypes[2]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *ReadConfig) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*ReadConfig) ProtoMessage() {}
func (x *ReadConfig) ProtoReflect() protoreflect.Message {
mi := &file_docreader_proto_msgTypes[2]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use ReadConfig.ProtoReflect.Descriptor instead.
func (*ReadConfig) Descriptor() ([]byte, []int) {
return file_docreader_proto_rawDescGZIP(), []int{2}
}
func (x *ReadConfig) GetChunkSize() int32 {
if x != nil {
return x.ChunkSize
}
return 0
}
func (x *ReadConfig) GetChunkOverlap() int32 {
if x != nil {
return x.ChunkOverlap
}
return 0
}
func (x *ReadConfig) GetSeparators() []string {
if x != nil {
return x.Separators
}
return nil
}
func (x *ReadConfig) GetEnableMultimodal() bool {
if x != nil {
return x.EnableMultimodal
}
return false
}
func (x *ReadConfig) GetStorageConfig() *StorageConfig {
if x != nil {
return x.StorageConfig
}
return nil
}
func (x *ReadConfig) GetVlmConfig() *VLMConfig {
if x != nil {
return x.VlmConfig
}
return nil
}
// 从文件读取文档请求
type ReadFromFileRequest struct {
state protoimpl.MessageState `protogen:"open.v1"`
FileContent []byte `protobuf:"bytes,1,opt,name=file_content,json=fileContent,proto3" json:"file_content,omitempty"` // 文件内容
FileName string `protobuf:"bytes,2,opt,name=file_name,json=fileName,proto3" json:"file_name,omitempty"` // 文件名
FileType string `protobuf:"bytes,3,opt,name=file_type,json=fileType,proto3" json:"file_type,omitempty"` // 文件类型
ReadConfig *ReadConfig `protobuf:"bytes,4,opt,name=read_config,json=readConfig,proto3" json:"read_config,omitempty"`
RequestId string `protobuf:"bytes,5,opt,name=request_id,json=requestId,proto3" json:"request_id,omitempty"`
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *ReadFromFileRequest) Reset() {
*x = ReadFromFileRequest{}
mi := &file_docreader_proto_msgTypes[3]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *ReadFromFileRequest) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*ReadFromFileRequest) ProtoMessage() {}
func (x *ReadFromFileRequest) ProtoReflect() protoreflect.Message {
mi := &file_docreader_proto_msgTypes[3]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use ReadFromFileRequest.ProtoReflect.Descriptor instead.
func (*ReadFromFileRequest) Descriptor() ([]byte, []int) {
return file_docreader_proto_rawDescGZIP(), []int{3}
}
func (x *ReadFromFileRequest) GetFileContent() []byte {
if x != nil {
return x.FileContent
}
return nil
}
func (x *ReadFromFileRequest) GetFileName() string {
if x != nil {
return x.FileName
}
return ""
}
func (x *ReadFromFileRequest) GetFileType() string {
if x != nil {
return x.FileType
}
return ""
}
func (x *ReadFromFileRequest) GetReadConfig() *ReadConfig {
if x != nil {
return x.ReadConfig
}
return nil
}
func (x *ReadFromFileRequest) GetRequestId() string {
if x != nil {
return x.RequestId
}
return ""
}
// 从URL读取文档请求
type ReadFromURLRequest struct {
state protoimpl.MessageState `protogen:"open.v1"`
Url string `protobuf:"bytes,1,opt,name=url,proto3" json:"url,omitempty"` // 文档URL
Title string `protobuf:"bytes,2,opt,name=title,proto3" json:"title,omitempty"` // 标题
ReadConfig *ReadConfig `protobuf:"bytes,3,opt,name=read_config,json=readConfig,proto3" json:"read_config,omitempty"`
RequestId string `protobuf:"bytes,4,opt,name=request_id,json=requestId,proto3" json:"request_id,omitempty"`
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *ReadFromURLRequest) Reset() {
*x = ReadFromURLRequest{}
mi := &file_docreader_proto_msgTypes[4]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *ReadFromURLRequest) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*ReadFromURLRequest) ProtoMessage() {}
func (x *ReadFromURLRequest) ProtoReflect() protoreflect.Message {
mi := &file_docreader_proto_msgTypes[4]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use ReadFromURLRequest.ProtoReflect.Descriptor instead.
func (*ReadFromURLRequest) Descriptor() ([]byte, []int) {
return file_docreader_proto_rawDescGZIP(), []int{4}
}
func (x *ReadFromURLRequest) GetUrl() string {
if x != nil {
return x.Url
}
return ""
}
func (x *ReadFromURLRequest) GetTitle() string {
if x != nil {
return x.Title
}
return ""
}
func (x *ReadFromURLRequest) GetReadConfig() *ReadConfig {
if x != nil {
return x.ReadConfig
}
return nil
}
func (x *ReadFromURLRequest) GetRequestId() string {
if x != nil {
return x.RequestId
}
return ""
}
// 图片信息
type Image struct {
state protoimpl.MessageState `protogen:"open.v1"`
Url string `protobuf:"bytes,1,opt,name=url,proto3" json:"url,omitempty"` // 图片URL
Caption string `protobuf:"bytes,2,opt,name=caption,proto3" json:"caption,omitempty"` // 图片描述
OcrText string `protobuf:"bytes,3,opt,name=ocr_text,json=ocrText,proto3" json:"ocr_text,omitempty"` // OCR提取的文本内容
OriginalUrl string `protobuf:"bytes,4,opt,name=original_url,json=originalUrl,proto3" json:"original_url,omitempty"` // 原始图片URL
Start int32 `protobuf:"varint,5,opt,name=start,proto3" json:"start,omitempty"` // 图片在文本中的开始位置
End int32 `protobuf:"varint,6,opt,name=end,proto3" json:"end,omitempty"` // 图片在文本中的结束位置
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *Image) Reset() {
*x = Image{}
mi := &file_docreader_proto_msgTypes[5]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *Image) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*Image) ProtoMessage() {}
func (x *Image) ProtoReflect() protoreflect.Message {
mi := &file_docreader_proto_msgTypes[5]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use Image.ProtoReflect.Descriptor instead.
func (*Image) Descriptor() ([]byte, []int) {
return file_docreader_proto_rawDescGZIP(), []int{5}
}
func (x *Image) GetUrl() string {
if x != nil {
return x.Url
}
return ""
}
func (x *Image) GetCaption() string {
if x != nil {
return x.Caption
}
return ""
}
func (x *Image) GetOcrText() string {
if x != nil {
return x.OcrText
}
return ""
}
func (x *Image) GetOriginalUrl() string {
if x != nil {
return x.OriginalUrl
}
return ""
}
func (x *Image) GetStart() int32 {
if x != nil {
return x.Start
}
return 0
}
func (x *Image) GetEnd() int32 {
if x != nil {
return x.End
}
return 0
}
type Chunk struct {
state protoimpl.MessageState `protogen:"open.v1"`
Content string `protobuf:"bytes,1,opt,name=content,proto3" json:"content,omitempty"` // 块内容
Seq int32 `protobuf:"varint,2,opt,name=seq,proto3" json:"seq,omitempty"` // 块在文档中的次序
Start int32 `protobuf:"varint,3,opt,name=start,proto3" json:"start,omitempty"` // 块在文档中的起始位置
End int32 `protobuf:"varint,4,opt,name=end,proto3" json:"end,omitempty"` // 块在文档中的结束位置
Images []*Image `protobuf:"bytes,5,rep,name=images,proto3" json:"images,omitempty"` // 块中包含的图片信息
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *Chunk) Reset() {
*x = Chunk{}
mi := &file_docreader_proto_msgTypes[6]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *Chunk) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*Chunk) ProtoMessage() {}
func (x *Chunk) ProtoReflect() protoreflect.Message {
mi := &file_docreader_proto_msgTypes[6]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use Chunk.ProtoReflect.Descriptor instead.
func (*Chunk) Descriptor() ([]byte, []int) {
return file_docreader_proto_rawDescGZIP(), []int{6}
}
func (x *Chunk) GetContent() string {
if x != nil {
return x.Content
}
return ""
}
func (x *Chunk) GetSeq() int32 {
if x != nil {
return x.Seq
}
return 0
}
func (x *Chunk) GetStart() int32 {
if x != nil {
return x.Start
}
return 0
}
func (x *Chunk) GetEnd() int32 {
if x != nil {
return x.End
}
return 0
}
func (x *Chunk) GetImages() []*Image {
if x != nil {
return x.Images
}
return nil
}
// 从URL读取文档响应
type ReadResponse struct {
state protoimpl.MessageState `protogen:"open.v1"`
Chunks []*Chunk `protobuf:"bytes,1,rep,name=chunks,proto3" json:"chunks,omitempty"` // 文档分块
Error string `protobuf:"bytes,2,opt,name=error,proto3" json:"error,omitempty"` // 错误信息
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
func (x *ReadResponse) Reset() {
*x = ReadResponse{}
mi := &file_docreader_proto_msgTypes[7]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
func (x *ReadResponse) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*ReadResponse) ProtoMessage() {}
func (x *ReadResponse) ProtoReflect() protoreflect.Message {
mi := &file_docreader_proto_msgTypes[7]
if x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use ReadResponse.ProtoReflect.Descriptor instead.
func (*ReadResponse) Descriptor() ([]byte, []int) {
return file_docreader_proto_rawDescGZIP(), []int{7}
}
func (x *ReadResponse) GetChunks() []*Chunk {
if x != nil {
return x.Chunks
}
return nil
}
func (x *ReadResponse) GetError() string {
if x != nil {
return x.Error
}
return ""
}
var File_docreader_proto protoreflect.FileDescriptor
const file_docreader_proto_rawDesc = "" +
"\n" +
"\x0fdocreader.proto\x12\tdocreader\"\x88\x02\n" +
"\rStorageConfig\x126\n" +
"\bprovider\x18\x01 \x01(\x0e2\x1a.docreader.StorageProviderR\bprovider\x12\x16\n" +
"\x06region\x18\x02 \x01(\tR\x06region\x12\x1f\n" +
"\vbucket_name\x18\x03 \x01(\tR\n" +
"bucketName\x12\"\n" +
"\raccess_key_id\x18\x04 \x01(\tR\vaccessKeyId\x12*\n" +
"\x11secret_access_key\x18\x05 \x01(\tR\x0fsecretAccessKey\x12\x15\n" +
"\x06app_id\x18\x06 \x01(\tR\x05appId\x12\x1f\n" +
"\vpath_prefix\x18\a \x01(\tR\n" +
"pathPrefix\"\x85\x01\n" +
"\tVLMConfig\x12\x1d\n" +
"\n" +
"model_name\x18\x01 \x01(\tR\tmodelName\x12\x19\n" +
"\bbase_url\x18\x02 \x01(\tR\abaseUrl\x12\x17\n" +
"\aapi_key\x18\x03 \x01(\tR\x06apiKey\x12%\n" +
"\x0einterface_type\x18\x04 \x01(\tR\rinterfaceType\"\x93\x02\n" +
"\n" +
"ReadConfig\x12\x1d\n" +
"\n" +
"chunk_size\x18\x01 \x01(\x05R\tchunkSize\x12#\n" +
"\rchunk_overlap\x18\x02 \x01(\x05R\fchunkOverlap\x12\x1e\n" +
"\n" +
"separators\x18\x03 \x03(\tR\n" +
"separators\x12+\n" +
"\x11enable_multimodal\x18\x04 \x01(\bR\x10enableMultimodal\x12?\n" +
"\x0estorage_config\x18\x05 \x01(\v2\x18.docreader.StorageConfigR\rstorageConfig\x123\n" +
"\n" +
"vlm_config\x18\x06 \x01(\v2\x14.docreader.VLMConfigR\tvlmConfig\"\xc9\x01\n" +
"\x13ReadFromFileRequest\x12!\n" +
"\ffile_content\x18\x01 \x01(\fR\vfileContent\x12\x1b\n" +
"\tfile_name\x18\x02 \x01(\tR\bfileName\x12\x1b\n" +
"\tfile_type\x18\x03 \x01(\tR\bfileType\x126\n" +
"\vread_config\x18\x04 \x01(\v2\x15.docreader.ReadConfigR\n" +
"readConfig\x12\x1d\n" +
"\n" +
"request_id\x18\x05 \x01(\tR\trequestId\"\x93\x01\n" +
"\x12ReadFromURLRequest\x12\x10\n" +
"\x03url\x18\x01 \x01(\tR\x03url\x12\x14\n" +
"\x05title\x18\x02 \x01(\tR\x05title\x126\n" +
"\vread_config\x18\x03 \x01(\v2\x15.docreader.ReadConfigR\n" +
"readConfig\x12\x1d\n" +
"\n" +
"request_id\x18\x04 \x01(\tR\trequestId\"\x99\x01\n" +
"\x05Image\x12\x10\n" +
"\x03url\x18\x01 \x01(\tR\x03url\x12\x18\n" +
"\acaption\x18\x02 \x01(\tR\acaption\x12\x19\n" +
"\bocr_text\x18\x03 \x01(\tR\aocrText\x12!\n" +
"\foriginal_url\x18\x04 \x01(\tR\voriginalUrl\x12\x14\n" +
"\x05start\x18\x05 \x01(\x05R\x05start\x12\x10\n" +
"\x03end\x18\x06 \x01(\x05R\x03end\"\x85\x01\n" +
"\x05Chunk\x12\x18\n" +
"\acontent\x18\x01 \x01(\tR\acontent\x12\x10\n" +
"\x03seq\x18\x02 \x01(\x05R\x03seq\x12\x14\n" +
"\x05start\x18\x03 \x01(\x05R\x05start\x12\x10\n" +
"\x03end\x18\x04 \x01(\x05R\x03end\x12(\n" +
"\x06images\x18\x05 \x03(\v2\x10.docreader.ImageR\x06images\"N\n" +
"\fReadResponse\x12(\n" +
"\x06chunks\x18\x01 \x03(\v2\x10.docreader.ChunkR\x06chunks\x12\x14\n" +
"\x05error\x18\x02 \x01(\tR\x05error*G\n" +
"\x0fStorageProvider\x12 \n" +
"\x1cSTORAGE_PROVIDER_UNSPECIFIED\x10\x00\x12\a\n" +
"\x03COS\x10\x01\x12\t\n" +
"\x05MINIO\x10\x022\x9f\x01\n" +
"\tDocReader\x12I\n" +
"\fReadFromFile\x12\x1e.docreader.ReadFromFileRequest\x1a\x17.docreader.ReadResponse\"\x00\x12G\n" +
"\vReadFromURL\x12\x1d.docreader.ReadFromURLRequest\x1a\x17.docreader.ReadResponse\"\x00B5Z3github.com/Tencent/WeKnora/internal/docreader/protob\x06proto3"
var (
file_docreader_proto_rawDescOnce sync.Once
file_docreader_proto_rawDescData []byte
)
func file_docreader_proto_rawDescGZIP() []byte {
file_docreader_proto_rawDescOnce.Do(func() {
file_docreader_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_docreader_proto_rawDesc), len(file_docreader_proto_rawDesc)))
})
return file_docreader_proto_rawDescData
}
var file_docreader_proto_enumTypes = make([]protoimpl.EnumInfo, 1)
var file_docreader_proto_msgTypes = make([]protoimpl.MessageInfo, 8)
var file_docreader_proto_goTypes = []any{
(StorageProvider)(0), // 0: docreader.StorageProvider
(*StorageConfig)(nil), // 1: docreader.StorageConfig
(*VLMConfig)(nil), // 2: docreader.VLMConfig
(*ReadConfig)(nil), // 3: docreader.ReadConfig
(*ReadFromFileRequest)(nil), // 4: docreader.ReadFromFileRequest
(*ReadFromURLRequest)(nil), // 5: docreader.ReadFromURLRequest
(*Image)(nil), // 6: docreader.Image
(*Chunk)(nil), // 7: docreader.Chunk
(*ReadResponse)(nil), // 8: docreader.ReadResponse
}
var file_docreader_proto_depIdxs = []int32{
0, // 0: docreader.StorageConfig.provider:type_name -> docreader.StorageProvider
1, // 1: docreader.ReadConfig.storage_config:type_name -> docreader.StorageConfig
2, // 2: docreader.ReadConfig.vlm_config:type_name -> docreader.VLMConfig
3, // 3: docreader.ReadFromFileRequest.read_config:type_name -> docreader.ReadConfig
3, // 4: docreader.ReadFromURLRequest.read_config:type_name -> docreader.ReadConfig
6, // 5: docreader.Chunk.images:type_name -> docreader.Image
7, // 6: docreader.ReadResponse.chunks:type_name -> docreader.Chunk
4, // 7: docreader.DocReader.ReadFromFile:input_type -> docreader.ReadFromFileRequest
5, // 8: docreader.DocReader.ReadFromURL:input_type -> docreader.ReadFromURLRequest
8, // 9: docreader.DocReader.ReadFromFile:output_type -> docreader.ReadResponse
8, // 10: docreader.DocReader.ReadFromURL:output_type -> docreader.ReadResponse
9, // [9:11] is the sub-list for method output_type
7, // [7:9] is the sub-list for method input_type
7, // [7:7] is the sub-list for extension type_name
7, // [7:7] is the sub-list for extension extendee
0, // [0:7] is the sub-list for field type_name
}
func init() { file_docreader_proto_init() }
func file_docreader_proto_init() {
if File_docreader_proto != nil {
return
}
type x struct{}
out := protoimpl.TypeBuilder{
File: protoimpl.DescBuilder{
GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
RawDescriptor: unsafe.Slice(unsafe.StringData(file_docreader_proto_rawDesc), len(file_docreader_proto_rawDesc)),
NumEnums: 1,
NumMessages: 8,
NumExtensions: 0,
NumServices: 1,
},
GoTypes: file_docreader_proto_goTypes,
DependencyIndexes: file_docreader_proto_depIdxs,
EnumInfos: file_docreader_proto_enumTypes,
MessageInfos: file_docreader_proto_msgTypes,
}.Build()
File_docreader_proto = out.File
file_docreader_proto_goTypes = nil
file_docreader_proto_depIdxs = nil
}

View File

@@ -0,0 +1,89 @@
syntax = "proto3";
package docreader;
option go_package = "github.com/Tencent/WeKnora/internal/docreader/proto";
// 文档读取服务
service DocReader {
// 从文件读取文档
rpc ReadFromFile(ReadFromFileRequest) returns (ReadResponse) {}
// 从URL读取文档
rpc ReadFromURL(ReadFromURLRequest) returns (ReadResponse) {}
}
// 对象存储提供方
enum StorageProvider {
STORAGE_PROVIDER_UNSPECIFIED = 0;
COS = 1; // 腾讯云 COS
MINIO = 2; // MinIO/S3 兼容
}
// 通用对象存储配置,兼容 COS 与 MinIO
message StorageConfig {
StorageProvider provider = 1; // 存储提供方
string region = 2; // 区域COS 使用)
string bucket_name = 3; // 桶名
string access_key_id = 4; // 访问密钥 IDMinIO/S3 使用)
string secret_access_key = 5; // 访问密钥 SecretMinIO/S3 使用)
string app_id = 6; // 应用 IDCOS 使用)
string path_prefix = 7; // 路径前缀
}
// VLM 配置
message VLMConfig {
string model_name = 1; // VLM Model Name
string base_url = 2; // VLM Base URL
string api_key = 3; // VLM API Key
string interface_type = 4; // VLM Interface Type: "ollama" or "openai"
}
message ReadConfig {
int32 chunk_size = 1; // 分块大小
int32 chunk_overlap = 2; // 分块重叠
repeated string separators = 3; // 分隔符
bool enable_multimodal = 4; // 多模态处理
StorageConfig storage_config = 5; // 对象存储配置(通用)
VLMConfig vlm_config = 6; // VLM 配置
}
// 从文件读取文档请求
message ReadFromFileRequest {
bytes file_content = 1; // 文件内容
string file_name = 2; // 文件名
string file_type = 3; // 文件类型
ReadConfig read_config = 4;
string request_id = 5;
}
// 从URL读取文档请求
message ReadFromURLRequest {
string url = 1; // 文档URL
string title = 2; // 标题
ReadConfig read_config = 3;
string request_id = 4;
}
// 图片信息
message Image {
string url = 1; // 图片URL
string caption = 2; // 图片描述
string ocr_text = 3; // OCR提取的文本内容
string original_url = 4; // 原始图片URL
int32 start = 5; // 图片在文本中的开始位置
int32 end = 6; // 图片在文本中的结束位置
}
message Chunk {
string content = 1; // 块内容
int32 seq = 2; // 块在文档中的次序
int32 start = 3; // 块在文档中的起始位置
int32 end = 4; // 块在文档中的结束位置
repeated Image images = 5; // 块中包含的图片信息
}
// 从URL读取文档响应
message ReadResponse {
repeated Chunk chunks = 1; // 文档分块
string error = 2; // 错误信息
}

View File

@@ -0,0 +1,167 @@
// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
// versions:
// - protoc-gen-go-grpc v1.5.1
// - protoc v5.29.3
// source: docreader.proto
package proto
import (
context "context"
grpc "google.golang.org/grpc"
codes "google.golang.org/grpc/codes"
status "google.golang.org/grpc/status"
)
// This is a compile-time assertion to ensure that this generated file
// is compatible with the grpc package it is being compiled against.
// Requires gRPC-Go v1.64.0 or later.
const _ = grpc.SupportPackageIsVersion9
const (
DocReader_ReadFromFile_FullMethodName = "/docreader.DocReader/ReadFromFile"
DocReader_ReadFromURL_FullMethodName = "/docreader.DocReader/ReadFromURL"
)
// DocReaderClient is the client API for DocReader service.
//
// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
//
// 文档读取服务
type DocReaderClient interface {
// 从文件读取文档
ReadFromFile(ctx context.Context, in *ReadFromFileRequest, opts ...grpc.CallOption) (*ReadResponse, error)
// 从URL读取文档
ReadFromURL(ctx context.Context, in *ReadFromURLRequest, opts ...grpc.CallOption) (*ReadResponse, error)
}
type docReaderClient struct {
cc grpc.ClientConnInterface
}
func NewDocReaderClient(cc grpc.ClientConnInterface) DocReaderClient {
return &docReaderClient{cc}
}
func (c *docReaderClient) ReadFromFile(ctx context.Context, in *ReadFromFileRequest, opts ...grpc.CallOption) (*ReadResponse, error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
out := new(ReadResponse)
err := c.cc.Invoke(ctx, DocReader_ReadFromFile_FullMethodName, in, out, cOpts...)
if err != nil {
return nil, err
}
return out, nil
}
func (c *docReaderClient) ReadFromURL(ctx context.Context, in *ReadFromURLRequest, opts ...grpc.CallOption) (*ReadResponse, error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
out := new(ReadResponse)
err := c.cc.Invoke(ctx, DocReader_ReadFromURL_FullMethodName, in, out, cOpts...)
if err != nil {
return nil, err
}
return out, nil
}
// DocReaderServer is the server API for DocReader service.
// All implementations must embed UnimplementedDocReaderServer
// for forward compatibility.
//
// 文档读取服务
type DocReaderServer interface {
// 从文件读取文档
ReadFromFile(context.Context, *ReadFromFileRequest) (*ReadResponse, error)
// 从URL读取文档
ReadFromURL(context.Context, *ReadFromURLRequest) (*ReadResponse, error)
mustEmbedUnimplementedDocReaderServer()
}
// UnimplementedDocReaderServer must be embedded to have
// forward compatible implementations.
//
// NOTE: this should be embedded by value instead of pointer to avoid a nil
// pointer dereference when methods are called.
type UnimplementedDocReaderServer struct{}
func (UnimplementedDocReaderServer) ReadFromFile(context.Context, *ReadFromFileRequest) (*ReadResponse, error) {
return nil, status.Errorf(codes.Unimplemented, "method ReadFromFile not implemented")
}
func (UnimplementedDocReaderServer) ReadFromURL(context.Context, *ReadFromURLRequest) (*ReadResponse, error) {
return nil, status.Errorf(codes.Unimplemented, "method ReadFromURL not implemented")
}
func (UnimplementedDocReaderServer) mustEmbedUnimplementedDocReaderServer() {}
func (UnimplementedDocReaderServer) testEmbeddedByValue() {}
// UnsafeDocReaderServer may be embedded to opt out of forward compatibility for this service.
// Use of this interface is not recommended, as added methods to DocReaderServer will
// result in compilation errors.
type UnsafeDocReaderServer interface {
mustEmbedUnimplementedDocReaderServer()
}
func RegisterDocReaderServer(s grpc.ServiceRegistrar, srv DocReaderServer) {
// If the following call pancis, it indicates UnimplementedDocReaderServer was
// embedded by pointer and is nil. This will cause panics if an
// unimplemented method is ever invoked, so we test this at initialization
// time to prevent it from happening at runtime later due to I/O.
if t, ok := srv.(interface{ testEmbeddedByValue() }); ok {
t.testEmbeddedByValue()
}
s.RegisterService(&DocReader_ServiceDesc, srv)
}
func _DocReader_ReadFromFile_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(ReadFromFileRequest)
if err := dec(in); err != nil {
return nil, err
}
if interceptor == nil {
return srv.(DocReaderServer).ReadFromFile(ctx, in)
}
info := &grpc.UnaryServerInfo{
Server: srv,
FullMethod: DocReader_ReadFromFile_FullMethodName,
}
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
return srv.(DocReaderServer).ReadFromFile(ctx, req.(*ReadFromFileRequest))
}
return interceptor(ctx, in, info, handler)
}
func _DocReader_ReadFromURL_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(ReadFromURLRequest)
if err := dec(in); err != nil {
return nil, err
}
if interceptor == nil {
return srv.(DocReaderServer).ReadFromURL(ctx, in)
}
info := &grpc.UnaryServerInfo{
Server: srv,
FullMethod: DocReader_ReadFromURL_FullMethodName,
}
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
return srv.(DocReaderServer).ReadFromURL(ctx, req.(*ReadFromURLRequest))
}
return interceptor(ctx, in, info, handler)
}
// DocReader_ServiceDesc is the grpc.ServiceDesc for DocReader service.
// It's only intended for direct use with grpc.RegisterService,
// and not to be introspected or modified (even as a copy)
var DocReader_ServiceDesc = grpc.ServiceDesc{
ServiceName: "docreader.DocReader",
HandlerType: (*DocReaderServer)(nil),
Methods: []grpc.MethodDesc{
{
MethodName: "ReadFromFile",
Handler: _DocReader_ReadFromFile_Handler,
},
{
MethodName: "ReadFromURL",
Handler: _DocReader_ReadFromURL_Handler,
},
},
Streams: []grpc.StreamDesc{},
Metadata: "docreader.proto",
}

View File

@@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# NO CHECKED-IN PROTOBUF GENCODE
# source: docreader.proto
# Protobuf Python Version: 6.31.1
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import runtime_version as _runtime_version
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
_runtime_version.ValidateProtobufRuntimeVersion(
_runtime_version.Domain.PUBLIC,
6,
31,
1,
'',
'docreader.proto'
)
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x64ocreader.proto\x12\tdocreader\"\xb9\x01\n\rStorageConfig\x12,\n\x08provider\x18\x01 \x01(\x0e\x32\x1a.docreader.StorageProvider\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x03 \x01(\t\x12\x15\n\raccess_key_id\x18\x04 \x01(\t\x12\x19\n\x11secret_access_key\x18\x05 \x01(\t\x12\x0e\n\x06\x61pp_id\x18\x06 \x01(\t\x12\x13\n\x0bpath_prefix\x18\x07 \x01(\t\"Z\n\tVLMConfig\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x10\n\x08\x62\x61se_url\x18\x02 \x01(\t\x12\x0f\n\x07\x61pi_key\x18\x03 \x01(\t\x12\x16\n\x0einterface_type\x18\x04 \x01(\t\"\xc2\x01\n\nReadConfig\x12\x12\n\nchunk_size\x18\x01 \x01(\x05\x12\x15\n\rchunk_overlap\x18\x02 \x01(\x05\x12\x12\n\nseparators\x18\x03 \x03(\t\x12\x19\n\x11\x65nable_multimodal\x18\x04 \x01(\x08\x12\x30\n\x0estorage_config\x18\x05 \x01(\x0b\x32\x18.docreader.StorageConfig\x12(\n\nvlm_config\x18\x06 \x01(\x0b\x32\x14.docreader.VLMConfig\"\x91\x01\n\x13ReadFromFileRequest\x12\x14\n\x0c\x66ile_content\x18\x01 \x01(\x0c\x12\x11\n\tfile_name\x18\x02 \x01(\t\x12\x11\n\tfile_type\x18\x03 \x01(\t\x12*\n\x0bread_config\x18\x04 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x05 \x01(\t\"p\n\x12ReadFromURLRequest\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\r\n\x05title\x18\x02 \x01(\t\x12*\n\x0bread_config\x18\x03 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x04 \x01(\t\"i\n\x05Image\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\x0f\n\x07\x63\x61ption\x18\x02 \x01(\t\x12\x10\n\x08ocr_text\x18\x03 \x01(\t\x12\x14\n\x0coriginal_url\x18\x04 \x01(\t\x12\r\n\x05start\x18\x05 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x06 \x01(\x05\"c\n\x05\x43hunk\x12\x0f\n\x07\x63ontent\x18\x01 \x01(\t\x12\x0b\n\x03seq\x18\x02 \x01(\x05\x12\r\n\x05start\x18\x03 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x04 \x01(\x05\x12 \n\x06images\x18\x05 \x03(\x0b\x32\x10.docreader.Image\"?\n\x0cReadResponse\x12 \n\x06\x63hunks\x18\x01 \x03(\x0b\x32\x10.docreader.Chunk\x12\r\n\x05\x65rror\x18\x02 \x01(\t*G\n\x0fStorageProvider\x12 \n\x1cSTORAGE_PROVIDER_UNSPECIFIED\x10\x00\x12\x07\n\x03\x43OS\x10\x01\x12\t\n\x05MINIO\x10\x02\x32\x9f\x01\n\tDocReader\x12I\n\x0cReadFromFile\x12\x1e.docreader.ReadFromFileRequest\x1a\x17.docreader.ReadResponse\"\x00\x12G\n\x0bReadFromURL\x12\x1d.docreader.ReadFromURLRequest\x1a\x17.docreader.ReadResponse\"\x00\x42\x35Z3github.com/Tencent/WeKnora/internal/docreader/protob\x06proto3')
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'docreader_pb2', _globals)
if not _descriptor._USE_C_DESCRIPTORS:
_globals['DESCRIPTOR']._loaded_options = None
_globals['DESCRIPTOR']._serialized_options = b'Z3github.com/Tencent/WeKnora/internal/docreader/proto'
_globals['_STORAGEPROVIDER']._serialized_start=1042
_globals['_STORAGEPROVIDER']._serialized_end=1113
_globals['_STORAGECONFIG']._serialized_start=31
_globals['_STORAGECONFIG']._serialized_end=216
_globals['_VLMCONFIG']._serialized_start=218
_globals['_VLMCONFIG']._serialized_end=308
_globals['_READCONFIG']._serialized_start=311
_globals['_READCONFIG']._serialized_end=505
_globals['_READFROMFILEREQUEST']._serialized_start=508
_globals['_READFROMFILEREQUEST']._serialized_end=653
_globals['_READFROMURLREQUEST']._serialized_start=655
_globals['_READFROMURLREQUEST']._serialized_end=767
_globals['_IMAGE']._serialized_start=769
_globals['_IMAGE']._serialized_end=874
_globals['_CHUNK']._serialized_start=876
_globals['_CHUNK']._serialized_end=975
_globals['_READRESPONSE']._serialized_start=977
_globals['_READRESPONSE']._serialized_end=1040
_globals['_DOCREADER']._serialized_start=1116
_globals['_DOCREADER']._serialized_end=1275
# @@protoc_insertion_point(module_scope)

View File

@@ -0,0 +1,145 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc
import warnings
from . import docreader_pb2 as docreader__pb2
GRPC_GENERATED_VERSION = '1.76.0'
GRPC_VERSION = grpc.__version__
_version_not_supported = False
try:
from grpc._utilities import first_version_is_lower
_version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
except ImportError:
_version_not_supported = True
if _version_not_supported:
raise RuntimeError(
f'The grpc package installed is at version {GRPC_VERSION},'
+ ' but the generated code in docreader_pb2_grpc.py depends on'
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
)
class DocReaderStub(object):
"""文档读取服务
"""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.ReadFromFile = channel.unary_unary(
'/docreader.DocReader/ReadFromFile',
request_serializer=docreader__pb2.ReadFromFileRequest.SerializeToString,
response_deserializer=docreader__pb2.ReadResponse.FromString,
_registered_method=True)
self.ReadFromURL = channel.unary_unary(
'/docreader.DocReader/ReadFromURL',
request_serializer=docreader__pb2.ReadFromURLRequest.SerializeToString,
response_deserializer=docreader__pb2.ReadResponse.FromString,
_registered_method=True)
class DocReaderServicer(object):
"""文档读取服务
"""
def ReadFromFile(self, request, context):
"""从文件读取文档
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def ReadFromURL(self, request, context):
"""从URL读取文档
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_DocReaderServicer_to_server(servicer, server):
rpc_method_handlers = {
'ReadFromFile': grpc.unary_unary_rpc_method_handler(
servicer.ReadFromFile,
request_deserializer=docreader__pb2.ReadFromFileRequest.FromString,
response_serializer=docreader__pb2.ReadResponse.SerializeToString,
),
'ReadFromURL': grpc.unary_unary_rpc_method_handler(
servicer.ReadFromURL,
request_deserializer=docreader__pb2.ReadFromURLRequest.FromString,
response_serializer=docreader__pb2.ReadResponse.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'docreader.DocReader', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
server.add_registered_method_handlers('docreader.DocReader', rpc_method_handlers)
# This class is part of an EXPERIMENTAL API.
class DocReader(object):
"""文档读取服务
"""
@staticmethod
def ReadFromFile(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(
request,
target,
'/docreader.DocReader/ReadFromFile',
docreader__pb2.ReadFromFileRequest.SerializeToString,
docreader__pb2.ReadResponse.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)
@staticmethod
def ReadFromURL(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(
request,
target,
'/docreader.DocReader/ReadFromURL',
docreader__pb2.ReadFromURLRequest.SerializeToString,
docreader__pb2.ReadResponse.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)

35
docreader/pyproject.toml Normal file
View File

@@ -0,0 +1,35 @@
[project]
name = "docreader"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10.18"
dependencies = [
"antiword>=0.1.0",
"asyncio>=4.0.0",
"beautifulsoup4>=4.14.2",
"cos-python-sdk-v5>=1.9.38",
"goose3[all]>=3.1.20",
"grpcio>=1.76.0",
"grpcio-health-checking>=1.76.0",
"grpcio-tools>=1.76.0",
"lxml>=6.0.2",
"markdown>=3.10",
"markdownify>=1.2.0",
"minio>=7.2.18",
"mistletoe>=1.5.0",
"ollama>=0.6.0",
"openai>=2.7.1",
"paddleocr>=2.10.0,<3.0.0",
"paddlepaddle>=3.0.0,<4.0.0",
"pdfplumber>=0.11.7",
"pillow>=12.0.0",
"playwright>=1.55.0",
"protobuf>=6.33.0",
"pypdf>=6.1.3",
"pypdf2>=3.0.1",
"python-docx>=1.2.0",
"requests>=2.32.5",
"textract==1.5.0",
"urllib3>=2.5.0",
]

View File

@@ -0,0 +1,70 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
import logging
from paddleocr import PaddleOCR
# 添加当前目录到Python路径
current_dir = os.path.dirname(os.path.abspath(__file__))
if current_dir not in sys.path:
sys.path.append(current_dir)
# 导入ImageParser
from parser.image_parser import ImageParser
# 配置日志
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
def init_ocr_model():
"""Initialize PaddleOCR model to pre-download and cache models"""
try:
logger.info("Initializing PaddleOCR model for pre-download...")
# 使用与代码中相同的配置
ocr_config = {
"use_gpu": False,
"text_det_limit_type": "max",
"text_det_limit_side_len": 960,
"use_doc_orientation_classify": True, # 启用文档方向分类
"use_doc_unwarping": False,
"use_textline_orientation": True, # 启用文本行方向检测
"text_recognition_model_name": "PP-OCRv4_server_rec",
"text_detection_model_name": "PP-OCRv4_server_det",
"text_det_thresh": 0.3,
"text_det_box_thresh": 0.6,
"text_det_unclip_ratio": 1.5,
"text_rec_score_thresh": 0.0,
"ocr_version": "PP-OCRv4",
"lang": "ch",
"show_log": False,
"use_dilation": True,
"det_db_score_mode": "slow",
}
# 初始化PaddleOCR这会触发模型下载和缓存
ocr = PaddleOCR(**ocr_config)
logger.info("PaddleOCR model initialization completed successfully")
# 测试OCR功能以确保模型正常工作
import numpy as np
from PIL import Image
# 创建一个简单的测试图像
test_image = np.ones((100, 300, 3), dtype=np.uint8) * 255
test_pil = Image.fromarray(test_image)
# 执行一次OCR测试
result = ocr.ocr(np.array(test_pil), cls=False)
logger.info("PaddleOCR test completed successfully")
except Exception as e:
logger.error(f"Failed to initialize PaddleOCR model: {str(e)}")
raise

View File

@@ -0,0 +1,31 @@
#!/bin/bash
set -x
# 设置目录
PROTO_DIR="proto"
PYTHON_OUT="proto"
GO_OUT="proto"
# 生成Python代码
python3 -m grpc_tools.protoc -I${PROTO_DIR} \
--python_out=${PYTHON_OUT} \
--grpc_python_out=${PYTHON_OUT} \
${PROTO_DIR}/docreader.proto
# 生成Go代码
protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
--go_opt=paths=source_relative \
--go-grpc_out=${GO_OUT} \
--go-grpc_opt=paths=source_relative \
${PROTO_DIR}/docreader.proto
# 修复Python导入问题MacOS兼容版本
if [ "$(uname)" == "Darwin" ]; then
# MacOS版本
sed -i '' 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
else
# Linux版本
sed -i 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
fi
echo "Proto files generated successfully!"

BIN
docreader/testdata/images/test_text.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

56
docreader/testdata/test.html vendored Normal file
View File

@@ -0,0 +1,56 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>测试 HTML 文档</title>
</head>
<body>
<h1>测试 HTML 文档</h1>
<p>这是一个测试 HTML 文档,用于测试 HTML 解析功能。</p>
<h2>包含图片</h2>
<img src="https://example.com/image.jpg" alt="测试图片">
<h2>包含链接</h2>
<p>这是一个<a href="https://example.com">测试链接</a></p>
<h2>包含代码块</h2>
<pre><code>
def hello_world():
print("Hello, World!")
</code></pre>
<h2>包含表格</h2>
<table>
<thead>
<tr>
<th>表头1</th>
<th>表头2</th>
</tr>
</thead>
<tbody>
<tr>
<td>内容1</td>
<td>内容2</td>
</tr>
<tr>
<td>内容3</td>
<td>内容4</td>
</tr>
</tbody>
</table>
<h2>测试分块功能</h2>
<p>这部分内容用于测试分块功能,确保 HTML 结构在分块时保持完整。</p>
<ul>
<li>第一块内容</li>
<li>第二块内容</li>
<li>第三块内容</li>
</ul>
<h2>测试重叠功能</h2>
<p>这部分内容可能会在分块时与前后块重叠,以确保上下文的连续性。</p>
</body>
</html>

37
docreader/testdata/test.md vendored Normal file
View File

@@ -0,0 +1,37 @@
# 测试 Markdown 文档
这是一个测试 Markdown 文档,用于测试 Markdown 解析功能。
## 包含图片
![测试图片](https://geektutu.com/post/quick-go-protobuf/go-protobuf.jpg)
## 包含链接
这是一个[测试链接](https://example.com)。
## 包含代码块
```python
def hello_world():
print("Hello, World!")
```
## 包含表格
| 表头1 | 表头2 |
|-------|-------|
| 内容1 | 内容2 |
| 内容3 | 内容4 |
## 测试分块功能
这部分内容用于测试分块功能,确保 Markdown 结构在分块时保持完整。
- 第一块内容
- 第二块内容
- 第三块内容
## 测试重叠功能
这部分内容可能会在分块时与前后块重叠,以确保上下文的连续性。

16
docreader/testdata/test.txt vendored Normal file
View File

@@ -0,0 +1,16 @@
这是一个测试文档
包含多行内容
用于测试文档解析功能
这个文档包含以下内容:
1. 基本文本内容
2. 多行段落
3. 列表项
测试分块功能:
- 第一块内容
- 第二块内容
- 第三块内容
测试重叠功能:
这部分内容可能会在分块时与前后块重叠,以确保上下文的连续性。

19
docreader/testdata/test_download.txt vendored Normal file
View File

@@ -0,0 +1,19 @@
这是一个测试文档
包含多行内容
用于测试文档解析功能
这个文档包含以下内容:
1. 基本文本内容
2. 多行段落
3. 列表项
测试分块功能:
- 第一块内容
- 第二块内容
- 第三块内容
测试重叠功能:
这部分内容可能会在分块时与前后块重叠,以确保上下文的连续性。
test

View File

@@ -0,0 +1,83 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import re
import logging
# 配置日志
logger = logging.getLogger(__name__)
def singleton(cls, *args, **kw):
instances = {}
def _singleton():
key = str(cls) + str(os.getpid())
if key not in instances:
logger.info(f"Creating new singleton instance with key: {key}")
instances[key] = cls(*args, **kw)
else:
logger.info(f"Returning existing singleton instance with key: {key}")
return instances[key]
return _singleton
def rmSpace(txt):
logger.info(f"Removing spaces from text of length: {len(txt)}")
txt = re.sub(r"([^a-z0-9.,\)>]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
return re.sub(r"([^ ]) +([^a-z0-9.,\(<])", r"\1\2", txt, flags=re.IGNORECASE)
def findMaxDt(fnm):
m = "1970-01-01 00:00:00"
logger.info(f"Finding maximum date in file: {fnm}")
try:
with open(fnm, "r") as f:
while True:
l = f.readline()
if not l:
break
l = l.strip("\n")
if l == "nan":
continue
if l > m:
m = l
logger.info(f"Maximum date found: {m}")
except Exception as e:
logger.error(f"Error reading file {fnm} for max date: {str(e)}")
return m
def findMaxTm(fnm):
m = 0
logger.info(f"Finding maximum time in file: {fnm}")
try:
with open(fnm, "r") as f:
while True:
l = f.readline()
if not l:
break
l = l.strip("\n")
if l == "nan":
continue
if int(l) > m:
m = int(l)
logger.info(f"Maximum time found: {m}")
except Exception as e:
logger.error(f"Error reading file {fnm} for max time: {str(e)}")
return m

149
docreader/utils/request.py Normal file
View File

@@ -0,0 +1,149 @@
from contextvars import ContextVar
import logging
import uuid
import contextlib
import time
from typing import Optional
from logging import LogRecord
# 配置日志
logger = logging.getLogger(__name__)
# 定义上下文变量
request_id_var = ContextVar("request_id", default=None)
_request_start_time_ctx = ContextVar("request_start_time", default=None)
def set_request_id(request_id: str) -> None:
"""设置当前上下文的请求ID"""
request_id_var.set(request_id)
def get_request_id() -> Optional[str]:
"""获取当前上下文的请求ID"""
return request_id_var.get()
class MillisecondFormatter(logging.Formatter):
"""自定义日志格式化器,只显示毫秒级时间戳(3位数字)而不是微秒(6位)"""
def formatTime(self, record, datefmt=None):
"""重写formatTime方法将微秒格式化为毫秒"""
# 先获取标准的格式化时间
result = super().formatTime(record, datefmt)
# 如果使用了包含.%f的格式则将微秒(6位)截断为毫秒(3位)
if datefmt and ".%f" in datefmt:
# 格式化的时间字符串应该在最后有6位微秒数
parts = result.split('.')
if len(parts) > 1 and len(parts[1]) >= 6:
# 只保留前3位作为毫秒
millis = parts[1][:3]
result = f"{parts[0]}.{millis}"
return result
def init_logging_request_id():
"""
Initialize logging to include request ID in log messages.
Add the custom filter to all existing handlers
"""
logger.info("Initializing request ID logging")
root_logger = logging.getLogger()
# 添加自定义过滤器到所有处理器
for handler in root_logger.handlers:
# 添加请求ID过滤器
handler.addFilter(RequestIdFilter())
# 更新格式化器以包含请求ID调整格式使其更紧凑整齐
formatter = logging.Formatter(
fmt="%(asctime)s.%(msecs)03d [%(request_id)s] %(levelname)-5s %(name)-20s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
handler.setFormatter(formatter)
logger.info(
f"Updated {len(root_logger.handlers)} handlers with request ID formatting"
)
# 如果没有处理器,添加一个标准输出处理器
if not root_logger.handlers:
handler = logging.StreamHandler()
formatter = logging.Formatter(
fmt="%(asctime)s.%(msecs)03d [%(request_id)s] %(levelname)-5s %(name)-20s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
handler.setFormatter(formatter)
handler.addFilter(RequestIdFilter())
root_logger.addHandler(handler)
logger.info("Added new StreamHandler with request ID formatting")
class RequestIdFilter(logging.Filter):
"""Filter that adds request ID to log messages"""
def filter(self, record: LogRecord) -> bool:
request_id = request_id_var.get()
if request_id is not None:
# 为日志记录添加请求ID属性使用短格式
if len(request_id) > 8:
# 截取ID的前8个字符确保显示整齐
short_id = request_id[:8]
if "-" in request_id:
# 尝试保留格式,例如 test-req-1-XXX
parts = request_id.split("-")
if len(parts) >= 3:
# 如果格式是 xxx-xxx-n-randompart
short_id = f"{parts[0]}-{parts[1]}-{parts[2]}"
record.request_id = short_id
else:
record.request_id = request_id
# 添加执行时间属性
start_time = _request_start_time_ctx.get()
if start_time is not None:
elapsed_ms = int((time.time() - start_time) * 1000)
record.elapsed_ms = elapsed_ms
# 添加执行时间到消息中
if not hasattr(record, "message_with_elapsed"):
record.message_with_elapsed = True
record.msg = f"{record.msg} (elapsed: {elapsed_ms}ms)"
else:
# 如果没有请求ID使用占位符
record.request_id = "no-req-id"
return True
@contextlib.contextmanager
def request_id_context(request_id: str = None):
"""Context manager that sets a request ID for the current context
Args:
request_id: 要使用的请求ID如果为None则自动生成
Example:
with request_id_context("req-123"):
# 在这个代码块中的所有日志都会包含请求ID req-123
logging.info("Processing request")
"""
# Generate or use provided request ID
req_id = request_id or str(uuid.uuid4())
# Set start time and request ID
start_time = time.time()
req_token = request_id_var.set(req_id)
time_token = _request_start_time_ctx.set(start_time)
logger.info(f"Starting new request with ID: {req_id}")
try:
yield request_id_var.get()
finally:
# Log completion and reset context vars
elapsed_ms = int((time.time() - start_time) * 1000)
logger.info(f"Request {req_id} completed in {elapsed_ms}ms")
request_id_var.reset(req_token)
_request_start_time_ctx.reset(time_token)

3228
docreader/uv.lock generated Normal file

File diff suppressed because it is too large Load Diff