mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
chore(docreader): 重新组织模块文件
This commit is contained in:
23
docreader/Makefile
Normal file
23
docreader/Makefile
Normal file
@@ -0,0 +1,23 @@
|
||||
.PHONY: proto build run docker-build docker-run clean
|
||||
|
||||
# 生成 protobuf 代码
|
||||
proto:
|
||||
@echo "Generating protobuf code..."
|
||||
@sh ./scripts/generate_proto.sh
|
||||
|
||||
# 构建 Go 客户端
|
||||
build:
|
||||
@echo "Building Go client..."
|
||||
@go build -o bin/client ./src/client
|
||||
|
||||
# 运行 Python 服务
|
||||
run:
|
||||
@echo "Running Python server..."
|
||||
@python src/server/server.py
|
||||
|
||||
# 清理
|
||||
clean:
|
||||
@echo "Cleaning up..."
|
||||
@rm -rf bin/
|
||||
@find . -name "*.pyc" -delete
|
||||
@find . -name "__pycache__" -delete
|
||||
0
docreader/README.md
Normal file
0
docreader/README.md
Normal file
115
docreader/client/client.go
Normal file
115
docreader/client/client.go
Normal file
@@ -0,0 +1,115 @@
|
||||
package client
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/Tencent/WeKnora/services/docreader/src/proto"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
"google.golang.org/grpc/resolver"
|
||||
)
|
||||
|
||||
const (
|
||||
maxMessageSize = 50 * 1024 * 1024 // 50MB
|
||||
)
|
||||
|
||||
var (
|
||||
// Logger is the default logger used by the client
|
||||
Logger = log.New(os.Stdout, "[DocReader] ", log.LstdFlags|log.Lmicroseconds)
|
||||
)
|
||||
|
||||
// ImageInfo 表示一个图片的信息
|
||||
type ImageInfo struct {
|
||||
URL string // 图片URL(COS)
|
||||
Caption string // 图片描述
|
||||
OCRText string // OCR提取的文本
|
||||
OriginalURL string // 原始图片URL
|
||||
Start int // 图片在文本中的开始位置
|
||||
End int // 图片在文本中的结束位置
|
||||
}
|
||||
|
||||
// Client represents a DocReader service client
|
||||
type Client struct {
|
||||
conn *grpc.ClientConn
|
||||
proto.DocReaderClient
|
||||
debug bool
|
||||
}
|
||||
|
||||
// NewClient creates a new DocReader client with the specified address
|
||||
func NewClient(addr string) (*Client, error) {
|
||||
Logger.Printf("INFO: Creating new DocReader client connecting to %s", addr)
|
||||
|
||||
// 设置消息大小限制
|
||||
opts := []grpc.DialOption{
|
||||
grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy":"round_robin"}`),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(maxMessageSize),
|
||||
grpc.MaxCallSendMsgSize(maxMessageSize),
|
||||
),
|
||||
}
|
||||
resolver.SetDefaultScheme("dns")
|
||||
|
||||
startTime := time.Now()
|
||||
conn, err := grpc.Dial("dns:///"+addr, opts...)
|
||||
if err != nil {
|
||||
Logger.Printf("ERROR: Failed to connect to DocReader service: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
Logger.Printf("INFO: Successfully connected to DocReader service in %v", time.Since(startTime))
|
||||
|
||||
return &Client{
|
||||
conn: conn,
|
||||
DocReaderClient: proto.NewDocReaderClient(conn),
|
||||
debug: false,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Close closes the client connection
|
||||
func (c *Client) Close() error {
|
||||
Logger.Printf("INFO: Closing DocReader client connection")
|
||||
return c.conn.Close()
|
||||
}
|
||||
|
||||
// SetDebug enables or disables debug logging
|
||||
func (c *Client) SetDebug(debug bool) {
|
||||
c.debug = debug
|
||||
Logger.Printf("INFO: Debug logging set to %v", debug)
|
||||
}
|
||||
|
||||
// Log logs a message with the appropriate level
|
||||
func (c *Client) Log(level string, format string, args ...interface{}) {
|
||||
if level == "DEBUG" && !c.debug {
|
||||
return
|
||||
}
|
||||
Logger.Printf("%s: %s", level, fmt.Sprintf(format, args...))
|
||||
}
|
||||
|
||||
// GetImagesFromChunk 从一个Chunk中提取所有图片信息
|
||||
func GetImagesFromChunk(chunk *proto.Chunk) []ImageInfo {
|
||||
if chunk == nil || len(chunk.Images) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
images := make([]ImageInfo, 0, len(chunk.Images))
|
||||
for _, img := range chunk.Images {
|
||||
images = append(images, ImageInfo{
|
||||
URL: img.Url,
|
||||
Caption: img.Caption,
|
||||
OCRText: img.OcrText,
|
||||
OriginalURL: img.OriginalUrl,
|
||||
Start: int(img.Start),
|
||||
End: int(img.End),
|
||||
})
|
||||
}
|
||||
|
||||
return images
|
||||
}
|
||||
|
||||
// HasImagesInChunk 判断一个Chunk是否包含图片
|
||||
func HasImagesInChunk(chunk *proto.Chunk) bool {
|
||||
return chunk != nil && len(chunk.Images) > 0
|
||||
}
|
||||
154
docreader/client/client_test.go
Normal file
154
docreader/client/client_test.go
Normal file
@@ -0,0 +1,154 @@
|
||||
package client
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/Tencent/WeKnora/services/docreader/src/proto"
|
||||
)
|
||||
|
||||
func init() {
|
||||
// 配置测试日志
|
||||
log.SetOutput(os.Stdout)
|
||||
log.SetFlags(log.LstdFlags | log.Lmicroseconds | log.Lshortfile)
|
||||
log.Println("INFO: Initializing DocReader client tests")
|
||||
}
|
||||
|
||||
func TestReadFromURL(t *testing.T) {
|
||||
log.Println("INFO: Starting TestReadFromURL")
|
||||
|
||||
// 创建测试客户端
|
||||
log.Println("INFO: Creating test client")
|
||||
client, err := NewClient("localhost:50051")
|
||||
if err != nil {
|
||||
log.Printf("ERROR: Failed to create client: %v", err)
|
||||
t.Fatalf("Failed to create client: %v", err)
|
||||
}
|
||||
defer client.Close()
|
||||
|
||||
// 启用调试日志
|
||||
client.SetDebug(true)
|
||||
|
||||
// 测试 ReadFromURL 方法
|
||||
log.Println("INFO: Sending ReadFromURL request to server")
|
||||
startTime := time.Now()
|
||||
resp, err := client.ReadFromURL(
|
||||
context.Background(),
|
||||
&proto.ReadFromURLRequest{
|
||||
Url: "https://example.com",
|
||||
Title: "test",
|
||||
ReadConfig: &proto.ReadConfig{
|
||||
ChunkSize: 512,
|
||||
ChunkOverlap: 50,
|
||||
Separators: []string{"\n\n", "\n", "。"},
|
||||
EnableMultimodal: true,
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
requestDuration := time.Since(startTime)
|
||||
if err != nil {
|
||||
log.Printf("ERROR: ReadFromURL failed: %v", err)
|
||||
t.Fatalf("ReadFromURL failed: %v", err)
|
||||
}
|
||||
log.Printf("INFO: ReadFromURL completed in %v", requestDuration)
|
||||
|
||||
// 验证结果
|
||||
chunkCount := len(resp.Chunks)
|
||||
log.Printf("INFO: Received %d chunks from URL parsing", chunkCount)
|
||||
if chunkCount == 0 {
|
||||
log.Println("WARN: Expected non-empty content but received none")
|
||||
t.Error("Expected non-empty content")
|
||||
}
|
||||
|
||||
// 打印结果
|
||||
for i, chunk := range resp.Chunks {
|
||||
if i < 2 || i >= chunkCount-2 { // 只打印前两个和后两个块
|
||||
log.Printf("DEBUG: Chunk %d: %s", chunk.Seq, truncateString(chunk.Content, 50))
|
||||
} else if i == 2 && chunkCount > 4 {
|
||||
log.Printf("DEBUG: ... %d more chunks ...", chunkCount-4)
|
||||
}
|
||||
}
|
||||
|
||||
log.Println("INFO: TestReadFromURL completed successfully")
|
||||
}
|
||||
|
||||
func TestReadFromFileWithChunking(t *testing.T) {
|
||||
log.Println("INFO: Starting TestReadFromFileWithChunking")
|
||||
|
||||
// 创建测试客户端
|
||||
log.Println("INFO: Creating test client")
|
||||
client, err := NewClient("localhost:50051")
|
||||
if err != nil {
|
||||
log.Printf("ERROR: Failed to create client: %v", err)
|
||||
t.Fatalf("Failed to create client: %v", err)
|
||||
}
|
||||
defer client.Close()
|
||||
|
||||
// 启用调试日志
|
||||
client.SetDebug(true)
|
||||
|
||||
// 读取测试文件
|
||||
log.Println("INFO: Reading test file")
|
||||
fileContent, err := os.ReadFile("../testdata/test.md")
|
||||
if err != nil {
|
||||
log.Printf("ERROR: Failed to read test file: %v", err)
|
||||
t.Fatalf("Failed to read test file: %v", err)
|
||||
}
|
||||
log.Printf("INFO: Read test file, size: %d bytes", len(fileContent))
|
||||
|
||||
// 测试 ReadFromFile 方法,带分块参数
|
||||
log.Println("INFO: Sending ReadFromFile request to server")
|
||||
startTime := time.Now()
|
||||
resp, err := client.ReadFromFile(
|
||||
context.Background(),
|
||||
&proto.ReadFromFileRequest{
|
||||
FileContent: fileContent,
|
||||
FileName: "test.md",
|
||||
FileType: "md",
|
||||
ReadConfig: &proto.ReadConfig{
|
||||
ChunkSize: 200,
|
||||
ChunkOverlap: 50,
|
||||
Separators: []string{"\n\n", "\n", "。"},
|
||||
EnableMultimodal: true,
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
requestDuration := time.Since(startTime)
|
||||
if err != nil {
|
||||
log.Printf("ERROR: ReadFromFile failed: %v", err)
|
||||
t.Fatalf("ReadFromFile failed: %v", err)
|
||||
}
|
||||
log.Printf("INFO: ReadFromFile completed in %v", requestDuration)
|
||||
|
||||
// 验证结果
|
||||
chunkCount := len(resp.Chunks)
|
||||
log.Printf("INFO: Received %d chunks from file parsing", chunkCount)
|
||||
if chunkCount == 0 {
|
||||
log.Println("WARN: Expected non-empty content but received none")
|
||||
t.Error("Expected non-empty content")
|
||||
}
|
||||
|
||||
// 打印结果
|
||||
for i, chunk := range resp.Chunks {
|
||||
if i < 2 || i >= chunkCount-2 { // 只打印前两个和后两个块
|
||||
log.Printf("DEBUG: Chunk %d: %s", chunk.Seq, truncateString(chunk.Content, 50))
|
||||
} else if i == 2 && chunkCount > 4 {
|
||||
log.Printf("DEBUG: ... %d more chunks ...", chunkCount-4)
|
||||
}
|
||||
}
|
||||
|
||||
log.Println("INFO: TestReadFromFileWithChunking completed successfully")
|
||||
}
|
||||
|
||||
// 截断字符串以供日志打印
|
||||
func truncateString(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
return s[:maxLen] + "..."
|
||||
}
|
||||
403
docreader/main.py
Normal file
403
docreader/main.py
Normal file
@@ -0,0 +1,403 @@
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from concurrent import futures
|
||||
import traceback
|
||||
import grpc
|
||||
import uuid
|
||||
import atexit
|
||||
from grpc_health.v1 import health_pb2_grpc
|
||||
from grpc_health.v1.health import HealthServicer
|
||||
|
||||
# Add parent directory to Python path
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parent_dir = os.path.dirname(current_dir)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
|
||||
from proto.docreader_pb2 import ReadResponse, Chunk, Image
|
||||
from proto import docreader_pb2_grpc
|
||||
from parser import Parser, OCREngine
|
||||
from parser.config import ChunkingConfig
|
||||
from utils.request import request_id_context, init_logging_request_id
|
||||
|
||||
# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
# Optional dependency for charset detection; install via `pip install charset-normalizer`
|
||||
from charset_normalizer import from_bytes as _cn_from_bytes # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
_cn_from_bytes = None # type: ignore
|
||||
|
||||
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
|
||||
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
|
||||
|
||||
|
||||
def to_valid_utf8_text(s: Optional[str]) -> str:
|
||||
"""Return a UTF-8 safe string for protobuf.
|
||||
|
||||
- Replace any surrogate code points with U+FFFD
|
||||
- Re-encode with errors='replace' to ensure valid UTF-8
|
||||
"""
|
||||
if not s:
|
||||
return ""
|
||||
s = _SURROGATE_RE.sub("\ufffd", s)
|
||||
return s.encode("utf-8", errors="replace").decode("utf-8")
|
||||
|
||||
|
||||
def read_text_with_fallback(file_path: str) -> str:
|
||||
"""Read text from file supporting multiple encodings with graceful fallback.
|
||||
|
||||
This server currently receives bytes over gRPC and delegates decoding to the parser.
|
||||
This helper is provided for future local-file reads if needed.
|
||||
"""
|
||||
with open(file_path, "rb") as f:
|
||||
raw = f.read()
|
||||
if _cn_from_bytes is not None:
|
||||
try:
|
||||
result = _cn_from_bytes(raw).best()
|
||||
if result:
|
||||
return str(result)
|
||||
except Exception:
|
||||
pass
|
||||
for enc in ("utf-8", "gb18030", "latin-1"):
|
||||
try:
|
||||
return raw.decode(enc, errors="replace")
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
return raw.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
# Ensure no existing handlers
|
||||
for handler in logging.root.handlers[:]:
|
||||
logging.root.removeHandler(handler)
|
||||
|
||||
# Configure logging - use stdout
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
logging.root.addHandler(handler)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.info("Initializing server logging")
|
||||
|
||||
# Initialize request ID logging
|
||||
init_logging_request_id()
|
||||
|
||||
# Set max message size to 50MB
|
||||
MAX_MESSAGE_LENGTH = 50 * 1024 * 1024
|
||||
|
||||
|
||||
parser = Parser()
|
||||
|
||||
|
||||
class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.parser = Parser()
|
||||
|
||||
def ReadFromFile(self, request, context):
|
||||
# Get or generate request ID
|
||||
request_id = (
|
||||
request.request_id
|
||||
if hasattr(request, "request_id") and request.request_id
|
||||
else str(uuid.uuid4())
|
||||
)
|
||||
|
||||
# Use request ID context
|
||||
with request_id_context(request_id):
|
||||
try:
|
||||
# Get file type
|
||||
file_type = (
|
||||
request.file_type or os.path.splitext(request.file_name)[1][1:]
|
||||
)
|
||||
logger.info(
|
||||
f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
|
||||
)
|
||||
logger.info(f"File content size: {len(request.file_content)} bytes")
|
||||
|
||||
# Create chunking config
|
||||
chunk_size = request.read_config.chunk_size or 512
|
||||
chunk_overlap = request.read_config.chunk_overlap or 50
|
||||
separators = request.read_config.separators or ["\n\n", "\n", "。"]
|
||||
enable_multimodal = request.read_config.enable_multimodal or False
|
||||
|
||||
logger.info(
|
||||
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
|
||||
f"multimodal={enable_multimodal}"
|
||||
)
|
||||
|
||||
# Get Storage and VLM config from request
|
||||
storage_config = None
|
||||
vlm_config = None
|
||||
|
||||
sc = request.read_config.storage_config
|
||||
# Keep parser-side key name as cos_config for backward compatibility
|
||||
storage_config = {
|
||||
"provider": "minio" if sc.provider == 2 else "cos",
|
||||
"region": sc.region,
|
||||
"bucket_name": sc.bucket_name,
|
||||
"access_key_id": sc.access_key_id,
|
||||
"secret_access_key": sc.secret_access_key,
|
||||
"app_id": sc.app_id,
|
||||
"path_prefix": sc.path_prefix,
|
||||
}
|
||||
logger.info(
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
|
||||
)
|
||||
|
||||
vlm_config = {
|
||||
"model_name": request.read_config.vlm_config.model_name,
|
||||
"base_url": request.read_config.vlm_config.base_url,
|
||||
"api_key": request.read_config.vlm_config.api_key or "",
|
||||
"interface_type": request.read_config.vlm_config.interface_type
|
||||
or "openai",
|
||||
}
|
||||
logger.info(
|
||||
f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}"
|
||||
)
|
||||
|
||||
chunking_config = ChunkingConfig(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
separators=separators,
|
||||
enable_multimodal=enable_multimodal,
|
||||
storage_config=storage_config,
|
||||
vlm_config=vlm_config,
|
||||
)
|
||||
|
||||
# Parse file
|
||||
logger.info(f"Starting file parsing process")
|
||||
result = self.parser.parse_file(
|
||||
request.file_name, file_type, request.file_content, chunking_config
|
||||
)
|
||||
|
||||
if not result:
|
||||
error_msg = "Failed to parse file"
|
||||
logger.error(error_msg)
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(error_msg)
|
||||
return ReadResponse()
|
||||
|
||||
# Convert to protobuf message
|
||||
logger.info(
|
||||
f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
|
||||
)
|
||||
|
||||
# Build response, including image info
|
||||
response = ReadResponse(
|
||||
chunks=[
|
||||
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
|
||||
]
|
||||
)
|
||||
logger.info(f"Response size: {response.ByteSize()} bytes")
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error reading file: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
logger.info(f"Detailed traceback: {traceback.format_exc()}")
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return ReadResponse(error=str(e))
|
||||
|
||||
def ReadFromURL(self, request, context):
|
||||
# Get or generate request ID
|
||||
request_id = (
|
||||
request.request_id
|
||||
if hasattr(request, "request_id") and request.request_id
|
||||
else str(uuid.uuid4())
|
||||
)
|
||||
|
||||
# Use request ID context
|
||||
with request_id_context(request_id):
|
||||
try:
|
||||
logger.info(f"Received ReadFromURL request for URL: {request.url}")
|
||||
|
||||
# Create chunking config
|
||||
chunk_size = request.read_config.chunk_size or 512
|
||||
chunk_overlap = request.read_config.chunk_overlap or 50
|
||||
separators = request.read_config.separators or ["\n\n", "\n", "。"]
|
||||
enable_multimodal = request.read_config.enable_multimodal or False
|
||||
|
||||
logger.info(
|
||||
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
|
||||
f"multimodal={enable_multimodal}"
|
||||
)
|
||||
|
||||
# Get Storage and VLM config from request
|
||||
storage_config = None
|
||||
vlm_config = None
|
||||
|
||||
sc = request.read_config.storage_config
|
||||
storage_config = {
|
||||
"provider": "minio" if sc.provider == 2 else "cos",
|
||||
"region": sc.region,
|
||||
"bucket_name": sc.bucket_name,
|
||||
"access_key_id": sc.access_key_id,
|
||||
"secret_access_key": sc.secret_access_key,
|
||||
"app_id": sc.app_id,
|
||||
"path_prefix": sc.path_prefix,
|
||||
}
|
||||
logger.info(
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
|
||||
)
|
||||
|
||||
vlm_config = {
|
||||
"model_name": request.read_config.vlm_config.model_name,
|
||||
"base_url": request.read_config.vlm_config.base_url,
|
||||
"api_key": request.read_config.vlm_config.api_key or "",
|
||||
"interface_type": request.read_config.vlm_config.interface_type
|
||||
or "openai",
|
||||
}
|
||||
logger.info(
|
||||
f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}"
|
||||
)
|
||||
|
||||
chunking_config = ChunkingConfig(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
separators=separators,
|
||||
enable_multimodal=enable_multimodal,
|
||||
storage_config=storage_config,
|
||||
vlm_config=vlm_config,
|
||||
)
|
||||
|
||||
# Parse URL
|
||||
logger.info(f"Starting URL parsing process")
|
||||
result = self.parser.parse_url(
|
||||
request.url, request.title, chunking_config
|
||||
)
|
||||
if not result:
|
||||
error_msg = "Failed to parse URL"
|
||||
logger.error(error_msg)
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(error_msg)
|
||||
return ReadResponse(error=error_msg)
|
||||
|
||||
# Convert to protobuf message, including image info
|
||||
logger.info(
|
||||
f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
|
||||
)
|
||||
|
||||
response = ReadResponse(
|
||||
chunks=[
|
||||
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
|
||||
]
|
||||
)
|
||||
logger.info(f"Response size: {response.ByteSize()} bytes")
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error reading URL: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
logger.info(f"Detailed traceback: {traceback.format_exc()}")
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return ReadResponse(error=str(e))
|
||||
|
||||
def _convert_chunk_to_proto(self, chunk):
|
||||
"""Convert internal Chunk object to protobuf Chunk message
|
||||
Ensures all string fields are valid UTF-8 for protobuf (no lone surrogates).
|
||||
"""
|
||||
# Clean helper for strings
|
||||
_c = to_valid_utf8_text
|
||||
|
||||
proto_chunk = Chunk(
|
||||
content=_c(getattr(chunk, "content", None)),
|
||||
seq=getattr(chunk, "seq", 0),
|
||||
start=getattr(chunk, "start", 0),
|
||||
end=getattr(chunk, "end", 0),
|
||||
)
|
||||
|
||||
# If chunk has images attribute and is not empty, add image info
|
||||
if hasattr(chunk, "images") and chunk.images:
|
||||
logger.info(
|
||||
f"Adding {len(chunk.images)} images to chunk {getattr(chunk, 'seq', 0)}"
|
||||
)
|
||||
for img_info in chunk.images:
|
||||
# img_info expected as dict
|
||||
proto_image = Image(
|
||||
url=_c(img_info.get("cos_url", "")),
|
||||
caption=_c(img_info.get("caption", "")),
|
||||
ocr_text=_c(img_info.get("ocr_text", "")),
|
||||
original_url=_c(img_info.get("original_url", "")),
|
||||
start=int(img_info.get("start", 0) or 0),
|
||||
end=int(img_info.get("end", 0) or 0),
|
||||
)
|
||||
proto_chunk.images.append(proto_image)
|
||||
|
||||
return proto_chunk
|
||||
|
||||
|
||||
def init_ocr_engine(ocr_backend, ocr_config):
|
||||
"""Initialize OCR engine"""
|
||||
try:
|
||||
logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
|
||||
ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
|
||||
if ocr_engine:
|
||||
logger.info("OCR engine initialized successfully")
|
||||
return True
|
||||
else:
|
||||
logger.error("OCR engine initialization failed")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Error initializing OCR engine: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
init_ocr_engine(
|
||||
os.getenv("OCR_BACKEND", "paddle"),
|
||||
{
|
||||
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
|
||||
},
|
||||
)
|
||||
|
||||
# Set max number of worker threads
|
||||
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
|
||||
logger.info(f"Starting DocReader service with {max_workers} worker threads")
|
||||
|
||||
# Get port number
|
||||
port = os.environ.get("GRPC_PORT", "50051")
|
||||
|
||||
# Create server
|
||||
server = grpc.server(
|
||||
futures.ThreadPoolExecutor(max_workers=max_workers),
|
||||
options=[
|
||||
("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
|
||||
("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
|
||||
],
|
||||
)
|
||||
|
||||
# Register services
|
||||
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
|
||||
|
||||
# Register health check service
|
||||
health_servicer = HealthServicer()
|
||||
health_pb2_grpc.add_HealthServicer_to_server(health_servicer, server)
|
||||
|
||||
# Set listen address
|
||||
server.add_insecure_port(f"[::]:{port}")
|
||||
|
||||
# Start service
|
||||
server.start()
|
||||
|
||||
logger.info(f"Server started on port {port}")
|
||||
logger.info("Server is ready to accept connections")
|
||||
|
||||
try:
|
||||
# Wait for service termination
|
||||
server.wait_for_termination()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received termination signal, shutting down server")
|
||||
server.stop(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
42
docreader/parser/__init__.py
Normal file
42
docreader/parser/__init__.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
Parser module for WeKnora document processing system.
|
||||
|
||||
This module provides document parsers for various file formats including:
|
||||
- Microsoft Word documents (.doc, .docx)
|
||||
- PDF documents
|
||||
- Markdown files
|
||||
- Plain text files
|
||||
- Images with text content
|
||||
- Web pages
|
||||
|
||||
The parsers extract content from documents and can split them into
|
||||
meaningful chunks for further processing and indexing.
|
||||
"""
|
||||
|
||||
from .base_parser import BaseParser, ParseResult
|
||||
from .docx_parser import DocxParser
|
||||
from .doc_parser import DocParser
|
||||
from .pdf_parser import PDFParser
|
||||
from .markdown_parser import MarkdownParser
|
||||
from .text_parser import TextParser
|
||||
from .image_parser import ImageParser
|
||||
from .web_parser import WebParser
|
||||
from .parser import Parser
|
||||
from .config import ChunkingConfig
|
||||
from .ocr_engine import OCREngine
|
||||
|
||||
# Export public classes and modules
|
||||
__all__ = [
|
||||
"BaseParser", # Base parser class that all format parsers inherit from
|
||||
"DocxParser", # Parser for .docx files (modern Word documents)
|
||||
"DocParser", # Parser for .doc files (legacy Word documents)
|
||||
"PDFParser", # Parser for PDF documents
|
||||
"MarkdownParser", # Parser for Markdown text files
|
||||
"TextParser", # Parser for plain text files
|
||||
"ImageParser", # Parser for images with text content
|
||||
"WebParser", # Parser for web pages
|
||||
"Parser", # Main parser factory that selects the appropriate parser
|
||||
"ChunkingConfig", # Configuration for text chunking behavior
|
||||
"ParseResult", # Standard result format returned by all parsers
|
||||
"OCREngine", # OCR engine for extracting text from images
|
||||
]
|
||||
1244
docreader/parser/base_parser.py
Normal file
1244
docreader/parser/base_parser.py
Normal file
File diff suppressed because it is too large
Load Diff
360
docreader/parser/caption.py
Normal file
360
docreader/parser/caption.py
Normal file
@@ -0,0 +1,360 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import requests
|
||||
import ollama
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImageUrl:
|
||||
"""Image URL data structure for caption requests."""
|
||||
|
||||
url: Optional[str] = None
|
||||
detail: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Content:
|
||||
"""Content data structure that can contain text or image URL."""
|
||||
|
||||
type: Optional[str] = None
|
||||
text: Optional[str] = None
|
||||
image_url: Optional[ImageUrl] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SystemMessage:
|
||||
"""System message for VLM model requests."""
|
||||
|
||||
role: Optional[str] = None
|
||||
content: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class UserMessage:
|
||||
"""User message for VLM model requests, can contain multiple content items."""
|
||||
|
||||
role: Optional[str] = None
|
||||
content: List[Content] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompletionRequest:
|
||||
"""Request structure for VLM model completion API."""
|
||||
|
||||
model: str
|
||||
temperature: float
|
||||
top_p: float
|
||||
messages: List[Union[SystemMessage, UserMessage]]
|
||||
user: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Model:
|
||||
"""Model identifier structure."""
|
||||
|
||||
id: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelsResp:
|
||||
"""Response structure for available models API."""
|
||||
|
||||
data: List[Model] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Message:
|
||||
"""Message structure in API response."""
|
||||
|
||||
role: Optional[str] = None
|
||||
content: Optional[str] = None
|
||||
tool_calls: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Choice:
|
||||
"""Choice structure in API response."""
|
||||
|
||||
message: Optional[Message] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Usage:
|
||||
"""Token usage information in API response."""
|
||||
|
||||
prompt_tokens: Optional[int] = 0
|
||||
total_tokens: Optional[int] = 0
|
||||
completion_tokens: Optional[int] = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class CaptionChatResp:
|
||||
"""Response structure for caption chat API."""
|
||||
|
||||
id: Optional[str] = None
|
||||
created: Optional[int] = None
|
||||
model: Optional[Model] = None
|
||||
object: Optional[str] = None
|
||||
choices: List[Choice] = field(default_factory=list)
|
||||
usage: Optional[Usage] = None
|
||||
|
||||
@staticmethod
|
||||
def from_json(json_data: dict) -> "CaptionChatResp":
|
||||
"""
|
||||
Parse API response JSON into a CaptionChatResp object.
|
||||
|
||||
Args:
|
||||
json_data: The JSON response from the API
|
||||
|
||||
Returns:
|
||||
A parsed CaptionChatResp object
|
||||
"""
|
||||
logger.info("Parsing CaptionChatResp from JSON")
|
||||
# Manually parse nested fields with safe field extraction
|
||||
choices = []
|
||||
for choice in json_data.get("choices", []):
|
||||
message_data = choice.get("message", {})
|
||||
message = Message(
|
||||
role=message_data.get("role"),
|
||||
content=message_data.get("content"),
|
||||
tool_calls=message_data.get("tool_calls"),
|
||||
)
|
||||
choices.append(Choice(message=message))
|
||||
|
||||
# Handle usage with safe field extraction
|
||||
usage_data = json_data.get("usage", {})
|
||||
usage = None
|
||||
if usage_data:
|
||||
usage = Usage(
|
||||
prompt_tokens=usage_data.get("prompt_tokens", 0),
|
||||
total_tokens=usage_data.get("total_tokens", 0),
|
||||
completion_tokens=usage_data.get("completion_tokens", 0),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Parsed {len(choices)} choices and usage data: {usage is not None}"
|
||||
)
|
||||
return CaptionChatResp(
|
||||
id=json_data.get("id"),
|
||||
created=json_data.get("created"),
|
||||
model=json_data.get("model"),
|
||||
object=json_data.get("object"),
|
||||
choices=choices,
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
def choice_data(self) -> str:
|
||||
"""
|
||||
Extract the content from the first choice in the response.
|
||||
|
||||
Returns:
|
||||
The content string from the first choice, or empty string if no choices
|
||||
"""
|
||||
if self.choices:
|
||||
logger.info("Retrieving content from first choice")
|
||||
return self.choices[0].message.content
|
||||
logger.warning("No choices available in response")
|
||||
return ""
|
||||
|
||||
|
||||
class Caption:
|
||||
"""
|
||||
Service for generating captions for images using a Vision Language Model.
|
||||
Uses an external API to process images and return textual descriptions.
|
||||
"""
|
||||
|
||||
def __init__(self, vlm_config=None):
|
||||
"""Initialize the Caption service with configuration from parameters or environment variables."""
|
||||
logger.info("Initializing Caption service")
|
||||
self.prompt = """简单凝炼的描述图片的主要内容"""
|
||||
|
||||
# Use provided VLM config if available, otherwise fall back to environment variables
|
||||
if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"):
|
||||
self.completion_url = vlm_config.get("base_url", "") + "/chat/completions"
|
||||
self.model = vlm_config.get("model_name", "")
|
||||
self.api_key = vlm_config.get("api_key", "")
|
||||
self.interface_type = vlm_config.get("interface_type", "openai").lower()
|
||||
else:
|
||||
if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "":
|
||||
logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set")
|
||||
return
|
||||
self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions"
|
||||
self.model = os.getenv("VLM_MODEL_NAME")
|
||||
self.api_key = os.getenv("VLM_MODEL_API_KEY")
|
||||
self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower()
|
||||
|
||||
# 验证接口类型
|
||||
if self.interface_type not in ["ollama", "openai"]:
|
||||
logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai")
|
||||
self.interface_type = "openai"
|
||||
|
||||
logger.info(
|
||||
f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}"
|
||||
)
|
||||
|
||||
def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]:
|
||||
"""
|
||||
Call the Caption API to generate a description for the given image.
|
||||
|
||||
Args:
|
||||
image_data: URL of the image or base64 encoded image data
|
||||
|
||||
Returns:
|
||||
CaptionChatResp object if successful, None otherwise
|
||||
"""
|
||||
logger.info(f"Calling Caption API for image captioning")
|
||||
logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}")
|
||||
|
||||
# 根据接口类型选择调用方式
|
||||
if self.interface_type == "ollama":
|
||||
return self._call_ollama_api(image_data)
|
||||
else:
|
||||
return self._call_openai_api(image_data)
|
||||
|
||||
def _call_ollama_api(self, image_base64: str) -> Optional[CaptionChatResp]:
|
||||
"""Call Ollama API for image captioning using base64 encoded image data."""
|
||||
|
||||
host = self.completion_url.replace("/v1/chat/completions", "")
|
||||
|
||||
client = ollama.Client(
|
||||
host=host,
|
||||
)
|
||||
|
||||
try:
|
||||
logger.info(f"Calling Ollama API with model: {self.model}")
|
||||
|
||||
# 调用Ollama API,使用images参数传递base64编码的图片
|
||||
response = client.generate(
|
||||
model=self.model,
|
||||
prompt="简单凝炼的描述图片的主要内容",
|
||||
images=[image_base64], # image_base64是base64编码的图片数据
|
||||
options={"temperature": 0.1},
|
||||
stream=False,
|
||||
)
|
||||
|
||||
# 构造响应对象
|
||||
caption_resp = CaptionChatResp(
|
||||
id="ollama_response",
|
||||
created=int(time.time()),
|
||||
model=self.model,
|
||||
object="chat.completion",
|
||||
choices=[
|
||||
Choice(
|
||||
message=Message(
|
||||
role="assistant",
|
||||
content=response.response
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
logger.info("Successfully received response from Ollama API")
|
||||
return caption_resp
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Ollama API: {e}")
|
||||
return None
|
||||
|
||||
def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]:
|
||||
"""Call OpenAI-compatible API for image captioning."""
|
||||
logger.info(f"Calling OpenAI-compatible API with model: {self.model}")
|
||||
|
||||
user_msg = UserMessage(
|
||||
role="user",
|
||||
content=[
|
||||
Content(type="text", text=self.prompt),
|
||||
Content(
|
||||
type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto")
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
gpt_req = CompletionRequest(
|
||||
model=self.model,
|
||||
temperature=0.3,
|
||||
top_p=0.8,
|
||||
messages=[user_msg],
|
||||
user="abc",
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "text/event-stream",
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
if self.api_key:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
|
||||
try:
|
||||
logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}")
|
||||
response = requests.post(
|
||||
self.completion_url,
|
||||
data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4),
|
||||
headers=headers,
|
||||
timeout=30,
|
||||
)
|
||||
if response.status_code != 200:
|
||||
logger.error(
|
||||
f"OpenAI-compatible API returned non-200 status code: {response.status_code}"
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
logger.info(
|
||||
f"Successfully received response from OpenAI-compatible API with status: {response.status_code}"
|
||||
)
|
||||
logger.info(f"Converting response to CaptionChatResp object")
|
||||
caption_resp = CaptionChatResp.from_json(response.json())
|
||||
|
||||
if caption_resp.usage:
|
||||
logger.info(
|
||||
f"API usage: prompt_tokens={caption_resp.usage.prompt_tokens}, "
|
||||
f"completion_tokens={caption_resp.usage.completion_tokens}"
|
||||
)
|
||||
|
||||
return caption_resp
|
||||
except requests.exceptions.Timeout:
|
||||
logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Request error calling OpenAI-compatible API: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error calling OpenAI-compatible API: {e}")
|
||||
return None
|
||||
|
||||
def get_caption(self, image_data: str) -> str:
|
||||
"""
|
||||
Get a caption for the provided image data.
|
||||
|
||||
Args:
|
||||
image_data: URL of the image or base64 encoded image data
|
||||
|
||||
Returns:
|
||||
Caption text as string, or empty string if captioning failed
|
||||
"""
|
||||
logger.info("Getting caption for image")
|
||||
if not image_data or self.completion_url is None:
|
||||
logger.error("Image data is not set")
|
||||
return ""
|
||||
caption_resp = self._call_caption_api(image_data)
|
||||
if caption_resp:
|
||||
caption = caption_resp.choice_data()
|
||||
caption_length = len(caption)
|
||||
logger.info(f"Successfully generated caption of length {caption_length}")
|
||||
logger.info(
|
||||
f"Caption: {caption[:50]}..."
|
||||
if caption_length > 50
|
||||
else f"Caption: {caption}"
|
||||
)
|
||||
return caption
|
||||
logger.warning("Failed to get caption from Caption API")
|
||||
return ""
|
||||
21
docreader/parser/config.py
Normal file
21
docreader/parser/config.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkingConfig:
|
||||
"""
|
||||
Configuration for text chunking process.
|
||||
Controls how documents are split into smaller pieces for processing.
|
||||
"""
|
||||
|
||||
chunk_size: int = 512 # Maximum size of each chunk in tokens/chars
|
||||
chunk_overlap: int = 50 # Number of tokens/chars to overlap between chunks
|
||||
separators: list = field(
|
||||
default_factory=lambda: ["\n\n", "\n", "。"]
|
||||
) # Text separators in order of priority
|
||||
enable_multimodal: bool = (
|
||||
False # Whether to enable multimodal processing (text + images)
|
||||
)
|
||||
storage_config: dict = None # Preferred field name going forward
|
||||
vlm_config: dict = None # VLM configuration for image captioning
|
||||
|
||||
315
docreader/parser/doc_parser.py
Normal file
315
docreader/parser/doc_parser.py
Normal file
@@ -0,0 +1,315 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
import os
|
||||
import subprocess
|
||||
import shutil
|
||||
from io import BytesIO
|
||||
from typing import Optional, List, Tuple
|
||||
import textract
|
||||
from PIL import Image
|
||||
import zipfile
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from .base_parser import BaseParser
|
||||
from .docx_parser import DocxParser, Docx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocParser(BaseParser):
|
||||
"""DOC document parser"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> str:
|
||||
"""Parse DOC document
|
||||
|
||||
Args:
|
||||
content: DOC document content
|
||||
|
||||
Returns:
|
||||
Parse result
|
||||
"""
|
||||
logger.info(f"Parsing DOC document, content size: {len(content)} bytes")
|
||||
|
||||
# Save byte content as a temporary file
|
||||
with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
|
||||
temp_file_path = temp_file.name
|
||||
temp_file.write(content)
|
||||
temp_file.flush()
|
||||
logger.info(f"Saved DOC content to temporary file: {temp_file_path}")
|
||||
|
||||
try:
|
||||
# First try to convert to docx format to extract images
|
||||
if self.enable_multimodal:
|
||||
logger.info("Multimodal enabled, attempting to extract images from DOC")
|
||||
docx_content = self._convert_doc_to_docx(temp_file_path)
|
||||
|
||||
if docx_content:
|
||||
logger.info("Successfully converted DOC to DOCX, using DocxParser")
|
||||
# Use existing DocxParser to parse the converted docx
|
||||
docx_parser = DocxParser(
|
||||
file_name=self.file_name,
|
||||
file_type="docx",
|
||||
enable_multimodal=self.enable_multimodal,
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
chunking_config=self.chunking_config,
|
||||
separators=self.separators,
|
||||
)
|
||||
text = docx_parser.parse_into_text(docx_content)
|
||||
logger.info(f"Extracted {len(text)} characters using DocxParser")
|
||||
|
||||
# Clean up temporary file
|
||||
os.unlink(temp_file_path)
|
||||
logger.info(f"Deleted temporary file: {temp_file_path}")
|
||||
|
||||
return text
|
||||
else:
|
||||
logger.warning(
|
||||
"Failed to convert DOC to DOCX, falling back to text-only extraction"
|
||||
)
|
||||
|
||||
# If image extraction is not needed or conversion failed, try using antiword to extract text
|
||||
try:
|
||||
logger.info("Attempting to parse DOC file with antiword")
|
||||
# Check if antiword is installed
|
||||
antiword_path = self._find_antiword_path()
|
||||
|
||||
if antiword_path:
|
||||
# Use antiword to extract text directly
|
||||
logger.info(f"Using antiword at {antiword_path} to extract text")
|
||||
process = subprocess.Popen(
|
||||
[antiword_path, temp_file_path],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = process.communicate()
|
||||
|
||||
if process.returncode == 0:
|
||||
text = stdout.decode("utf-8", errors="ignore")
|
||||
logger.info(
|
||||
f"Successfully extracted {len(text)} characters using antiword"
|
||||
)
|
||||
|
||||
# Clean up temporary file
|
||||
os.unlink(temp_file_path)
|
||||
logger.info(f"Deleted temporary file: {temp_file_path}")
|
||||
|
||||
return text
|
||||
else:
|
||||
logger.warning(
|
||||
f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
|
||||
)
|
||||
else:
|
||||
logger.warning("antiword not found, falling back to textract")
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error using antiword: {str(e)}, falling back to textract"
|
||||
)
|
||||
|
||||
# If antiword fails, try using textract
|
||||
logger.info("Parsing DOC file with textract")
|
||||
text = textract.process(temp_file_path, method="antiword").decode("utf-8")
|
||||
logger.info(
|
||||
f"Successfully extracted {len(text)} characters of text from DOC document using textract"
|
||||
)
|
||||
|
||||
# Clean up temporary file
|
||||
os.unlink(temp_file_path)
|
||||
logger.info(f"Deleted temporary file: {temp_file_path}")
|
||||
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing DOC document: {str(e)}")
|
||||
# Ensure temporary file is cleaned up
|
||||
if os.path.exists(temp_file_path):
|
||||
os.unlink(temp_file_path)
|
||||
logger.info(f"Deleted temporary file after error: {temp_file_path}")
|
||||
return ""
|
||||
|
||||
def _convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
|
||||
"""Convert DOC file to DOCX format
|
||||
|
||||
Uses LibreOffice/OpenOffice for conversion
|
||||
|
||||
Args:
|
||||
doc_path: DOC file path
|
||||
|
||||
Returns:
|
||||
Byte stream of DOCX file content, or None if conversion fails
|
||||
"""
|
||||
logger.info(f"Converting DOC to DOCX: {doc_path}")
|
||||
|
||||
# Create a temporary directory to store the converted file
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
docx_path = os.path.join(temp_dir, "converted.docx")
|
||||
|
||||
try:
|
||||
# Check if LibreOffice or OpenOffice is installed
|
||||
soffice_path = self._find_soffice_path()
|
||||
if not soffice_path:
|
||||
logger.error(
|
||||
"LibreOffice/OpenOffice not found, cannot convert DOC to DOCX"
|
||||
)
|
||||
return None
|
||||
|
||||
# Execute conversion command
|
||||
logger.info(f"Using {soffice_path} to convert DOC to DOCX")
|
||||
cmd = [
|
||||
soffice_path,
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"docx",
|
||||
"--outdir",
|
||||
temp_dir,
|
||||
doc_path,
|
||||
]
|
||||
|
||||
logger.info(f"Running command: {' '.join(cmd)}")
|
||||
process = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
logger.error(
|
||||
f"Error converting DOC to DOCX: {stderr.decode('utf-8', errors='ignore')}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Find the converted file
|
||||
for file in os.listdir(temp_dir):
|
||||
if file.endswith(".docx"):
|
||||
converted_file = os.path.join(temp_dir, file)
|
||||
logger.info(f"Found converted file: {converted_file}")
|
||||
|
||||
# Read the converted file content
|
||||
with open(converted_file, "rb") as f:
|
||||
docx_content = f.read()
|
||||
|
||||
logger.info(
|
||||
f"Successfully read converted DOCX file, size: {len(docx_content)} bytes"
|
||||
)
|
||||
return docx_content
|
||||
|
||||
logger.error("No DOCX file found after conversion")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during DOC to DOCX conversion: {str(e)}")
|
||||
return None
|
||||
finally:
|
||||
# Clean up temporary directory
|
||||
try:
|
||||
shutil.rmtree(temp_dir)
|
||||
logger.info(f"Cleaned up temporary directory: {temp_dir}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to clean up temporary directory: {str(e)}")
|
||||
|
||||
def _find_soffice_path(self) -> Optional[str]:
|
||||
"""Find LibreOffice/OpenOffice executable path
|
||||
|
||||
Returns:
|
||||
Executable path, or None if not found
|
||||
"""
|
||||
# Common LibreOffice/OpenOffice executable paths
|
||||
possible_paths = [
|
||||
# Linux
|
||||
"/usr/bin/soffice",
|
||||
"/usr/lib/libreoffice/program/soffice",
|
||||
"/opt/libreoffice25.2/program/soffice",
|
||||
# macOS
|
||||
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
||||
# Windows
|
||||
"C:\\Program Files\\LibreOffice\\program\\soffice.exe",
|
||||
"C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
|
||||
]
|
||||
|
||||
# Check if path is set in environment variable
|
||||
if os.environ.get("LIBREOFFICE_PATH"):
|
||||
possible_paths.insert(0, os.environ.get("LIBREOFFICE_PATH"))
|
||||
|
||||
for path in possible_paths:
|
||||
if os.path.exists(path):
|
||||
logger.info(f"Found LibreOffice/OpenOffice at: {path}")
|
||||
return path
|
||||
|
||||
# Try to find in PATH
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["which", "soffice"], capture_output=True, text=True
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
path = result.stdout.strip()
|
||||
logger.info(f"Found LibreOffice/OpenOffice in PATH: {path}")
|
||||
return path
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.warning("LibreOffice/OpenOffice not found")
|
||||
return None
|
||||
|
||||
def _find_antiword_path(self) -> Optional[str]:
|
||||
"""Find antiword executable path
|
||||
|
||||
Returns:
|
||||
Executable path, or None if not found
|
||||
"""
|
||||
# Common antiword executable paths
|
||||
possible_paths = [
|
||||
# Linux/macOS
|
||||
"/usr/bin/antiword",
|
||||
"/usr/local/bin/antiword",
|
||||
# Windows
|
||||
"C:\\Program Files\\Antiword\\antiword.exe",
|
||||
"C:\\Program Files (x86)\\Antiword\\antiword.exe",
|
||||
]
|
||||
|
||||
# Check if path is set in environment variable
|
||||
if os.environ.get("ANTIWORD_PATH"):
|
||||
possible_paths.insert(0, os.environ.get("ANTIWORD_PATH"))
|
||||
|
||||
for path in possible_paths:
|
||||
if os.path.exists(path):
|
||||
logger.info(f"Found antiword at: {path}")
|
||||
return path
|
||||
|
||||
# Try to find in PATH
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["which", "antiword"], capture_output=True, text=True
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
path = result.stdout.strip()
|
||||
logger.info(f"Found antiword in PATH: {path}")
|
||||
return path
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.warning("antiword not found")
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger.info("Running DocParser in standalone mode")
|
||||
|
||||
file_name = "/path/to/your/test.doc"
|
||||
logger.info(f"Processing file: {file_name}")
|
||||
|
||||
doc_parser = DocParser(
|
||||
file_name, enable_multimodal=True, chunk_size=512, chunk_overlap=60
|
||||
)
|
||||
logger.info("Parser initialized, starting processing")
|
||||
|
||||
with open(file_name, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
text = doc_parser.parse_into_text(content)
|
||||
logger.info(f"Processing complete, extracted text length: {len(text)}")
|
||||
logger.info(f"Sample text: {text[:200]}...")
|
||||
1489
docreader/parser/docx_parser.py
Normal file
1489
docreader/parser/docx_parser.py
Normal file
File diff suppressed because it is too large
Load Diff
68
docreader/parser/image_parser.py
Normal file
68
docreader/parser/image_parser.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import logging
|
||||
import os
|
||||
import asyncio
|
||||
from PIL import Image
|
||||
import io
|
||||
from typing import Dict, Any, Tuple, Union
|
||||
from .base_parser import BaseParser, ParseResult
|
||||
import numpy as np
|
||||
|
||||
# Set up logger for this module
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
class ImageParser(BaseParser):
|
||||
"""
|
||||
Parser for image files with OCR capability.
|
||||
Extracts text from images and generates captions.
|
||||
|
||||
This parser handles image processing by:
|
||||
1. Uploading the image to storage
|
||||
2. Generating a descriptive caption
|
||||
3. Performing OCR to extract text content
|
||||
4. Returning a combined result with both text and image reference
|
||||
"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
"""
|
||||
Parse image content, upload the image and return Markdown reference along with image map.
|
||||
|
||||
Args:
|
||||
content: Raw image data (bytes)
|
||||
|
||||
Returns:
|
||||
Tuple of (markdown_text, image_map) where image_map maps image URLs to PIL Image objects
|
||||
"""
|
||||
logger.info(f"Parsing image content, size: {len(content)} bytes")
|
||||
image_map = {}
|
||||
|
||||
try:
|
||||
# Upload image to storage service
|
||||
logger.info("Uploading image to storage")
|
||||
_, ext = os.path.splitext(self.file_name)
|
||||
image_url = self.upload_bytes(content, file_ext=ext)
|
||||
if not image_url:
|
||||
logger.error("Failed to upload image to storage")
|
||||
return "", {}
|
||||
logger.info(
|
||||
f"Successfully uploaded image, URL: {image_url[:50]}..."
|
||||
if len(image_url) > 50
|
||||
else f"Successfully uploaded image, URL: {image_url}"
|
||||
)
|
||||
|
||||
# Create image object and add to map
|
||||
try:
|
||||
from PIL import Image
|
||||
import io
|
||||
image = Image.open(io.BytesIO(content))
|
||||
image_map[image_url] = image
|
||||
logger.info(f"Added image to image_map for URL: {image_url}")
|
||||
except Exception as img_err:
|
||||
logger.error(f"Error creating image object: {str(img_err)}")
|
||||
|
||||
markdown_text = f""
|
||||
return markdown_text, image_map
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing image: {str(e)}")
|
||||
return "", {}
|
||||
43
docreader/parser/image_utils.py
Normal file
43
docreader/parser/image_utils.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
from typing import Union
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def image_to_base64(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
|
||||
"""Convert image to base64 encoded string
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, PIL Image object, or numpy array
|
||||
|
||||
Returns:
|
||||
Base64 encoded image string, or empty string if conversion fails
|
||||
"""
|
||||
try:
|
||||
if isinstance(image, str):
|
||||
# It's a file path
|
||||
with open(image, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
elif isinstance(image, bytes):
|
||||
# It's bytes data
|
||||
return base64.b64encode(image).decode("utf-8")
|
||||
elif isinstance(image, Image.Image):
|
||||
# It's a PIL Image
|
||||
buffer = io.BytesIO()
|
||||
image.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
elif isinstance(image, np.ndarray):
|
||||
# It's a numpy array
|
||||
pil_image = Image.fromarray(image)
|
||||
buffer = io.BytesIO()
|
||||
pil_image.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
else:
|
||||
logger.error(f"Unsupported image type: {type(image)}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting image to base64: {str(e)}")
|
||||
return ""
|
||||
33
docreader/parser/markdown_parser.py
Normal file
33
docreader/parser/markdown_parser.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import asyncio
|
||||
import re
|
||||
import logging
|
||||
import numpy as np
|
||||
import os # Import os module to get environment variables
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
from .base_parser import BaseParser
|
||||
|
||||
# Get logger object
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownParser(BaseParser):
|
||||
"""Markdown document parser"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
"""Parse Markdown document, only extract text content, do not process images
|
||||
|
||||
Args:
|
||||
content: Markdown document content
|
||||
|
||||
Returns:
|
||||
Parsed text result
|
||||
"""
|
||||
logger.info(f"Parsing Markdown document, content size: {len(content)} bytes")
|
||||
|
||||
# Convert byte content to string using universal decoding method
|
||||
text = self.decode_bytes(content)
|
||||
logger.info(f"Decoded Markdown content, text length: {len(text)} characters")
|
||||
|
||||
logger.info(f"Markdown parsing complete, extracted {len(text)} characters of text")
|
||||
return text
|
||||
|
||||
278
docreader/parser/ocr_engine.py
Normal file
278
docreader/parser/ocr_engine.py
Normal file
@@ -0,0 +1,278 @@
|
||||
import os
|
||||
import logging
|
||||
import base64
|
||||
from typing import Optional, Union, Dict, Any
|
||||
from abc import ABC, abstractmethod
|
||||
from PIL import Image
|
||||
import io
|
||||
import numpy as np
|
||||
from .image_utils import image_to_base64
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
class OCRBackend(ABC):
|
||||
"""Base class for OCR backends"""
|
||||
|
||||
@abstractmethod
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
pass
|
||||
|
||||
class PaddleOCRBackend(OCRBackend):
|
||||
"""PaddleOCR backend implementation"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize PaddleOCR backend"""
|
||||
self.ocr = None
|
||||
try:
|
||||
import os
|
||||
import paddle
|
||||
|
||||
# Set PaddlePaddle to use CPU and disable GPU
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
||||
paddle.set_device('cpu')
|
||||
|
||||
# 尝试检测CPU是否支持AVX指令集
|
||||
try:
|
||||
import subprocess
|
||||
import platform
|
||||
|
||||
# 检测CPU是否支持AVX
|
||||
if platform.system() == "Linux":
|
||||
try:
|
||||
result = subprocess.run(['grep', '-o', 'avx', '/proc/cpuinfo'],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
has_avx = 'avx' in result.stdout.lower()
|
||||
if not has_avx:
|
||||
logger.warning("CPU does not support AVX instructions, using compatibility mode")
|
||||
# 进一步限制指令集使用
|
||||
os.environ['FLAGS_use_avx2'] = '0'
|
||||
os.environ['FLAGS_use_avx'] = '1'
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
|
||||
logger.warning("Could not detect AVX support, using compatibility mode")
|
||||
os.environ['FLAGS_use_avx2'] = '0'
|
||||
os.environ['FLAGS_use_avx'] = '1'
|
||||
except Exception as e:
|
||||
logger.warning(f"Error detecting CPU capabilities: {e}, using compatibility mode")
|
||||
os.environ['FLAGS_use_avx2'] = '0'
|
||||
os.environ['FLAGS_use_avx'] = '1'
|
||||
|
||||
from paddleocr import PaddleOCR
|
||||
# OCR configuration with text orientation classification enabled
|
||||
ocr_config = {
|
||||
"use_gpu": False,
|
||||
"text_det_limit_type": "max",
|
||||
"text_det_limit_side_len": 960,
|
||||
"use_doc_orientation_classify": True, # 启用文档方向分类
|
||||
"use_doc_unwarping": False,
|
||||
"use_textline_orientation": True, # 启用文本行方向检测
|
||||
"text_recognition_model_name": "PP-OCRv4_server_rec",
|
||||
"text_detection_model_name": "PP-OCRv4_server_det",
|
||||
"text_det_thresh": 0.3,
|
||||
"text_det_box_thresh": 0.6,
|
||||
"text_det_unclip_ratio": 1.5,
|
||||
"text_rec_score_thresh": 0.0,
|
||||
"ocr_version": "PP-OCRv4",
|
||||
"lang": "ch",
|
||||
"show_log": False,
|
||||
"use_dilation": True, # improves accuracy
|
||||
"det_db_score_mode": "slow", # improves accuracy
|
||||
}
|
||||
|
||||
self.ocr = PaddleOCR(**ocr_config)
|
||||
logger.info("PaddleOCR engine initialized successfully")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
|
||||
except OSError as e:
|
||||
if "Illegal instruction" in str(e) or "core dumped" in str(e):
|
||||
logger.error(f"PaddlePaddle crashed due to CPU instruction set incompatibility: {str(e)}")
|
||||
logger.error("This usually happens when the CPU doesn't support AVX instructions.")
|
||||
logger.error("Please try installing a CPU-only version of PaddlePaddle or use a different OCR backend.")
|
||||
else:
|
||||
logger.error(f"Failed to initialize PaddleOCR due to OS error: {str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
|
||||
|
||||
def predict(self, image):
|
||||
"""Perform OCR recognition on the image
|
||||
|
||||
Args:
|
||||
image: Image object (PIL.Image or numpy array)
|
||||
|
||||
Returns:
|
||||
Extracted text string
|
||||
"""
|
||||
try:
|
||||
# Ensure image is in RGB format
|
||||
if hasattr(image, "convert") and image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
|
||||
# Convert to numpy array if needed
|
||||
if hasattr(image, "convert"):
|
||||
image_array = np.array(image)
|
||||
else:
|
||||
image_array = image
|
||||
|
||||
# Perform OCR
|
||||
ocr_result = self.ocr.ocr(image_array, cls=False)
|
||||
|
||||
# Extract text
|
||||
ocr_text = ""
|
||||
if ocr_result and ocr_result[0]:
|
||||
for line in ocr_result[0]:
|
||||
if line and len(line) >= 2:
|
||||
text = line[1][0] if line[1] else ""
|
||||
if text:
|
||||
ocr_text += text + " "
|
||||
|
||||
text_length = len(ocr_text.strip())
|
||||
if text_length > 0:
|
||||
logger.info(f"OCR extracted {text_length} characters")
|
||||
return ocr_text.strip()
|
||||
else:
|
||||
logger.warning("OCR returned empty result")
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR recognition error: {str(e)}")
|
||||
return ""
|
||||
|
||||
class NanonetsOCRBackend(OCRBackend):
|
||||
"""Nanonets OCR backend implementation using OpenAI API format"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize Nanonets OCR backend
|
||||
|
||||
Args:
|
||||
api_key: API key for OpenAI API
|
||||
base_url: Base URL for OpenAI API
|
||||
model: Model name
|
||||
"""
|
||||
try:
|
||||
from openai import OpenAI
|
||||
self.api_key = kwargs.get("api_key", "123")
|
||||
self.base_url = kwargs.get("base_url", "http://localhost:8000/v1")
|
||||
self.model = kwargs.get("model", "nanonets/Nanonets-OCR-s")
|
||||
self.temperature = kwargs.get("temperature", 0.0)
|
||||
self.max_tokens = kwargs.get("max_tokens", 15000)
|
||||
|
||||
self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
|
||||
self.prompt = """
|
||||
## 任务说明
|
||||
|
||||
请从上传的文档中提取文字内容,严格按自然阅读顺序(从上到下,从左到右)输出,并遵循以下格式规范。
|
||||
|
||||
### 1. **文本处理**
|
||||
|
||||
* 按正常阅读顺序提取文字,语句流畅自然。
|
||||
|
||||
### 2. **表格**
|
||||
|
||||
* 所有表格统一转换为 **Markdown 表格格式**。
|
||||
* 内容保持清晰、对齐整齐,便于阅读。
|
||||
|
||||
### 3. **公式**
|
||||
|
||||
* 所有公式转换为 **LaTeX 格式**,使用 `$$公式$$` 包裹。
|
||||
|
||||
### 4. **图片**
|
||||
|
||||
* 忽略图片信息
|
||||
|
||||
### 5. **链接**
|
||||
|
||||
* 不要猜测或补全不确定的链接地址。
|
||||
"""
|
||||
logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
|
||||
except ImportError:
|
||||
logger.error("Failed to import openai. Please install it with 'pip install openai'")
|
||||
self.client = None
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize Nanonets OCR: {str(e)}")
|
||||
self.client = None
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image using Nanonets OCR
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
if self.client is None:
|
||||
logger.error("Nanonets OCR client not initialized")
|
||||
return ""
|
||||
|
||||
try:
|
||||
# Encode image to base64
|
||||
img_base64 = image_to_base64(image)
|
||||
if not img_base64:
|
||||
return ""
|
||||
|
||||
# Call Nanonets OCR API
|
||||
logger.info(f"Calling Nanonets OCR API with model: {self.model}")
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_base64}"},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": self.prompt,
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
temperature=self.temperature,
|
||||
max_tokens=self.max_tokens
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
except Exception as e:
|
||||
logger.error(f"Nanonets OCR prediction error: {str(e)}")
|
||||
return ""
|
||||
|
||||
class OCREngine:
|
||||
"""OCR Engine factory class"""
|
||||
|
||||
_instance = None
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls, backend_type="paddle", **kwargs) -> Optional[OCRBackend]:
|
||||
"""Get OCR engine instance
|
||||
|
||||
Args:
|
||||
backend_type: OCR backend type, one of: "paddle", "nanonets"
|
||||
**kwargs: Additional arguments for the backend
|
||||
|
||||
Returns:
|
||||
OCR engine instance or None if initialization fails
|
||||
"""
|
||||
if cls._instance is None:
|
||||
logger.info(f"Initializing OCR engine with backend: {backend_type}")
|
||||
|
||||
if backend_type.lower() == "paddle":
|
||||
cls._instance = PaddleOCRBackend(**kwargs)
|
||||
elif backend_type.lower() == "nanonets":
|
||||
cls._instance = NanonetsOCRBackend(**kwargs)
|
||||
else:
|
||||
logger.error(f"Unknown OCR backend type: {backend_type}")
|
||||
return None
|
||||
|
||||
return cls._instance
|
||||
|
||||
206
docreader/parser/parser.py
Normal file
206
docreader/parser/parser.py
Normal file
@@ -0,0 +1,206 @@
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Any, Optional, Type
|
||||
|
||||
from .base_parser import BaseParser, ParseResult
|
||||
from .docx_parser import DocxParser
|
||||
from .doc_parser import DocParser
|
||||
from .pdf_parser import PDFParser
|
||||
from .markdown_parser import MarkdownParser
|
||||
from .text_parser import TextParser
|
||||
from .image_parser import ImageParser
|
||||
from .web_parser import WebParser
|
||||
from .config import ChunkingConfig
|
||||
import traceback
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
"""
|
||||
Represents a single text chunk with associated metadata.
|
||||
Basic unit for document processing and embedding.
|
||||
"""
|
||||
|
||||
content: str # Text content of the chunk
|
||||
metadata: Dict[str, Any] = None # Associated metadata (source, page number, etc.)
|
||||
|
||||
|
||||
class Parser:
|
||||
"""
|
||||
Document parser facade that integrates all specialized parsers.
|
||||
Provides a unified interface for parsing various document types.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
logger.info("Initializing document parser")
|
||||
# Initialize all parser types
|
||||
self.parsers: Dict[str, Type[BaseParser]] = {
|
||||
"docx": DocxParser,
|
||||
"doc": DocParser,
|
||||
"pdf": PDFParser,
|
||||
"md": MarkdownParser,
|
||||
"txt": TextParser,
|
||||
"jpg": ImageParser,
|
||||
"jpeg": ImageParser,
|
||||
"png": ImageParser,
|
||||
"gif": ImageParser,
|
||||
"bmp": ImageParser,
|
||||
"tiff": ImageParser,
|
||||
"webp": ImageParser,
|
||||
"markdown": MarkdownParser,
|
||||
}
|
||||
logger.info(
|
||||
"Parser initialized with %d parsers: %s",
|
||||
len(self.parsers),
|
||||
", ".join(self.parsers.keys()),
|
||||
)
|
||||
|
||||
|
||||
def get_parser(self, file_type: str) -> Optional[Type[BaseParser]]:
|
||||
"""
|
||||
Get parser class for the specified file type.
|
||||
|
||||
Args:
|
||||
file_type: The file extension or type identifier
|
||||
|
||||
Returns:
|
||||
Parser class for the file type, or None if unsupported
|
||||
"""
|
||||
file_type = file_type.lower()
|
||||
parser = self.parsers.get(file_type)
|
||||
if parser:
|
||||
logger.info(f"Found parser for file type: {file_type}")
|
||||
else:
|
||||
logger.warning(f"No parser found for file type: {file_type}")
|
||||
return parser
|
||||
|
||||
def parse_file(
|
||||
self,
|
||||
file_name: str,
|
||||
file_type: str,
|
||||
content: bytes,
|
||||
config: ChunkingConfig,
|
||||
) -> Optional[ParseResult]:
|
||||
"""
|
||||
Parse file content using appropriate parser based on file type.
|
||||
|
||||
Args:
|
||||
file_name: Name of the file being parsed
|
||||
file_type: Type/extension of the file
|
||||
content: Raw file content as bytes
|
||||
config: Configuration for chunking process
|
||||
|
||||
Returns:
|
||||
ParseResult containing chunks and metadata, or None if parsing failed
|
||||
"""
|
||||
logger.info(f"Parsing file: {file_name} with type: {file_type}")
|
||||
logger.info(
|
||||
f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
|
||||
f"multimodal={config.enable_multimodal}"
|
||||
)
|
||||
|
||||
parser_instance = None
|
||||
|
||||
try:
|
||||
# Get appropriate parser for file type
|
||||
cls = self.get_parser(file_type)
|
||||
if cls is None:
|
||||
logger.error(f"Unsupported file type: {file_type}")
|
||||
return None
|
||||
|
||||
# Parse file content
|
||||
logger.info(f"Creating parser instance for {file_type} file")
|
||||
parser_instance = cls(
|
||||
file_name=file_name,
|
||||
file_type=file_type,
|
||||
chunk_size=config.chunk_size,
|
||||
chunk_overlap=config.chunk_overlap,
|
||||
separators=config.separators,
|
||||
enable_multimodal=config.enable_multimodal,
|
||||
max_image_size=1920, # Limit image size to 1920px
|
||||
max_concurrent_tasks=5, # Limit concurrent tasks to 5
|
||||
chunking_config=config, # Pass the entire chunking config
|
||||
)
|
||||
|
||||
logger.info(f"Starting to parse file content, size: {len(content)} bytes")
|
||||
result = parser_instance.parse(content)
|
||||
|
||||
if result:
|
||||
logger.info(
|
||||
f"Successfully parsed file {file_name}, generated {len(result.chunks)} chunks"
|
||||
)
|
||||
if result.chunks and len(result.chunks) > 0:
|
||||
logger.info(
|
||||
f"First chunk content length: {len(result.chunks[0].content)}"
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Parser returned empty chunks for file: {file_name}")
|
||||
else:
|
||||
logger.warning(f"Parser returned None result for file: {file_name}")
|
||||
|
||||
# Return parse results
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing file {file_name}: {str(e)}")
|
||||
logger.info(f"Detailed traceback: {traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
def parse_url(
|
||||
self, url: str, title: str, config: ChunkingConfig
|
||||
) -> Optional[ParseResult]:
|
||||
"""
|
||||
Parse content from a URL using the WebParser.
|
||||
|
||||
Args:
|
||||
url: URL to parse
|
||||
title: Title of the webpage (for metadata)
|
||||
config: Configuration for chunking process
|
||||
|
||||
Returns:
|
||||
ParseResult containing chunks and metadata, or None if parsing failed
|
||||
"""
|
||||
logger.info(f"Parsing URL: {url}, title: {title}")
|
||||
logger.info(
|
||||
f"Chunking config: size={config.chunk_size}, overlap={config.chunk_overlap}, "
|
||||
f"multimodal={config.enable_multimodal}"
|
||||
)
|
||||
|
||||
parser_instance = None
|
||||
|
||||
try:
|
||||
# Create web parser instance
|
||||
logger.info("Creating WebParser instance")
|
||||
parser_instance = WebParser(
|
||||
title=title,
|
||||
chunk_size=config.chunk_size,
|
||||
chunk_overlap=config.chunk_overlap,
|
||||
separators=config.separators,
|
||||
enable_multimodal=config.enable_multimodal,
|
||||
max_image_size=1920, # Limit image size
|
||||
max_concurrent_tasks=5, # Limit concurrent tasks
|
||||
chunking_config=config,
|
||||
)
|
||||
|
||||
logger.info(f"Starting to parse URL content")
|
||||
result = parser_instance.parse(url)
|
||||
|
||||
if result:
|
||||
logger.info(
|
||||
f"Successfully parsed URL, generated {len(result.chunks)} chunks"
|
||||
)
|
||||
logger.info(
|
||||
f"First chunk content length: {len(result.chunks[0].content) if result.chunks else 0}"
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Parser returned empty result for URL: {url}")
|
||||
|
||||
# Return parse results
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing URL {url}: {str(e)}")
|
||||
logger.info(f"Detailed traceback: {traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
113
docreader/parser/pdf_parser.py
Normal file
113
docreader/parser/pdf_parser.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import logging
|
||||
import os
|
||||
import io
|
||||
from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
|
||||
|
||||
import pdfplumber
|
||||
import tempfile
|
||||
from .base_parser import BaseParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PDFParser(BaseParser):
|
||||
"""
|
||||
PDF Document Parser
|
||||
|
||||
This parser handles PDF documents by extracting text content.
|
||||
It uses the pypdf library for simple text extraction.
|
||||
"""
|
||||
def _convert_table_to_markdown(self, table_data: list) -> str:
|
||||
|
||||
if not table_data or not table_data[0]: return ""
|
||||
def clean_cell(cell):
|
||||
if cell is None: return ""
|
||||
return str(cell).replace("\n", " <br> ")
|
||||
try:
|
||||
markdown = ""
|
||||
header = [clean_cell(cell) for cell in table_data[0]]
|
||||
markdown += "| " + " | ".join(header) + " |\n"
|
||||
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
|
||||
for row in table_data[1:]:
|
||||
if not row: continue
|
||||
body_row = [clean_cell(cell) for cell in row]
|
||||
if len(body_row) != len(header):
|
||||
logger.warning(f"Skipping malformed table row: {body_row}")
|
||||
continue
|
||||
markdown += "| " + " | ".join(body_row) + " |\n"
|
||||
return markdown
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting table to markdown: {e}")
|
||||
return ""
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
|
||||
logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
|
||||
|
||||
all_page_content = []
|
||||
|
||||
|
||||
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
||||
temp_pdf_path = temp_pdf.name
|
||||
|
||||
try:
|
||||
temp_pdf.write(content)
|
||||
temp_pdf.close()
|
||||
logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
|
||||
|
||||
with pdfplumber.open(temp_pdf_path) as pdf:
|
||||
logger.info(f"PDF has {len(pdf.pages)} pages")
|
||||
|
||||
for page_num, page in enumerate(pdf.pages):
|
||||
page_content_parts = []
|
||||
|
||||
# Try-fallback strategy for table detection
|
||||
default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
|
||||
found_tables = page.find_tables(default_settings)
|
||||
if not found_tables:
|
||||
logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
|
||||
fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
|
||||
found_tables = page.find_tables(fallback_settings)
|
||||
|
||||
table_bboxes = [table.bbox for table in found_tables]
|
||||
# Define a filter function that keeps objects NOT inside any table bbox.
|
||||
def not_within_bboxes(obj):
|
||||
"""Check if an object is outside all table bounding boxes."""
|
||||
for bbox in table_bboxes:
|
||||
# Check if the object's vertical center is within a bbox
|
||||
if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
|
||||
return False # It's inside a table, so we DON'T keep it
|
||||
return True # It's outside all tables, so we DO keep it
|
||||
|
||||
# that contains only the non-table text.
|
||||
non_table_page = page.filter(not_within_bboxes)
|
||||
|
||||
# Now, extract text from this filtered page view.
|
||||
text = non_table_page.extract_text(x_tolerance=2)
|
||||
if text:
|
||||
page_content_parts.append(text)
|
||||
|
||||
# Process and append the structured Markdown tables
|
||||
if found_tables:
|
||||
logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
|
||||
for table in found_tables:
|
||||
markdown_table = self._convert_table_to_markdown(table.extract())
|
||||
page_content_parts.append(f"\n\n{markdown_table}\n\n")
|
||||
|
||||
|
||||
all_page_content.append("".join(page_content_parts))
|
||||
|
||||
final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
|
||||
logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
|
||||
|
||||
return final_text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse PDF document: {str(e)}")
|
||||
return ""
|
||||
finally:
|
||||
# This block is GUARANTEED to execute, preventing resource leaks.
|
||||
if os.path.exists(temp_pdf_path):
|
||||
try:
|
||||
os.remove(temp_pdf_path)
|
||||
logger.info(f"Temporary file cleaned up: {temp_pdf_path}")
|
||||
except OSError as e:
|
||||
logger.error(f"Error removing temporary file {temp_pdf_path}: {e}")
|
||||
360
docreader/parser/storage.py
Normal file
360
docreader/parser/storage.py
Normal file
@@ -0,0 +1,360 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import uuid
|
||||
import logging
|
||||
import io
|
||||
import traceback
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Tuple, Optional
|
||||
|
||||
from qcloud_cos import CosConfig, CosS3Client
|
||||
from minio import Minio
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class Storage(ABC):
|
||||
"""Abstract base class for object storage operations"""
|
||||
|
||||
@abstractmethod
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to object storage
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to object storage
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class CosStorage(Storage):
|
||||
"""Tencent Cloud COS storage implementation"""
|
||||
|
||||
def __init__(self, storage_config=None):
|
||||
"""Initialize COS storage
|
||||
|
||||
Args:
|
||||
storage_config: Storage configuration
|
||||
"""
|
||||
self.storage_config = storage_config
|
||||
self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client()
|
||||
|
||||
def _init_cos_client(self):
|
||||
"""Initialize Tencent Cloud COS client"""
|
||||
try:
|
||||
# Use provided COS config if available, otherwise fall back to environment variables
|
||||
if self.storage_config and self.storage_config.get("access_key_id") != "":
|
||||
cos_config = self.storage_config
|
||||
secret_id = cos_config.get("access_key_id")
|
||||
secret_key = cos_config.get("secret_access_key")
|
||||
region = cos_config.get("region")
|
||||
bucket_name = cos_config.get("bucket_name")
|
||||
appid = cos_config.get("app_id")
|
||||
prefix = cos_config.get("path_prefix", "")
|
||||
else:
|
||||
# Get COS configuration from environment variables
|
||||
secret_id = os.getenv("COS_SECRET_ID")
|
||||
secret_key = os.getenv("COS_SECRET_KEY")
|
||||
region = os.getenv("COS_REGION")
|
||||
bucket_name = os.getenv("COS_BUCKET_NAME")
|
||||
appid = os.getenv("COS_APP_ID")
|
||||
prefix = os.getenv("COS_PATH_PREFIX")
|
||||
|
||||
enable_old_domain = (
|
||||
os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true"
|
||||
)
|
||||
|
||||
if not all([secret_id, secret_key, region, bucket_name, appid]):
|
||||
logger.error(
|
||||
"Incomplete COS configuration, missing required environment variables"
|
||||
f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}"
|
||||
)
|
||||
return None, None, None, None
|
||||
|
||||
# Initialize COS configuration
|
||||
logger.info(
|
||||
f"Initializing COS client with region: {region}, bucket: {bucket_name}"
|
||||
)
|
||||
config = CosConfig(
|
||||
Appid=appid,
|
||||
Region=region,
|
||||
SecretId=secret_id,
|
||||
SecretKey=secret_key,
|
||||
EnableOldDomain=enable_old_domain,
|
||||
)
|
||||
|
||||
# Create client
|
||||
client = CosS3Client(config)
|
||||
return client, bucket_name, region, prefix
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize COS client: {str(e)}")
|
||||
return None, None, None, None
|
||||
|
||||
def _get_download_url(self, bucket_name, region, object_key):
|
||||
"""Generate COS object URL
|
||||
|
||||
Args:
|
||||
bucket_name: Bucket name
|
||||
region: Region
|
||||
object_key: Object key
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
|
||||
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to Tencent Cloud COS
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
logger.info(f"Uploading file to COS: {file_path}")
|
||||
try:
|
||||
if not self.client:
|
||||
return ""
|
||||
|
||||
# Generate object key, use UUID to avoid conflicts
|
||||
file_name = os.path.basename(file_path)
|
||||
object_key = (
|
||||
f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
)
|
||||
logger.info(f"Generated object key: {object_key}")
|
||||
|
||||
# Upload file
|
||||
logger.info("Attempting to upload file to COS")
|
||||
response = self.client.upload_file(
|
||||
Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key
|
||||
)
|
||||
|
||||
# Get file URL
|
||||
file_url = self._get_download_url(self.bucket_name, self.region, object_key)
|
||||
|
||||
logger.info(f"Successfully uploaded file to COS: {file_url}")
|
||||
return file_url
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file to COS: {str(e)}")
|
||||
return ""
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to Tencent Cloud COS
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes")
|
||||
if not self.client:
|
||||
return ""
|
||||
|
||||
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
logger.info(f"Generated object key: {object_key}")
|
||||
self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key)
|
||||
file_url = self._get_download_url(self.bucket_name, self.region, object_key)
|
||||
logger.info(f"Successfully uploaded bytes to COS: {file_url}")
|
||||
return file_url
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload bytes to COS: {str(e)}")
|
||||
traceback.print_exc()
|
||||
return ""
|
||||
|
||||
|
||||
class MinioStorage(Storage):
|
||||
"""MinIO storage implementation"""
|
||||
|
||||
def __init__(self, storage_config=None):
|
||||
"""Initialize MinIO storage
|
||||
|
||||
Args:
|
||||
storage_config: Storage configuration
|
||||
"""
|
||||
self.storage_config = storage_config
|
||||
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client()
|
||||
|
||||
def _init_minio_client(self):
|
||||
"""Initialize MinIO client from environment variables or injected config.
|
||||
|
||||
If storage_config.path_prefix contains JSON from server (for minio case),
|
||||
prefer those values to override envs.
|
||||
"""
|
||||
try:
|
||||
endpoint = os.getenv("MINIO_ENDPOINT")
|
||||
use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true"
|
||||
if self.storage_config and self.storage_config.get("bucket_name"):
|
||||
storage_config = self.storage_config
|
||||
bucket_name = storage_config.get("bucket_name")
|
||||
path_prefix = storage_config.get("path_prefix").strip().strip("/")
|
||||
access_key = storage_config.get("access_key_id")
|
||||
secret_key = storage_config.get("secret_access_key")
|
||||
else:
|
||||
access_key = os.getenv("MINIO_ACCESS_KEY_ID")
|
||||
secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY")
|
||||
bucket_name = os.getenv("MINIO_BUCKET_NAME")
|
||||
path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/")
|
||||
|
||||
if not all([endpoint, access_key, secret_key, bucket_name]):
|
||||
logger.error("Incomplete MinIO configuration, missing required environment variables")
|
||||
return None, None, None, None, None
|
||||
|
||||
# Initialize client
|
||||
client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl)
|
||||
|
||||
# Ensure bucket exists
|
||||
found = client.bucket_exists(bucket_name)
|
||||
if not found:
|
||||
client.make_bucket(bucket_name)
|
||||
policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name)
|
||||
client.set_bucket_policy(bucket_name, policy)
|
||||
|
||||
return client, bucket_name, use_ssl, endpoint, path_prefix
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize MinIO client: {str(e)}")
|
||||
return None, None, None, None, None
|
||||
|
||||
def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None):
|
||||
"""Construct a public URL for MinIO object.
|
||||
|
||||
If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint.
|
||||
"""
|
||||
if public_endpoint:
|
||||
base = public_endpoint
|
||||
else:
|
||||
scheme = "https" if use_ssl else "http"
|
||||
base = f"{scheme}://{endpoint}"
|
||||
# Path-style URL for MinIO
|
||||
return f"{base}/{bucket_name}/{object_key}"
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to MinIO
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
logger.info(f"Uploading file to MinIO: {file_path}")
|
||||
try:
|
||||
if not self.client:
|
||||
return ""
|
||||
|
||||
# Generate object key, use UUID to avoid conflicts
|
||||
file_name = os.path.basename(file_path)
|
||||
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
logger.info(f"Generated MinIO object key: {object_key}")
|
||||
|
||||
# Upload file
|
||||
logger.info("Attempting to upload file to MinIO")
|
||||
with open(file_path, 'rb') as file_data:
|
||||
file_size = os.path.getsize(file_path)
|
||||
self.client.put_object(
|
||||
bucket_name=self.bucket_name,
|
||||
object_name=object_key,
|
||||
data=file_data,
|
||||
length=file_size,
|
||||
content_type='application/octet-stream'
|
||||
)
|
||||
|
||||
# Get file URL
|
||||
file_url = self._get_download_url(
|
||||
self.bucket_name,
|
||||
object_key,
|
||||
self.use_ssl,
|
||||
self.endpoint,
|
||||
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
|
||||
)
|
||||
|
||||
logger.info(f"Successfully uploaded file to MinIO: {file_url}")
|
||||
return file_url
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file to MinIO: {str(e)}")
|
||||
return ""
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to MinIO
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes")
|
||||
if not self.client:
|
||||
return ""
|
||||
|
||||
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
logger.info(f"Generated MinIO object key: {object_key}")
|
||||
self.client.put_object(
|
||||
self.bucket_name,
|
||||
object_key,
|
||||
data=io.BytesIO(content),
|
||||
length=len(content),
|
||||
content_type="application/octet-stream"
|
||||
)
|
||||
file_url = self._get_download_url(
|
||||
self.bucket_name,
|
||||
object_key,
|
||||
self.use_ssl,
|
||||
self.endpoint,
|
||||
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
|
||||
)
|
||||
logger.info(f"Successfully uploaded bytes to MinIO: {file_url}")
|
||||
return file_url
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload bytes to MinIO: {str(e)}")
|
||||
traceback.print_exc()
|
||||
return ""
|
||||
|
||||
|
||||
def create_storage(storage_config=None) -> Storage:
|
||||
"""Create a storage instance based on configuration or environment variables
|
||||
|
||||
Args:
|
||||
storage_config: Storage configuration dictionary
|
||||
|
||||
Returns:
|
||||
Storage instance
|
||||
"""
|
||||
storage_type = os.getenv("STORAGE_TYPE", "cos").lower()
|
||||
|
||||
if storage_config:
|
||||
storage_type = str(storage_config.get("provider", storage_type)).lower()
|
||||
|
||||
logger.info(f"Creating {storage_type} storage instance")
|
||||
|
||||
if storage_type == "minio":
|
||||
return MinioStorage(storage_config)
|
||||
elif storage_type == "cos":
|
||||
# Default to COS
|
||||
return CosStorage(storage_config)
|
||||
else:
|
||||
return None
|
||||
58
docreader/parser/text_parser.py
Normal file
58
docreader/parser/text_parser.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import logging
|
||||
from .base_parser import BaseParser
|
||||
from typing import Dict, Any, Tuple, Union
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextParser(BaseParser):
|
||||
"""
|
||||
Text document parser for processing plain text files.
|
||||
This parser handles text extraction and chunking from plain text documents.
|
||||
"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
"""
|
||||
Parse text document content by decoding bytes to string.
|
||||
|
||||
This is a straightforward parser that simply converts the binary content
|
||||
to text using appropriate character encoding.
|
||||
|
||||
Args:
|
||||
content: Raw document content as bytes
|
||||
|
||||
Returns:
|
||||
Parsed text content as string
|
||||
"""
|
||||
logger.info(f"Parsing text document, content size: {len(content)} bytes")
|
||||
text = self.decode_bytes(content)
|
||||
logger.info(
|
||||
f"Successfully parsed text document, extracted {len(text)} characters"
|
||||
)
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger.info("Running TextParser in standalone mode")
|
||||
|
||||
# Sample text for testing
|
||||
text = """## 标题1
|
||||

|
||||
## 标题2
|
||||

|
||||
## 标题3
|
||||
"""
|
||||
logger.info(f"Test text content: {text}")
|
||||
|
||||
# Define separators for text splitting
|
||||
seperators = ["\n\n", "\n", "。"]
|
||||
parser = TextParser(separators=seperators)
|
||||
logger.info("Splitting text into units")
|
||||
units = parser._split_into_units(text)
|
||||
logger.info(f"Split text into {len(units)} units")
|
||||
logger.info(f"Units: {units}")
|
||||
130
docreader/parser/web_parser.py
Normal file
130
docreader/parser/web_parser.py
Normal file
@@ -0,0 +1,130 @@
|
||||
from typing import Any, Optional, Tuple, Dict, Union
|
||||
import os
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
from bs4 import BeautifulSoup
|
||||
from .base_parser import BaseParser, ParseResult
|
||||
import logging
|
||||
import asyncio
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WebParser(BaseParser):
|
||||
"""Web page parser"""
|
||||
|
||||
def __init__(self, title: str, **kwargs):
|
||||
self.title = title
|
||||
self.proxy = os.environ.get("WEB_PROXY", "")
|
||||
super().__init__(file_name=title, **kwargs)
|
||||
logger.info(f"Initialized WebParser with title: {title}")
|
||||
|
||||
async def scrape(self, url: str) -> Any:
|
||||
logger.info(f"Starting web page scraping for URL: {url}")
|
||||
try:
|
||||
async with async_playwright() as p:
|
||||
kwargs = {}
|
||||
if self.proxy:
|
||||
kwargs["proxy"] = {"server": self.proxy}
|
||||
logger.info("Launching WebKit browser")
|
||||
browser = await p.webkit.launch(**kwargs)
|
||||
page = await browser.new_page()
|
||||
|
||||
logger.info(f"Navigating to URL: {url}")
|
||||
try:
|
||||
await page.goto(url, timeout=30000)
|
||||
logger.info("Initial page load complete")
|
||||
except Exception as e:
|
||||
logger.error(f"Error navigating to URL: {str(e)}")
|
||||
await browser.close()
|
||||
return BeautifulSoup(
|
||||
"", "html.parser"
|
||||
) # Return empty soup on navigation error
|
||||
|
||||
logger.info("Retrieving page HTML content")
|
||||
content = await page.content()
|
||||
logger.info(f"Retrieved {len(content)} bytes of HTML content")
|
||||
|
||||
await browser.close()
|
||||
logger.info("Browser closed")
|
||||
|
||||
# Parse HTML content with BeautifulSoup
|
||||
logger.info("Parsing HTML with BeautifulSoup")
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
logger.info("Successfully parsed HTML content")
|
||||
return soup
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to scrape web page: {str(e)}")
|
||||
# Return empty BeautifulSoup object on error
|
||||
return BeautifulSoup("", "html.parser")
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
"""Parse web page
|
||||
|
||||
Args:
|
||||
content: Web page content
|
||||
|
||||
Returns:
|
||||
Parse result
|
||||
"""
|
||||
logger.info("Starting web page parsing")
|
||||
|
||||
# Call async method synchronously
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
# Run async method
|
||||
# Handle content possibly being a string
|
||||
if isinstance(content, bytes):
|
||||
url = self.decode_bytes(content)
|
||||
logger.info(f"Decoded URL from bytes: {url}")
|
||||
else:
|
||||
url = content
|
||||
logger.info(f"Using content as URL directly: {url}")
|
||||
|
||||
logger.info(f"Scraping web page: {url}")
|
||||
soup = loop.run_until_complete(self.scrape(url))
|
||||
|
||||
# Extract page text
|
||||
logger.info("Extracting text from web page")
|
||||
text = soup.get_text("\n")
|
||||
logger.info(f"Extracted {len(text)} characters of text from URL: {url}")
|
||||
|
||||
# Get title, usually in <title> or <h1> tag
|
||||
if self.title != "":
|
||||
title = self.title
|
||||
logger.info(f"Using provided title: {title}")
|
||||
else:
|
||||
title = soup.title.string if soup.title else None
|
||||
logger.info(f"Found title tag: {title}")
|
||||
|
||||
if not title: # If <title> tag does not exist or is empty, try <h1> tag
|
||||
h1_tag = soup.find("h1")
|
||||
if h1_tag:
|
||||
title = h1_tag.get_text()
|
||||
logger.info(f"Using h1 tag as title: {title}")
|
||||
else:
|
||||
title = "Untitled Web Page"
|
||||
logger.info("No title found, using default")
|
||||
|
||||
logger.info(f"Web page title: {title}")
|
||||
text = "\n".join(
|
||||
(line.strip() for line in text.splitlines() if line.strip())
|
||||
)
|
||||
|
||||
result = title + "\n\n" + text
|
||||
logger.info(
|
||||
f"Web page parsing complete, total content: {len(result)} characters"
|
||||
)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing web page: {str(e)}")
|
||||
return f"Error parsing web page: {str(e)}"
|
||||
|
||||
finally:
|
||||
# Close event loop
|
||||
logger.info("Closing event loop")
|
||||
loop.close()
|
||||
819
docreader/proto/docreader.pb.go
Normal file
819
docreader/proto/docreader.pb.go
Normal file
@@ -0,0 +1,819 @@
|
||||
// Code generated by protoc-gen-go. DO NOT EDIT.
|
||||
// versions:
|
||||
// protoc-gen-go v1.36.6
|
||||
// protoc v5.29.3
|
||||
// source: docreader.proto
|
||||
|
||||
package proto
|
||||
|
||||
import (
|
||||
protoreflect "google.golang.org/protobuf/reflect/protoreflect"
|
||||
protoimpl "google.golang.org/protobuf/runtime/protoimpl"
|
||||
reflect "reflect"
|
||||
sync "sync"
|
||||
unsafe "unsafe"
|
||||
)
|
||||
|
||||
const (
|
||||
// Verify that this generated code is sufficiently up-to-date.
|
||||
_ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion)
|
||||
// Verify that runtime/protoimpl is sufficiently up-to-date.
|
||||
_ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20)
|
||||
)
|
||||
|
||||
// 对象存储提供方
|
||||
type StorageProvider int32
|
||||
|
||||
const (
|
||||
StorageProvider_STORAGE_PROVIDER_UNSPECIFIED StorageProvider = 0
|
||||
StorageProvider_COS StorageProvider = 1 // 腾讯云 COS
|
||||
StorageProvider_MINIO StorageProvider = 2 // MinIO/S3 兼容
|
||||
)
|
||||
|
||||
// Enum value maps for StorageProvider.
|
||||
var (
|
||||
StorageProvider_name = map[int32]string{
|
||||
0: "STORAGE_PROVIDER_UNSPECIFIED",
|
||||
1: "COS",
|
||||
2: "MINIO",
|
||||
}
|
||||
StorageProvider_value = map[string]int32{
|
||||
"STORAGE_PROVIDER_UNSPECIFIED": 0,
|
||||
"COS": 1,
|
||||
"MINIO": 2,
|
||||
}
|
||||
)
|
||||
|
||||
func (x StorageProvider) Enum() *StorageProvider {
|
||||
p := new(StorageProvider)
|
||||
*p = x
|
||||
return p
|
||||
}
|
||||
|
||||
func (x StorageProvider) String() string {
|
||||
return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x))
|
||||
}
|
||||
|
||||
func (StorageProvider) Descriptor() protoreflect.EnumDescriptor {
|
||||
return file_docreader_proto_enumTypes[0].Descriptor()
|
||||
}
|
||||
|
||||
func (StorageProvider) Type() protoreflect.EnumType {
|
||||
return &file_docreader_proto_enumTypes[0]
|
||||
}
|
||||
|
||||
func (x StorageProvider) Number() protoreflect.EnumNumber {
|
||||
return protoreflect.EnumNumber(x)
|
||||
}
|
||||
|
||||
// Deprecated: Use StorageProvider.Descriptor instead.
|
||||
func (StorageProvider) EnumDescriptor() ([]byte, []int) {
|
||||
return file_docreader_proto_rawDescGZIP(), []int{0}
|
||||
}
|
||||
|
||||
// 通用对象存储配置,兼容 COS 与 MinIO
|
||||
type StorageConfig struct {
|
||||
state protoimpl.MessageState `protogen:"open.v1"`
|
||||
Provider StorageProvider `protobuf:"varint,1,opt,name=provider,proto3,enum=docreader.StorageProvider" json:"provider,omitempty"` // 存储提供方
|
||||
Region string `protobuf:"bytes,2,opt,name=region,proto3" json:"region,omitempty"` // 区域(COS 使用)
|
||||
BucketName string `protobuf:"bytes,3,opt,name=bucket_name,json=bucketName,proto3" json:"bucket_name,omitempty"` // 桶名
|
||||
AccessKeyId string `protobuf:"bytes,4,opt,name=access_key_id,json=accessKeyId,proto3" json:"access_key_id,omitempty"` // 访问密钥 ID(MinIO/S3 使用)
|
||||
SecretAccessKey string `protobuf:"bytes,5,opt,name=secret_access_key,json=secretAccessKey,proto3" json:"secret_access_key,omitempty"` // 访问密钥 Secret(MinIO/S3 使用)
|
||||
AppId string `protobuf:"bytes,6,opt,name=app_id,json=appId,proto3" json:"app_id,omitempty"` // 应用 ID(COS 使用)
|
||||
PathPrefix string `protobuf:"bytes,7,opt,name=path_prefix,json=pathPrefix,proto3" json:"path_prefix,omitempty"` // 路径前缀
|
||||
unknownFields protoimpl.UnknownFields
|
||||
sizeCache protoimpl.SizeCache
|
||||
}
|
||||
|
||||
func (x *StorageConfig) Reset() {
|
||||
*x = StorageConfig{}
|
||||
mi := &file_docreader_proto_msgTypes[0]
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
|
||||
func (x *StorageConfig) String() string {
|
||||
return protoimpl.X.MessageStringOf(x)
|
||||
}
|
||||
|
||||
func (*StorageConfig) ProtoMessage() {}
|
||||
|
||||
func (x *StorageConfig) ProtoReflect() protoreflect.Message {
|
||||
mi := &file_docreader_proto_msgTypes[0]
|
||||
if x != nil {
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
if ms.LoadMessageInfo() == nil {
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
return mi.MessageOf(x)
|
||||
}
|
||||
|
||||
// Deprecated: Use StorageConfig.ProtoReflect.Descriptor instead.
|
||||
func (*StorageConfig) Descriptor() ([]byte, []int) {
|
||||
return file_docreader_proto_rawDescGZIP(), []int{0}
|
||||
}
|
||||
|
||||
func (x *StorageConfig) GetProvider() StorageProvider {
|
||||
if x != nil {
|
||||
return x.Provider
|
||||
}
|
||||
return StorageProvider_STORAGE_PROVIDER_UNSPECIFIED
|
||||
}
|
||||
|
||||
func (x *StorageConfig) GetRegion() string {
|
||||
if x != nil {
|
||||
return x.Region
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *StorageConfig) GetBucketName() string {
|
||||
if x != nil {
|
||||
return x.BucketName
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *StorageConfig) GetAccessKeyId() string {
|
||||
if x != nil {
|
||||
return x.AccessKeyId
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *StorageConfig) GetSecretAccessKey() string {
|
||||
if x != nil {
|
||||
return x.SecretAccessKey
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *StorageConfig) GetAppId() string {
|
||||
if x != nil {
|
||||
return x.AppId
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *StorageConfig) GetPathPrefix() string {
|
||||
if x != nil {
|
||||
return x.PathPrefix
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// VLM 配置
|
||||
type VLMConfig struct {
|
||||
state protoimpl.MessageState `protogen:"open.v1"`
|
||||
ModelName string `protobuf:"bytes,1,opt,name=model_name,json=modelName,proto3" json:"model_name,omitempty"` // VLM Model Name
|
||||
BaseUrl string `protobuf:"bytes,2,opt,name=base_url,json=baseUrl,proto3" json:"base_url,omitempty"` // VLM Base URL
|
||||
ApiKey string `protobuf:"bytes,3,opt,name=api_key,json=apiKey,proto3" json:"api_key,omitempty"` // VLM API Key
|
||||
InterfaceType string `protobuf:"bytes,4,opt,name=interface_type,json=interfaceType,proto3" json:"interface_type,omitempty"` // VLM Interface Type: "ollama" or "openai"
|
||||
unknownFields protoimpl.UnknownFields
|
||||
sizeCache protoimpl.SizeCache
|
||||
}
|
||||
|
||||
func (x *VLMConfig) Reset() {
|
||||
*x = VLMConfig{}
|
||||
mi := &file_docreader_proto_msgTypes[1]
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
|
||||
func (x *VLMConfig) String() string {
|
||||
return protoimpl.X.MessageStringOf(x)
|
||||
}
|
||||
|
||||
func (*VLMConfig) ProtoMessage() {}
|
||||
|
||||
func (x *VLMConfig) ProtoReflect() protoreflect.Message {
|
||||
mi := &file_docreader_proto_msgTypes[1]
|
||||
if x != nil {
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
if ms.LoadMessageInfo() == nil {
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
return mi.MessageOf(x)
|
||||
}
|
||||
|
||||
// Deprecated: Use VLMConfig.ProtoReflect.Descriptor instead.
|
||||
func (*VLMConfig) Descriptor() ([]byte, []int) {
|
||||
return file_docreader_proto_rawDescGZIP(), []int{1}
|
||||
}
|
||||
|
||||
func (x *VLMConfig) GetModelName() string {
|
||||
if x != nil {
|
||||
return x.ModelName
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *VLMConfig) GetBaseUrl() string {
|
||||
if x != nil {
|
||||
return x.BaseUrl
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *VLMConfig) GetApiKey() string {
|
||||
if x != nil {
|
||||
return x.ApiKey
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *VLMConfig) GetInterfaceType() string {
|
||||
if x != nil {
|
||||
return x.InterfaceType
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
type ReadConfig struct {
|
||||
state protoimpl.MessageState `protogen:"open.v1"`
|
||||
ChunkSize int32 `protobuf:"varint,1,opt,name=chunk_size,json=chunkSize,proto3" json:"chunk_size,omitempty"` // 分块大小
|
||||
ChunkOverlap int32 `protobuf:"varint,2,opt,name=chunk_overlap,json=chunkOverlap,proto3" json:"chunk_overlap,omitempty"` // 分块重叠
|
||||
Separators []string `protobuf:"bytes,3,rep,name=separators,proto3" json:"separators,omitempty"` // 分隔符
|
||||
EnableMultimodal bool `protobuf:"varint,4,opt,name=enable_multimodal,json=enableMultimodal,proto3" json:"enable_multimodal,omitempty"` // 多模态处理
|
||||
StorageConfig *StorageConfig `protobuf:"bytes,5,opt,name=storage_config,json=storageConfig,proto3" json:"storage_config,omitempty"` // 对象存储配置(通用)
|
||||
VlmConfig *VLMConfig `protobuf:"bytes,6,opt,name=vlm_config,json=vlmConfig,proto3" json:"vlm_config,omitempty"` // VLM 配置
|
||||
unknownFields protoimpl.UnknownFields
|
||||
sizeCache protoimpl.SizeCache
|
||||
}
|
||||
|
||||
func (x *ReadConfig) Reset() {
|
||||
*x = ReadConfig{}
|
||||
mi := &file_docreader_proto_msgTypes[2]
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
|
||||
func (x *ReadConfig) String() string {
|
||||
return protoimpl.X.MessageStringOf(x)
|
||||
}
|
||||
|
||||
func (*ReadConfig) ProtoMessage() {}
|
||||
|
||||
func (x *ReadConfig) ProtoReflect() protoreflect.Message {
|
||||
mi := &file_docreader_proto_msgTypes[2]
|
||||
if x != nil {
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
if ms.LoadMessageInfo() == nil {
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
return mi.MessageOf(x)
|
||||
}
|
||||
|
||||
// Deprecated: Use ReadConfig.ProtoReflect.Descriptor instead.
|
||||
func (*ReadConfig) Descriptor() ([]byte, []int) {
|
||||
return file_docreader_proto_rawDescGZIP(), []int{2}
|
||||
}
|
||||
|
||||
func (x *ReadConfig) GetChunkSize() int32 {
|
||||
if x != nil {
|
||||
return x.ChunkSize
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (x *ReadConfig) GetChunkOverlap() int32 {
|
||||
if x != nil {
|
||||
return x.ChunkOverlap
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (x *ReadConfig) GetSeparators() []string {
|
||||
if x != nil {
|
||||
return x.Separators
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (x *ReadConfig) GetEnableMultimodal() bool {
|
||||
if x != nil {
|
||||
return x.EnableMultimodal
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (x *ReadConfig) GetStorageConfig() *StorageConfig {
|
||||
if x != nil {
|
||||
return x.StorageConfig
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (x *ReadConfig) GetVlmConfig() *VLMConfig {
|
||||
if x != nil {
|
||||
return x.VlmConfig
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// 从文件读取文档请求
|
||||
type ReadFromFileRequest struct {
|
||||
state protoimpl.MessageState `protogen:"open.v1"`
|
||||
FileContent []byte `protobuf:"bytes,1,opt,name=file_content,json=fileContent,proto3" json:"file_content,omitempty"` // 文件内容
|
||||
FileName string `protobuf:"bytes,2,opt,name=file_name,json=fileName,proto3" json:"file_name,omitempty"` // 文件名
|
||||
FileType string `protobuf:"bytes,3,opt,name=file_type,json=fileType,proto3" json:"file_type,omitempty"` // 文件类型
|
||||
ReadConfig *ReadConfig `protobuf:"bytes,4,opt,name=read_config,json=readConfig,proto3" json:"read_config,omitempty"`
|
||||
RequestId string `protobuf:"bytes,5,opt,name=request_id,json=requestId,proto3" json:"request_id,omitempty"`
|
||||
unknownFields protoimpl.UnknownFields
|
||||
sizeCache protoimpl.SizeCache
|
||||
}
|
||||
|
||||
func (x *ReadFromFileRequest) Reset() {
|
||||
*x = ReadFromFileRequest{}
|
||||
mi := &file_docreader_proto_msgTypes[3]
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
|
||||
func (x *ReadFromFileRequest) String() string {
|
||||
return protoimpl.X.MessageStringOf(x)
|
||||
}
|
||||
|
||||
func (*ReadFromFileRequest) ProtoMessage() {}
|
||||
|
||||
func (x *ReadFromFileRequest) ProtoReflect() protoreflect.Message {
|
||||
mi := &file_docreader_proto_msgTypes[3]
|
||||
if x != nil {
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
if ms.LoadMessageInfo() == nil {
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
return mi.MessageOf(x)
|
||||
}
|
||||
|
||||
// Deprecated: Use ReadFromFileRequest.ProtoReflect.Descriptor instead.
|
||||
func (*ReadFromFileRequest) Descriptor() ([]byte, []int) {
|
||||
return file_docreader_proto_rawDescGZIP(), []int{3}
|
||||
}
|
||||
|
||||
func (x *ReadFromFileRequest) GetFileContent() []byte {
|
||||
if x != nil {
|
||||
return x.FileContent
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (x *ReadFromFileRequest) GetFileName() string {
|
||||
if x != nil {
|
||||
return x.FileName
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *ReadFromFileRequest) GetFileType() string {
|
||||
if x != nil {
|
||||
return x.FileType
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *ReadFromFileRequest) GetReadConfig() *ReadConfig {
|
||||
if x != nil {
|
||||
return x.ReadConfig
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (x *ReadFromFileRequest) GetRequestId() string {
|
||||
if x != nil {
|
||||
return x.RequestId
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// 从URL读取文档请求
|
||||
type ReadFromURLRequest struct {
|
||||
state protoimpl.MessageState `protogen:"open.v1"`
|
||||
Url string `protobuf:"bytes,1,opt,name=url,proto3" json:"url,omitempty"` // 文档URL
|
||||
Title string `protobuf:"bytes,2,opt,name=title,proto3" json:"title,omitempty"` // 标题
|
||||
ReadConfig *ReadConfig `protobuf:"bytes,3,opt,name=read_config,json=readConfig,proto3" json:"read_config,omitempty"`
|
||||
RequestId string `protobuf:"bytes,4,opt,name=request_id,json=requestId,proto3" json:"request_id,omitempty"`
|
||||
unknownFields protoimpl.UnknownFields
|
||||
sizeCache protoimpl.SizeCache
|
||||
}
|
||||
|
||||
func (x *ReadFromURLRequest) Reset() {
|
||||
*x = ReadFromURLRequest{}
|
||||
mi := &file_docreader_proto_msgTypes[4]
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
|
||||
func (x *ReadFromURLRequest) String() string {
|
||||
return protoimpl.X.MessageStringOf(x)
|
||||
}
|
||||
|
||||
func (*ReadFromURLRequest) ProtoMessage() {}
|
||||
|
||||
func (x *ReadFromURLRequest) ProtoReflect() protoreflect.Message {
|
||||
mi := &file_docreader_proto_msgTypes[4]
|
||||
if x != nil {
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
if ms.LoadMessageInfo() == nil {
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
return mi.MessageOf(x)
|
||||
}
|
||||
|
||||
// Deprecated: Use ReadFromURLRequest.ProtoReflect.Descriptor instead.
|
||||
func (*ReadFromURLRequest) Descriptor() ([]byte, []int) {
|
||||
return file_docreader_proto_rawDescGZIP(), []int{4}
|
||||
}
|
||||
|
||||
func (x *ReadFromURLRequest) GetUrl() string {
|
||||
if x != nil {
|
||||
return x.Url
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *ReadFromURLRequest) GetTitle() string {
|
||||
if x != nil {
|
||||
return x.Title
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *ReadFromURLRequest) GetReadConfig() *ReadConfig {
|
||||
if x != nil {
|
||||
return x.ReadConfig
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (x *ReadFromURLRequest) GetRequestId() string {
|
||||
if x != nil {
|
||||
return x.RequestId
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// 图片信息
|
||||
type Image struct {
|
||||
state protoimpl.MessageState `protogen:"open.v1"`
|
||||
Url string `protobuf:"bytes,1,opt,name=url,proto3" json:"url,omitempty"` // 图片URL
|
||||
Caption string `protobuf:"bytes,2,opt,name=caption,proto3" json:"caption,omitempty"` // 图片描述
|
||||
OcrText string `protobuf:"bytes,3,opt,name=ocr_text,json=ocrText,proto3" json:"ocr_text,omitempty"` // OCR提取的文本内容
|
||||
OriginalUrl string `protobuf:"bytes,4,opt,name=original_url,json=originalUrl,proto3" json:"original_url,omitempty"` // 原始图片URL
|
||||
Start int32 `protobuf:"varint,5,opt,name=start,proto3" json:"start,omitempty"` // 图片在文本中的开始位置
|
||||
End int32 `protobuf:"varint,6,opt,name=end,proto3" json:"end,omitempty"` // 图片在文本中的结束位置
|
||||
unknownFields protoimpl.UnknownFields
|
||||
sizeCache protoimpl.SizeCache
|
||||
}
|
||||
|
||||
func (x *Image) Reset() {
|
||||
*x = Image{}
|
||||
mi := &file_docreader_proto_msgTypes[5]
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
|
||||
func (x *Image) String() string {
|
||||
return protoimpl.X.MessageStringOf(x)
|
||||
}
|
||||
|
||||
func (*Image) ProtoMessage() {}
|
||||
|
||||
func (x *Image) ProtoReflect() protoreflect.Message {
|
||||
mi := &file_docreader_proto_msgTypes[5]
|
||||
if x != nil {
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
if ms.LoadMessageInfo() == nil {
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
return mi.MessageOf(x)
|
||||
}
|
||||
|
||||
// Deprecated: Use Image.ProtoReflect.Descriptor instead.
|
||||
func (*Image) Descriptor() ([]byte, []int) {
|
||||
return file_docreader_proto_rawDescGZIP(), []int{5}
|
||||
}
|
||||
|
||||
func (x *Image) GetUrl() string {
|
||||
if x != nil {
|
||||
return x.Url
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *Image) GetCaption() string {
|
||||
if x != nil {
|
||||
return x.Caption
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *Image) GetOcrText() string {
|
||||
if x != nil {
|
||||
return x.OcrText
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *Image) GetOriginalUrl() string {
|
||||
if x != nil {
|
||||
return x.OriginalUrl
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *Image) GetStart() int32 {
|
||||
if x != nil {
|
||||
return x.Start
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (x *Image) GetEnd() int32 {
|
||||
if x != nil {
|
||||
return x.End
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
type Chunk struct {
|
||||
state protoimpl.MessageState `protogen:"open.v1"`
|
||||
Content string `protobuf:"bytes,1,opt,name=content,proto3" json:"content,omitempty"` // 块内容
|
||||
Seq int32 `protobuf:"varint,2,opt,name=seq,proto3" json:"seq,omitempty"` // 块在文档中的次序
|
||||
Start int32 `protobuf:"varint,3,opt,name=start,proto3" json:"start,omitempty"` // 块在文档中的起始位置
|
||||
End int32 `protobuf:"varint,4,opt,name=end,proto3" json:"end,omitempty"` // 块在文档中的结束位置
|
||||
Images []*Image `protobuf:"bytes,5,rep,name=images,proto3" json:"images,omitempty"` // 块中包含的图片信息
|
||||
unknownFields protoimpl.UnknownFields
|
||||
sizeCache protoimpl.SizeCache
|
||||
}
|
||||
|
||||
func (x *Chunk) Reset() {
|
||||
*x = Chunk{}
|
||||
mi := &file_docreader_proto_msgTypes[6]
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
|
||||
func (x *Chunk) String() string {
|
||||
return protoimpl.X.MessageStringOf(x)
|
||||
}
|
||||
|
||||
func (*Chunk) ProtoMessage() {}
|
||||
|
||||
func (x *Chunk) ProtoReflect() protoreflect.Message {
|
||||
mi := &file_docreader_proto_msgTypes[6]
|
||||
if x != nil {
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
if ms.LoadMessageInfo() == nil {
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
return mi.MessageOf(x)
|
||||
}
|
||||
|
||||
// Deprecated: Use Chunk.ProtoReflect.Descriptor instead.
|
||||
func (*Chunk) Descriptor() ([]byte, []int) {
|
||||
return file_docreader_proto_rawDescGZIP(), []int{6}
|
||||
}
|
||||
|
||||
func (x *Chunk) GetContent() string {
|
||||
if x != nil {
|
||||
return x.Content
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (x *Chunk) GetSeq() int32 {
|
||||
if x != nil {
|
||||
return x.Seq
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (x *Chunk) GetStart() int32 {
|
||||
if x != nil {
|
||||
return x.Start
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (x *Chunk) GetEnd() int32 {
|
||||
if x != nil {
|
||||
return x.End
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (x *Chunk) GetImages() []*Image {
|
||||
if x != nil {
|
||||
return x.Images
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// 从URL读取文档响应
|
||||
type ReadResponse struct {
|
||||
state protoimpl.MessageState `protogen:"open.v1"`
|
||||
Chunks []*Chunk `protobuf:"bytes,1,rep,name=chunks,proto3" json:"chunks,omitempty"` // 文档分块
|
||||
Error string `protobuf:"bytes,2,opt,name=error,proto3" json:"error,omitempty"` // 错误信息
|
||||
unknownFields protoimpl.UnknownFields
|
||||
sizeCache protoimpl.SizeCache
|
||||
}
|
||||
|
||||
func (x *ReadResponse) Reset() {
|
||||
*x = ReadResponse{}
|
||||
mi := &file_docreader_proto_msgTypes[7]
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
|
||||
func (x *ReadResponse) String() string {
|
||||
return protoimpl.X.MessageStringOf(x)
|
||||
}
|
||||
|
||||
func (*ReadResponse) ProtoMessage() {}
|
||||
|
||||
func (x *ReadResponse) ProtoReflect() protoreflect.Message {
|
||||
mi := &file_docreader_proto_msgTypes[7]
|
||||
if x != nil {
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
if ms.LoadMessageInfo() == nil {
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
return mi.MessageOf(x)
|
||||
}
|
||||
|
||||
// Deprecated: Use ReadResponse.ProtoReflect.Descriptor instead.
|
||||
func (*ReadResponse) Descriptor() ([]byte, []int) {
|
||||
return file_docreader_proto_rawDescGZIP(), []int{7}
|
||||
}
|
||||
|
||||
func (x *ReadResponse) GetChunks() []*Chunk {
|
||||
if x != nil {
|
||||
return x.Chunks
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (x *ReadResponse) GetError() string {
|
||||
if x != nil {
|
||||
return x.Error
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
var File_docreader_proto protoreflect.FileDescriptor
|
||||
|
||||
const file_docreader_proto_rawDesc = "" +
|
||||
"\n" +
|
||||
"\x0fdocreader.proto\x12\tdocreader\"\x88\x02\n" +
|
||||
"\rStorageConfig\x126\n" +
|
||||
"\bprovider\x18\x01 \x01(\x0e2\x1a.docreader.StorageProviderR\bprovider\x12\x16\n" +
|
||||
"\x06region\x18\x02 \x01(\tR\x06region\x12\x1f\n" +
|
||||
"\vbucket_name\x18\x03 \x01(\tR\n" +
|
||||
"bucketName\x12\"\n" +
|
||||
"\raccess_key_id\x18\x04 \x01(\tR\vaccessKeyId\x12*\n" +
|
||||
"\x11secret_access_key\x18\x05 \x01(\tR\x0fsecretAccessKey\x12\x15\n" +
|
||||
"\x06app_id\x18\x06 \x01(\tR\x05appId\x12\x1f\n" +
|
||||
"\vpath_prefix\x18\a \x01(\tR\n" +
|
||||
"pathPrefix\"\x85\x01\n" +
|
||||
"\tVLMConfig\x12\x1d\n" +
|
||||
"\n" +
|
||||
"model_name\x18\x01 \x01(\tR\tmodelName\x12\x19\n" +
|
||||
"\bbase_url\x18\x02 \x01(\tR\abaseUrl\x12\x17\n" +
|
||||
"\aapi_key\x18\x03 \x01(\tR\x06apiKey\x12%\n" +
|
||||
"\x0einterface_type\x18\x04 \x01(\tR\rinterfaceType\"\x93\x02\n" +
|
||||
"\n" +
|
||||
"ReadConfig\x12\x1d\n" +
|
||||
"\n" +
|
||||
"chunk_size\x18\x01 \x01(\x05R\tchunkSize\x12#\n" +
|
||||
"\rchunk_overlap\x18\x02 \x01(\x05R\fchunkOverlap\x12\x1e\n" +
|
||||
"\n" +
|
||||
"separators\x18\x03 \x03(\tR\n" +
|
||||
"separators\x12+\n" +
|
||||
"\x11enable_multimodal\x18\x04 \x01(\bR\x10enableMultimodal\x12?\n" +
|
||||
"\x0estorage_config\x18\x05 \x01(\v2\x18.docreader.StorageConfigR\rstorageConfig\x123\n" +
|
||||
"\n" +
|
||||
"vlm_config\x18\x06 \x01(\v2\x14.docreader.VLMConfigR\tvlmConfig\"\xc9\x01\n" +
|
||||
"\x13ReadFromFileRequest\x12!\n" +
|
||||
"\ffile_content\x18\x01 \x01(\fR\vfileContent\x12\x1b\n" +
|
||||
"\tfile_name\x18\x02 \x01(\tR\bfileName\x12\x1b\n" +
|
||||
"\tfile_type\x18\x03 \x01(\tR\bfileType\x126\n" +
|
||||
"\vread_config\x18\x04 \x01(\v2\x15.docreader.ReadConfigR\n" +
|
||||
"readConfig\x12\x1d\n" +
|
||||
"\n" +
|
||||
"request_id\x18\x05 \x01(\tR\trequestId\"\x93\x01\n" +
|
||||
"\x12ReadFromURLRequest\x12\x10\n" +
|
||||
"\x03url\x18\x01 \x01(\tR\x03url\x12\x14\n" +
|
||||
"\x05title\x18\x02 \x01(\tR\x05title\x126\n" +
|
||||
"\vread_config\x18\x03 \x01(\v2\x15.docreader.ReadConfigR\n" +
|
||||
"readConfig\x12\x1d\n" +
|
||||
"\n" +
|
||||
"request_id\x18\x04 \x01(\tR\trequestId\"\x99\x01\n" +
|
||||
"\x05Image\x12\x10\n" +
|
||||
"\x03url\x18\x01 \x01(\tR\x03url\x12\x18\n" +
|
||||
"\acaption\x18\x02 \x01(\tR\acaption\x12\x19\n" +
|
||||
"\bocr_text\x18\x03 \x01(\tR\aocrText\x12!\n" +
|
||||
"\foriginal_url\x18\x04 \x01(\tR\voriginalUrl\x12\x14\n" +
|
||||
"\x05start\x18\x05 \x01(\x05R\x05start\x12\x10\n" +
|
||||
"\x03end\x18\x06 \x01(\x05R\x03end\"\x85\x01\n" +
|
||||
"\x05Chunk\x12\x18\n" +
|
||||
"\acontent\x18\x01 \x01(\tR\acontent\x12\x10\n" +
|
||||
"\x03seq\x18\x02 \x01(\x05R\x03seq\x12\x14\n" +
|
||||
"\x05start\x18\x03 \x01(\x05R\x05start\x12\x10\n" +
|
||||
"\x03end\x18\x04 \x01(\x05R\x03end\x12(\n" +
|
||||
"\x06images\x18\x05 \x03(\v2\x10.docreader.ImageR\x06images\"N\n" +
|
||||
"\fReadResponse\x12(\n" +
|
||||
"\x06chunks\x18\x01 \x03(\v2\x10.docreader.ChunkR\x06chunks\x12\x14\n" +
|
||||
"\x05error\x18\x02 \x01(\tR\x05error*G\n" +
|
||||
"\x0fStorageProvider\x12 \n" +
|
||||
"\x1cSTORAGE_PROVIDER_UNSPECIFIED\x10\x00\x12\a\n" +
|
||||
"\x03COS\x10\x01\x12\t\n" +
|
||||
"\x05MINIO\x10\x022\x9f\x01\n" +
|
||||
"\tDocReader\x12I\n" +
|
||||
"\fReadFromFile\x12\x1e.docreader.ReadFromFileRequest\x1a\x17.docreader.ReadResponse\"\x00\x12G\n" +
|
||||
"\vReadFromURL\x12\x1d.docreader.ReadFromURLRequest\x1a\x17.docreader.ReadResponse\"\x00B5Z3github.com/Tencent/WeKnora/internal/docreader/protob\x06proto3"
|
||||
|
||||
var (
|
||||
file_docreader_proto_rawDescOnce sync.Once
|
||||
file_docreader_proto_rawDescData []byte
|
||||
)
|
||||
|
||||
func file_docreader_proto_rawDescGZIP() []byte {
|
||||
file_docreader_proto_rawDescOnce.Do(func() {
|
||||
file_docreader_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_docreader_proto_rawDesc), len(file_docreader_proto_rawDesc)))
|
||||
})
|
||||
return file_docreader_proto_rawDescData
|
||||
}
|
||||
|
||||
var file_docreader_proto_enumTypes = make([]protoimpl.EnumInfo, 1)
|
||||
var file_docreader_proto_msgTypes = make([]protoimpl.MessageInfo, 8)
|
||||
var file_docreader_proto_goTypes = []any{
|
||||
(StorageProvider)(0), // 0: docreader.StorageProvider
|
||||
(*StorageConfig)(nil), // 1: docreader.StorageConfig
|
||||
(*VLMConfig)(nil), // 2: docreader.VLMConfig
|
||||
(*ReadConfig)(nil), // 3: docreader.ReadConfig
|
||||
(*ReadFromFileRequest)(nil), // 4: docreader.ReadFromFileRequest
|
||||
(*ReadFromURLRequest)(nil), // 5: docreader.ReadFromURLRequest
|
||||
(*Image)(nil), // 6: docreader.Image
|
||||
(*Chunk)(nil), // 7: docreader.Chunk
|
||||
(*ReadResponse)(nil), // 8: docreader.ReadResponse
|
||||
}
|
||||
var file_docreader_proto_depIdxs = []int32{
|
||||
0, // 0: docreader.StorageConfig.provider:type_name -> docreader.StorageProvider
|
||||
1, // 1: docreader.ReadConfig.storage_config:type_name -> docreader.StorageConfig
|
||||
2, // 2: docreader.ReadConfig.vlm_config:type_name -> docreader.VLMConfig
|
||||
3, // 3: docreader.ReadFromFileRequest.read_config:type_name -> docreader.ReadConfig
|
||||
3, // 4: docreader.ReadFromURLRequest.read_config:type_name -> docreader.ReadConfig
|
||||
6, // 5: docreader.Chunk.images:type_name -> docreader.Image
|
||||
7, // 6: docreader.ReadResponse.chunks:type_name -> docreader.Chunk
|
||||
4, // 7: docreader.DocReader.ReadFromFile:input_type -> docreader.ReadFromFileRequest
|
||||
5, // 8: docreader.DocReader.ReadFromURL:input_type -> docreader.ReadFromURLRequest
|
||||
8, // 9: docreader.DocReader.ReadFromFile:output_type -> docreader.ReadResponse
|
||||
8, // 10: docreader.DocReader.ReadFromURL:output_type -> docreader.ReadResponse
|
||||
9, // [9:11] is the sub-list for method output_type
|
||||
7, // [7:9] is the sub-list for method input_type
|
||||
7, // [7:7] is the sub-list for extension type_name
|
||||
7, // [7:7] is the sub-list for extension extendee
|
||||
0, // [0:7] is the sub-list for field type_name
|
||||
}
|
||||
|
||||
func init() { file_docreader_proto_init() }
|
||||
func file_docreader_proto_init() {
|
||||
if File_docreader_proto != nil {
|
||||
return
|
||||
}
|
||||
type x struct{}
|
||||
out := protoimpl.TypeBuilder{
|
||||
File: protoimpl.DescBuilder{
|
||||
GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
|
||||
RawDescriptor: unsafe.Slice(unsafe.StringData(file_docreader_proto_rawDesc), len(file_docreader_proto_rawDesc)),
|
||||
NumEnums: 1,
|
||||
NumMessages: 8,
|
||||
NumExtensions: 0,
|
||||
NumServices: 1,
|
||||
},
|
||||
GoTypes: file_docreader_proto_goTypes,
|
||||
DependencyIndexes: file_docreader_proto_depIdxs,
|
||||
EnumInfos: file_docreader_proto_enumTypes,
|
||||
MessageInfos: file_docreader_proto_msgTypes,
|
||||
}.Build()
|
||||
File_docreader_proto = out.File
|
||||
file_docreader_proto_goTypes = nil
|
||||
file_docreader_proto_depIdxs = nil
|
||||
}
|
||||
89
docreader/proto/docreader.proto
Normal file
89
docreader/proto/docreader.proto
Normal file
@@ -0,0 +1,89 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package docreader;
|
||||
|
||||
option go_package = "github.com/Tencent/WeKnora/internal/docreader/proto";
|
||||
|
||||
// 文档读取服务
|
||||
service DocReader {
|
||||
// 从文件读取文档
|
||||
rpc ReadFromFile(ReadFromFileRequest) returns (ReadResponse) {}
|
||||
// 从URL读取文档
|
||||
rpc ReadFromURL(ReadFromURLRequest) returns (ReadResponse) {}
|
||||
}
|
||||
|
||||
// 对象存储提供方
|
||||
enum StorageProvider {
|
||||
STORAGE_PROVIDER_UNSPECIFIED = 0;
|
||||
COS = 1; // 腾讯云 COS
|
||||
MINIO = 2; // MinIO/S3 兼容
|
||||
}
|
||||
|
||||
// 通用对象存储配置,兼容 COS 与 MinIO
|
||||
message StorageConfig {
|
||||
StorageProvider provider = 1; // 存储提供方
|
||||
string region = 2; // 区域(COS 使用)
|
||||
string bucket_name = 3; // 桶名
|
||||
string access_key_id = 4; // 访问密钥 ID(MinIO/S3 使用)
|
||||
string secret_access_key = 5; // 访问密钥 Secret(MinIO/S3 使用)
|
||||
string app_id = 6; // 应用 ID(COS 使用)
|
||||
string path_prefix = 7; // 路径前缀
|
||||
}
|
||||
|
||||
// VLM 配置
|
||||
message VLMConfig {
|
||||
string model_name = 1; // VLM Model Name
|
||||
string base_url = 2; // VLM Base URL
|
||||
string api_key = 3; // VLM API Key
|
||||
string interface_type = 4; // VLM Interface Type: "ollama" or "openai"
|
||||
}
|
||||
|
||||
message ReadConfig {
|
||||
int32 chunk_size = 1; // 分块大小
|
||||
int32 chunk_overlap = 2; // 分块重叠
|
||||
repeated string separators = 3; // 分隔符
|
||||
bool enable_multimodal = 4; // 多模态处理
|
||||
StorageConfig storage_config = 5; // 对象存储配置(通用)
|
||||
VLMConfig vlm_config = 6; // VLM 配置
|
||||
}
|
||||
|
||||
// 从文件读取文档请求
|
||||
message ReadFromFileRequest {
|
||||
bytes file_content = 1; // 文件内容
|
||||
string file_name = 2; // 文件名
|
||||
string file_type = 3; // 文件类型
|
||||
ReadConfig read_config = 4;
|
||||
string request_id = 5;
|
||||
}
|
||||
|
||||
// 从URL读取文档请求
|
||||
message ReadFromURLRequest {
|
||||
string url = 1; // 文档URL
|
||||
string title = 2; // 标题
|
||||
ReadConfig read_config = 3;
|
||||
string request_id = 4;
|
||||
}
|
||||
|
||||
// 图片信息
|
||||
message Image {
|
||||
string url = 1; // 图片URL
|
||||
string caption = 2; // 图片描述
|
||||
string ocr_text = 3; // OCR提取的文本内容
|
||||
string original_url = 4; // 原始图片URL
|
||||
int32 start = 5; // 图片在文本中的开始位置
|
||||
int32 end = 6; // 图片在文本中的结束位置
|
||||
}
|
||||
|
||||
message Chunk {
|
||||
string content = 1; // 块内容
|
||||
int32 seq = 2; // 块在文档中的次序
|
||||
int32 start = 3; // 块在文档中的起始位置
|
||||
int32 end = 4; // 块在文档中的结束位置
|
||||
repeated Image images = 5; // 块中包含的图片信息
|
||||
}
|
||||
|
||||
// 从URL读取文档响应
|
||||
message ReadResponse {
|
||||
repeated Chunk chunks = 1; // 文档分块
|
||||
string error = 2; // 错误信息
|
||||
}
|
||||
167
docreader/proto/docreader_grpc.pb.go
Normal file
167
docreader/proto/docreader_grpc.pb.go
Normal file
@@ -0,0 +1,167 @@
|
||||
// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
|
||||
// versions:
|
||||
// - protoc-gen-go-grpc v1.5.1
|
||||
// - protoc v5.29.3
|
||||
// source: docreader.proto
|
||||
|
||||
package proto
|
||||
|
||||
import (
|
||||
context "context"
|
||||
grpc "google.golang.org/grpc"
|
||||
codes "google.golang.org/grpc/codes"
|
||||
status "google.golang.org/grpc/status"
|
||||
)
|
||||
|
||||
// This is a compile-time assertion to ensure that this generated file
|
||||
// is compatible with the grpc package it is being compiled against.
|
||||
// Requires gRPC-Go v1.64.0 or later.
|
||||
const _ = grpc.SupportPackageIsVersion9
|
||||
|
||||
const (
|
||||
DocReader_ReadFromFile_FullMethodName = "/docreader.DocReader/ReadFromFile"
|
||||
DocReader_ReadFromURL_FullMethodName = "/docreader.DocReader/ReadFromURL"
|
||||
)
|
||||
|
||||
// DocReaderClient is the client API for DocReader service.
|
||||
//
|
||||
// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
|
||||
//
|
||||
// 文档读取服务
|
||||
type DocReaderClient interface {
|
||||
// 从文件读取文档
|
||||
ReadFromFile(ctx context.Context, in *ReadFromFileRequest, opts ...grpc.CallOption) (*ReadResponse, error)
|
||||
// 从URL读取文档
|
||||
ReadFromURL(ctx context.Context, in *ReadFromURLRequest, opts ...grpc.CallOption) (*ReadResponse, error)
|
||||
}
|
||||
|
||||
type docReaderClient struct {
|
||||
cc grpc.ClientConnInterface
|
||||
}
|
||||
|
||||
func NewDocReaderClient(cc grpc.ClientConnInterface) DocReaderClient {
|
||||
return &docReaderClient{cc}
|
||||
}
|
||||
|
||||
func (c *docReaderClient) ReadFromFile(ctx context.Context, in *ReadFromFileRequest, opts ...grpc.CallOption) (*ReadResponse, error) {
|
||||
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
|
||||
out := new(ReadResponse)
|
||||
err := c.cc.Invoke(ctx, DocReader_ReadFromFile_FullMethodName, in, out, cOpts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (c *docReaderClient) ReadFromURL(ctx context.Context, in *ReadFromURLRequest, opts ...grpc.CallOption) (*ReadResponse, error) {
|
||||
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
|
||||
out := new(ReadResponse)
|
||||
err := c.cc.Invoke(ctx, DocReader_ReadFromURL_FullMethodName, in, out, cOpts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// DocReaderServer is the server API for DocReader service.
|
||||
// All implementations must embed UnimplementedDocReaderServer
|
||||
// for forward compatibility.
|
||||
//
|
||||
// 文档读取服务
|
||||
type DocReaderServer interface {
|
||||
// 从文件读取文档
|
||||
ReadFromFile(context.Context, *ReadFromFileRequest) (*ReadResponse, error)
|
||||
// 从URL读取文档
|
||||
ReadFromURL(context.Context, *ReadFromURLRequest) (*ReadResponse, error)
|
||||
mustEmbedUnimplementedDocReaderServer()
|
||||
}
|
||||
|
||||
// UnimplementedDocReaderServer must be embedded to have
|
||||
// forward compatible implementations.
|
||||
//
|
||||
// NOTE: this should be embedded by value instead of pointer to avoid a nil
|
||||
// pointer dereference when methods are called.
|
||||
type UnimplementedDocReaderServer struct{}
|
||||
|
||||
func (UnimplementedDocReaderServer) ReadFromFile(context.Context, *ReadFromFileRequest) (*ReadResponse, error) {
|
||||
return nil, status.Errorf(codes.Unimplemented, "method ReadFromFile not implemented")
|
||||
}
|
||||
func (UnimplementedDocReaderServer) ReadFromURL(context.Context, *ReadFromURLRequest) (*ReadResponse, error) {
|
||||
return nil, status.Errorf(codes.Unimplemented, "method ReadFromURL not implemented")
|
||||
}
|
||||
func (UnimplementedDocReaderServer) mustEmbedUnimplementedDocReaderServer() {}
|
||||
func (UnimplementedDocReaderServer) testEmbeddedByValue() {}
|
||||
|
||||
// UnsafeDocReaderServer may be embedded to opt out of forward compatibility for this service.
|
||||
// Use of this interface is not recommended, as added methods to DocReaderServer will
|
||||
// result in compilation errors.
|
||||
type UnsafeDocReaderServer interface {
|
||||
mustEmbedUnimplementedDocReaderServer()
|
||||
}
|
||||
|
||||
func RegisterDocReaderServer(s grpc.ServiceRegistrar, srv DocReaderServer) {
|
||||
// If the following call pancis, it indicates UnimplementedDocReaderServer was
|
||||
// embedded by pointer and is nil. This will cause panics if an
|
||||
// unimplemented method is ever invoked, so we test this at initialization
|
||||
// time to prevent it from happening at runtime later due to I/O.
|
||||
if t, ok := srv.(interface{ testEmbeddedByValue() }); ok {
|
||||
t.testEmbeddedByValue()
|
||||
}
|
||||
s.RegisterService(&DocReader_ServiceDesc, srv)
|
||||
}
|
||||
|
||||
func _DocReader_ReadFromFile_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
|
||||
in := new(ReadFromFileRequest)
|
||||
if err := dec(in); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if interceptor == nil {
|
||||
return srv.(DocReaderServer).ReadFromFile(ctx, in)
|
||||
}
|
||||
info := &grpc.UnaryServerInfo{
|
||||
Server: srv,
|
||||
FullMethod: DocReader_ReadFromFile_FullMethodName,
|
||||
}
|
||||
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
|
||||
return srv.(DocReaderServer).ReadFromFile(ctx, req.(*ReadFromFileRequest))
|
||||
}
|
||||
return interceptor(ctx, in, info, handler)
|
||||
}
|
||||
|
||||
func _DocReader_ReadFromURL_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
|
||||
in := new(ReadFromURLRequest)
|
||||
if err := dec(in); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if interceptor == nil {
|
||||
return srv.(DocReaderServer).ReadFromURL(ctx, in)
|
||||
}
|
||||
info := &grpc.UnaryServerInfo{
|
||||
Server: srv,
|
||||
FullMethod: DocReader_ReadFromURL_FullMethodName,
|
||||
}
|
||||
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
|
||||
return srv.(DocReaderServer).ReadFromURL(ctx, req.(*ReadFromURLRequest))
|
||||
}
|
||||
return interceptor(ctx, in, info, handler)
|
||||
}
|
||||
|
||||
// DocReader_ServiceDesc is the grpc.ServiceDesc for DocReader service.
|
||||
// It's only intended for direct use with grpc.RegisterService,
|
||||
// and not to be introspected or modified (even as a copy)
|
||||
var DocReader_ServiceDesc = grpc.ServiceDesc{
|
||||
ServiceName: "docreader.DocReader",
|
||||
HandlerType: (*DocReaderServer)(nil),
|
||||
Methods: []grpc.MethodDesc{
|
||||
{
|
||||
MethodName: "ReadFromFile",
|
||||
Handler: _DocReader_ReadFromFile_Handler,
|
||||
},
|
||||
{
|
||||
MethodName: "ReadFromURL",
|
||||
Handler: _DocReader_ReadFromURL_Handler,
|
||||
},
|
||||
},
|
||||
Streams: []grpc.StreamDesc{},
|
||||
Metadata: "docreader.proto",
|
||||
}
|
||||
55
docreader/proto/docreader_pb2.py
Normal file
55
docreader/proto/docreader_pb2.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# NO CHECKED-IN PROTOBUF GENCODE
|
||||
# source: docreader.proto
|
||||
# Protobuf Python Version: 6.31.1
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import runtime_version as _runtime_version
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
_runtime_version.ValidateProtobufRuntimeVersion(
|
||||
_runtime_version.Domain.PUBLIC,
|
||||
6,
|
||||
31,
|
||||
1,
|
||||
'',
|
||||
'docreader.proto'
|
||||
)
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x64ocreader.proto\x12\tdocreader\"\xb9\x01\n\rStorageConfig\x12,\n\x08provider\x18\x01 \x01(\x0e\x32\x1a.docreader.StorageProvider\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x03 \x01(\t\x12\x15\n\raccess_key_id\x18\x04 \x01(\t\x12\x19\n\x11secret_access_key\x18\x05 \x01(\t\x12\x0e\n\x06\x61pp_id\x18\x06 \x01(\t\x12\x13\n\x0bpath_prefix\x18\x07 \x01(\t\"Z\n\tVLMConfig\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x10\n\x08\x62\x61se_url\x18\x02 \x01(\t\x12\x0f\n\x07\x61pi_key\x18\x03 \x01(\t\x12\x16\n\x0einterface_type\x18\x04 \x01(\t\"\xc2\x01\n\nReadConfig\x12\x12\n\nchunk_size\x18\x01 \x01(\x05\x12\x15\n\rchunk_overlap\x18\x02 \x01(\x05\x12\x12\n\nseparators\x18\x03 \x03(\t\x12\x19\n\x11\x65nable_multimodal\x18\x04 \x01(\x08\x12\x30\n\x0estorage_config\x18\x05 \x01(\x0b\x32\x18.docreader.StorageConfig\x12(\n\nvlm_config\x18\x06 \x01(\x0b\x32\x14.docreader.VLMConfig\"\x91\x01\n\x13ReadFromFileRequest\x12\x14\n\x0c\x66ile_content\x18\x01 \x01(\x0c\x12\x11\n\tfile_name\x18\x02 \x01(\t\x12\x11\n\tfile_type\x18\x03 \x01(\t\x12*\n\x0bread_config\x18\x04 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x05 \x01(\t\"p\n\x12ReadFromURLRequest\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\r\n\x05title\x18\x02 \x01(\t\x12*\n\x0bread_config\x18\x03 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x04 \x01(\t\"i\n\x05Image\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\x0f\n\x07\x63\x61ption\x18\x02 \x01(\t\x12\x10\n\x08ocr_text\x18\x03 \x01(\t\x12\x14\n\x0coriginal_url\x18\x04 \x01(\t\x12\r\n\x05start\x18\x05 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x06 \x01(\x05\"c\n\x05\x43hunk\x12\x0f\n\x07\x63ontent\x18\x01 \x01(\t\x12\x0b\n\x03seq\x18\x02 \x01(\x05\x12\r\n\x05start\x18\x03 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x04 \x01(\x05\x12 \n\x06images\x18\x05 \x03(\x0b\x32\x10.docreader.Image\"?\n\x0cReadResponse\x12 \n\x06\x63hunks\x18\x01 \x03(\x0b\x32\x10.docreader.Chunk\x12\r\n\x05\x65rror\x18\x02 \x01(\t*G\n\x0fStorageProvider\x12 \n\x1cSTORAGE_PROVIDER_UNSPECIFIED\x10\x00\x12\x07\n\x03\x43OS\x10\x01\x12\t\n\x05MINIO\x10\x02\x32\x9f\x01\n\tDocReader\x12I\n\x0cReadFromFile\x12\x1e.docreader.ReadFromFileRequest\x1a\x17.docreader.ReadResponse\"\x00\x12G\n\x0bReadFromURL\x12\x1d.docreader.ReadFromURLRequest\x1a\x17.docreader.ReadResponse\"\x00\x42\x35Z3github.com/Tencent/WeKnora/internal/docreader/protob\x06proto3')
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'docreader_pb2', _globals)
|
||||
if not _descriptor._USE_C_DESCRIPTORS:
|
||||
_globals['DESCRIPTOR']._loaded_options = None
|
||||
_globals['DESCRIPTOR']._serialized_options = b'Z3github.com/Tencent/WeKnora/internal/docreader/proto'
|
||||
_globals['_STORAGEPROVIDER']._serialized_start=1042
|
||||
_globals['_STORAGEPROVIDER']._serialized_end=1113
|
||||
_globals['_STORAGECONFIG']._serialized_start=31
|
||||
_globals['_STORAGECONFIG']._serialized_end=216
|
||||
_globals['_VLMCONFIG']._serialized_start=218
|
||||
_globals['_VLMCONFIG']._serialized_end=308
|
||||
_globals['_READCONFIG']._serialized_start=311
|
||||
_globals['_READCONFIG']._serialized_end=505
|
||||
_globals['_READFROMFILEREQUEST']._serialized_start=508
|
||||
_globals['_READFROMFILEREQUEST']._serialized_end=653
|
||||
_globals['_READFROMURLREQUEST']._serialized_start=655
|
||||
_globals['_READFROMURLREQUEST']._serialized_end=767
|
||||
_globals['_IMAGE']._serialized_start=769
|
||||
_globals['_IMAGE']._serialized_end=874
|
||||
_globals['_CHUNK']._serialized_start=876
|
||||
_globals['_CHUNK']._serialized_end=975
|
||||
_globals['_READRESPONSE']._serialized_start=977
|
||||
_globals['_READRESPONSE']._serialized_end=1040
|
||||
_globals['_DOCREADER']._serialized_start=1116
|
||||
_globals['_DOCREADER']._serialized_end=1275
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
145
docreader/proto/docreader_pb2_grpc.py
Normal file
145
docreader/proto/docreader_pb2_grpc.py
Normal file
@@ -0,0 +1,145 @@
|
||||
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
|
||||
"""Client and server classes corresponding to protobuf-defined services."""
|
||||
import grpc
|
||||
import warnings
|
||||
|
||||
from . import docreader_pb2 as docreader__pb2
|
||||
|
||||
GRPC_GENERATED_VERSION = '1.76.0'
|
||||
GRPC_VERSION = grpc.__version__
|
||||
_version_not_supported = False
|
||||
|
||||
try:
|
||||
from grpc._utilities import first_version_is_lower
|
||||
_version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
|
||||
except ImportError:
|
||||
_version_not_supported = True
|
||||
|
||||
if _version_not_supported:
|
||||
raise RuntimeError(
|
||||
f'The grpc package installed is at version {GRPC_VERSION},'
|
||||
+ ' but the generated code in docreader_pb2_grpc.py depends on'
|
||||
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
|
||||
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
|
||||
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
|
||||
)
|
||||
|
||||
|
||||
class DocReaderStub(object):
|
||||
"""文档读取服务
|
||||
"""
|
||||
|
||||
def __init__(self, channel):
|
||||
"""Constructor.
|
||||
|
||||
Args:
|
||||
channel: A grpc.Channel.
|
||||
"""
|
||||
self.ReadFromFile = channel.unary_unary(
|
||||
'/docreader.DocReader/ReadFromFile',
|
||||
request_serializer=docreader__pb2.ReadFromFileRequest.SerializeToString,
|
||||
response_deserializer=docreader__pb2.ReadResponse.FromString,
|
||||
_registered_method=True)
|
||||
self.ReadFromURL = channel.unary_unary(
|
||||
'/docreader.DocReader/ReadFromURL',
|
||||
request_serializer=docreader__pb2.ReadFromURLRequest.SerializeToString,
|
||||
response_deserializer=docreader__pb2.ReadResponse.FromString,
|
||||
_registered_method=True)
|
||||
|
||||
|
||||
class DocReaderServicer(object):
|
||||
"""文档读取服务
|
||||
"""
|
||||
|
||||
def ReadFromFile(self, request, context):
|
||||
"""从文件读取文档
|
||||
"""
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details('Method not implemented!')
|
||||
raise NotImplementedError('Method not implemented!')
|
||||
|
||||
def ReadFromURL(self, request, context):
|
||||
"""从URL读取文档
|
||||
"""
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details('Method not implemented!')
|
||||
raise NotImplementedError('Method not implemented!')
|
||||
|
||||
|
||||
def add_DocReaderServicer_to_server(servicer, server):
|
||||
rpc_method_handlers = {
|
||||
'ReadFromFile': grpc.unary_unary_rpc_method_handler(
|
||||
servicer.ReadFromFile,
|
||||
request_deserializer=docreader__pb2.ReadFromFileRequest.FromString,
|
||||
response_serializer=docreader__pb2.ReadResponse.SerializeToString,
|
||||
),
|
||||
'ReadFromURL': grpc.unary_unary_rpc_method_handler(
|
||||
servicer.ReadFromURL,
|
||||
request_deserializer=docreader__pb2.ReadFromURLRequest.FromString,
|
||||
response_serializer=docreader__pb2.ReadResponse.SerializeToString,
|
||||
),
|
||||
}
|
||||
generic_handler = grpc.method_handlers_generic_handler(
|
||||
'docreader.DocReader', rpc_method_handlers)
|
||||
server.add_generic_rpc_handlers((generic_handler,))
|
||||
server.add_registered_method_handlers('docreader.DocReader', rpc_method_handlers)
|
||||
|
||||
|
||||
# This class is part of an EXPERIMENTAL API.
|
||||
class DocReader(object):
|
||||
"""文档读取服务
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def ReadFromFile(request,
|
||||
target,
|
||||
options=(),
|
||||
channel_credentials=None,
|
||||
call_credentials=None,
|
||||
insecure=False,
|
||||
compression=None,
|
||||
wait_for_ready=None,
|
||||
timeout=None,
|
||||
metadata=None):
|
||||
return grpc.experimental.unary_unary(
|
||||
request,
|
||||
target,
|
||||
'/docreader.DocReader/ReadFromFile',
|
||||
docreader__pb2.ReadFromFileRequest.SerializeToString,
|
||||
docreader__pb2.ReadResponse.FromString,
|
||||
options,
|
||||
channel_credentials,
|
||||
insecure,
|
||||
call_credentials,
|
||||
compression,
|
||||
wait_for_ready,
|
||||
timeout,
|
||||
metadata,
|
||||
_registered_method=True)
|
||||
|
||||
@staticmethod
|
||||
def ReadFromURL(request,
|
||||
target,
|
||||
options=(),
|
||||
channel_credentials=None,
|
||||
call_credentials=None,
|
||||
insecure=False,
|
||||
compression=None,
|
||||
wait_for_ready=None,
|
||||
timeout=None,
|
||||
metadata=None):
|
||||
return grpc.experimental.unary_unary(
|
||||
request,
|
||||
target,
|
||||
'/docreader.DocReader/ReadFromURL',
|
||||
docreader__pb2.ReadFromURLRequest.SerializeToString,
|
||||
docreader__pb2.ReadResponse.FromString,
|
||||
options,
|
||||
channel_credentials,
|
||||
insecure,
|
||||
call_credentials,
|
||||
compression,
|
||||
wait_for_ready,
|
||||
timeout,
|
||||
metadata,
|
||||
_registered_method=True)
|
||||
35
docreader/pyproject.toml
Normal file
35
docreader/pyproject.toml
Normal file
@@ -0,0 +1,35 @@
|
||||
[project]
|
||||
name = "docreader"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10.18"
|
||||
dependencies = [
|
||||
"antiword>=0.1.0",
|
||||
"asyncio>=4.0.0",
|
||||
"beautifulsoup4>=4.14.2",
|
||||
"cos-python-sdk-v5>=1.9.38",
|
||||
"goose3[all]>=3.1.20",
|
||||
"grpcio>=1.76.0",
|
||||
"grpcio-health-checking>=1.76.0",
|
||||
"grpcio-tools>=1.76.0",
|
||||
"lxml>=6.0.2",
|
||||
"markdown>=3.10",
|
||||
"markdownify>=1.2.0",
|
||||
"minio>=7.2.18",
|
||||
"mistletoe>=1.5.0",
|
||||
"ollama>=0.6.0",
|
||||
"openai>=2.7.1",
|
||||
"paddleocr>=2.10.0,<3.0.0",
|
||||
"paddlepaddle>=3.0.0,<4.0.0",
|
||||
"pdfplumber>=0.11.7",
|
||||
"pillow>=12.0.0",
|
||||
"playwright>=1.55.0",
|
||||
"protobuf>=6.33.0",
|
||||
"pypdf>=6.1.3",
|
||||
"pypdf2>=3.0.1",
|
||||
"python-docx>=1.2.0",
|
||||
"requests>=2.32.5",
|
||||
"textract==1.5.0",
|
||||
"urllib3>=2.5.0",
|
||||
]
|
||||
70
docreader/scripts/download_deps.py
Normal file
70
docreader/scripts/download_deps.py
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
# 添加当前目录到Python路径
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
if current_dir not in sys.path:
|
||||
sys.path.append(current_dir)
|
||||
|
||||
# 导入ImageParser
|
||||
from parser.image_parser import ImageParser
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def init_ocr_model():
|
||||
"""Initialize PaddleOCR model to pre-download and cache models"""
|
||||
try:
|
||||
logger.info("Initializing PaddleOCR model for pre-download...")
|
||||
|
||||
# 使用与代码中相同的配置
|
||||
ocr_config = {
|
||||
"use_gpu": False,
|
||||
"text_det_limit_type": "max",
|
||||
"text_det_limit_side_len": 960,
|
||||
"use_doc_orientation_classify": True, # 启用文档方向分类
|
||||
"use_doc_unwarping": False,
|
||||
"use_textline_orientation": True, # 启用文本行方向检测
|
||||
"text_recognition_model_name": "PP-OCRv4_server_rec",
|
||||
"text_detection_model_name": "PP-OCRv4_server_det",
|
||||
"text_det_thresh": 0.3,
|
||||
"text_det_box_thresh": 0.6,
|
||||
"text_det_unclip_ratio": 1.5,
|
||||
"text_rec_score_thresh": 0.0,
|
||||
"ocr_version": "PP-OCRv4",
|
||||
"lang": "ch",
|
||||
"show_log": False,
|
||||
"use_dilation": True,
|
||||
"det_db_score_mode": "slow",
|
||||
}
|
||||
|
||||
# 初始化PaddleOCR,这会触发模型下载和缓存
|
||||
ocr = PaddleOCR(**ocr_config)
|
||||
logger.info("PaddleOCR model initialization completed successfully")
|
||||
|
||||
# 测试OCR功能以确保模型正常工作
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
# 创建一个简单的测试图像
|
||||
test_image = np.ones((100, 300, 3), dtype=np.uint8) * 255
|
||||
test_pil = Image.fromarray(test_image)
|
||||
|
||||
# 执行一次OCR测试
|
||||
result = ocr.ocr(np.array(test_pil), cls=False)
|
||||
logger.info("PaddleOCR test completed successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize PaddleOCR model: {str(e)}")
|
||||
raise
|
||||
31
docreader/scripts/generate_proto.sh
Executable file
31
docreader/scripts/generate_proto.sh
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
# 设置目录
|
||||
PROTO_DIR="proto"
|
||||
PYTHON_OUT="proto"
|
||||
GO_OUT="proto"
|
||||
|
||||
# 生成Python代码
|
||||
python3 -m grpc_tools.protoc -I${PROTO_DIR} \
|
||||
--python_out=${PYTHON_OUT} \
|
||||
--grpc_python_out=${PYTHON_OUT} \
|
||||
${PROTO_DIR}/docreader.proto
|
||||
|
||||
# 生成Go代码
|
||||
protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
|
||||
--go_opt=paths=source_relative \
|
||||
--go-grpc_out=${GO_OUT} \
|
||||
--go-grpc_opt=paths=source_relative \
|
||||
${PROTO_DIR}/docreader.proto
|
||||
|
||||
# 修复Python导入问题(MacOS兼容版本)
|
||||
if [ "$(uname)" == "Darwin" ]; then
|
||||
# MacOS版本
|
||||
sed -i '' 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
else
|
||||
# Linux版本
|
||||
sed -i 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
fi
|
||||
|
||||
echo "Proto files generated successfully!"
|
||||
BIN
docreader/testdata/images/test_text.png
vendored
Normal file
BIN
docreader/testdata/images/test_text.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.8 KiB |
56
docreader/testdata/test.html
vendored
Normal file
56
docreader/testdata/test.html
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>测试 HTML 文档</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>测试 HTML 文档</h1>
|
||||
|
||||
<p>这是一个测试 HTML 文档,用于测试 HTML 解析功能。</p>
|
||||
|
||||
<h2>包含图片</h2>
|
||||
<img src="https://example.com/image.jpg" alt="测试图片">
|
||||
|
||||
<h2>包含链接</h2>
|
||||
<p>这是一个<a href="https://example.com">测试链接</a>。</p>
|
||||
|
||||
<h2>包含代码块</h2>
|
||||
<pre><code>
|
||||
def hello_world():
|
||||
print("Hello, World!")
|
||||
</code></pre>
|
||||
|
||||
<h2>包含表格</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>表头1</th>
|
||||
<th>表头2</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>内容1</td>
|
||||
<td>内容2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>内容3</td>
|
||||
<td>内容4</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<h2>测试分块功能</h2>
|
||||
<p>这部分内容用于测试分块功能,确保 HTML 结构在分块时保持完整。</p>
|
||||
<ul>
|
||||
<li>第一块内容</li>
|
||||
<li>第二块内容</li>
|
||||
<li>第三块内容</li>
|
||||
</ul>
|
||||
|
||||
<h2>测试重叠功能</h2>
|
||||
<p>这部分内容可能会在分块时与前后块重叠,以确保上下文的连续性。</p>
|
||||
</body>
|
||||
</html>
|
||||
37
docreader/testdata/test.md
vendored
Normal file
37
docreader/testdata/test.md
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
# 测试 Markdown 文档
|
||||
|
||||
这是一个测试 Markdown 文档,用于测试 Markdown 解析功能。
|
||||
|
||||
## 包含图片
|
||||
|
||||

|
||||
|
||||
## 包含链接
|
||||
|
||||
这是一个[测试链接](https://example.com)。
|
||||
|
||||
## 包含代码块
|
||||
|
||||
```python
|
||||
def hello_world():
|
||||
print("Hello, World!")
|
||||
```
|
||||
|
||||
## 包含表格
|
||||
|
||||
| 表头1 | 表头2 |
|
||||
|-------|-------|
|
||||
| 内容1 | 内容2 |
|
||||
| 内容3 | 内容4 |
|
||||
|
||||
## 测试分块功能
|
||||
|
||||
这部分内容用于测试分块功能,确保 Markdown 结构在分块时保持完整。
|
||||
|
||||
- 第一块内容
|
||||
- 第二块内容
|
||||
- 第三块内容
|
||||
|
||||
## 测试重叠功能
|
||||
|
||||
这部分内容可能会在分块时与前后块重叠,以确保上下文的连续性。
|
||||
16
docreader/testdata/test.txt
vendored
Normal file
16
docreader/testdata/test.txt
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
这是一个测试文档
|
||||
包含多行内容
|
||||
用于测试文档解析功能
|
||||
|
||||
这个文档包含以下内容:
|
||||
1. 基本文本内容
|
||||
2. 多行段落
|
||||
3. 列表项
|
||||
|
||||
测试分块功能:
|
||||
- 第一块内容
|
||||
- 第二块内容
|
||||
- 第三块内容
|
||||
|
||||
测试重叠功能:
|
||||
这部分内容可能会在分块时与前后块重叠,以确保上下文的连续性。
|
||||
19
docreader/testdata/test_download.txt
vendored
Normal file
19
docreader/testdata/test_download.txt
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
这是一个测试文档
|
||||
包含多行内容
|
||||
用于测试文档解析功能
|
||||
|
||||
这个文档包含以下内容:
|
||||
1. 基本文本内容
|
||||
2. 多行段落
|
||||
3. 列表项
|
||||
|
||||
测试分块功能:
|
||||
- 第一块内容
|
||||
- 第二块内容
|
||||
- 第三块内容
|
||||
|
||||
测试重叠功能:
|
||||
这部分内容可能会在分块时与前后块重叠,以确保上下文的连续性。
|
||||
|
||||
|
||||
test
|
||||
83
docreader/utils/__init__.py
Normal file
83
docreader/utils/__init__.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def singleton(cls, *args, **kw):
|
||||
instances = {}
|
||||
|
||||
def _singleton():
|
||||
key = str(cls) + str(os.getpid())
|
||||
if key not in instances:
|
||||
logger.info(f"Creating new singleton instance with key: {key}")
|
||||
instances[key] = cls(*args, **kw)
|
||||
else:
|
||||
logger.info(f"Returning existing singleton instance with key: {key}")
|
||||
return instances[key]
|
||||
|
||||
return _singleton
|
||||
|
||||
|
||||
def rmSpace(txt):
|
||||
logger.info(f"Removing spaces from text of length: {len(txt)}")
|
||||
txt = re.sub(r"([^a-z0-9.,\)>]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||
return re.sub(r"([^ ]) +([^a-z0-9.,\(<])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
def findMaxDt(fnm):
|
||||
m = "1970-01-01 00:00:00"
|
||||
logger.info(f"Finding maximum date in file: {fnm}")
|
||||
try:
|
||||
with open(fnm, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l:
|
||||
break
|
||||
l = l.strip("\n")
|
||||
if l == "nan":
|
||||
continue
|
||||
if l > m:
|
||||
m = l
|
||||
logger.info(f"Maximum date found: {m}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading file {fnm} for max date: {str(e)}")
|
||||
return m
|
||||
|
||||
|
||||
def findMaxTm(fnm):
|
||||
m = 0
|
||||
logger.info(f"Finding maximum time in file: {fnm}")
|
||||
try:
|
||||
with open(fnm, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l:
|
||||
break
|
||||
l = l.strip("\n")
|
||||
if l == "nan":
|
||||
continue
|
||||
if int(l) > m:
|
||||
m = int(l)
|
||||
logger.info(f"Maximum time found: {m}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading file {fnm} for max time: {str(e)}")
|
||||
return m
|
||||
149
docreader/utils/request.py
Normal file
149
docreader/utils/request.py
Normal file
@@ -0,0 +1,149 @@
|
||||
from contextvars import ContextVar
|
||||
import logging
|
||||
import uuid
|
||||
import contextlib
|
||||
import time
|
||||
from typing import Optional
|
||||
from logging import LogRecord
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 定义上下文变量
|
||||
request_id_var = ContextVar("request_id", default=None)
|
||||
_request_start_time_ctx = ContextVar("request_start_time", default=None)
|
||||
|
||||
|
||||
def set_request_id(request_id: str) -> None:
|
||||
"""设置当前上下文的请求ID"""
|
||||
request_id_var.set(request_id)
|
||||
|
||||
|
||||
def get_request_id() -> Optional[str]:
|
||||
"""获取当前上下文的请求ID"""
|
||||
return request_id_var.get()
|
||||
|
||||
|
||||
class MillisecondFormatter(logging.Formatter):
|
||||
"""自定义日志格式化器,只显示毫秒级时间戳(3位数字)而不是微秒(6位)"""
|
||||
|
||||
def formatTime(self, record, datefmt=None):
|
||||
"""重写formatTime方法,将微秒格式化为毫秒"""
|
||||
# 先获取标准的格式化时间
|
||||
result = super().formatTime(record, datefmt)
|
||||
|
||||
# 如果使用了包含.%f的格式,则将微秒(6位)截断为毫秒(3位)
|
||||
if datefmt and ".%f" in datefmt:
|
||||
# 格式化的时间字符串应该在最后有6位微秒数
|
||||
parts = result.split('.')
|
||||
if len(parts) > 1 and len(parts[1]) >= 6:
|
||||
# 只保留前3位作为毫秒
|
||||
millis = parts[1][:3]
|
||||
result = f"{parts[0]}.{millis}"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def init_logging_request_id():
|
||||
"""
|
||||
Initialize logging to include request ID in log messages.
|
||||
Add the custom filter to all existing handlers
|
||||
"""
|
||||
logger.info("Initializing request ID logging")
|
||||
root_logger = logging.getLogger()
|
||||
|
||||
# 添加自定义过滤器到所有处理器
|
||||
for handler in root_logger.handlers:
|
||||
# 添加请求ID过滤器
|
||||
handler.addFilter(RequestIdFilter())
|
||||
|
||||
# 更新格式化器以包含请求ID,调整格式使其更紧凑整齐
|
||||
formatter = logging.Formatter(
|
||||
fmt="%(asctime)s.%(msecs)03d [%(request_id)s] %(levelname)-5s %(name)-20s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
logger.info(
|
||||
f"Updated {len(root_logger.handlers)} handlers with request ID formatting"
|
||||
)
|
||||
|
||||
# 如果没有处理器,添加一个标准输出处理器
|
||||
if not root_logger.handlers:
|
||||
handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter(
|
||||
fmt="%(asctime)s.%(msecs)03d [%(request_id)s] %(levelname)-5s %(name)-20s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
handler.addFilter(RequestIdFilter())
|
||||
root_logger.addHandler(handler)
|
||||
logger.info("Added new StreamHandler with request ID formatting")
|
||||
|
||||
|
||||
class RequestIdFilter(logging.Filter):
|
||||
"""Filter that adds request ID to log messages"""
|
||||
|
||||
def filter(self, record: LogRecord) -> bool:
|
||||
request_id = request_id_var.get()
|
||||
if request_id is not None:
|
||||
# 为日志记录添加请求ID属性,使用短格式
|
||||
if len(request_id) > 8:
|
||||
# 截取ID的前8个字符,确保显示整齐
|
||||
short_id = request_id[:8]
|
||||
if "-" in request_id:
|
||||
# 尝试保留格式,例如 test-req-1-XXX
|
||||
parts = request_id.split("-")
|
||||
if len(parts) >= 3:
|
||||
# 如果格式是 xxx-xxx-n-randompart
|
||||
short_id = f"{parts[0]}-{parts[1]}-{parts[2]}"
|
||||
record.request_id = short_id
|
||||
else:
|
||||
record.request_id = request_id
|
||||
|
||||
# 添加执行时间属性
|
||||
start_time = _request_start_time_ctx.get()
|
||||
if start_time is not None:
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
record.elapsed_ms = elapsed_ms
|
||||
# 添加执行时间到消息中
|
||||
if not hasattr(record, "message_with_elapsed"):
|
||||
record.message_with_elapsed = True
|
||||
record.msg = f"{record.msg} (elapsed: {elapsed_ms}ms)"
|
||||
else:
|
||||
# 如果没有请求ID,使用占位符
|
||||
record.request_id = "no-req-id"
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def request_id_context(request_id: str = None):
|
||||
"""Context manager that sets a request ID for the current context
|
||||
|
||||
Args:
|
||||
request_id: 要使用的请求ID,如果为None则自动生成
|
||||
|
||||
Example:
|
||||
with request_id_context("req-123"):
|
||||
# 在这个代码块中的所有日志都会包含请求ID req-123
|
||||
logging.info("Processing request")
|
||||
"""
|
||||
# Generate or use provided request ID
|
||||
req_id = request_id or str(uuid.uuid4())
|
||||
|
||||
# Set start time and request ID
|
||||
start_time = time.time()
|
||||
req_token = request_id_var.set(req_id)
|
||||
time_token = _request_start_time_ctx.set(start_time)
|
||||
|
||||
logger.info(f"Starting new request with ID: {req_id}")
|
||||
|
||||
try:
|
||||
yield request_id_var.get()
|
||||
finally:
|
||||
# Log completion and reset context vars
|
||||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||||
logger.info(f"Request {req_id} completed in {elapsed_ms}ms")
|
||||
request_id_var.reset(req_token)
|
||||
_request_start_time_ctx.reset(time_token)
|
||||
3228
docreader/uv.lock
generated
Normal file
3228
docreader/uv.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user