mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
feat: use paddle ocr v4 instead
This commit is contained in:
@@ -129,9 +129,6 @@ services:
|
||||
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME}
|
||||
- MINIO_USE_SSL=${MINIO_USE_SSL}
|
||||
- WEB_PROXY=${WEB_PROXY}
|
||||
- GRPC_ENABLE_FORK_SUPPORT=1
|
||||
- GRPC_WORKER_PROCESSES=1
|
||||
- GRPC_MAX_WORKERS=4
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
|
||||
@@ -13,7 +13,7 @@ urllib3
|
||||
markdownify
|
||||
mistletoe
|
||||
goose3[all]
|
||||
paddleocr==3.2.0
|
||||
paddleocr>=2.10.0,<3.0.0
|
||||
markdown
|
||||
pypdf
|
||||
cos-python-sdk-v5
|
||||
@@ -25,7 +25,7 @@ ollama
|
||||
pdfplumber
|
||||
|
||||
--extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
paddlepaddle==3.2.0
|
||||
paddlepaddle>=3.0.0,<4.0.0
|
||||
|
||||
# --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
# paddlepaddle-gpu==3.0.0
|
||||
@@ -33,30 +33,40 @@ class PaddleOCRBackend(OCRBackend):
|
||||
"""Initialize PaddleOCR backend"""
|
||||
self.ocr = None
|
||||
try:
|
||||
import os
|
||||
import paddle
|
||||
|
||||
# Set PaddlePaddle to use CPU and disable GPU
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
||||
paddle.set_device('cpu')
|
||||
|
||||
from paddleocr import PaddleOCR
|
||||
# Default OCR configuration
|
||||
# Simplified OCR configuration
|
||||
ocr_config = {
|
||||
"text_det_limit_type": "max", # Change from 'min' to 'max'
|
||||
"text_det_limit_side_len": 960, # A standard and safe limit for the longest side
|
||||
"use_doc_orientation_classify": False, # Do not use document image orientation classification
|
||||
"use_doc_unwarping": False, # Do not use document unwarping
|
||||
"use_textline_orientation": False, # Do not use textline orientation classification
|
||||
"text_recognition_model_name": "PP-OCRv5_server_rec",
|
||||
"text_detection_model_name": "PP-OCRv5_server_det",
|
||||
"text_recognition_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_rec_infer",
|
||||
"text_detection_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_det_infer",
|
||||
"text_det_thresh": 0.3, # Text detection pixel threshold
|
||||
"text_det_box_thresh": 0.6, # Text detection box threshold
|
||||
"text_det_unclip_ratio": 1.5, # Text detection expansion ratio
|
||||
"text_rec_score_thresh": 0.0, # Text recognition confidence threshold
|
||||
"ocr_version": "PP-OCRv5", # Switch to PP-OCRv4 here to compare
|
||||
"use_gpu": False,
|
||||
"text_det_limit_type": "max",
|
||||
"text_det_limit_side_len": 960,
|
||||
"use_doc_orientation_classify": False,
|
||||
"use_doc_unwarping": False,
|
||||
"use_textline_orientation": False,
|
||||
"text_recognition_model_name": "PP-OCRv4_server_rec",
|
||||
"text_detection_model_name": "PP-OCRv4_server_det",
|
||||
"text_det_thresh": 0.3,
|
||||
"text_det_box_thresh": 0.6,
|
||||
"text_det_unclip_ratio": 1.5,
|
||||
"text_rec_score_thresh": 0.0,
|
||||
"ocr_version": "PP-OCRv4",
|
||||
"lang": "ch",
|
||||
"show_log": False,
|
||||
"use_dilation": True, # improves accuracy
|
||||
"det_db_score_mode": "slow", # improves accuracy
|
||||
}
|
||||
|
||||
self.ocr = PaddleOCR(**ocr_config)
|
||||
logger.info("PaddleOCR engine initialized successfully")
|
||||
except ImportError:
|
||||
logger.error("Failed to import paddleocr. Please install it with 'pip install paddleocr'")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
|
||||
|
||||
@@ -71,50 +81,39 @@ class PaddleOCRBackend(OCRBackend):
|
||||
"""
|
||||
try:
|
||||
# Ensure image is in RGB format
|
||||
if hasattr(image, "convert") and image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
|
||||
# Convert to numpy array if needed
|
||||
if hasattr(image, "convert"):
|
||||
if image.mode == "RGBA":
|
||||
img_for_ocr = image.convert("RGB") # 尝试转换为 RGB
|
||||
logger.info(f"Converted image from RGBA to RGB format for OCR.")
|
||||
elif image.mode != "RGB": # 如果不是 RGBA 也不是 RGB,也尝试转 RGB
|
||||
img_for_ocr = image.convert("RGB")
|
||||
logger.info(f"Converted image from {image.mode} to RGB format for OCR.")
|
||||
else:
|
||||
img_for_ocr = image
|
||||
logger.info(f"Image already in RGB format.")
|
||||
image_array = np.array(image)
|
||||
else:
|
||||
img_for_ocr = image
|
||||
logger.info(f"Image is not a PIL.Image object, assuming it's already suitable for OCR.")
|
||||
image_array = image
|
||||
|
||||
# Convert to numpy array if not already
|
||||
if hasattr(img_for_ocr, "convert"):
|
||||
image_array = np.array(img_for_ocr)
|
||||
else:
|
||||
image_array = img_for_ocr
|
||||
|
||||
ocr_result = self.ocr.predict(image_array)
|
||||
# Perform OCR
|
||||
ocr_result = self.ocr.ocr(image_array, cls=False)
|
||||
|
||||
# Extract text
|
||||
if ocr_result and any(ocr_result):
|
||||
ocr_text = ""
|
||||
for image_result in ocr_result:
|
||||
ocr_text = ocr_text + " ".join(image_result["rec_texts"])
|
||||
text_length = len(ocr_text)
|
||||
if text_length > 0:
|
||||
logger.info(f"OCR extracted {text_length} characters")
|
||||
logger.info(
|
||||
f"OCR text sample: {ocr_text[:100]}..."
|
||||
if text_length > 100
|
||||
else f"OCR text: {ocr_text}"
|
||||
)
|
||||
return ocr_text
|
||||
else:
|
||||
logger.warning("OCR returned empty result")
|
||||
ocr_text = ""
|
||||
if ocr_result and ocr_result[0]:
|
||||
for line in ocr_result[0]:
|
||||
if line and len(line) >= 2:
|
||||
text = line[1][0] if line[1] else ""
|
||||
if text:
|
||||
ocr_text += text + " "
|
||||
|
||||
text_length = len(ocr_text.strip())
|
||||
if text_length > 0:
|
||||
logger.info(f"OCR extracted {text_length} characters")
|
||||
return ocr_text.strip()
|
||||
else:
|
||||
logger.warning("OCR did not return any result")
|
||||
return ""
|
||||
logger.warning("OCR returned empty result")
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR recognition error: {str(e)}")
|
||||
return ""
|
||||
|
||||
class NanonetsOCRBackend(OCRBackend):
|
||||
"""Nanonets OCR backend implementation using OpenAI API format"""
|
||||
|
||||
|
||||
@@ -5,9 +5,7 @@ from concurrent import futures
|
||||
import traceback
|
||||
import grpc
|
||||
import uuid
|
||||
|
||||
# Enable gRPC fork support to avoid multiprocessing issues
|
||||
os.environ.setdefault('GRPC_ENABLE_FORK_SUPPORT', '1')
|
||||
import atexit
|
||||
|
||||
# Add parent directory to Python path
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
@@ -327,104 +325,47 @@ def init_ocr_engine(ocr_backend, ocr_config):
|
||||
logger.error(f"Error initializing OCR engine: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def serve():
|
||||
|
||||
init_ocr_engine(os.getenv("OCR_BACKEND", "paddle"), {
|
||||
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
|
||||
})
|
||||
# Set max number of worker threads and processes
|
||||
|
||||
# Set max number of worker threads
|
||||
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
|
||||
# Force single process mode to avoid gRPC multiprocessing issues
|
||||
worker_processes = 1
|
||||
logger.info(f"Starting DocReader service, max worker threads per process: {max_workers}, "
|
||||
f"processes: {worker_processes} (forced single process mode)")
|
||||
logger.info(f"Starting DocReader service with {max_workers} worker threads")
|
||||
|
||||
# Get port number
|
||||
port = os.environ.get("GRPC_PORT", "50051")
|
||||
|
||||
# Multi-process mode (disabled due to gRPC fork issues)
|
||||
if False and worker_processes > 1:
|
||||
import multiprocessing
|
||||
processes = []
|
||||
|
||||
def run_server():
|
||||
# Create server
|
||||
server = grpc.server(
|
||||
futures.ThreadPoolExecutor(max_workers=max_workers),
|
||||
options=[
|
||||
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
|
||||
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
|
||||
('grpc.enable_fork_support', 1),
|
||||
('grpc.so_reuseport', 1),
|
||||
],
|
||||
)
|
||||
|
||||
# Register service
|
||||
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
|
||||
|
||||
# Set listen address
|
||||
server.add_insecure_port(f"[::]:{port}")
|
||||
|
||||
# Start service
|
||||
server.start()
|
||||
|
||||
logger.info(f"Worker process {os.getpid()} started on port {port}")
|
||||
|
||||
try:
|
||||
# Wait for service termination
|
||||
server.wait_for_termination()
|
||||
except KeyboardInterrupt:
|
||||
logger.info(f"Worker process {os.getpid()} received termination signal")
|
||||
server.stop(0)
|
||||
|
||||
# Start specified number of worker processes
|
||||
for i in range(worker_processes):
|
||||
process = multiprocessing.Process(target=run_server)
|
||||
processes.append(process)
|
||||
process.start()
|
||||
logger.info(f"Started worker process {process.pid} ({i+1}/{worker_processes})")
|
||||
|
||||
# Wait for all processes to complete
|
||||
try:
|
||||
for process in processes:
|
||||
process.join()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Master process received termination signal")
|
||||
for process in processes:
|
||||
if process.is_alive():
|
||||
logger.info(f"Terminating worker process {process.pid}")
|
||||
process.terminate()
|
||||
# Create server
|
||||
server = grpc.server(
|
||||
futures.ThreadPoolExecutor(max_workers=max_workers),
|
||||
options=[
|
||||
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
|
||||
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
|
||||
],
|
||||
)
|
||||
|
||||
# Single-process mode
|
||||
else:
|
||||
# Create server
|
||||
server = grpc.server(
|
||||
futures.ThreadPoolExecutor(max_workers=max_workers),
|
||||
options=[
|
||||
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
|
||||
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
|
||||
('grpc.enable_fork_support', 1),
|
||||
('grpc.so_reuseport', 1),
|
||||
],
|
||||
)
|
||||
|
||||
# Register service
|
||||
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
|
||||
|
||||
# Set listen address
|
||||
server.add_insecure_port(f"[::]:{port}")
|
||||
|
||||
# Start service
|
||||
server.start()
|
||||
|
||||
logger.info(f"Server started on port {port} (single process mode)")
|
||||
logger.info("Server is ready to accept connections")
|
||||
|
||||
try:
|
||||
# Wait for service termination
|
||||
server.wait_for_termination()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received termination signal, shutting down server")
|
||||
server.stop(0)
|
||||
# Register service
|
||||
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
|
||||
|
||||
# Set listen address
|
||||
server.add_insecure_port(f"[::]:{port}")
|
||||
|
||||
# Start service
|
||||
server.start()
|
||||
|
||||
logger.info(f"Server started on port {port}")
|
||||
logger.info("Server is ready to accept connections")
|
||||
|
||||
try:
|
||||
# Wait for service termination
|
||||
server.wait_for_termination()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received termination signal, shutting down server")
|
||||
server.stop(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
serve()
|
||||
|
||||
Reference in New Issue
Block a user