From 7775559a9bca75d45fade585ac7837501eef5d88 Mon Sep 17 00:00:00 2001 From: wizardchen Date: Wed, 10 Sep 2025 00:43:46 +0800 Subject: [PATCH] feat: use paddle ocr v4 instead --- docker-compose.yml | 3 - services/docreader/requirements.txt | 4 +- services/docreader/src/parser/ocr_engine.py | 103 ++++++++-------- services/docreader/src/server/server.py | 123 +++++--------------- 4 files changed, 85 insertions(+), 148 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 7ebfe9b..be99ac4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -129,9 +129,6 @@ services: - MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME} - MINIO_USE_SSL=${MINIO_USE_SSL} - WEB_PROXY=${WEB_PROXY} - - GRPC_ENABLE_FORK_SUPPORT=1 - - GRPC_WORKER_PROCESSES=1 - - GRPC_MAX_WORKERS=4 networks: - WeKnora-network restart: unless-stopped diff --git a/services/docreader/requirements.txt b/services/docreader/requirements.txt index ef16eac..d6e6afa 100644 --- a/services/docreader/requirements.txt +++ b/services/docreader/requirements.txt @@ -13,7 +13,7 @@ urllib3 markdownify mistletoe goose3[all] -paddleocr==3.2.0 +paddleocr>=2.10.0,<3.0.0 markdown pypdf cos-python-sdk-v5 @@ -25,7 +25,7 @@ ollama pdfplumber --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cpu/ -paddlepaddle==3.2.0 +paddlepaddle>=3.0.0,<4.0.0 # --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ # paddlepaddle-gpu==3.0.0 \ No newline at end of file diff --git a/services/docreader/src/parser/ocr_engine.py b/services/docreader/src/parser/ocr_engine.py index df4dd81..f46ce90 100644 --- a/services/docreader/src/parser/ocr_engine.py +++ b/services/docreader/src/parser/ocr_engine.py @@ -33,30 +33,40 @@ class PaddleOCRBackend(OCRBackend): """Initialize PaddleOCR backend""" self.ocr = None try: + import os + import paddle + + # Set PaddlePaddle to use CPU and disable GPU + os.environ['CUDA_VISIBLE_DEVICES'] = '' + paddle.set_device('cpu') + from paddleocr import PaddleOCR - # Default OCR configuration + # Simplified OCR configuration ocr_config = { - "text_det_limit_type": "max", # Change from 'min' to 'max' - "text_det_limit_side_len": 960, # A standard and safe limit for the longest side - "use_doc_orientation_classify": False, # Do not use document image orientation classification - "use_doc_unwarping": False, # Do not use document unwarping - "use_textline_orientation": False, # Do not use textline orientation classification - "text_recognition_model_name": "PP-OCRv5_server_rec", - "text_detection_model_name": "PP-OCRv5_server_det", - "text_recognition_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_rec_infer", - "text_detection_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_det_infer", - "text_det_thresh": 0.3, # Text detection pixel threshold - "text_det_box_thresh": 0.6, # Text detection box threshold - "text_det_unclip_ratio": 1.5, # Text detection expansion ratio - "text_rec_score_thresh": 0.0, # Text recognition confidence threshold - "ocr_version": "PP-OCRv5", # Switch to PP-OCRv4 here to compare + "use_gpu": False, + "text_det_limit_type": "max", + "text_det_limit_side_len": 960, + "use_doc_orientation_classify": False, + "use_doc_unwarping": False, + "use_textline_orientation": False, + "text_recognition_model_name": "PP-OCRv4_server_rec", + "text_detection_model_name": "PP-OCRv4_server_det", + "text_det_thresh": 0.3, + "text_det_box_thresh": 0.6, + "text_det_unclip_ratio": 1.5, + "text_rec_score_thresh": 0.0, + "ocr_version": "PP-OCRv4", "lang": "ch", + "show_log": False, + "use_dilation": True, # improves accuracy + "det_db_score_mode": "slow", # improves accuracy } self.ocr = PaddleOCR(**ocr_config) logger.info("PaddleOCR engine initialized successfully") - except ImportError: - logger.error("Failed to import paddleocr. Please install it with 'pip install paddleocr'") + + except ImportError as e: + logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'") except Exception as e: logger.error(f"Failed to initialize PaddleOCR: {str(e)}") @@ -71,50 +81,39 @@ class PaddleOCRBackend(OCRBackend): """ try: # Ensure image is in RGB format + if hasattr(image, "convert") and image.mode != "RGB": + image = image.convert("RGB") + + # Convert to numpy array if needed if hasattr(image, "convert"): - if image.mode == "RGBA": - img_for_ocr = image.convert("RGB") # 尝试转换为 RGB - logger.info(f"Converted image from RGBA to RGB format for OCR.") - elif image.mode != "RGB": # 如果不是 RGBA 也不是 RGB,也尝试转 RGB - img_for_ocr = image.convert("RGB") - logger.info(f"Converted image from {image.mode} to RGB format for OCR.") - else: - img_for_ocr = image - logger.info(f"Image already in RGB format.") + image_array = np.array(image) else: - img_for_ocr = image - logger.info(f"Image is not a PIL.Image object, assuming it's already suitable for OCR.") + image_array = image - # Convert to numpy array if not already - if hasattr(img_for_ocr, "convert"): - image_array = np.array(img_for_ocr) - else: - image_array = img_for_ocr - - ocr_result = self.ocr.predict(image_array) + # Perform OCR + ocr_result = self.ocr.ocr(image_array, cls=False) # Extract text - if ocr_result and any(ocr_result): - ocr_text = "" - for image_result in ocr_result: - ocr_text = ocr_text + " ".join(image_result["rec_texts"]) - text_length = len(ocr_text) - if text_length > 0: - logger.info(f"OCR extracted {text_length} characters") - logger.info( - f"OCR text sample: {ocr_text[:100]}..." - if text_length > 100 - else f"OCR text: {ocr_text}" - ) - return ocr_text - else: - logger.warning("OCR returned empty result") + ocr_text = "" + if ocr_result and ocr_result[0]: + for line in ocr_result[0]: + if line and len(line) >= 2: + text = line[1][0] if line[1] else "" + if text: + ocr_text += text + " " + + text_length = len(ocr_text.strip()) + if text_length > 0: + logger.info(f"OCR extracted {text_length} characters") + return ocr_text.strip() else: - logger.warning("OCR did not return any result") - return "" + logger.warning("OCR returned empty result") + return "" + except Exception as e: logger.error(f"OCR recognition error: {str(e)}") return "" + class NanonetsOCRBackend(OCRBackend): """Nanonets OCR backend implementation using OpenAI API format""" diff --git a/services/docreader/src/server/server.py b/services/docreader/src/server/server.py index 828772f..988ba36 100644 --- a/services/docreader/src/server/server.py +++ b/services/docreader/src/server/server.py @@ -5,9 +5,7 @@ from concurrent import futures import traceback import grpc import uuid - -# Enable gRPC fork support to avoid multiprocessing issues -os.environ.setdefault('GRPC_ENABLE_FORK_SUPPORT', '1') +import atexit # Add parent directory to Python path current_dir = os.path.dirname(os.path.abspath(__file__)) @@ -327,104 +325,47 @@ def init_ocr_engine(ocr_backend, ocr_config): logger.error(f"Error initializing OCR engine: {str(e)}") return False + def serve(): + init_ocr_engine(os.getenv("OCR_BACKEND", "paddle"), { "OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""), }) - # Set max number of worker threads and processes + + # Set max number of worker threads max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4")) - # Force single process mode to avoid gRPC multiprocessing issues - worker_processes = 1 - logger.info(f"Starting DocReader service, max worker threads per process: {max_workers}, " - f"processes: {worker_processes} (forced single process mode)") + logger.info(f"Starting DocReader service with {max_workers} worker threads") # Get port number port = os.environ.get("GRPC_PORT", "50051") - # Multi-process mode (disabled due to gRPC fork issues) - if False and worker_processes > 1: - import multiprocessing - processes = [] - - def run_server(): - # Create server - server = grpc.server( - futures.ThreadPoolExecutor(max_workers=max_workers), - options=[ - ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), - ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH), - ('grpc.enable_fork_support', 1), - ('grpc.so_reuseport', 1), - ], - ) - - # Register service - docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server) - - # Set listen address - server.add_insecure_port(f"[::]:{port}") - - # Start service - server.start() - - logger.info(f"Worker process {os.getpid()} started on port {port}") - - try: - # Wait for service termination - server.wait_for_termination() - except KeyboardInterrupt: - logger.info(f"Worker process {os.getpid()} received termination signal") - server.stop(0) - - # Start specified number of worker processes - for i in range(worker_processes): - process = multiprocessing.Process(target=run_server) - processes.append(process) - process.start() - logger.info(f"Started worker process {process.pid} ({i+1}/{worker_processes})") - - # Wait for all processes to complete - try: - for process in processes: - process.join() - except KeyboardInterrupt: - logger.info("Master process received termination signal") - for process in processes: - if process.is_alive(): - logger.info(f"Terminating worker process {process.pid}") - process.terminate() + # Create server + server = grpc.server( + futures.ThreadPoolExecutor(max_workers=max_workers), + options=[ + ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), + ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH), + ], + ) - # Single-process mode - else: - # Create server - server = grpc.server( - futures.ThreadPoolExecutor(max_workers=max_workers), - options=[ - ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), - ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH), - ('grpc.enable_fork_support', 1), - ('grpc.so_reuseport', 1), - ], - ) - - # Register service - docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server) - - # Set listen address - server.add_insecure_port(f"[::]:{port}") - - # Start service - server.start() - - logger.info(f"Server started on port {port} (single process mode)") - logger.info("Server is ready to accept connections") - - try: - # Wait for service termination - server.wait_for_termination() - except KeyboardInterrupt: - logger.info("Received termination signal, shutting down server") - server.stop(0) + # Register service + docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server) + + # Set listen address + server.add_insecure_port(f"[::]:{port}") + + # Start service + server.start() + + logger.info(f"Server started on port {port}") + logger.info("Server is ready to accept connections") + + try: + # Wait for service termination + server.wait_for_termination() + except KeyboardInterrupt: + logger.info("Received termination signal, shutting down server") + server.stop(0) if __name__ == "__main__": serve()