feat: use paddle ocr v4 instead

2025-11-25 03:15:00 +08:00 · 2025-09-10 00:43:46 +08:00
parent 2b6cbee1b6
commit 7775559a9b
4 changed files with 85 additions and 148 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -129,9 +129,6 @@ services:
      - MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME}
      - MINIO_USE_SSL=${MINIO_USE_SSL}
      - WEB_PROXY=${WEB_PROXY}
-      - GRPC_ENABLE_FORK_SUPPORT=1
-      - GRPC_WORKER_PROCESSES=1
-      - GRPC_MAX_WORKERS=4
    networks:
      - WeKnora-network
    restart: unless-stopped
--- a/services/docreader/requirements.txt
+++ b/services/docreader/requirements.txt
@@ -13,7 +13,7 @@ urllib3
 markdownify
 mistletoe
 goose3[all]
-paddleocr==3.2.0
+paddleocr>=2.10.0,<3.0.0
 markdown
 pypdf
 cos-python-sdk-v5
@@ -25,7 +25,7 @@ ollama
 pdfplumber

 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cpu/
-paddlepaddle==3.2.0
+paddlepaddle>=3.0.0,<4.0.0

 # --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
 # paddlepaddle-gpu==3.0.0 
--- a/services/docreader/src/parser/ocr_engine.py
+++ b/services/docreader/src/parser/ocr_engine.py
@@ -33,30 +33,40 @@ class PaddleOCRBackend(OCRBackend):
        """Initialize PaddleOCR backend"""
        self.ocr = None
        try:
+            import os
+            import paddle
+            
+            # Set PaddlePaddle to use CPU and disable GPU
+            os.environ['CUDA_VISIBLE_DEVICES'] = ''
+            paddle.set_device('cpu')
+            
            from paddleocr import PaddleOCR
-            # Default OCR configuration
+            # Simplified OCR configuration
            ocr_config = {
-                "text_det_limit_type": "max",  # Change from 'min' to 'max'
-                "text_det_limit_side_len": 960,  # A standard and safe limit for the longest side
-                "use_doc_orientation_classify": False,  # Do not use document image orientation classification
-                "use_doc_unwarping": False,  # Do not use document unwarping
-                "use_textline_orientation": False,  # Do not use textline orientation classification
-                "text_recognition_model_name": "PP-OCRv5_server_rec",
-                "text_detection_model_name": "PP-OCRv5_server_det",
-                "text_recognition_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_rec_infer",
-                "text_detection_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_det_infer",
-                "text_det_thresh": 0.3,  # Text detection pixel threshold
-                "text_det_box_thresh": 0.6,  # Text detection box threshold
-                "text_det_unclip_ratio": 1.5,  # Text detection expansion ratio
-                "text_rec_score_thresh": 0.0,  # Text recognition confidence threshold
-                "ocr_version": "PP-OCRv5",  # Switch to PP-OCRv4 here to compare
+                "use_gpu": False,
+                "text_det_limit_type": "max",
+                "text_det_limit_side_len": 960,
+                "use_doc_orientation_classify": False,
+                "use_doc_unwarping": False,
+                "use_textline_orientation": False,
+                "text_recognition_model_name": "PP-OCRv4_server_rec",
+                "text_detection_model_name": "PP-OCRv4_server_det",
+                "text_det_thresh": 0.3,
+                "text_det_box_thresh": 0.6,
+                "text_det_unclip_ratio": 1.5,
+                "text_rec_score_thresh": 0.0,
+                "ocr_version": "PP-OCRv4",
                "lang": "ch",
+                "show_log": False,
+                "use_dilation": True,  # improves accuracy
+                "det_db_score_mode": "slow",  # improves accuracy
            }
            
            self.ocr = PaddleOCR(**ocr_config)
            logger.info("PaddleOCR engine initialized successfully")
-        except ImportError:
-            logger.error("Failed to import paddleocr. Please install it with 'pip install paddleocr'")
+            
+        except ImportError as e:
+            logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
        except Exception as e:
            logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
    
@@ -71,50 +81,39 @@ class PaddleOCRBackend(OCRBackend):
        """
        try:
            # Ensure image is in RGB format
+            if hasattr(image, "convert") and image.mode != "RGB":
+                image = image.convert("RGB")
+
+            # Convert to numpy array if needed
            if hasattr(image, "convert"):
-                if image.mode == "RGBA":
-                    img_for_ocr = image.convert("RGB") # 尝试转换为 RGB
-                    logger.info(f"Converted image from RGBA to RGB format for OCR.")
-                elif image.mode != "RGB": # 如果不是 RGBA 也不是 RGB，也尝试转 RGB
-                    img_for_ocr = image.convert("RGB")
-                    logger.info(f"Converted image from {image.mode} to RGB format for OCR.")
-                else:
-                    img_for_ocr = image
-                    logger.info(f"Image already in RGB format.")
+                image_array = np.array(image)
            else:
-                img_for_ocr = image
-                logger.info(f"Image is not a PIL.Image object, assuming it's already suitable for OCR.")
+                image_array = image

-            # Convert to numpy array if not already
-            if hasattr(img_for_ocr, "convert"):
-                image_array = np.array(img_for_ocr)
-            else:
-                image_array = img_for_ocr
-
-            ocr_result = self.ocr.predict(image_array)
+            # Perform OCR
+            ocr_result = self.ocr.ocr(image_array, cls=False)
   
            # Extract text
-            if ocr_result and any(ocr_result):
-                ocr_text = ""
-                for image_result in ocr_result:
-                    ocr_text = ocr_text + " ".join(image_result["rec_texts"])
-                text_length = len(ocr_text)
-                if text_length > 0:
-                    logger.info(f"OCR extracted {text_length} characters")
-                    logger.info(
-                        f"OCR text sample: {ocr_text[:100]}..."
-                        if text_length > 100
-                        else f"OCR text: {ocr_text}"
-                    )
-                    return ocr_text
-                else:
-                    logger.warning("OCR returned empty result")
+            ocr_text = ""
+            if ocr_result and ocr_result[0]:
+                for line in ocr_result[0]:
+                    if line and len(line) >= 2:
+                        text = line[1][0] if line[1] else ""
+                        if text:
+                            ocr_text += text + " "
+            
+            text_length = len(ocr_text.strip())
+            if text_length > 0:
+                logger.info(f"OCR extracted {text_length} characters")
+                return ocr_text.strip()
            else:
-                logger.warning("OCR did not return any result")
-            return ""
+                logger.warning("OCR returned empty result")
+                return ""
+                
        except Exception as e:
            logger.error(f"OCR recognition error: {str(e)}")
            return ""
+    
 class NanonetsOCRBackend(OCRBackend):
    """Nanonets OCR backend implementation using OpenAI API format"""
    
--- a/services/docreader/src/server/server.py
+++ b/services/docreader/src/server/server.py
@@ -5,9 +5,7 @@ from concurrent import futures
 import traceback
 import grpc
 import uuid
-
-# Enable gRPC fork support to avoid multiprocessing issues
-os.environ.setdefault('GRPC_ENABLE_FORK_SUPPORT', '1')
+import atexit

 # Add parent directory to Python path
 current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -327,104 +325,47 @@ def init_ocr_engine(ocr_backend, ocr_config):
        logger.error(f"Error initializing OCR engine: {str(e)}")
        return False

+
 def serve():
+    
    init_ocr_engine(os.getenv("OCR_BACKEND", "paddle"), {
        "OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
    })
-    # Set max number of worker threads and processes
+    
+    # Set max number of worker threads
    max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
-    # Force single process mode to avoid gRPC multiprocessing issues
-    worker_processes = 1
-    logger.info(f"Starting DocReader service, max worker threads per process: {max_workers}, "
-                f"processes: {worker_processes} (forced single process mode)")
+    logger.info(f"Starting DocReader service with {max_workers} worker threads")
    
    # Get port number
    port = os.environ.get("GRPC_PORT", "50051")
    
-    # Multi-process mode (disabled due to gRPC fork issues)
-    if False and worker_processes > 1:
-        import multiprocessing
-        processes = []
-        
-        def run_server():
-            # Create server
-            server = grpc.server(
-                futures.ThreadPoolExecutor(max_workers=max_workers),
-                options=[
-                    ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
-                    ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
-                    ('grpc.enable_fork_support', 1),
-                    ('grpc.so_reuseport', 1),
-                ],
-            )
-            
-            # Register service
-            docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
-            
-            # Set listen address
-            server.add_insecure_port(f"[::]:{port}")
-            
-            # Start service
-            server.start()
-            
-            logger.info(f"Worker process {os.getpid()} started on port {port}")
-            
-            try:
-                # Wait for service termination
-                server.wait_for_termination()
-            except KeyboardInterrupt:
-                logger.info(f"Worker process {os.getpid()} received termination signal")
-                server.stop(0)
-        
-        # Start specified number of worker processes
-        for i in range(worker_processes):
-            process = multiprocessing.Process(target=run_server)
-            processes.append(process)
-            process.start()
-            logger.info(f"Started worker process {process.pid} ({i+1}/{worker_processes})")
-        
-        # Wait for all processes to complete
-        try:
-            for process in processes:
-                process.join()
-        except KeyboardInterrupt:
-            logger.info("Master process received termination signal")
-            for process in processes:
-                if process.is_alive():
-                    logger.info(f"Terminating worker process {process.pid}")
-                    process.terminate()
+    # Create server
+    server = grpc.server(
+        futures.ThreadPoolExecutor(max_workers=max_workers),
+        options=[
+            ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
+            ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
+        ],
+    )
    
-    # Single-process mode
-    else:
-        # Create server
-        server = grpc.server(
-            futures.ThreadPoolExecutor(max_workers=max_workers),
-            options=[
-                ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
-                ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
-                ('grpc.enable_fork_support', 1),
-                ('grpc.so_reuseport', 1),
-            ],
-        )
-        
-        # Register service
-        docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
-        
-        # Set listen address
-        server.add_insecure_port(f"[::]:{port}")
-        
-        # Start service
-        server.start()
-        
-        logger.info(f"Server started on port {port} (single process mode)")
-        logger.info("Server is ready to accept connections")
-        
-        try:
-            # Wait for service termination
-            server.wait_for_termination()
-        except KeyboardInterrupt:
-            logger.info("Received termination signal, shutting down server")
-            server.stop(0)
+    # Register service
+    docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
+    
+    # Set listen address
+    server.add_insecure_port(f"[::]:{port}")
+    
+    # Start service
+    server.start()
+    
+    logger.info(f"Server started on port {port}")
+    logger.info("Server is ready to accept connections")
+    
+    try:
+        # Wait for service termination
+        server.wait_for_termination()
+    except KeyboardInterrupt:
+        logger.info("Received termination signal, shutting down server")
+        server.stop(0)

 if __name__ == "__main__":
    serve()