fix:build docreader timeout; update ocr config;support pdf tables parsing

2025-11-25 03:15:00 +08:00 · 2025-09-03 11:30:47 +08:00
parent 08c52fbd7b
commit 3aad892a62
3 changed files with 77 additions and 42 deletions
--- a/docker/Dockerfile.docreader
+++ b/docker/Dockerfile.docreader
@@ -25,11 +25,24 @@ RUN apt-get update && apt-get install -y \
    unzip \
    && rm -rf /var/lib/apt/lists/*

-# 安装 protoc
-RUN curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \
-    unzip protoc-3.19.4-linux-x86_64.zip -d /usr/local && \
-    chmod +x /usr/local/bin/protoc && \
-    rm protoc-3.19.4-linux-x86_64.zip
+# 检查是否存在本地protoc安装包，如果存在则离线安装，否则在线安装,其他安装包按需求添加
+COPY packages/ /app/packages/
+RUN echo "检查本地protoc安装包..." && \
+    if [ -f "/app/packages/protoc-3.19.4-linux-x86_64.zip" ]; then \
+        echo "发现本地protoc安装包，将进行离线安装"; \
+        # 离线安装：使用本地包（精确路径避免歧义）
+        cp /app/packages/protoc-*.zip /app/ && \
+        unzip -o /app/protoc-*.zip -d /usr/local && \
+        chmod +x /usr/local/bin/protoc && \
+        rm -f /app/protoc-*.zip; \
+    else \
+        echo "未发现本地protoc安装包，将进行在线安装"; \
+        # 在线安装：从网络下载
+        curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \
+        unzip -o protoc-3.19.4-linux-x86_64.zip -d /usr/local && \
+        chmod +x /usr/local/bin/protoc && \
+        rm -f protoc-3.19.4-linux-x86_64.zip; \
+    fi

 # 复制依赖文件
 COPY services/docreader/requirements.txt .
--- a/services/docreader/src/parser/ocr_engine.py
+++ b/services/docreader/src/parser/ocr_engine.py
@@ -36,6 +36,8 @@ class PaddleOCRBackend(OCRBackend):
            from paddleocr import PaddleOCR
            # Default OCR configuration
            ocr_config = {
+                "text_det_limit_type": "max",  # Change from 'min' to 'max'
+                "text_det_limit_side_len": 960,  # A standard and safe limit for the longest side
                "use_doc_orientation_classify": False,  # Do not use document image orientation classification
                "use_doc_unwarping": False,  # Do not use document unwarping
                "use_textline_orientation": False,  # Do not use textline orientation classification
@@ -43,8 +45,6 @@ class PaddleOCRBackend(OCRBackend):
                "text_detection_model_name": "PP-OCRv5_server_det",
                "text_recognition_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_rec_infer",
                "text_detection_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_det_infer",
-                "text_det_limit_type": "min",  # Limit by short side
-                "text_det_limit_side_len": 736,  # Limit side length to 736
                "text_det_thresh": 0.3,  # Text detection pixel threshold
                "text_det_box_thresh": 0.6,  # Text detection box threshold
                "text_det_unclip_ratio": 1.5,  # Text detection expansion ratio
--- a/services/docreader/src/parser/pdf_parser.py
+++ b/services/docreader/src/parser/pdf_parser.py
@@ -3,7 +3,8 @@ import os
 import io
 from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union

-from pypdf import PdfReader
+import pdfplumber
+import tempfile
 from .base_parser import BaseParser

 logger = logging.getLogger(__name__)
@@ -17,45 +18,66 @@ class PDFParser(BaseParser):
    """

    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
-        """
-        Parse PDF document content into text
+       
+        logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")

-        This method processes a PDF document by extracting text content.
-
-        Args:
-            content: PDF document content as bytes
-
-        Returns:
-            Extracted text content
-        """
-        logger.info(f"Parsing PDF document, content size: {len(content)} bytes")
+        all_page_content = []
+     

+        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+        temp_pdf_path = temp_pdf.name
+        
        try:
-            # Use io.BytesIO to read content from bytes
-            pdf_file = io.BytesIO(content)
+            temp_pdf.write(content)
+            temp_pdf.close()
+            logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
            
-            # Create PdfReader object
-            pdf_reader = PdfReader(pdf_file)
-            num_pages = len(pdf_reader.pages)
-            logger.info(f"PDF has {num_pages} pages")
+            with pdfplumber.open(temp_pdf_path) as pdf:
+                logger.info(f"PDF has {len(pdf.pages)} pages")
+                
+                for page_num, page in enumerate(pdf.pages):
+                    page_content_parts = []
+                    
+                    # Try-fallback strategy for table detection
+                    default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
+                    found_tables = page.find_tables(default_settings)
+                    if not found_tables:
+                        logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
+                        fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
+                        found_tables = page.find_tables(fallback_settings)
+
+                    table_bboxes = [table.bbox for table in found_tables]
+                    # Define a filter function that keeps objects NOT inside any table bbox.
+                    def not_within_bboxes(obj):
+                        """Check if an object is outside all table bounding boxes."""
+                        for bbox in table_bboxes:
+                            # Check if the object's vertical center is within a bbox
+                            if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
+                                return False # It's inside a table, so we DON'T keep it
+                        return True # It's outside all tables, so we DO keep it
+
+                    # that contains only the non-table text.
+                    non_table_page = page.filter(not_within_bboxes)
+
+                    # Now, extract text from this filtered page view.
+                    text = non_table_page.extract_text(x_tolerance=2)
+                    if text:
+                        page_content_parts.append(text)
+              
+                    # Process and append the structured Markdown tables
+                    if found_tables:
+                        logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
+                        for table in found_tables:
+                            markdown_table = self._convert_table_to_markdown(table.extract())
+                            page_content_parts.append(f"\n\n{markdown_table}\n\n")
+                    
+                    
+                    all_page_content.append("".join(page_content_parts))
+
+            final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
+            logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
            
-            # Extract text from all pages
-            all_text = []
-            for page_num, page in enumerate(pdf_reader.pages):
-                try:
-                    page_text = page.extract_text()
-                    if page_text:
-                        all_text.append(page_text)
-                        logger.info(f"Successfully extracted text from page {page_num+1}/{num_pages}")
-                    else:
-                        logger.warning(f"No text extracted from page {page_num+1}/{num_pages}")
-                except Exception as e:
-                    logger.error(f"Error extracting text from page {page_num+1}: {str(e)}")
-            
-            # Combine all extracted text
-            result = "\n\n".join(all_text)
-            logger.info(f"PDF parsing complete, extracted {len(result)} characters of text")
-            return result
+            return final_text
            
        except Exception as e:
            logger.error(f"Failed to parse PDF document: {str(e)}")