diff --git a/docker/Dockerfile.docreader b/docker/Dockerfile.docreader index 6cc9433..d7bae0c 100644 --- a/docker/Dockerfile.docreader +++ b/docker/Dockerfile.docreader @@ -25,11 +25,24 @@ RUN apt-get update && apt-get install -y \ unzip \ && rm -rf /var/lib/apt/lists/* -# 安装 protoc -RUN curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \ - unzip protoc-3.19.4-linux-x86_64.zip -d /usr/local && \ - chmod +x /usr/local/bin/protoc && \ - rm protoc-3.19.4-linux-x86_64.zip +# 检查是否存在本地protoc安装包,如果存在则离线安装,否则在线安装,其他安装包按需求添加 +COPY packages/ /app/packages/ +RUN echo "检查本地protoc安装包..." && \ + if [ -f "/app/packages/protoc-3.19.4-linux-x86_64.zip" ]; then \ + echo "发现本地protoc安装包,将进行离线安装"; \ + # 离线安装:使用本地包(精确路径避免歧义) + cp /app/packages/protoc-*.zip /app/ && \ + unzip -o /app/protoc-*.zip -d /usr/local && \ + chmod +x /usr/local/bin/protoc && \ + rm -f /app/protoc-*.zip; \ + else \ + echo "未发现本地protoc安装包,将进行在线安装"; \ + # 在线安装:从网络下载 + curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \ + unzip -o protoc-3.19.4-linux-x86_64.zip -d /usr/local && \ + chmod +x /usr/local/bin/protoc && \ + rm -f protoc-3.19.4-linux-x86_64.zip; \ + fi # 复制依赖文件 COPY services/docreader/requirements.txt . diff --git a/services/docreader/src/parser/ocr_engine.py b/services/docreader/src/parser/ocr_engine.py index e387e07..df4dd81 100644 --- a/services/docreader/src/parser/ocr_engine.py +++ b/services/docreader/src/parser/ocr_engine.py @@ -36,6 +36,8 @@ class PaddleOCRBackend(OCRBackend): from paddleocr import PaddleOCR # Default OCR configuration ocr_config = { + "text_det_limit_type": "max", # Change from 'min' to 'max' + "text_det_limit_side_len": 960, # A standard and safe limit for the longest side "use_doc_orientation_classify": False, # Do not use document image orientation classification "use_doc_unwarping": False, # Do not use document unwarping "use_textline_orientation": False, # Do not use textline orientation classification @@ -43,8 +45,6 @@ class PaddleOCRBackend(OCRBackend): "text_detection_model_name": "PP-OCRv5_server_det", "text_recognition_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_rec_infer", "text_detection_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_det_infer", - "text_det_limit_type": "min", # Limit by short side - "text_det_limit_side_len": 736, # Limit side length to 736 "text_det_thresh": 0.3, # Text detection pixel threshold "text_det_box_thresh": 0.6, # Text detection box threshold "text_det_unclip_ratio": 1.5, # Text detection expansion ratio diff --git a/services/docreader/src/parser/pdf_parser.py b/services/docreader/src/parser/pdf_parser.py index c124012..26145a7 100644 --- a/services/docreader/src/parser/pdf_parser.py +++ b/services/docreader/src/parser/pdf_parser.py @@ -3,7 +3,8 @@ import os import io from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union -from pypdf import PdfReader +import pdfplumber +import tempfile from .base_parser import BaseParser logger = logging.getLogger(__name__) @@ -17,45 +18,66 @@ class PDFParser(BaseParser): """ def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: - """ - Parse PDF document content into text + + logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes") - This method processes a PDF document by extracting text content. - - Args: - content: PDF document content as bytes - - Returns: - Extracted text content - """ - logger.info(f"Parsing PDF document, content size: {len(content)} bytes") + all_page_content = [] + + temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") + temp_pdf_path = temp_pdf.name + try: - # Use io.BytesIO to read content from bytes - pdf_file = io.BytesIO(content) + temp_pdf.write(content) + temp_pdf.close() + logger.info(f"PDF content written to temporary file: {temp_pdf_path}") - # Create PdfReader object - pdf_reader = PdfReader(pdf_file) - num_pages = len(pdf_reader.pages) - logger.info(f"PDF has {num_pages} pages") + with pdfplumber.open(temp_pdf_path) as pdf: + logger.info(f"PDF has {len(pdf.pages)} pages") + + for page_num, page in enumerate(pdf.pages): + page_content_parts = [] + + # Try-fallback strategy for table detection + default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" } + found_tables = page.find_tables(default_settings) + if not found_tables: + logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.") + fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" } + found_tables = page.find_tables(fallback_settings) + + table_bboxes = [table.bbox for table in found_tables] + # Define a filter function that keeps objects NOT inside any table bbox. + def not_within_bboxes(obj): + """Check if an object is outside all table bounding boxes.""" + for bbox in table_bboxes: + # Check if the object's vertical center is within a bbox + if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]: + return False # It's inside a table, so we DON'T keep it + return True # It's outside all tables, so we DO keep it + + # that contains only the non-table text. + non_table_page = page.filter(not_within_bboxes) + + # Now, extract text from this filtered page view. + text = non_table_page.extract_text(x_tolerance=2) + if text: + page_content_parts.append(text) + + # Process and append the structured Markdown tables + if found_tables: + logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}") + for table in found_tables: + markdown_table = self._convert_table_to_markdown(table.extract()) + page_content_parts.append(f"\n\n{markdown_table}\n\n") + + + all_page_content.append("".join(page_content_parts)) + + final_text = "\n\n--- Page Break ---\n\n".join(all_page_content) + logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.") - # Extract text from all pages - all_text = [] - for page_num, page in enumerate(pdf_reader.pages): - try: - page_text = page.extract_text() - if page_text: - all_text.append(page_text) - logger.info(f"Successfully extracted text from page {page_num+1}/{num_pages}") - else: - logger.warning(f"No text extracted from page {page_num+1}/{num_pages}") - except Exception as e: - logger.error(f"Error extracting text from page {page_num+1}: {str(e)}") - - # Combine all extracted text - result = "\n\n".join(all_text) - logger.info(f"PDF parsing complete, extracted {len(result)} characters of text") - return result + return final_text except Exception as e: logger.error(f"Failed to parse PDF document: {str(e)}")