fix:build docreader timeout; update ocr config;support pdf tables parsing

This commit is contained in:
Liwx1014
2025-09-03 11:30:47 +08:00
committed by lyingbug
parent 08c52fbd7b
commit 3aad892a62
3 changed files with 77 additions and 42 deletions

View File

@@ -25,11 +25,24 @@ RUN apt-get update && apt-get install -y \
unzip \ unzip \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# 安装 protoc # 检查是否存在本地protoc安装包如果存在则离线安装否则在线安装,其他安装包按需求添加
RUN curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \ COPY packages/ /app/packages/
unzip protoc-3.19.4-linux-x86_64.zip -d /usr/local && \ RUN echo "检查本地protoc安装包..." && \
chmod +x /usr/local/bin/protoc && \ if [ -f "/app/packages/protoc-3.19.4-linux-x86_64.zip" ]; then \
rm protoc-3.19.4-linux-x86_64.zip echo "发现本地protoc安装包将进行离线安装"; \
# 离线安装:使用本地包(精确路径避免歧义)
cp /app/packages/protoc-*.zip /app/ && \
unzip -o /app/protoc-*.zip -d /usr/local && \
chmod +x /usr/local/bin/protoc && \
rm -f /app/protoc-*.zip; \
else \
echo "未发现本地protoc安装包将进行在线安装"; \
# 在线安装:从网络下载
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \
unzip -o protoc-3.19.4-linux-x86_64.zip -d /usr/local && \
chmod +x /usr/local/bin/protoc && \
rm -f protoc-3.19.4-linux-x86_64.zip; \
fi
# 复制依赖文件 # 复制依赖文件
COPY services/docreader/requirements.txt . COPY services/docreader/requirements.txt .

View File

@@ -36,6 +36,8 @@ class PaddleOCRBackend(OCRBackend):
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
# Default OCR configuration # Default OCR configuration
ocr_config = { ocr_config = {
"text_det_limit_type": "max", # Change from 'min' to 'max'
"text_det_limit_side_len": 960, # A standard and safe limit for the longest side
"use_doc_orientation_classify": False, # Do not use document image orientation classification "use_doc_orientation_classify": False, # Do not use document image orientation classification
"use_doc_unwarping": False, # Do not use document unwarping "use_doc_unwarping": False, # Do not use document unwarping
"use_textline_orientation": False, # Do not use textline orientation classification "use_textline_orientation": False, # Do not use textline orientation classification
@@ -43,8 +45,6 @@ class PaddleOCRBackend(OCRBackend):
"text_detection_model_name": "PP-OCRv5_server_det", "text_detection_model_name": "PP-OCRv5_server_det",
"text_recognition_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_rec_infer", "text_recognition_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_rec_infer",
"text_detection_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_det_infer", "text_detection_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_det_infer",
"text_det_limit_type": "min", # Limit by short side
"text_det_limit_side_len": 736, # Limit side length to 736
"text_det_thresh": 0.3, # Text detection pixel threshold "text_det_thresh": 0.3, # Text detection pixel threshold
"text_det_box_thresh": 0.6, # Text detection box threshold "text_det_box_thresh": 0.6, # Text detection box threshold
"text_det_unclip_ratio": 1.5, # Text detection expansion ratio "text_det_unclip_ratio": 1.5, # Text detection expansion ratio

View File

@@ -3,7 +3,8 @@ import os
import io import io
from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
from pypdf import PdfReader import pdfplumber
import tempfile
from .base_parser import BaseParser from .base_parser import BaseParser
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -17,45 +18,66 @@ class PDFParser(BaseParser):
""" """
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]: def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
"""
Parse PDF document content into text
This method processes a PDF document by extracting text content. logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
Args: all_page_content = []
content: PDF document content as bytes
Returns:
Extracted text content temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
""" temp_pdf_path = temp_pdf.name
logger.info(f"Parsing PDF document, content size: {len(content)} bytes")
try: try:
# Use io.BytesIO to read content from bytes temp_pdf.write(content)
pdf_file = io.BytesIO(content) temp_pdf.close()
logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
# Create PdfReader object with pdfplumber.open(temp_pdf_path) as pdf:
pdf_reader = PdfReader(pdf_file) logger.info(f"PDF has {len(pdf.pages)} pages")
num_pages = len(pdf_reader.pages)
logger.info(f"PDF has {num_pages} pages")
# Extract text from all pages for page_num, page in enumerate(pdf.pages):
all_text = [] page_content_parts = []
for page_num, page in enumerate(pdf_reader.pages):
try:
page_text = page.extract_text()
if page_text:
all_text.append(page_text)
logger.info(f"Successfully extracted text from page {page_num+1}/{num_pages}")
else:
logger.warning(f"No text extracted from page {page_num+1}/{num_pages}")
except Exception as e:
logger.error(f"Error extracting text from page {page_num+1}: {str(e)}")
# Combine all extracted text # Try-fallback strategy for table detection
result = "\n\n".join(all_text) default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
logger.info(f"PDF parsing complete, extracted {len(result)} characters of text") found_tables = page.find_tables(default_settings)
return result if not found_tables:
logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
found_tables = page.find_tables(fallback_settings)
table_bboxes = [table.bbox for table in found_tables]
# Define a filter function that keeps objects NOT inside any table bbox.
def not_within_bboxes(obj):
"""Check if an object is outside all table bounding boxes."""
for bbox in table_bboxes:
# Check if the object's vertical center is within a bbox
if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
return False # It's inside a table, so we DON'T keep it
return True # It's outside all tables, so we DO keep it
# that contains only the non-table text.
non_table_page = page.filter(not_within_bboxes)
# Now, extract text from this filtered page view.
text = non_table_page.extract_text(x_tolerance=2)
if text:
page_content_parts.append(text)
# Process and append the structured Markdown tables
if found_tables:
logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
for table in found_tables:
markdown_table = self._convert_table_to_markdown(table.extract())
page_content_parts.append(f"\n\n{markdown_table}\n\n")
all_page_content.append("".join(page_content_parts))
final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
return final_text
except Exception as e: except Exception as e:
logger.error(f"Failed to parse PDF document: {str(e)}") logger.error(f"Failed to parse PDF document: {str(e)}")