mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 11:29:31 +08:00
fix:build docreader timeout; update ocr config;support pdf tables parsing
This commit is contained in:
@@ -25,11 +25,24 @@ RUN apt-get update && apt-get install -y \
|
|||||||
unzip \
|
unzip \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# 安装 protoc
|
# 检查是否存在本地protoc安装包,如果存在则离线安装,否则在线安装,其他安装包按需求添加
|
||||||
RUN curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \
|
COPY packages/ /app/packages/
|
||||||
unzip protoc-3.19.4-linux-x86_64.zip -d /usr/local && \
|
RUN echo "检查本地protoc安装包..." && \
|
||||||
chmod +x /usr/local/bin/protoc && \
|
if [ -f "/app/packages/protoc-3.19.4-linux-x86_64.zip" ]; then \
|
||||||
rm protoc-3.19.4-linux-x86_64.zip
|
echo "发现本地protoc安装包,将进行离线安装"; \
|
||||||
|
# 离线安装:使用本地包(精确路径避免歧义)
|
||||||
|
cp /app/packages/protoc-*.zip /app/ && \
|
||||||
|
unzip -o /app/protoc-*.zip -d /usr/local && \
|
||||||
|
chmod +x /usr/local/bin/protoc && \
|
||||||
|
rm -f /app/protoc-*.zip; \
|
||||||
|
else \
|
||||||
|
echo "未发现本地protoc安装包,将进行在线安装"; \
|
||||||
|
# 在线安装:从网络下载
|
||||||
|
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \
|
||||||
|
unzip -o protoc-3.19.4-linux-x86_64.zip -d /usr/local && \
|
||||||
|
chmod +x /usr/local/bin/protoc && \
|
||||||
|
rm -f protoc-3.19.4-linux-x86_64.zip; \
|
||||||
|
fi
|
||||||
|
|
||||||
# 复制依赖文件
|
# 复制依赖文件
|
||||||
COPY services/docreader/requirements.txt .
|
COPY services/docreader/requirements.txt .
|
||||||
|
|||||||
@@ -36,6 +36,8 @@ class PaddleOCRBackend(OCRBackend):
|
|||||||
from paddleocr import PaddleOCR
|
from paddleocr import PaddleOCR
|
||||||
# Default OCR configuration
|
# Default OCR configuration
|
||||||
ocr_config = {
|
ocr_config = {
|
||||||
|
"text_det_limit_type": "max", # Change from 'min' to 'max'
|
||||||
|
"text_det_limit_side_len": 960, # A standard and safe limit for the longest side
|
||||||
"use_doc_orientation_classify": False, # Do not use document image orientation classification
|
"use_doc_orientation_classify": False, # Do not use document image orientation classification
|
||||||
"use_doc_unwarping": False, # Do not use document unwarping
|
"use_doc_unwarping": False, # Do not use document unwarping
|
||||||
"use_textline_orientation": False, # Do not use textline orientation classification
|
"use_textline_orientation": False, # Do not use textline orientation classification
|
||||||
@@ -43,8 +45,6 @@ class PaddleOCRBackend(OCRBackend):
|
|||||||
"text_detection_model_name": "PP-OCRv5_server_det",
|
"text_detection_model_name": "PP-OCRv5_server_det",
|
||||||
"text_recognition_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_rec_infer",
|
"text_recognition_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_rec_infer",
|
||||||
"text_detection_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_det_infer",
|
"text_detection_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_det_infer",
|
||||||
"text_det_limit_type": "min", # Limit by short side
|
|
||||||
"text_det_limit_side_len": 736, # Limit side length to 736
|
|
||||||
"text_det_thresh": 0.3, # Text detection pixel threshold
|
"text_det_thresh": 0.3, # Text detection pixel threshold
|
||||||
"text_det_box_thresh": 0.6, # Text detection box threshold
|
"text_det_box_thresh": 0.6, # Text detection box threshold
|
||||||
"text_det_unclip_ratio": 1.5, # Text detection expansion ratio
|
"text_det_unclip_ratio": 1.5, # Text detection expansion ratio
|
||||||
|
|||||||
@@ -3,7 +3,8 @@ import os
|
|||||||
import io
|
import io
|
||||||
from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
|
from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
|
||||||
|
|
||||||
from pypdf import PdfReader
|
import pdfplumber
|
||||||
|
import tempfile
|
||||||
from .base_parser import BaseParser
|
from .base_parser import BaseParser
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -17,45 +18,66 @@ class PDFParser(BaseParser):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||||
"""
|
|
||||||
Parse PDF document content into text
|
logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
|
||||||
|
|
||||||
This method processes a PDF document by extracting text content.
|
all_page_content = []
|
||||||
|
|
||||||
Args:
|
|
||||||
content: PDF document content as bytes
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Extracted text content
|
|
||||||
"""
|
|
||||||
logger.info(f"Parsing PDF document, content size: {len(content)} bytes")
|
|
||||||
|
|
||||||
|
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
||||||
|
temp_pdf_path = temp_pdf.name
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use io.BytesIO to read content from bytes
|
temp_pdf.write(content)
|
||||||
pdf_file = io.BytesIO(content)
|
temp_pdf.close()
|
||||||
|
logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
|
||||||
|
|
||||||
# Create PdfReader object
|
with pdfplumber.open(temp_pdf_path) as pdf:
|
||||||
pdf_reader = PdfReader(pdf_file)
|
logger.info(f"PDF has {len(pdf.pages)} pages")
|
||||||
num_pages = len(pdf_reader.pages)
|
|
||||||
logger.info(f"PDF has {num_pages} pages")
|
for page_num, page in enumerate(pdf.pages):
|
||||||
|
page_content_parts = []
|
||||||
|
|
||||||
|
# Try-fallback strategy for table detection
|
||||||
|
default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
|
||||||
|
found_tables = page.find_tables(default_settings)
|
||||||
|
if not found_tables:
|
||||||
|
logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
|
||||||
|
fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
|
||||||
|
found_tables = page.find_tables(fallback_settings)
|
||||||
|
|
||||||
|
table_bboxes = [table.bbox for table in found_tables]
|
||||||
|
# Define a filter function that keeps objects NOT inside any table bbox.
|
||||||
|
def not_within_bboxes(obj):
|
||||||
|
"""Check if an object is outside all table bounding boxes."""
|
||||||
|
for bbox in table_bboxes:
|
||||||
|
# Check if the object's vertical center is within a bbox
|
||||||
|
if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
|
||||||
|
return False # It's inside a table, so we DON'T keep it
|
||||||
|
return True # It's outside all tables, so we DO keep it
|
||||||
|
|
||||||
|
# that contains only the non-table text.
|
||||||
|
non_table_page = page.filter(not_within_bboxes)
|
||||||
|
|
||||||
|
# Now, extract text from this filtered page view.
|
||||||
|
text = non_table_page.extract_text(x_tolerance=2)
|
||||||
|
if text:
|
||||||
|
page_content_parts.append(text)
|
||||||
|
|
||||||
|
# Process and append the structured Markdown tables
|
||||||
|
if found_tables:
|
||||||
|
logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
|
||||||
|
for table in found_tables:
|
||||||
|
markdown_table = self._convert_table_to_markdown(table.extract())
|
||||||
|
page_content_parts.append(f"\n\n{markdown_table}\n\n")
|
||||||
|
|
||||||
|
|
||||||
|
all_page_content.append("".join(page_content_parts))
|
||||||
|
|
||||||
|
final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
|
||||||
|
logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
|
||||||
|
|
||||||
# Extract text from all pages
|
return final_text
|
||||||
all_text = []
|
|
||||||
for page_num, page in enumerate(pdf_reader.pages):
|
|
||||||
try:
|
|
||||||
page_text = page.extract_text()
|
|
||||||
if page_text:
|
|
||||||
all_text.append(page_text)
|
|
||||||
logger.info(f"Successfully extracted text from page {page_num+1}/{num_pages}")
|
|
||||||
else:
|
|
||||||
logger.warning(f"No text extracted from page {page_num+1}/{num_pages}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error extracting text from page {page_num+1}: {str(e)}")
|
|
||||||
|
|
||||||
# Combine all extracted text
|
|
||||||
result = "\n\n".join(all_text)
|
|
||||||
logger.info(f"PDF parsing complete, extracted {len(result)} characters of text")
|
|
||||||
return result
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to parse PDF document: {str(e)}")
|
logger.error(f"Failed to parse PDF document: {str(e)}")
|
||||||
|
|||||||
Reference in New Issue
Block a user