2025-08-14 00:45:19 +08:00
|
|
|
|
# =========================
|
2025-08-05 15:08:07 +08:00
|
|
|
|
# 构建阶段
|
2025-08-14 00:45:19 +08:00
|
|
|
|
# =========================
|
2025-08-22 12:19:06 +08:00
|
|
|
|
FROM python:3.10.18-bookworm AS builder
|
2025-08-05 15:08:07 +08:00
|
|
|
|
|
2025-08-14 00:45:19 +08:00
|
|
|
|
# 切换 apt 源到清华
|
|
|
|
|
|
RUN sed -i 's@http://deb.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources && \
|
|
|
|
|
|
sed -i 's@http://security.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources
|
|
|
|
|
|
|
2025-08-05 15:08:07 +08:00
|
|
|
|
WORKDIR /app
|
|
|
|
|
|
|
|
|
|
|
|
# 安装构建依赖
|
|
|
|
|
|
RUN apt-get update && apt-get install -y \
|
|
|
|
|
|
gcc \
|
|
|
|
|
|
python3-dev \
|
|
|
|
|
|
libjpeg-dev \
|
|
|
|
|
|
zlib1g-dev \
|
|
|
|
|
|
libpq-dev \
|
|
|
|
|
|
libffi-dev \
|
2025-08-13 13:54:41 +08:00
|
|
|
|
libgl1 \
|
2025-08-05 15:08:07 +08:00
|
|
|
|
libglib2.0-0 \
|
|
|
|
|
|
wget \
|
|
|
|
|
|
antiword \
|
|
|
|
|
|
curl \
|
|
|
|
|
|
unzip \
|
|
|
|
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
2025-09-03 11:30:47 +08:00
|
|
|
|
# 检查是否存在本地protoc安装包,如果存在则离线安装,否则在线安装,其他安装包按需求添加
|
2025-09-12 20:10:58 +08:00
|
|
|
|
ARG TARGETARCH
|
2025-09-03 11:30:47 +08:00
|
|
|
|
COPY packages/ /app/packages/
|
|
|
|
|
|
RUN echo "检查本地protoc安装包..." && \
|
2025-09-12 20:10:58 +08:00
|
|
|
|
# 根据目标架构选择正确的protoc包名
|
|
|
|
|
|
case ${TARGETARCH} in \
|
|
|
|
|
|
"amd64") PROTOC_ARCH="x86_64" ;; \
|
|
|
|
|
|
"arm64") PROTOC_ARCH="aarch_64" ;; \
|
|
|
|
|
|
"arm") PROTOC_ARCH="arm" ;; \
|
|
|
|
|
|
*) echo "Unsupported architecture for protoc: ${TARGETARCH}" && exit 1 ;; \
|
|
|
|
|
|
esac && \
|
|
|
|
|
|
PROTOC_PACKAGE="protoc-3.19.4-linux-${PROTOC_ARCH}.zip" && \
|
|
|
|
|
|
if [ -f "/app/packages/${PROTOC_PACKAGE}" ]; then \
|
2025-09-03 11:30:47 +08:00
|
|
|
|
echo "发现本地protoc安装包,将进行离线安装"; \
|
|
|
|
|
|
# 离线安装:使用本地包(精确路径避免歧义)
|
2025-09-12 20:10:58 +08:00
|
|
|
|
cp /app/packages/${PROTOC_PACKAGE} /app/ && \
|
|
|
|
|
|
unzip -o /app/${PROTOC_PACKAGE} -d /usr/local && \
|
2025-09-03 11:30:47 +08:00
|
|
|
|
chmod +x /usr/local/bin/protoc && \
|
2025-09-12 20:10:58 +08:00
|
|
|
|
rm -f /app/${PROTOC_PACKAGE}; \
|
2025-09-03 11:30:47 +08:00
|
|
|
|
else \
|
|
|
|
|
|
echo "未发现本地protoc安装包,将进行在线安装"; \
|
|
|
|
|
|
# 在线安装:从网络下载
|
2025-09-12 20:10:58 +08:00
|
|
|
|
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/${PROTOC_PACKAGE} && \
|
|
|
|
|
|
unzip -o ${PROTOC_PACKAGE} -d /usr/local && \
|
2025-09-03 11:30:47 +08:00
|
|
|
|
chmod +x /usr/local/bin/protoc && \
|
2025-09-12 20:10:58 +08:00
|
|
|
|
rm -f ${PROTOC_PACKAGE}; \
|
2025-09-03 11:30:47 +08:00
|
|
|
|
fi
|
2025-08-05 15:08:07 +08:00
|
|
|
|
|
2025-09-10 16:05:37 +08:00
|
|
|
|
# 预下载 PP-OCRv4 模型
|
|
|
|
|
|
RUN mkdir -p /root/.paddleocr/whl/det/ch && \
|
|
|
|
|
|
mkdir -p /root/.paddleocr/whl/rec/ch && \
|
|
|
|
|
|
mkdir -p /root/.paddleocr/whl/cls/ch && \
|
|
|
|
|
|
# 下载检测模型
|
|
|
|
|
|
wget https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar \
|
|
|
|
|
|
-O /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar && \
|
|
|
|
|
|
tar -xf /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar -C /root/.paddleocr/whl/det/ch/ && \
|
|
|
|
|
|
# 下载识别模型
|
|
|
|
|
|
wget https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar \
|
|
|
|
|
|
-O /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
|
|
|
|
|
|
tar -xf /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar -C /root/.paddleocr/whl/rec/ch/ && \
|
|
|
|
|
|
# 下载文本方向分类模型(用于判断文本是否需要旋转)
|
|
|
|
|
|
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar \
|
|
|
|
|
|
-O /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar && \
|
|
|
|
|
|
tar -xf /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar -C /root/.paddleocr/whl/cls/ && \
|
|
|
|
|
|
# 清理压缩包
|
|
|
|
|
|
rm -f /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar && \
|
|
|
|
|
|
rm -f /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
|
|
|
|
|
|
rm -f /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar
|
2025-08-05 15:08:07 +08:00
|
|
|
|
|
2025-11-05 11:33:50 +08:00
|
|
|
|
# 复制依赖文件
|
|
|
|
|
|
COPY docreader/pyproject.toml docreader/uv.lock ./
|
|
|
|
|
|
RUN pip install uv --break-system-packages && \
|
|
|
|
|
|
python -m uv sync --locked --no-dev
|
2025-08-05 15:08:07 +08:00
|
|
|
|
|
2025-11-05 11:33:50 +08:00
|
|
|
|
# 复制源代码和生成脚本
|
2025-11-07 10:30:02 +08:00
|
|
|
|
COPY docreader docreader
|
2025-08-05 15:08:07 +08:00
|
|
|
|
|
|
|
|
|
|
# 生成 protobuf 代码
|
2025-11-07 10:30:02 +08:00
|
|
|
|
RUN chmod +x docreader/scripts/generate_proto.sh && \
|
|
|
|
|
|
bash docreader/scripts/generate_proto.sh
|
2025-08-05 15:08:07 +08:00
|
|
|
|
|
2025-11-05 11:33:50 +08:00
|
|
|
|
# 确保模型目录存在
|
|
|
|
|
|
RUN ls -la /root/.paddleocr/whl/
|
2025-08-14 00:45:19 +08:00
|
|
|
|
|
|
|
|
|
|
# =========================
|
2025-08-05 15:08:07 +08:00
|
|
|
|
# 运行阶段
|
2025-08-14 00:45:19 +08:00
|
|
|
|
# =========================
|
2025-08-22 12:19:06 +08:00
|
|
|
|
FROM python:3.10.18-bookworm AS runner
|
2025-08-05 15:08:07 +08:00
|
|
|
|
|
2025-08-14 00:45:19 +08:00
|
|
|
|
# 切换 apt 源到清华
|
|
|
|
|
|
RUN sed -i 's@http://deb.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources && \
|
|
|
|
|
|
sed -i 's@http://security.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources
|
|
|
|
|
|
|
2025-08-05 15:08:07 +08:00
|
|
|
|
WORKDIR /app
|
|
|
|
|
|
|
|
|
|
|
|
# 安装运行时依赖
|
|
|
|
|
|
RUN apt-get update && apt-get install -y \
|
|
|
|
|
|
libjpeg62-turbo \
|
|
|
|
|
|
libpq5 \
|
|
|
|
|
|
wget \
|
|
|
|
|
|
gnupg \
|
2025-08-13 13:54:41 +08:00
|
|
|
|
libgl1 \
|
2025-08-05 15:08:07 +08:00
|
|
|
|
libglib2.0-0 \
|
|
|
|
|
|
antiword \
|
|
|
|
|
|
vim \
|
|
|
|
|
|
tar \
|
|
|
|
|
|
dpkg \
|
|
|
|
|
|
libxinerama1 \
|
|
|
|
|
|
libfontconfig1 \
|
|
|
|
|
|
libdbus-glib-1-2 \
|
|
|
|
|
|
libcairo2 \
|
|
|
|
|
|
libcups2 \
|
|
|
|
|
|
libglu1-mesa \
|
|
|
|
|
|
libsm6 \
|
2025-08-16 13:13:52 +08:00
|
|
|
|
libreoffice \
|
2025-09-10 20:12:37 +08:00
|
|
|
|
curl \
|
2025-08-05 15:08:07 +08:00
|
|
|
|
&& rm -rf /var/lib/apt/lists/*
|
2025-09-10 20:12:37 +08:00
|
|
|
|
|
|
|
|
|
|
# 安装 grpc_health_probe
|
2025-09-12 20:10:58 +08:00
|
|
|
|
ARG TARGETARCH
|
2025-09-10 20:12:37 +08:00
|
|
|
|
RUN GRPC_HEALTH_PROBE_VERSION=v0.4.24 && \
|
2025-09-12 20:10:58 +08:00
|
|
|
|
# 根据目标架构选择正确的二进制文件
|
|
|
|
|
|
case ${TARGETARCH} in \
|
|
|
|
|
|
"amd64") ARCH="amd64" ;; \
|
|
|
|
|
|
"arm64") ARCH="arm64" ;; \
|
|
|
|
|
|
"arm") ARCH="arm" ;; \
|
|
|
|
|
|
*) echo "Unsupported architecture: ${TARGETARCH}" && exit 1 ;; \
|
|
|
|
|
|
esac && \
|
|
|
|
|
|
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-${ARCH} && \
|
2025-09-10 20:12:37 +08:00
|
|
|
|
chmod +x /bin/grpc_health_probe
|
2025-08-14 00:45:19 +08:00
|
|
|
|
|
2025-08-05 15:08:07 +08:00
|
|
|
|
# 从构建阶段复制已安装的依赖和生成的代码
|
2025-11-05 11:33:50 +08:00
|
|
|
|
ENV VIRTUAL_ENV=/app/.venv
|
|
|
|
|
|
COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
|
|
|
|
|
|
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
|
|
|
|
|
|
|
2025-08-05 15:08:07 +08:00
|
|
|
|
COPY --from=builder /usr/local/bin /usr/local/bin
|
2025-09-10 16:05:37 +08:00
|
|
|
|
COPY --from=builder /root/.paddleocr /root/.paddleocr
|
2025-08-05 15:08:07 +08:00
|
|
|
|
|
|
|
|
|
|
# 安装 Playwright 浏览器
|
|
|
|
|
|
RUN python -m playwright install webkit
|
2025-08-15 15:22:00 +08:00
|
|
|
|
RUN python -m playwright install-deps webkit
|
2025-08-05 15:08:07 +08:00
|
|
|
|
|
2025-11-05 11:33:50 +08:00
|
|
|
|
# COPY docreader/scripts/download_deps.py download_deps.py
|
|
|
|
|
|
# RUN python -m download_deps
|
2025-09-11 11:45:34 +08:00
|
|
|
|
|
2025-11-07 10:30:02 +08:00
|
|
|
|
COPY docreader/pyproject.toml docreader/uv.lock ./
|
|
|
|
|
|
COPY --from=builder /app/docreader docreader
|
2025-08-05 15:08:07 +08:00
|
|
|
|
|
|
|
|
|
|
# 暴露 gRPC 端口
|
|
|
|
|
|
EXPOSE 50051
|
|
|
|
|
|
|
2025-09-11 23:08:13 +08:00
|
|
|
|
# 直接运行 Python 服务(日志输出到 stdout/stderr)
|
2025-11-07 10:30:02 +08:00
|
|
|
|
CMD ["uv", "run", "-m", "docreader.main"]
|