Files
WeKnora/docker/Dockerfile.docreader
2025-09-10 20:22:14 +08:00

147 lines
5.2 KiB
Docker
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# =========================
# 构建阶段
# =========================
FROM python:3.10.18-bookworm AS builder
# 切换 apt 源到清华
RUN sed -i 's@http://deb.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources && \
sed -i 's@http://security.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources
WORKDIR /app
# 安装构建依赖
RUN apt-get update && apt-get install -y \
gcc \
python3-dev \
libjpeg-dev \
zlib1g-dev \
libpq-dev \
libffi-dev \
libgl1 \
libglib2.0-0 \
wget \
antiword \
curl \
unzip \
&& rm -rf /var/lib/apt/lists/*
# 检查是否存在本地protoc安装包如果存在则离线安装否则在线安装,其他安装包按需求添加
COPY packages/ /app/packages/
RUN echo "检查本地protoc安装包..." && \
if [ -f "/app/packages/protoc-3.19.4-linux-x86_64.zip" ]; then \
echo "发现本地protoc安装包将进行离线安装"; \
# 离线安装:使用本地包(精确路径避免歧义)
cp /app/packages/protoc-*.zip /app/ && \
unzip -o /app/protoc-*.zip -d /usr/local && \
chmod +x /usr/local/bin/protoc && \
rm -f /app/protoc-*.zip; \
else \
echo "未发现本地protoc安装包将进行在线安装"; \
# 在线安装:从网络下载
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \
unzip -o protoc-3.19.4-linux-x86_64.zip -d /usr/local && \
chmod +x /usr/local/bin/protoc && \
rm -f protoc-3.19.4-linux-x86_64.zip; \
fi
# 复制依赖文件
COPY services/docreader/requirements.txt .
# 安装依赖
RUN pip cache purge && pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
# 预下载 PP-OCRv4 模型
RUN mkdir -p /root/.paddleocr/whl/det/ch && \
mkdir -p /root/.paddleocr/whl/rec/ch && \
mkdir -p /root/.paddleocr/whl/cls/ch && \
# 下载检测模型
wget https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar \
-O /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar && \
tar -xf /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar -C /root/.paddleocr/whl/det/ch/ && \
# 下载识别模型
wget https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar \
-O /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
tar -xf /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar -C /root/.paddleocr/whl/rec/ch/ && \
# 下载文本方向分类模型(用于判断文本是否需要旋转)
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar \
-O /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar && \
tar -xf /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar -C /root/.paddleocr/whl/cls/ && \
# 清理压缩包
rm -f /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar && \
rm -f /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
rm -f /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar
# 复制源代码和生成脚本
COPY services/docreader/src/ /app/src/
COPY services/docreader/scripts/ /app/scripts/
# 确保模型目录存在
RUN ls -la /root/.paddleocr/whl/
# 生成 protobuf 代码
RUN chmod +x /app/scripts/generate_proto.sh && bash /app/scripts/generate_proto.sh
# =========================
# 运行阶段
# =========================
FROM python:3.10.18-bookworm AS runner
# 切换 apt 源到清华
RUN sed -i 's@http://deb.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources && \
sed -i 's@http://security.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources
WORKDIR /app
# 安装运行时依赖
RUN apt-get update && apt-get install -y \
libjpeg62-turbo \
libpq5 \
wget \
gnupg \
libgl1 \
libglib2.0-0 \
antiword \
supervisor \
vim \
tar \
dpkg \
libxinerama1 \
libfontconfig1 \
libdbus-glib-1-2 \
libcairo2 \
libcups2 \
libglu1-mesa \
libsm6 \
libreoffice \
curl \
&& rm -rf /var/lib/apt/lists/*
# 安装 grpc_health_probe
RUN GRPC_HEALTH_PROBE_VERSION=v0.4.24 && \
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-amd64 && \
chmod +x /bin/grpc_health_probe
# 从构建阶段复制已安装的依赖和生成的代码
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
COPY --from=builder /root/.paddleocr /root/.paddleocr
COPY --from=builder /app/src /app/src
# 安装 Playwright 浏览器
RUN python -m playwright install webkit
RUN python -m playwright install-deps webkit
# 设置 Python 路径
ENV PYTHONPATH=/app/src
RUN cd /app/src && python -m download_deps
# 创建supervisor配置
RUN mkdir -p /etc/supervisor/conf.d
COPY services/docreader/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
# 暴露 gRPC 端口
EXPOSE 50051
# 使用supervisor启动服务
CMD ["supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]