Files
WeKnora/docker/Dockerfile.docreader

147 lines
5.2 KiB
Docker
Raw Normal View History

# =========================
2025-08-05 15:08:07 +08:00
# 构建阶段
# =========================
FROM python:3.10.18-bookworm AS builder
2025-08-05 15:08:07 +08:00
# 切换 apt 源到清华
RUN sed -i 's@http://deb.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources && \
sed -i 's@http://security.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources
2025-08-05 15:08:07 +08:00
WORKDIR /app
# 安装构建依赖
RUN apt-get update && apt-get install -y \
gcc \
python3-dev \
libjpeg-dev \
zlib1g-dev \
libpq-dev \
libffi-dev \
libgl1 \
2025-08-05 15:08:07 +08:00
libglib2.0-0 \
wget \
antiword \
curl \
unzip \
&& rm -rf /var/lib/apt/lists/*
# 检查是否存在本地protoc安装包如果存在则离线安装否则在线安装,其他安装包按需求添加
COPY packages/ /app/packages/
RUN echo "检查本地protoc安装包..." && \
if [ -f "/app/packages/protoc-3.19.4-linux-x86_64.zip" ]; then \
echo "发现本地protoc安装包将进行离线安装"; \
# 离线安装:使用本地包(精确路径避免歧义)
cp /app/packages/protoc-*.zip /app/ && \
unzip -o /app/protoc-*.zip -d /usr/local && \
chmod +x /usr/local/bin/protoc && \
rm -f /app/protoc-*.zip; \
else \
echo "未发现本地protoc安装包将进行在线安装"; \
# 在线安装:从网络下载
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \
unzip -o protoc-3.19.4-linux-x86_64.zip -d /usr/local && \
chmod +x /usr/local/bin/protoc && \
rm -f protoc-3.19.4-linux-x86_64.zip; \
fi
2025-08-05 15:08:07 +08:00
# 复制依赖文件
COPY services/docreader/requirements.txt .
# 安装依赖
RUN pip cache purge && pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
2025-08-05 15:08:07 +08:00
# 预下载 PP-OCRv4 模型
RUN mkdir -p /root/.paddleocr/whl/det/ch && \
mkdir -p /root/.paddleocr/whl/rec/ch && \
mkdir -p /root/.paddleocr/whl/cls/ch && \
# 下载检测模型
wget https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar \
-O /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar && \
tar -xf /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar -C /root/.paddleocr/whl/det/ch/ && \
# 下载识别模型
wget https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar \
-O /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
tar -xf /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar -C /root/.paddleocr/whl/rec/ch/ && \
# 下载文本方向分类模型(用于判断文本是否需要旋转)
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar \
-O /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar && \
tar -xf /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar -C /root/.paddleocr/whl/cls/ && \
# 清理压缩包
rm -f /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar && \
rm -f /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
rm -f /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar
2025-08-05 15:08:07 +08:00
# 复制源代码和生成脚本
COPY services/docreader/src/ /app/src/
COPY services/docreader/scripts/ /app/scripts/
# 确保模型目录存在
RUN ls -la /root/.paddleocr/whl/
2025-08-05 15:08:07 +08:00
# 生成 protobuf 代码
RUN chmod +x /app/scripts/generate_proto.sh && bash /app/scripts/generate_proto.sh
2025-08-05 15:08:07 +08:00
# =========================
2025-08-05 15:08:07 +08:00
# 运行阶段
# =========================
FROM python:3.10.18-bookworm AS runner
2025-08-05 15:08:07 +08:00
# 切换 apt 源到清华
RUN sed -i 's@http://deb.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources && \
sed -i 's@http://security.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources
2025-08-05 15:08:07 +08:00
WORKDIR /app
# 安装运行时依赖
RUN apt-get update && apt-get install -y \
libjpeg62-turbo \
libpq5 \
wget \
gnupg \
libgl1 \
2025-08-05 15:08:07 +08:00
libglib2.0-0 \
antiword \
supervisor \
vim \
tar \
dpkg \
libxinerama1 \
libfontconfig1 \
libdbus-glib-1-2 \
libcairo2 \
libcups2 \
libglu1-mesa \
libsm6 \
libreoffice \
2025-09-10 20:12:37 +08:00
curl \
2025-08-05 15:08:07 +08:00
&& rm -rf /var/lib/apt/lists/*
2025-09-10 20:12:37 +08:00
# 安装 grpc_health_probe
RUN GRPC_HEALTH_PROBE_VERSION=v0.4.24 && \
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-amd64 && \
chmod +x /bin/grpc_health_probe
2025-08-05 15:08:07 +08:00
# 从构建阶段复制已安装的依赖和生成的代码
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
2025-08-05 15:08:07 +08:00
COPY --from=builder /usr/local/bin /usr/local/bin
COPY --from=builder /root/.paddleocr /root/.paddleocr
COPY --from=builder /app/src /app/src
2025-08-05 15:08:07 +08:00
# 安装 Playwright 浏览器
RUN python -m playwright install webkit
RUN python -m playwright install-deps webkit
2025-08-05 15:08:07 +08:00
# 设置 Python 路径
ENV PYTHONPATH=/app/src
RUN cd /app/src && python -m download_deps
# 创建supervisor配置
RUN mkdir -p /etc/supervisor/conf.d
COPY services/docreader/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
# 暴露 gRPC 端口
EXPOSE 50051
# 使用supervisor启动服务
CMD ["supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]