mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-24 19:12:51 +08:00
chore(docreader): 重新组织模块文件
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
**/.venv/
|
||||
**/.python-version
|
||||
@@ -122,6 +122,9 @@ services:
|
||||
|
||||
docreader:
|
||||
image: wechatopenai/weknora-docreader:latest
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/Dockerfile.docreader
|
||||
container_name: WeKnora-docreader
|
||||
ports:
|
||||
- "${DOCREADER_PORT:-50051}:50051"
|
||||
|
||||
@@ -53,12 +53,6 @@ RUN echo "检查本地protoc安装包..." && \
|
||||
rm -f ${PROTOC_PACKAGE}; \
|
||||
fi
|
||||
|
||||
# 复制依赖文件
|
||||
COPY services/docreader/requirements.txt .
|
||||
|
||||
# 安装依赖
|
||||
RUN pip cache purge && pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# 预下载 PP-OCRv4 模型
|
||||
RUN mkdir -p /root/.paddleocr/whl/det/ch && \
|
||||
mkdir -p /root/.paddleocr/whl/rec/ch && \
|
||||
@@ -80,17 +74,20 @@ RUN mkdir -p /root/.paddleocr/whl/det/ch && \
|
||||
rm -f /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
|
||||
rm -f /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar
|
||||
|
||||
# 复制依赖文件
|
||||
COPY docreader/pyproject.toml docreader/uv.lock ./
|
||||
RUN pip install uv --break-system-packages && \
|
||||
python -m uv sync --locked --no-dev
|
||||
|
||||
# 复制源代码和生成脚本
|
||||
COPY services/docreader/src/ /app/src/
|
||||
COPY services/docreader/scripts/ /app/scripts/
|
||||
COPY docreader .
|
||||
|
||||
# 生成 protobuf 代码
|
||||
RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh
|
||||
|
||||
# 确保模型目录存在
|
||||
RUN ls -la /root/.paddleocr/whl/
|
||||
|
||||
# 生成 protobuf 代码
|
||||
RUN chmod +x /app/scripts/generate_proto.sh && bash /app/scripts/generate_proto.sh
|
||||
|
||||
|
||||
# =========================
|
||||
# 运行阶段
|
||||
# =========================
|
||||
@@ -139,7 +136,10 @@ RUN GRPC_HEALTH_PROBE_VERSION=v0.4.24 && \
|
||||
chmod +x /bin/grpc_health_probe
|
||||
|
||||
# 从构建阶段复制已安装的依赖和生成的代码
|
||||
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
|
||||
ENV VIRTUAL_ENV=/app/.venv
|
||||
COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
|
||||
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
|
||||
|
||||
COPY --from=builder /usr/local/bin /usr/local/bin
|
||||
COPY --from=builder /root/.paddleocr /root/.paddleocr
|
||||
|
||||
@@ -147,14 +147,13 @@ COPY --from=builder /root/.paddleocr /root/.paddleocr
|
||||
RUN python -m playwright install webkit
|
||||
RUN python -m playwright install-deps webkit
|
||||
|
||||
COPY --from=builder /app/src /app/src
|
||||
# COPY docreader/scripts/download_deps.py download_deps.py
|
||||
# RUN python -m download_deps
|
||||
|
||||
# 设置 Python 路径
|
||||
ENV PYTHONPATH=/app/src
|
||||
RUN cd /app/src && python -m download_deps
|
||||
COPY --from=builder /app/ ./
|
||||
|
||||
# 暴露 gRPC 端口
|
||||
EXPOSE 50051
|
||||
|
||||
# 直接运行 Python 服务(日志输出到 stdout/stderr)
|
||||
CMD ["python", "/app/src/server/server.py"]
|
||||
CMD ["uv", "run", "main.py"]
|
||||
@@ -34,6 +34,7 @@ except Exception: # pragma: no cover
|
||||
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
|
||||
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
|
||||
|
||||
|
||||
def to_valid_utf8_text(s: Optional[str]) -> str:
|
||||
"""Return a UTF-8 safe string for protobuf.
|
||||
|
||||
@@ -42,9 +43,10 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
|
||||
"""
|
||||
if not s:
|
||||
return ""
|
||||
s = _SURROGATE_RE.sub("\uFFFD", s)
|
||||
s = _SURROGATE_RE.sub("\ufffd", s)
|
||||
return s.encode("utf-8", errors="replace").decode("utf-8")
|
||||
|
||||
|
||||
def read_text_with_fallback(file_path: str) -> str:
|
||||
"""Read text from file supporting multiple encodings with graceful fallback.
|
||||
|
||||
@@ -67,6 +69,7 @@ def read_text_with_fallback(file_path: str) -> str:
|
||||
continue
|
||||
return raw.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
# Ensure no existing handlers
|
||||
for handler in logging.root.handlers[:]:
|
||||
logging.root.removeHandler(handler)
|
||||
@@ -88,6 +91,7 @@ MAX_MESSAGE_LENGTH = 50 * 1024 * 1024
|
||||
|
||||
parser = Parser()
|
||||
|
||||
|
||||
class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@@ -127,29 +131,34 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
# Get Storage and VLM config from request
|
||||
storage_config = None
|
||||
vlm_config = None
|
||||
|
||||
|
||||
sc = request.read_config.storage_config
|
||||
# Keep parser-side key name as cos_config for backward compatibility
|
||||
storage_config = {
|
||||
'provider': 'minio' if sc.provider == 2 else 'cos',
|
||||
'region': sc.region,
|
||||
'bucket_name': sc.bucket_name,
|
||||
'access_key_id': sc.access_key_id,
|
||||
'secret_access_key': sc.secret_access_key,
|
||||
'app_id': sc.app_id,
|
||||
'path_prefix': sc.path_prefix,
|
||||
"provider": "minio" if sc.provider == 2 else "cos",
|
||||
"region": sc.region,
|
||||
"bucket_name": sc.bucket_name,
|
||||
"access_key_id": sc.access_key_id,
|
||||
"secret_access_key": sc.secret_access_key,
|
||||
"app_id": sc.app_id,
|
||||
"path_prefix": sc.path_prefix,
|
||||
}
|
||||
logger.info(f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}")
|
||||
|
||||
logger.info(
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
|
||||
)
|
||||
|
||||
vlm_config = {
|
||||
'model_name': request.read_config.vlm_config.model_name,
|
||||
'base_url': request.read_config.vlm_config.base_url,
|
||||
'api_key': request.read_config.vlm_config.api_key or '',
|
||||
'interface_type': request.read_config.vlm_config.interface_type or 'openai',
|
||||
"model_name": request.read_config.vlm_config.model_name,
|
||||
"base_url": request.read_config.vlm_config.base_url,
|
||||
"api_key": request.read_config.vlm_config.api_key or "",
|
||||
"interface_type": request.read_config.vlm_config.interface_type
|
||||
or "openai",
|
||||
}
|
||||
logger.info(f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}")
|
||||
logger.info(
|
||||
f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}"
|
||||
)
|
||||
|
||||
chunking_config = ChunkingConfig(
|
||||
chunk_size=chunk_size,
|
||||
@@ -177,10 +186,12 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
logger.info(
|
||||
f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
|
||||
)
|
||||
|
||||
|
||||
# Build response, including image info
|
||||
response = ReadResponse(
|
||||
chunks=[self._convert_chunk_to_proto(chunk) for chunk in result.chunks]
|
||||
chunks=[
|
||||
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
|
||||
]
|
||||
)
|
||||
logger.info(f"Response size: {response.ByteSize()} bytes")
|
||||
return response
|
||||
@@ -220,29 +231,34 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
# Get Storage and VLM config from request
|
||||
storage_config = None
|
||||
vlm_config = None
|
||||
|
||||
|
||||
sc = request.read_config.storage_config
|
||||
storage_config = {
|
||||
'provider': 'minio' if sc.provider == 2 else 'cos',
|
||||
'region': sc.region,
|
||||
'bucket_name': sc.bucket_name,
|
||||
'access_key_id': sc.access_key_id,
|
||||
'secret_access_key': sc.secret_access_key,
|
||||
'app_id': sc.app_id,
|
||||
'path_prefix': sc.path_prefix,
|
||||
"provider": "minio" if sc.provider == 2 else "cos",
|
||||
"region": sc.region,
|
||||
"bucket_name": sc.bucket_name,
|
||||
"access_key_id": sc.access_key_id,
|
||||
"secret_access_key": sc.secret_access_key,
|
||||
"app_id": sc.app_id,
|
||||
"path_prefix": sc.path_prefix,
|
||||
}
|
||||
logger.info(f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}")
|
||||
logger.info(
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
|
||||
)
|
||||
|
||||
vlm_config = {
|
||||
'model_name': request.read_config.vlm_config.model_name,
|
||||
'base_url': request.read_config.vlm_config.base_url,
|
||||
'api_key': request.read_config.vlm_config.api_key or '',
|
||||
'interface_type': request.read_config.vlm_config.interface_type or 'openai',
|
||||
"model_name": request.read_config.vlm_config.model_name,
|
||||
"base_url": request.read_config.vlm_config.base_url,
|
||||
"api_key": request.read_config.vlm_config.api_key or "",
|
||||
"interface_type": request.read_config.vlm_config.interface_type
|
||||
or "openai",
|
||||
}
|
||||
logger.info(f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}")
|
||||
|
||||
logger.info(
|
||||
f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}"
|
||||
)
|
||||
|
||||
chunking_config = ChunkingConfig(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
@@ -254,7 +270,9 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
|
||||
# Parse URL
|
||||
logger.info(f"Starting URL parsing process")
|
||||
result = self.parser.parse_url(request.url, request.title, chunking_config)
|
||||
result = self.parser.parse_url(
|
||||
request.url, request.title, chunking_config
|
||||
)
|
||||
if not result:
|
||||
error_msg = "Failed to parse URL"
|
||||
logger.error(error_msg)
|
||||
@@ -266,9 +284,11 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
logger.info(
|
||||
f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
|
||||
)
|
||||
|
||||
|
||||
response = ReadResponse(
|
||||
chunks=[self._convert_chunk_to_proto(chunk) for chunk in result.chunks]
|
||||
chunks=[
|
||||
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
|
||||
]
|
||||
)
|
||||
logger.info(f"Response size: {response.ByteSize()} bytes")
|
||||
return response
|
||||
@@ -280,7 +300,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return ReadResponse(error=str(e))
|
||||
|
||||
|
||||
def _convert_chunk_to_proto(self, chunk):
|
||||
"""Convert internal Chunk object to protobuf Chunk message
|
||||
Ensures all string fields are valid UTF-8 for protobuf (no lone surrogates).
|
||||
@@ -294,10 +314,12 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
start=getattr(chunk, "start", 0),
|
||||
end=getattr(chunk, "end", 0),
|
||||
)
|
||||
|
||||
|
||||
# If chunk has images attribute and is not empty, add image info
|
||||
if hasattr(chunk, "images") and chunk.images:
|
||||
logger.info(f"Adding {len(chunk.images)} images to chunk {getattr(chunk, 'seq', 0)}")
|
||||
logger.info(
|
||||
f"Adding {len(chunk.images)} images to chunk {getattr(chunk, 'seq', 0)}"
|
||||
)
|
||||
for img_info in chunk.images:
|
||||
# img_info expected as dict
|
||||
proto_image = Image(
|
||||
@@ -309,9 +331,10 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
end=int(img_info.get("end", 0) or 0),
|
||||
)
|
||||
proto_chunk.images.append(proto_image)
|
||||
|
||||
|
||||
return proto_chunk
|
||||
|
||||
|
||||
def init_ocr_engine(ocr_backend, ocr_config):
|
||||
"""Initialize OCR engine"""
|
||||
try:
|
||||
@@ -328,44 +351,46 @@ def init_ocr_engine(ocr_backend, ocr_config):
|
||||
return False
|
||||
|
||||
|
||||
def serve():
|
||||
|
||||
init_ocr_engine(os.getenv("OCR_BACKEND", "paddle"), {
|
||||
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
|
||||
})
|
||||
|
||||
def main():
|
||||
init_ocr_engine(
|
||||
os.getenv("OCR_BACKEND", "paddle"),
|
||||
{
|
||||
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
|
||||
},
|
||||
)
|
||||
|
||||
# Set max number of worker threads
|
||||
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
|
||||
logger.info(f"Starting DocReader service with {max_workers} worker threads")
|
||||
|
||||
|
||||
# Get port number
|
||||
port = os.environ.get("GRPC_PORT", "50051")
|
||||
|
||||
|
||||
# Create server
|
||||
server = grpc.server(
|
||||
futures.ThreadPoolExecutor(max_workers=max_workers),
|
||||
options=[
|
||||
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
|
||||
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
|
||||
("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
|
||||
("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
# Register services
|
||||
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
|
||||
|
||||
|
||||
# Register health check service
|
||||
health_servicer = HealthServicer()
|
||||
health_pb2_grpc.add_HealthServicer_to_server(health_servicer, server)
|
||||
|
||||
|
||||
# Set listen address
|
||||
server.add_insecure_port(f"[::]:{port}")
|
||||
|
||||
|
||||
# Start service
|
||||
server.start()
|
||||
|
||||
|
||||
logger.info(f"Server started on port {port}")
|
||||
logger.info("Server is ready to accept connections")
|
||||
|
||||
|
||||
try:
|
||||
# Wait for service termination
|
||||
server.wait_for_termination()
|
||||
@@ -373,5 +398,6 @@ def serve():
|
||||
logger.info("Received termination signal, shutting down server")
|
||||
server.stop(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
serve()
|
||||
main()
|
||||
55
docreader/proto/docreader_pb2.py
Normal file
55
docreader/proto/docreader_pb2.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# NO CHECKED-IN PROTOBUF GENCODE
|
||||
# source: docreader.proto
|
||||
# Protobuf Python Version: 6.31.1
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import runtime_version as _runtime_version
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
_runtime_version.ValidateProtobufRuntimeVersion(
|
||||
_runtime_version.Domain.PUBLIC,
|
||||
6,
|
||||
31,
|
||||
1,
|
||||
'',
|
||||
'docreader.proto'
|
||||
)
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x64ocreader.proto\x12\tdocreader\"\xb9\x01\n\rStorageConfig\x12,\n\x08provider\x18\x01 \x01(\x0e\x32\x1a.docreader.StorageProvider\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x03 \x01(\t\x12\x15\n\raccess_key_id\x18\x04 \x01(\t\x12\x19\n\x11secret_access_key\x18\x05 \x01(\t\x12\x0e\n\x06\x61pp_id\x18\x06 \x01(\t\x12\x13\n\x0bpath_prefix\x18\x07 \x01(\t\"Z\n\tVLMConfig\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x10\n\x08\x62\x61se_url\x18\x02 \x01(\t\x12\x0f\n\x07\x61pi_key\x18\x03 \x01(\t\x12\x16\n\x0einterface_type\x18\x04 \x01(\t\"\xc2\x01\n\nReadConfig\x12\x12\n\nchunk_size\x18\x01 \x01(\x05\x12\x15\n\rchunk_overlap\x18\x02 \x01(\x05\x12\x12\n\nseparators\x18\x03 \x03(\t\x12\x19\n\x11\x65nable_multimodal\x18\x04 \x01(\x08\x12\x30\n\x0estorage_config\x18\x05 \x01(\x0b\x32\x18.docreader.StorageConfig\x12(\n\nvlm_config\x18\x06 \x01(\x0b\x32\x14.docreader.VLMConfig\"\x91\x01\n\x13ReadFromFileRequest\x12\x14\n\x0c\x66ile_content\x18\x01 \x01(\x0c\x12\x11\n\tfile_name\x18\x02 \x01(\t\x12\x11\n\tfile_type\x18\x03 \x01(\t\x12*\n\x0bread_config\x18\x04 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x05 \x01(\t\"p\n\x12ReadFromURLRequest\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\r\n\x05title\x18\x02 \x01(\t\x12*\n\x0bread_config\x18\x03 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x04 \x01(\t\"i\n\x05Image\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\x0f\n\x07\x63\x61ption\x18\x02 \x01(\t\x12\x10\n\x08ocr_text\x18\x03 \x01(\t\x12\x14\n\x0coriginal_url\x18\x04 \x01(\t\x12\r\n\x05start\x18\x05 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x06 \x01(\x05\"c\n\x05\x43hunk\x12\x0f\n\x07\x63ontent\x18\x01 \x01(\t\x12\x0b\n\x03seq\x18\x02 \x01(\x05\x12\r\n\x05start\x18\x03 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x04 \x01(\x05\x12 \n\x06images\x18\x05 \x03(\x0b\x32\x10.docreader.Image\"?\n\x0cReadResponse\x12 \n\x06\x63hunks\x18\x01 \x03(\x0b\x32\x10.docreader.Chunk\x12\r\n\x05\x65rror\x18\x02 \x01(\t*G\n\x0fStorageProvider\x12 \n\x1cSTORAGE_PROVIDER_UNSPECIFIED\x10\x00\x12\x07\n\x03\x43OS\x10\x01\x12\t\n\x05MINIO\x10\x02\x32\x9f\x01\n\tDocReader\x12I\n\x0cReadFromFile\x12\x1e.docreader.ReadFromFileRequest\x1a\x17.docreader.ReadResponse\"\x00\x12G\n\x0bReadFromURL\x12\x1d.docreader.ReadFromURLRequest\x1a\x17.docreader.ReadResponse\"\x00\x42\x35Z3github.com/Tencent/WeKnora/internal/docreader/protob\x06proto3')
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'docreader_pb2', _globals)
|
||||
if not _descriptor._USE_C_DESCRIPTORS:
|
||||
_globals['DESCRIPTOR']._loaded_options = None
|
||||
_globals['DESCRIPTOR']._serialized_options = b'Z3github.com/Tencent/WeKnora/internal/docreader/proto'
|
||||
_globals['_STORAGEPROVIDER']._serialized_start=1042
|
||||
_globals['_STORAGEPROVIDER']._serialized_end=1113
|
||||
_globals['_STORAGECONFIG']._serialized_start=31
|
||||
_globals['_STORAGECONFIG']._serialized_end=216
|
||||
_globals['_VLMCONFIG']._serialized_start=218
|
||||
_globals['_VLMCONFIG']._serialized_end=308
|
||||
_globals['_READCONFIG']._serialized_start=311
|
||||
_globals['_READCONFIG']._serialized_end=505
|
||||
_globals['_READFROMFILEREQUEST']._serialized_start=508
|
||||
_globals['_READFROMFILEREQUEST']._serialized_end=653
|
||||
_globals['_READFROMURLREQUEST']._serialized_start=655
|
||||
_globals['_READFROMURLREQUEST']._serialized_end=767
|
||||
_globals['_IMAGE']._serialized_start=769
|
||||
_globals['_IMAGE']._serialized_end=874
|
||||
_globals['_CHUNK']._serialized_start=876
|
||||
_globals['_CHUNK']._serialized_end=975
|
||||
_globals['_READRESPONSE']._serialized_start=977
|
||||
_globals['_READRESPONSE']._serialized_end=1040
|
||||
_globals['_DOCREADER']._serialized_start=1116
|
||||
_globals['_DOCREADER']._serialized_end=1275
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
@@ -3,9 +3,9 @@
|
||||
import grpc
|
||||
import warnings
|
||||
|
||||
import docreader_pb2 as docreader__pb2
|
||||
from . import docreader_pb2 as docreader__pb2
|
||||
|
||||
GRPC_GENERATED_VERSION = '1.74.0'
|
||||
GRPC_GENERATED_VERSION = '1.76.0'
|
||||
GRPC_VERSION = grpc.__version__
|
||||
_version_not_supported = False
|
||||
|
||||
@@ -18,7 +18,7 @@ except ImportError:
|
||||
if _version_not_supported:
|
||||
raise RuntimeError(
|
||||
f'The grpc package installed is at version {GRPC_VERSION},'
|
||||
+ f' but the generated code in docreader_pb2_grpc.py depends on'
|
||||
+ ' but the generated code in docreader_pb2_grpc.py depends on'
|
||||
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
|
||||
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
|
||||
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
|
||||
35
docreader/pyproject.toml
Normal file
35
docreader/pyproject.toml
Normal file
@@ -0,0 +1,35 @@
|
||||
[project]
|
||||
name = "docreader"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10.18"
|
||||
dependencies = [
|
||||
"antiword>=0.1.0",
|
||||
"asyncio>=4.0.0",
|
||||
"beautifulsoup4>=4.14.2",
|
||||
"cos-python-sdk-v5>=1.9.38",
|
||||
"goose3[all]>=3.1.20",
|
||||
"grpcio>=1.76.0",
|
||||
"grpcio-health-checking>=1.76.0",
|
||||
"grpcio-tools>=1.76.0",
|
||||
"lxml>=6.0.2",
|
||||
"markdown>=3.10",
|
||||
"markdownify>=1.2.0",
|
||||
"minio>=7.2.18",
|
||||
"mistletoe>=1.5.0",
|
||||
"ollama>=0.6.0",
|
||||
"openai>=2.7.1",
|
||||
"paddleocr>=2.10.0,<3.0.0",
|
||||
"paddlepaddle>=3.0.0,<4.0.0",
|
||||
"pdfplumber>=0.11.7",
|
||||
"pillow>=12.0.0",
|
||||
"playwright>=1.55.0",
|
||||
"protobuf>=6.33.0",
|
||||
"pypdf>=6.1.3",
|
||||
"pypdf2>=3.0.1",
|
||||
"python-docx>=1.2.0",
|
||||
"requests>=2.32.5",
|
||||
"textract==1.5.0",
|
||||
"urllib3>=2.5.0",
|
||||
]
|
||||
@@ -2,9 +2,9 @@
|
||||
set -x
|
||||
|
||||
# 设置目录
|
||||
PROTO_DIR="src/proto"
|
||||
PYTHON_OUT="src/proto"
|
||||
GO_OUT="src/proto"
|
||||
PROTO_DIR="proto"
|
||||
PYTHON_OUT="proto"
|
||||
GO_OUT="proto"
|
||||
|
||||
# 生成Python代码
|
||||
python3 -m grpc_tools.protoc -I${PROTO_DIR} \
|
||||
@@ -22,10 +22,10 @@ protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
|
||||
# 修复Python导入问题(MacOS兼容版本)
|
||||
if [ "$(uname)" == "Darwin" ]; then
|
||||
# MacOS版本
|
||||
sed -i '' 's/import docreader_pb2/from . import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
sed -i '' 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
else
|
||||
# Linux版本
|
||||
sed -i 's/import docreader_pb2/from . import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
sed -i 's/from . import docreader_pb2/from proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
fi
|
||||
|
||||
echo "Proto files generated successfully!"
|
||||
|
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.8 KiB |
3228
docreader/uv.lock
generated
Normal file
3228
docreader/uv.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
2228
services/docreader/poetry.lock
generated
2228
services/docreader/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,19 +0,0 @@
|
||||
[project]
|
||||
name = "docreader"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12,<4.0"
|
||||
dependencies = [
|
||||
"paddlepaddle (>=3.0.0,<4.0.0)",
|
||||
"paddleocr (>=2.10.0,<3.0.0)",
|
||||
"playwright (>=1.51.0,<2.0.0)",
|
||||
"setuptools (>=79.0.0,<80.0.0)",
|
||||
"textract (>=1.6.5,<2.0.0)",
|
||||
"antiword (>=0.3.2,<0.4.0)"
|
||||
]
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
@@ -1,32 +0,0 @@
|
||||
grpcio
|
||||
grpcio-tools
|
||||
grpcio-health-checking
|
||||
protobuf
|
||||
python-docx
|
||||
PyPDF2
|
||||
requests
|
||||
Pillow
|
||||
beautifulsoup4
|
||||
lxml
|
||||
playwright
|
||||
asyncio
|
||||
urllib3
|
||||
markdownify
|
||||
mistletoe
|
||||
goose3[all]
|
||||
paddleocr>=2.10.0,<3.0.0
|
||||
markdown
|
||||
pypdf
|
||||
cos-python-sdk-v5
|
||||
minio
|
||||
textract
|
||||
antiword
|
||||
openai
|
||||
ollama
|
||||
pdfplumber
|
||||
|
||||
--extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
paddlepaddle>=3.0.0,<4.0.0
|
||||
|
||||
# --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
# paddlepaddle-gpu==3.0.0
|
||||
@@ -1,32 +0,0 @@
|
||||
[supervisord]
|
||||
nodaemon=true
|
||||
logfile=/var/log/supervisord.log
|
||||
logfile_maxbytes=50MB
|
||||
logfile_backups=10
|
||||
loglevel=info
|
||||
pidfile=/var/run/supervisord.pid
|
||||
user=root
|
||||
|
||||
[program:docreader]
|
||||
command=/bin/sh -c "python /app/src/server/server.py 2>&1"
|
||||
directory=/app
|
||||
autostart=true
|
||||
autorestart=true
|
||||
startretries=5
|
||||
startsecs=5
|
||||
priority=10
|
||||
redirect_stderr=true
|
||||
stdout_logfile=/var/log/docreader.log
|
||||
stdout_logfile_maxbytes=50MB
|
||||
stdout_logfile_backups=10
|
||||
environment=PYTHONPATH="/app/src"
|
||||
|
||||
[unix_http_server]
|
||||
file=/var/run/supervisor.sock
|
||||
chmod=0700
|
||||
|
||||
[rpcinterface:supervisor]
|
||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||
|
||||
[supervisorctl]
|
||||
serverurl=unix:///var/run/supervisor.sock
|
||||
Reference in New Issue
Block a user