Compare commits
82 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3e08d3d9d8 | ||
|
|
a0685a22db | ||
|
|
b9f264b883 | ||
|
|
154025f723 | ||
|
|
587d1b2bd3 | ||
|
|
3a2c86df5b | ||
|
|
a910bae6cd | ||
|
|
ddbdae686f | ||
|
|
4fdbec17a7 | ||
|
|
2d66abedf0 | ||
|
|
af620806e0 | ||
|
|
a625eff525 | ||
|
|
ef69e2aed5 | ||
|
|
07c3453e1a | ||
|
|
0032a9185c | ||
|
|
da640d1d33 | ||
|
|
1fd2de5a64 | ||
|
|
c1f731e026 | ||
|
|
0d790ffedc | ||
|
|
6d547131b6 | ||
|
|
8a40377a4a | ||
|
|
a1d0ccaa09 | ||
|
|
ca704fa054 | ||
|
|
02b78a5908 | ||
|
|
de96a52d54 | ||
|
|
f24cd817cb | ||
|
|
4824e41361 | ||
|
|
bfd4fffbe3 | ||
|
|
a2902de6ce | ||
|
|
a5c3623a02 | ||
|
|
8f723b38fb | ||
|
|
7973128f4c | ||
|
|
8ed050b8ec | ||
|
|
4ccbd2a127 | ||
|
|
512910584b | ||
|
|
cd7e02e54a | ||
|
|
c9b1f43ed7 | ||
|
|
76fc64a807 | ||
|
|
947899ff10 | ||
|
|
5e0a99b127 | ||
|
|
b04566be32 | ||
|
|
0157eb25bd | ||
|
|
91e65d6445 | ||
|
|
c589a911dc | ||
|
|
66aec78960 | ||
|
|
76fbfdf8ac | ||
|
|
4137a63852 | ||
|
|
d28f805707 | ||
|
|
2e395864b9 | ||
|
|
4005aa3ded | ||
|
|
5e22f96d37 | ||
|
|
2237e1ee55 | ||
|
|
b11df52cfb | ||
|
|
c3744866fd | ||
|
|
c2d52a9374 | ||
|
|
81bd2e6c2c | ||
|
|
0908f9c487 | ||
|
|
1aac37d3fd | ||
|
|
cd249df8c8 | ||
|
|
092b30af3e | ||
|
|
74c121f7fb | ||
|
|
78088057fb | ||
|
|
bff0e742fa | ||
|
|
6598baab2e | ||
|
|
6f6ca84dae | ||
|
|
8a2b38da6f | ||
|
|
79b9315758 | ||
|
|
b3c43b2180 | ||
|
|
bfea6775ee | ||
|
|
2241127a41 | ||
|
|
7cfae7e0d3 | ||
|
|
19d2493afc | ||
|
|
0e1d7edca3 | ||
|
|
fd6c50059e | ||
|
|
7775559a9b | ||
|
|
2b6cbee1b6 | ||
|
|
4214e6782b | ||
|
|
3f8a1d20c1 | ||
|
|
7efa173812 | ||
|
|
44e0e9ecb8 | ||
|
|
820aeacbba | ||
|
|
daa5e8853a |
@@ -0,0 +1,2 @@
|
||||
**/.venv/
|
||||
**/.python-version
|
||||
100
.env.example
@@ -23,9 +23,14 @@ STORAGE_TYPE=local
|
||||
# 流处理后端(memory/redis)
|
||||
STREAM_MANAGER_TYPE=redis
|
||||
|
||||
# 主数据库配置
|
||||
# 数据库端口,默认为5432
|
||||
DB_PORT=5432
|
||||
# 应用服务端口,默认为8080
|
||||
APP_PORT=8080
|
||||
|
||||
# 前端服务端口,默认为80
|
||||
FRONTEND_PORT=80
|
||||
|
||||
# 文档解析模块端口,默认为50051
|
||||
DOCREADER_PORT=50051
|
||||
|
||||
# 数据库用户名
|
||||
DB_USER=postgres
|
||||
@@ -37,9 +42,6 @@ DB_PASSWORD=postgres123!@#
|
||||
DB_NAME=WeKnora
|
||||
|
||||
# 如果使用 redis 作为流处理后端,需要配置以下参数
|
||||
# Redis端口,默认为6379
|
||||
REDIS_PORT=6379
|
||||
|
||||
# Redis密码,如果没有设置密码,可以留空
|
||||
REDIS_PASSWORD=redis123!@#
|
||||
|
||||
@@ -57,9 +59,11 @@ TENANT_AES_KEY=weknorarag-api-key-secret-secret
|
||||
# 是否开启知识图谱构建和检索(构建阶段需调用大模型,耗时较长)
|
||||
ENABLE_GRAPH_RAG=false
|
||||
|
||||
MINIO_PORT=9000
|
||||
# MinIO端口
|
||||
# MINIO_PORT=9000
|
||||
|
||||
MINIO_CONSOLE_PORT=9001
|
||||
# MinIO控制台端口
|
||||
# MINIO_CONSOLE_PORT=9001
|
||||
|
||||
# Embedding并发数,出现429错误时,可调小此参数
|
||||
CONCURRENCY_POOL_SIZE=5
|
||||
@@ -112,78 +116,14 @@ COS_ENABLE_OLD_DOMAIN=true
|
||||
# 如果解析网络连接使用Web代理,需要配置以下参数
|
||||
# WEB_PROXY=your_web_proxy
|
||||
|
||||
##############################################################
|
||||
# Neo4j 开关
|
||||
# NEO4J_ENABLE=false
|
||||
|
||||
###### 注意: 以下配置不再生效,已在Web“配置初始化”阶段完成 #########
|
||||
# Neo4j的访问地址
|
||||
# NEO4J_URI=neo4j://neo4j:7687
|
||||
|
||||
# Neo4j的用户名和密码
|
||||
# NEO4J_USERNAME=neo4j
|
||||
|
||||
# # 初始化默认租户与知识库
|
||||
# # 租户ID,通常是一个字符串
|
||||
# INIT_TEST_TENANT_ID=1
|
||||
|
||||
# # 知识库ID,通常是一个字符串
|
||||
# INIT_TEST_KNOWLEDGE_BASE_ID=kb-00000001
|
||||
|
||||
# # LLM Model
|
||||
# # 使用的LLM模型名称
|
||||
# # 默认使用 Ollama 的 Qwen3 8B 模型,ollama 会自动处理模型下载和加载
|
||||
# # 如果需要使用其他模型,请替换为实际的模型名称
|
||||
# INIT_LLM_MODEL_NAME=qwen3:8b
|
||||
|
||||
# # LLM模型的访问地址
|
||||
# # 支持第三方模型服务的URL
|
||||
# # 如果使用 Ollama 的本地服务,可以留空,ollama 会自动处理
|
||||
# # INIT_LLM_MODEL_BASE_URL=your_llm_model_base_url
|
||||
|
||||
# # LLM模型的API密钥,如果需要身份验证,可以设置
|
||||
# # 支持第三方模型服务的API密钥
|
||||
# # 如果使用 Ollama 的本地服务,可以留空,ollama 会自动处理
|
||||
# # INIT_LLM_MODEL_API_KEY=your_llm_model_api_key
|
||||
|
||||
# # Embedding Model
|
||||
# # 使用的Embedding模型名称
|
||||
# # 默认使用 nomic-embed-text 模型,支持文本嵌入
|
||||
# # 如果需要使用其他模型,请替换为实际的模型名称
|
||||
# INIT_EMBEDDING_MODEL_NAME=nomic-embed-text
|
||||
|
||||
# # Embedding模型向量维度
|
||||
# INIT_EMBEDDING_MODEL_DIMENSION=768
|
||||
|
||||
# # Embedding模型的ID,通常是一个字符串
|
||||
# INIT_EMBEDDING_MODEL_ID=builtin:nomic-embed-text:768
|
||||
|
||||
# # Embedding模型的访问地址
|
||||
# # 支持第三方模型服务的URL
|
||||
# # 如果使用 Ollama 的本地服务,可以留空,ollama 会自动处理
|
||||
# # INIT_EMBEDDING_MODEL_BASE_URL=your_embedding_model_base_url
|
||||
|
||||
# # Embedding模型的API密钥,如果需要身份验证,可以设置
|
||||
# # 支持第三方模型服务的API密钥
|
||||
# # 如果使用 Ollama 的本地服务,可以留空,ollama 会自动处理
|
||||
# # INIT_EMBEDDING_MODEL_API_KEY=your_embedding_model_api_key
|
||||
|
||||
# # Rerank Model(可选)
|
||||
# # 对于rag来说,使用Rerank模型对提升文档搜索的准确度有着重要作用
|
||||
# # 目前 ollama 暂不支持运行 Rerank 模型
|
||||
# # 使用的Rerank模型名称
|
||||
# # INIT_RERANK_MODEL_NAME=your_rerank_model_name
|
||||
|
||||
# # Rerank模型的访问地址
|
||||
# # 支持第三方模型服务的URL
|
||||
# # INIT_RERANK_MODEL_BASE_URL=your_rerank_model_base_url
|
||||
|
||||
# # Rerank模型的API密钥,如果需要身份验证,可以设置
|
||||
# # 支持第三方模型服务的API密钥
|
||||
# # INIT_RERANK_MODEL_API_KEY=your_rerank_model_api_key
|
||||
|
||||
# # VLM_MODEL_NAME 使用的多模态模型名称
|
||||
# # 用于解析图片数据
|
||||
# # VLM_MODEL_NAME=your_vlm_model_name
|
||||
|
||||
# # VLM_MODEL_BASE_URL 使用的多模态模型访问地址
|
||||
# # 支持第三方模型服务的URL
|
||||
# # VLM_MODEL_BASE_URL=your_vlm_model_base_url
|
||||
|
||||
# # VLM_MODEL_API_KEY 使用的多模态模型API密钥
|
||||
# # 支持第三方模型服务的API密钥
|
||||
# # VLM_MODEL_API_KEY=your_vlm_model_api_key
|
||||
# Neo4j的密码
|
||||
# NEO4J_PASSWORD=password
|
||||
|
||||
13
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
@@ -49,15 +49,7 @@ body:
|
||||
|
||||
请按照以下步骤收集相关日志:
|
||||
|
||||
**1. 应用模块日志:**
|
||||
```bash
|
||||
docker exec -it WeKnora-app tail -f /var/log/WeKnora.log
|
||||
```
|
||||
|
||||
**2. 文档解析模块日志:**
|
||||
```bash
|
||||
docker exec -it WeKnora-docreader tail -f /var/log/docreader.log
|
||||
```
|
||||
docker compose logs -f --tail=1000 app docreader postgres
|
||||
|
||||
请重现问题并收集相关日志,然后粘贴到下面的日志字段中。
|
||||
|
||||
@@ -68,8 +60,7 @@ body:
|
||||
description: 请按照上面的指南收集并粘贴相关日志
|
||||
placeholder: |
|
||||
请粘贴从以下命令收集的日志:
|
||||
- docker exec -it WeKnora-app tail -f /var/log/WeKnora.log
|
||||
- docker exec -it WeKnora-docreader tail -f /var/log/docreader.log
|
||||
docker compose logs -f --tail=1000 app docreader postgres
|
||||
render: shell
|
||||
|
||||
- type: input
|
||||
|
||||
8
.github/ISSUE_TEMPLATE/question.yml
vendored
@@ -68,14 +68,8 @@ body:
|
||||
|
||||
如果问题涉及错误或需要调试,请收集相关日志:
|
||||
|
||||
**应用模块日志:**
|
||||
```bash
|
||||
docker exec -it WeKnora-app tail -f /var/log/WeKnora.log
|
||||
```
|
||||
|
||||
**文档解析模块日志:**
|
||||
```bash
|
||||
docker exec -it WeKnora-docreader tail -f /var/log/docreader.log
|
||||
docker compose logs -f --tail=1000 app docreader postgres
|
||||
```
|
||||
|
||||
- type: textarea
|
||||
|
||||
223
.github/workflows/docker-image.yml
vendored
@@ -1,6 +1,8 @@
|
||||
name: Build and Push Docker Image
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v*"
|
||||
branches:
|
||||
- main
|
||||
|
||||
@@ -9,44 +11,217 @@ concurrency:
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
build-app:
|
||||
build-ui:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- service_name: ui
|
||||
file: frontend/Dockerfile
|
||||
context: ./frontend
|
||||
platform: linux/amd64,linux/arm64
|
||||
- service_name: app
|
||||
file: docker/Dockerfile.app
|
||||
context: .
|
||||
platform: linux/amd64,linux/arm64
|
||||
- service_name: docreader
|
||||
file: docker/Dockerfile.docreader
|
||||
context: .
|
||||
platform: linux/amd64,linux/arm64
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Build ${{ matrix.service_name }} Image
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-ui
|
||||
|
||||
- name: Build ui Image
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
file: frontend/Dockerfile
|
||||
context: ./frontend
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-ui:cache
|
||||
cache-to: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-ui:cache,mode=max
|
||||
|
||||
build-docreader:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Free Disk Space
|
||||
uses: jlumbroso/free-disk-space@main
|
||||
with:
|
||||
# this might remove tools that are actually needed,
|
||||
# if set to "true" but frees about 6 GB
|
||||
tool-cache: false
|
||||
|
||||
# all of these default to true, but feel free to set to
|
||||
# "false" if necessary for your workflow
|
||||
android: true
|
||||
dotnet: true
|
||||
haskell: true
|
||||
large-packages: true
|
||||
docker-images: true
|
||||
swap-storage: true
|
||||
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-docreader
|
||||
|
||||
- name: Build docreader Image
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
file: docker/Dockerfile.docreader
|
||||
context: .
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-docreader:cache
|
||||
cache-to: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-docreader:cache,mode=max
|
||||
|
||||
build-app:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- arch: amd64
|
||||
platform: linux/amd64
|
||||
runs: ubuntu-latest
|
||||
- arch: arm64
|
||||
platform: linux/arm64
|
||||
runs: ubuntu-24.04-arm
|
||||
runs-on: ${{ matrix.runs }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
id: setup-buildx
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-app
|
||||
|
||||
- name: Prepare version info
|
||||
id: version
|
||||
run: |
|
||||
# 使用统一的版本管理脚本
|
||||
eval "$(./scripts/get_version.sh env)"
|
||||
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
||||
echo "commit_id=$COMMIT_ID" >> $GITHUB_OUTPUT
|
||||
echo "build_time=$BUILD_TIME" >> $GITHUB_OUTPUT
|
||||
echo "go_version=$GO_VERSION" >> $GITHUB_OUTPUT
|
||||
|
||||
platform=${{ matrix.platform }}
|
||||
echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
|
||||
# 显示版本信息
|
||||
./scripts/get_version.sh info
|
||||
|
||||
- name: Build Cache for Docker
|
||||
uses: actions/cache@v4
|
||||
id: cache
|
||||
with:
|
||||
path: go-pkg-mod
|
||||
key: ${{ env.PLATFORM_PAIR }}-go-build-cache-${{ hashFiles('**/go.sum') }}
|
||||
|
||||
- name: Inject go-build-cache
|
||||
uses: reproducible-containers/buildkit-cache-dance@v3
|
||||
with:
|
||||
builder: ${{ steps.setup-buildx.outputs.name }}
|
||||
cache-map: |
|
||||
{
|
||||
"go-pkg-mod": "/go/pkg/mod"
|
||||
}
|
||||
skip-extraction: ${{ steps.cache.outputs.cache-hit }}
|
||||
|
||||
- name: Build app Image
|
||||
id: build
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
push: true
|
||||
platforms: ${{ matrix.platform }}
|
||||
file: ${{ matrix.file }}
|
||||
context: ${{ matrix.context }}
|
||||
tags: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-${{ matrix.service_name }}:latest
|
||||
file: docker/Dockerfile.app
|
||||
context: .
|
||||
build-args: |
|
||||
${{ format('VERSION_ARG={0}', steps.version.outputs.version) }}
|
||||
${{ format('COMMIT_ID_ARG={0}', steps.version.outputs.commit_id) }}
|
||||
${{ format('BUILD_TIME_ARG={0}', steps.version.outputs.build_time) }}
|
||||
${{ format('GO_VERSION_ARG={0}', steps.version.outputs.go_version) }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
tags: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-app
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-app:cache-${{ env.PLATFORM_PAIR }}
|
||||
cache-to: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-app:cache-${{ env.PLATFORM_PAIR }},mode=max
|
||||
outputs: type=image,push-by-digest=true,name-canonical=true,push=true
|
||||
|
||||
- name: Export digest
|
||||
run: |
|
||||
mkdir -p ${{ runner.temp }}/digests
|
||||
digest="${{ steps.build.outputs.digest }}"
|
||||
touch "${{ runner.temp }}/digests/${digest#sha256:}"
|
||||
|
||||
- name: Upload digest
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: digests-${{ env.PLATFORM_PAIR }}
|
||||
path: ${{ runner.temp }}/digests/*
|
||||
if-no-files-found: error
|
||||
retention-days: 1
|
||||
|
||||
merge:
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- build-app
|
||||
steps:
|
||||
- name: Download digests
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ${{ runner.temp }}/digests
|
||||
pattern: digests-*
|
||||
merge-multiple: true
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-app
|
||||
|
||||
- name: Create manifest list and push
|
||||
working-directory: ${{ runner.temp }}/digests
|
||||
run: |
|
||||
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
||||
$(printf '${{ secrets.DOCKERHUB_USERNAME }}/weknora-app@sha256:%s ' *)
|
||||
|
||||
- name: Inspect image
|
||||
run: |
|
||||
docker buildx imagetools inspect ${{ secrets.DOCKERHUB_USERNAME }}/weknora-app:${{ steps.meta.outputs.version }}
|
||||
|
||||
5
.gitignore
vendored
@@ -26,13 +26,14 @@ temp/
|
||||
|
||||
WeKnora
|
||||
/models/
|
||||
services/docreader/src/proto/__pycache__
|
||||
test/data/mswag.txt
|
||||
data/files/
|
||||
|
||||
.python-version
|
||||
.venv/
|
||||
**/__pycache__
|
||||
.python-version
|
||||
|
||||
### macOS
|
||||
# General
|
||||
.DS_Store
|
||||
PROGRESS_RU.md
|
||||
|
||||
103
CHANGELOG.md
Normal file
@@ -0,0 +1,103 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
## [0.1.4] - 2025-09-17
|
||||
|
||||
### 🚀 Major Features
|
||||
- **NEW**: Multi-knowledgebases operation support
|
||||
- Added comprehensive multi-knowledgebase management functionality
|
||||
- Implemented multi-data source search engine configuration and optimization logic
|
||||
- Enhanced knowledge base switching and management in UI
|
||||
- **NEW**: Enhanced tenant information management
|
||||
- Added dedicated tenant information page
|
||||
- Improved user and tenant management capabilities
|
||||
|
||||
### 🎨 UI/UX Improvements
|
||||
- **REDESIGNED**: Settings page with improved layout and functionality
|
||||
- **ENHANCED**: Menu component with multi-knowledgebase support
|
||||
- **IMPROVED**: Initialization configuration page structure
|
||||
- **OPTIMIZED**: Login page and authentication flow
|
||||
|
||||
### 🔒 Security Fixes
|
||||
- **FIXED**: XSS attack vulnerabilities in thinking component
|
||||
- **FIXED**: Content Security Policy (CSP) errors
|
||||
- **ENHANCED**: Frontend security measures and input sanitization
|
||||
|
||||
### 🐛 Bug Fixes
|
||||
- **FIXED**: Login direct page navigation issues
|
||||
- **FIXED**: App LLM model check logic
|
||||
- **FIXED**: Version script functionality
|
||||
- **FIXED**: File download content errors
|
||||
- **IMPROVED**: Document content component display
|
||||
|
||||
### 🧹 Code Cleanup
|
||||
- **REMOVED**: Test data functionality and related APIs
|
||||
- **SIMPLIFIED**: Initialization configuration components
|
||||
- **CLEANED**: Redundant UI components and unused code
|
||||
|
||||
|
||||
## [0.1.3] - 2025-09-16
|
||||
|
||||
### 🔒 Security Features
|
||||
- **NEW**: Added login authentication functionality to enhance system security
|
||||
- Implemented user authentication and authorization mechanisms
|
||||
- Added session management and access control
|
||||
- Fixed XSS attack vulnerabilities in frontend components
|
||||
|
||||
### 📚 Documentation Updates
|
||||
- Added security notices in all README files (English, Chinese, Japanese)
|
||||
- Updated deployment recommendations emphasizing internal/private network deployment
|
||||
- Enhanced security guidelines to prevent information leakage risks
|
||||
- Fixed documentation spelling issues
|
||||
|
||||
### 🛡️ Security Improvements
|
||||
- Hide API keys in UI for security purposes
|
||||
- Enhanced input sanitization and XSS protection
|
||||
- Added comprehensive security utilities
|
||||
|
||||
### 🐛 Bug Fixes
|
||||
- Fixed OCR AVX support issues
|
||||
- Improved frontend health check dependencies
|
||||
- Enhanced Docker binary downloads for target architecture
|
||||
- Fixed COS file service initialization parameters and URL processing logic
|
||||
|
||||
### 🚀 Features & Enhancements
|
||||
- Improved application and docreader log output
|
||||
- Enhanced frontend routing and authentication flow
|
||||
- Added comprehensive user management system
|
||||
- Improved initialization configuration handling
|
||||
|
||||
### 🛡️ Security Recommendations
|
||||
- Deploy WeKnora services in internal/private network environments
|
||||
- Avoid direct exposure to public internet
|
||||
- Configure proper firewall rules and access controls
|
||||
- Regular updates for security patches and improvements
|
||||
|
||||
## [0.1.2] - 2025-09-10
|
||||
|
||||
- Fixed health check implementation for docreader service
|
||||
- Improved query handling for empty queries
|
||||
- Enhanced knowledge base column value update methods
|
||||
- Optimized logging throughout the application
|
||||
- Added process parsing documentation for markdown files
|
||||
- Fixed OCR model pre-fetching in Docker containers
|
||||
- Resolved image parser concurrency errors
|
||||
- Added support for modifying listening port configuration
|
||||
|
||||
## [0.1.0] - 2025-09-08
|
||||
|
||||
- Initial public release of WeKnora.
|
||||
- Web UI for knowledge upload, chat, configuration, and settings.
|
||||
- RAG pipeline with chunking, embedding, retrieval, reranking, and generation.
|
||||
- Initialization wizard for configuring models (LLM, embedding, rerank, retriever).
|
||||
- Support for local Ollama and remote API models.
|
||||
- Vector backends: PostgreSQL (pgvector), Elasticsearch; GraphRAG support.
|
||||
- End-to-end evaluation utilities and metrics.
|
||||
- Docker Compose for quick startup and service orchestration.
|
||||
- MCP server support for integrating with MCP-compatible clients.
|
||||
|
||||
[0.1.4]: https://github.com/Tencent/WeKnora/tree/v0.1.4
|
||||
[0.1.3]: https://github.com/Tencent/WeKnora/tree/v0.1.3
|
||||
[0.1.2]: https://github.com/Tencent/WeKnora/tree/v0.1.2
|
||||
[0.1.0]: https://github.com/Tencent/WeKnora/tree/v0.1.0
|
||||
83
Makefile
@@ -1,4 +1,4 @@
|
||||
.PHONY: help build run test clean docker-build docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama build-images build-images-app build-images-docreader build-images-frontend clean-images
|
||||
.PHONY: help build run test clean docker-build-app docker-build-docreader docker-build-frontend docker-build-all docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama build-images build-images-app build-images-docreader build-images-frontend clean-images check-env list-containers pull-images show-platform
|
||||
|
||||
# Show help
|
||||
help:
|
||||
@@ -11,10 +11,13 @@ help:
|
||||
@echo " clean 清理构建文件"
|
||||
@echo ""
|
||||
@echo "Docker 命令:"
|
||||
@echo " docker-build 构建 Docker 镜像"
|
||||
@echo " docker-run 运行 Docker 容器"
|
||||
@echo " docker-stop 停止 Docker 容器"
|
||||
@echo " docker-restart 重启 Docker 容器"
|
||||
@echo " docker-build-app 构建应用 Docker 镜像 (wechatopenai/weknora-app)"
|
||||
@echo " docker-build-docreader 构建文档读取器镜像 (wechatopenai/weknora-docreader)"
|
||||
@echo " docker-build-frontend 构建前端镜像 (wechatopenai/weknora-ui)"
|
||||
@echo " docker-build-all 构建所有 Docker 镜像"
|
||||
@echo " docker-run 运行 Docker 容器"
|
||||
@echo " docker-stop 停止 Docker 容器"
|
||||
@echo " docker-restart 重启 Docker 容器"
|
||||
@echo ""
|
||||
@echo "服务管理:"
|
||||
@echo " start-all 启动所有服务"
|
||||
@@ -37,15 +40,32 @@ help:
|
||||
@echo " lint 代码检查"
|
||||
@echo " deps 安装依赖"
|
||||
@echo " docs 生成 API 文档"
|
||||
@echo ""
|
||||
@echo "环境检查:"
|
||||
@echo " check-env 检查环境配置"
|
||||
@echo " list-containers 列出运行中的容器"
|
||||
@echo " pull-images 拉取最新镜像"
|
||||
@echo " show-platform 显示当前构建平台"
|
||||
|
||||
# Go related variables
|
||||
BINARY_NAME=WeKnora
|
||||
MAIN_PATH=./cmd/server
|
||||
|
||||
# Docker related variables
|
||||
DOCKER_IMAGE=WeKnora
|
||||
DOCKER_IMAGE=wechatopenai/weknora-app
|
||||
DOCKER_TAG=latest
|
||||
|
||||
# Platform detection
|
||||
ifeq ($(shell uname -m),x86_64)
|
||||
PLATFORM=linux/amd64
|
||||
else ifeq ($(shell uname -m),aarch64)
|
||||
PLATFORM=linux/arm64
|
||||
else ifeq ($(shell uname -m),arm64)
|
||||
PLATFORM=linux/arm64
|
||||
else
|
||||
PLATFORM=linux/amd64
|
||||
endif
|
||||
|
||||
# Build the application
|
||||
build:
|
||||
go build -o $(BINARY_NAME) $(MAIN_PATH)
|
||||
@@ -64,8 +84,27 @@ clean:
|
||||
rm -f $(BINARY_NAME)
|
||||
|
||||
# Build Docker image
|
||||
docker-build:
|
||||
docker build -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
|
||||
docker-build-app:
|
||||
@echo "获取版本信息..."
|
||||
@eval $$(./scripts/get_version.sh env); \
|
||||
./scripts/get_version.sh info; \
|
||||
docker build --platform $(PLATFORM) \
|
||||
--build-arg VERSION_ARG="$$VERSION" \
|
||||
--build-arg COMMIT_ID_ARG="$$COMMIT_ID" \
|
||||
--build-arg BUILD_TIME_ARG="$$BUILD_TIME" \
|
||||
--build-arg GO_VERSION_ARG="$$GO_VERSION" \
|
||||
-f docker/Dockerfile.app -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
|
||||
|
||||
# Build docreader Docker image
|
||||
docker-build-docreader:
|
||||
docker build --platform $(PLATFORM) -f docker/Dockerfile.docreader -t wechatopenai/weknora-docreader:latest .
|
||||
|
||||
# Build frontend Docker image
|
||||
docker-build-frontend:
|
||||
docker build --platform $(PLATFORM) -f frontend/Dockerfile -t wechatopenai/weknora-ui:latest frontend/
|
||||
|
||||
# Build all Docker images
|
||||
docker-build-all: docker-build-app docker-build-docreader docker-build-frontend
|
||||
|
||||
# Run Docker container (传统方式)
|
||||
docker-run:
|
||||
@@ -107,10 +146,10 @@ build-images-frontend:
|
||||
clean-images:
|
||||
./scripts/build_images.sh --clean
|
||||
|
||||
# Restart Docker container (stop, rebuild, start)
|
||||
# Restart Docker container (stop, start)
|
||||
docker-restart:
|
||||
docker-compose stop -t 60
|
||||
docker-compose up --build
|
||||
docker-compose up
|
||||
|
||||
# Database migrations
|
||||
migrate-up:
|
||||
@@ -137,7 +176,12 @@ deps:
|
||||
|
||||
# Build for production
|
||||
build-prod:
|
||||
GOOS=linux go build -a -installsuffix cgo -ldflags="-w -s" -o $(BINARY_NAME) $(MAIN_PATH)
|
||||
VERSION=$${VERSION:-unknown}; \
|
||||
COMMIT_ID=$${COMMIT_ID:-unknown}; \
|
||||
BUILD_TIME=$${BUILD_TIME:-unknown}; \
|
||||
GO_VERSION=$${GO_VERSION:-unknown}; \
|
||||
LDFLAGS="-X 'github.com/Tencent/WeKnora/internal/handler.Version=$$VERSION' -X 'github.com/Tencent/WeKnora/internal/handler.CommitID=$$COMMIT_ID' -X 'github.com/Tencent/WeKnora/internal/handler.BuildTime=$$BUILD_TIME' -X 'github.com/Tencent/WeKnora/internal/handler.GoVersion=$$GO_VERSION'"; \
|
||||
go build -ldflags="-w -s $$LDFLAGS" -o $(BINARY_NAME) $(MAIN_PATH)
|
||||
|
||||
clean-db:
|
||||
@echo "Cleaning database..."
|
||||
@@ -151,4 +195,21 @@ clean-db:
|
||||
docker volume rm weknora_redis_data; \
|
||||
fi
|
||||
|
||||
# Environment check
|
||||
check-env:
|
||||
./scripts/start_all.sh --check
|
||||
|
||||
# List containers
|
||||
list-containers:
|
||||
./scripts/start_all.sh --list
|
||||
|
||||
# Pull latest images
|
||||
pull-images:
|
||||
./scripts/start_all.sh --pull
|
||||
|
||||
# Show current platform
|
||||
show-platform:
|
||||
@echo "当前系统架构: $(shell uname -m)"
|
||||
@echo "Docker构建平台: $(PLATFORM)"
|
||||
|
||||
|
||||
|
||||
350
README.md
@@ -14,148 +14,198 @@
|
||||
<a href="https://github.com/Tencent/WeKnora/blob/main/LICENSE">
|
||||
<img src="https://img.shields.io/badge/License-MIT-ffffff?labelColor=d4eaf7&color=2e6cc4" alt="License">
|
||||
</a>
|
||||
<a href="./CHANGELOG.md">
|
||||
<img alt="Version" src="https://img.shields.io/badge/version-0.1.3-2e6cc4?labelColor=d4eaf7">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
| <a href="./README_EN.md"><b>English</b></a> | <b>简体中文</b> | <a href="./README_JA.md"><b>日本語</b></a> |
|
||||
| <b>English</b> | <a href="./README_CN.md"><b>简体中文</b></a> | <a href="./README_JA.md"><b>日本語</b></a> |
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<h4 align="center">
|
||||
|
||||
[项目介绍](#-项目介绍) • [架构设计](#-架构设计) • [核心特性](#-核心特性) • [快速开始](#-快速开始) • [文档](#-文档) • [开发指南](#-开发指南)
|
||||
|
||||
[Overview](#-overview) • [Architecture](#-architecture) • [Key Features](#-key-features) • [Getting Started](#-getting-started) • [API Reference](#-api-reference) • [Developer Guide](#-developer-guide)
|
||||
|
||||
</h4>
|
||||
</p>
|
||||
|
||||
# 💡 WeKnora - 基于大模型的文档理解检索框架
|
||||
# 💡 WeKnora - LLM-Powered Document Understanding & Retrieval Framework
|
||||
|
||||
## 📌 项目介绍
|
||||
## 📌 Overview
|
||||
|
||||
[**WeKnora(维娜拉)**](https://weknora.weixin.qq.com) 是一款基于大语言模型(LLM)的文档理解与语义检索框架,专为结构复杂、内容异构的文档场景而打造。
|
||||
[**WeKnora**](https://weknora.weixin.qq.com) is an LLM-powered framework designed for deep document understanding and semantic retrieval, especially for handling complex, heterogeneous documents.
|
||||
|
||||
框架采用模块化架构,融合多模态预处理、语义向量索引、智能召回与大模型生成推理,构建起高效、可控的文档问答流程。核心检索流程基于 **RAG(Retrieval-Augmented Generation)** 机制,将上下文相关片段与语言模型结合,实现更高质量的语义回答。
|
||||
It adopts a modular architecture that combines multimodal preprocessing, semantic vector indexing, intelligent retrieval, and large language model inference. At its core, WeKnora follows the **RAG (Retrieval-Augmented Generation)** paradigm, enabling high-quality, context-aware answers by combining relevant document chunks with model reasoning.
|
||||
|
||||
**官网:** https://weknora.weixin.qq.com
|
||||
**Website:** https://weknora.weixin.qq.com
|
||||
|
||||
## 🏗️ 架构设计
|
||||
## 🔒 Security Notice
|
||||
|
||||

|
||||
**Important:** Starting from v0.1.3, WeKnora includes login authentication functionality to enhance system security. For production deployments, we strongly recommend:
|
||||
|
||||
WeKnora 采用现代化模块化设计,构建了一条完整的文档理解与检索流水线。系统主要包括文档解析、向量化处理、检索引擎和大模型推理等核心模块,每个组件均可灵活配置与扩展。
|
||||
- Deploy WeKnora services in internal/private network environments rather than public internet
|
||||
- Avoid exposing the service directly to public networks to prevent potential information leakage
|
||||
- Configure proper firewall rules and access controls for your deployment environment
|
||||
- Regularly update to the latest version for security patches and improvements
|
||||
|
||||
## 🎯 核心特性
|
||||
## 🏗️ Architecture
|
||||
|
||||
- **🔍 精准理解**:支持 PDF、Word、图片等文档的结构化内容提取,统一构建语义视图
|
||||
- **🧠 智能推理**:借助大语言模型理解文档上下文与用户意图,支持精准问答与多轮对话
|
||||
- **🔧 灵活扩展**:从解析、嵌入、召回到生成全流程解耦,便于灵活集成与定制扩展
|
||||
- **⚡ 高效检索**:混合多种检索策略:关键词、向量、知识图谱
|
||||
- **🎯 简单易用**:直观的Web界面与标准API,零技术门槛快速上手
|
||||
- **🔒 安全可控**:支持本地化与私有云部署,数据完全自主可控
|
||||

|
||||
|
||||
## 📊 适用场景
|
||||
WeKnora employs a modern modular design to build a complete document understanding and retrieval pipeline. The system primarily includes document parsing, vector processing, retrieval engine, and large model inference as core modules, with each component being flexibly configurable and extendable.
|
||||
|
||||
| 应用场景 | 具体应用 | 核心价值 |
|
||||
## 🎯 Key Features
|
||||
|
||||
- **🔍 Precise Understanding**: Structured content extraction from PDFs, Word documents, images and more into unified semantic views
|
||||
- **🧠 Intelligent Reasoning**: Leverages LLMs to understand document context and user intent for accurate Q&A and multi-turn conversations
|
||||
- **🔧 Flexible Extension**: All components from parsing and embedding to retrieval and generation are decoupled for easy customization
|
||||
- **⚡ Efficient Retrieval**: Hybrid retrieval strategies combining keywords, vectors, and knowledge graphs
|
||||
- **🎯 User-Friendly**: Intuitive web interface and standardized APIs for zero technical barriers
|
||||
- **🔒 Secure & Controlled**: Support for local deployment and private cloud, ensuring complete data sovereignty
|
||||
|
||||
## 📊 Application Scenarios
|
||||
|
||||
| Scenario | Applications | Core Value |
|
||||
|---------|----------|----------|
|
||||
| **企业知识管理** | 内部文档检索、规章制度问答、操作手册查询 | 提升知识查找效率,降低培训成本 |
|
||||
| **科研文献分析** | 论文检索、研究报告分析、学术资料整理 | 加速文献调研,辅助研究决策 |
|
||||
| **产品技术支持** | 产品手册问答、技术文档检索、故障排查 | 提升客户服务质量,减少技术支持负担 |
|
||||
| **法律合规审查** | 合同条款检索、法规政策查询、案例分析 | 提高合规效率,降低法律风险 |
|
||||
| **医疗知识辅助** | 医学文献检索、诊疗指南查询、病例分析 | 辅助临床决策,提升诊疗质量 |
|
||||
| **Enterprise Knowledge Management** | Internal document retrieval, policy Q&A, operation manual search | Improve knowledge discovery efficiency, reduce training costs |
|
||||
| **Academic Research Analysis** | Paper retrieval, research report analysis, scholarly material organization | Accelerate literature review, assist research decisions |
|
||||
| **Product Technical Support** | Product manual Q&A, technical documentation search, troubleshooting | Enhance customer service quality, reduce support burden |
|
||||
| **Legal & Compliance Review** | Contract clause retrieval, regulatory policy search, case analysis | Improve compliance efficiency, reduce legal risks |
|
||||
| **Medical Knowledge Assistance** | Medical literature retrieval, treatment guideline search, case analysis | Support clinical decisions, improve diagnosis quality |
|
||||
|
||||
## 🧩 功能模块能力
|
||||
## 🧩 Feature Matrix
|
||||
|
||||
| 功能模块 | 支持情况 | 说明 |
|
||||
| Module | Support | Description |
|
||||
|---------|---------|------|
|
||||
| 文档格式支持 | ✅ PDF / Word / Txt / Markdown / 图片(含 OCR / Caption) | 支持多种结构化与非结构化文档内容解析,支持图文混排与图像文字提取 |
|
||||
| 嵌入模型支持 | ✅ 本地模型、BGE / GTE API 等 | 支持自定义 embedding 模型,兼容本地部署与云端向量生成接口 |
|
||||
| 向量数据库接入 | ✅ PostgreSQL(pgvector)、Elasticsearch | 支持主流向量索引后端,可灵活切换与扩展,适配不同检索场景 |
|
||||
| 检索机制 | ✅ BM25 / Dense Retrieve / GraphRAG | 支持稠密/稀疏召回、知识图谱增强检索等多种策略,可自由组合召回-重排-生成流程 |
|
||||
| 大模型集成 | ✅ 支持 Qwen、DeepSeek 等,思考/非思考模式切换 | 可接入本地大模型(如 Ollama 启动)或调用外部 API 服务,支持推理模式灵活配置 |
|
||||
| 问答能力 | ✅ 上下文感知、多轮对话、提示词模板 | 支持复杂语义建模、指令控制与链式问答,可配置提示词与上下文窗口 |
|
||||
| 端到端测试支持 | ✅ 检索+生成过程可视化与指标评估 | 提供一体化链路测试工具,支持评估召回命中率、回答覆盖度、BLEU / ROUGE 等主流指标 |
|
||||
| 部署模式 | ✅ 支持本地部署 / Docker 镜像 | 满足私有化、离线部署与灵活运维的需求 |
|
||||
| 用户界面 | ✅ Web UI + RESTful API | 提供交互式界面与标准 API 接口,适配开发者与业务用户使用习惯 |
|
||||
| Document Formats | ✅ PDF / Word / Txt / Markdown / Images (with OCR / Caption) | Support for structured and unstructured documents with text extraction from images |
|
||||
| Embedding Models | ✅ Local models, BGE / GTE APIs, etc. | Customizable embedding models, compatible with local deployment and cloud vector generation APIs |
|
||||
| Vector DB Integration | ✅ PostgreSQL (pgvector), Elasticsearch | Support for mainstream vector index backends, flexible switching for different retrieval scenarios |
|
||||
| Retrieval Strategies | ✅ BM25 / Dense Retrieval / GraphRAG | Support for sparse/dense recall and knowledge graph-enhanced retrieval with customizable retrieve-rerank-generate pipelines |
|
||||
| LLM Integration | ✅ Support for Qwen, DeepSeek, etc., with thinking/non-thinking mode switching | Compatible with local models (e.g., via Ollama) or external API services with flexible inference configuration |
|
||||
| QA Capabilities | ✅ Context-aware, multi-turn dialogue, prompt templates | Support for complex semantic modeling, instruction control and chain-of-thought Q&A with configurable prompts and context windows |
|
||||
| E2E Testing | ✅ Retrieval+generation process visualization and metric evaluation | End-to-end testing tools for evaluating recall hit rates, answer coverage, BLEU/ROUGE and other metrics |
|
||||
| Deployment Modes | ✅ Support for local deployment / Docker images | Meets private, offline deployment and flexible operation requirements |
|
||||
| User Interfaces | ✅ Web UI + RESTful API | Interactive interface and standard API endpoints, suitable for both developers and business users |
|
||||
|
||||
## 🚀 快速开始
|
||||
## 🚀 Getting Started
|
||||
|
||||
### 🛠 环境要求
|
||||
### 🛠 Prerequisites
|
||||
|
||||
确保本地已安装以下工具:
|
||||
Make sure the following tools are installed on your system:
|
||||
|
||||
* [Docker](https://www.docker.com/)
|
||||
* [Docker Compose](https://docs.docker.com/compose/)
|
||||
* [Git](https://git-scm.com/)
|
||||
|
||||
### 📦 安装步骤
|
||||
### 📦 Installation
|
||||
|
||||
#### ① 克隆代码仓库
|
||||
#### ① Clone the repository
|
||||
|
||||
```bash
|
||||
# 克隆主仓库
|
||||
# Clone the main repository
|
||||
git clone https://github.com/Tencent/WeKnora.git
|
||||
cd WeKnora
|
||||
```
|
||||
|
||||
#### ② 配置环境变量
|
||||
#### ② Configure environment variables
|
||||
|
||||
```bash
|
||||
# 复制示例配置文件
|
||||
# Copy example env file
|
||||
cp .env.example .env
|
||||
|
||||
# 编辑 .env,填入对应配置信息
|
||||
# 所有变量说明详见 .env.example 注释
|
||||
# Edit .env and set required values
|
||||
# All variables are documented in the .env.example comments
|
||||
```
|
||||
|
||||
#### ③ 启动服务
|
||||
#### ③ Start the services (include Ollama)
|
||||
|
||||
Check the images that need to be started in the .env file.
|
||||
|
||||
```bash
|
||||
# 启动全部服务(含 Ollama 与后端容器)
|
||||
./scripts/start_all.sh
|
||||
# 或
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
make start-all
|
||||
```
|
||||
|
||||
#### ③ 启动服务备选
|
||||
#### ③.0 Start ollama services (Optional)
|
||||
|
||||
```bash
|
||||
# 启动 ollama 服务 (可选)
|
||||
ollama serve > /dev/null 2>&1 &
|
||||
```
|
||||
|
||||
# 启动服务
|
||||
#### ③.1 Activate different combinations of features
|
||||
|
||||
- Minimum core services
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
#### ④ 停止服务
|
||||
- All features enabled
|
||||
```bash
|
||||
docker-compose --profile full up -d
|
||||
```
|
||||
|
||||
- Tracing logs required
|
||||
```bash
|
||||
docker-compose --profile jaeger up -d
|
||||
```
|
||||
|
||||
- Neo4j knowledge graph required
|
||||
```bash
|
||||
docker-compose --profile neo4j up -d
|
||||
```
|
||||
|
||||
- Minio file storage service required
|
||||
```bash
|
||||
docker-compose --profile minio up -d
|
||||
```
|
||||
|
||||
- Multiple options combination
|
||||
```bash
|
||||
docker-compose --profile neo4j --profile minio up -d
|
||||
```
|
||||
|
||||
#### ④ Stop the services
|
||||
|
||||
```bash
|
||||
./scripts/start_all.sh --stop
|
||||
# 或
|
||||
# Or
|
||||
make stop-all
|
||||
```
|
||||
|
||||
### 🌐 服务访问地址
|
||||
### 🌐 Access Services
|
||||
|
||||
启动成功后,可访问以下地址:
|
||||
Once started, services will be available at:
|
||||
|
||||
* Web UI:`http://localhost`
|
||||
* 后端 API:`http://localhost:8080`
|
||||
* 链路追踪(Jaeger):`http://localhost:16686`
|
||||
* Web UI: `http://localhost`
|
||||
* Backend API: `http://localhost:8080`
|
||||
* Jaeger Tracing: `http://localhost:16686`
|
||||
|
||||
### 🔌 使用微信对话开放平台
|
||||
### 🔌 Using WeChat Dialog Open Platform
|
||||
|
||||
WeKnora 作为[微信对话开放平台](https://chatbot.weixin.qq.com)的核心技术框架,提供更简便的使用方式:
|
||||
WeKnora serves as the core technology framework for the [WeChat Dialog Open Platform](https://chatbot.weixin.qq.com), providing a more convenient usage approach:
|
||||
|
||||
- **零代码部署**:只需上传知识,即可在微信生态中快速部署智能问答服务,实现"即问即答"的体验
|
||||
- **高效问题管理**:支持高频问题的独立分类管理,提供丰富的数据工具,确保回答精准可靠且易于维护
|
||||
- **微信生态覆盖**:通过微信对话开放平台,WeKnora 的智能问答能力可无缝集成到公众号、小程序等微信场景中,提升用户交互体验
|
||||
### 🔗MCP服务器访问已经部署好的WEKnora
|
||||
#### 1️⃣克隆储存库
|
||||
- **Zero-code Deployment**: Simply upload knowledge to quickly deploy intelligent Q&A services within the WeChat ecosystem, achieving an "ask and answer" experience
|
||||
- **Efficient Question Management**: Support for categorized management of high-frequency questions, with rich data tools to ensure accurate, reliable, and easily maintainable answers
|
||||
- **WeChat Ecosystem Integration**: Through the WeChat Dialog Open Platform, WeKnora's intelligent Q&A capabilities can be seamlessly integrated into WeChat Official Accounts, Mini Programs, and other WeChat scenarios, enhancing user interaction experiences
|
||||
|
||||
### 🔗 Access WeKnora via MCP Server
|
||||
|
||||
#### 1️⃣ Clone the repository
|
||||
```
|
||||
git clone https://github.com/Tencent/WeKnora
|
||||
```
|
||||
#### 2️⃣配置MCP服务器
|
||||
mcp客户端配置服务器
|
||||
|
||||
#### 2️⃣ Configure MCP Server
|
||||
> It is recommended to directly refer to the [MCP Configuration Guide](./mcp-server/MCP_CONFIG.md) for configuration.
|
||||
|
||||
Configure the MCP client to connect to the server:
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
@@ -165,150 +215,156 @@ mcp客户端配置服务器
|
||||
],
|
||||
"command": "python",
|
||||
"env":{
|
||||
"WEKNORA_API_KEY":"进入你的weknora实例,打开开发者工具,查看请求头x-api-key,以sk开头",
|
||||
"WEKNORA_BASE_URL":"http(s)://你的weknora地址/api/v1"
|
||||
"WEKNORA_API_KEY":"Enter your WeKnora instance, open developer tools, check the request header x-api-key starting with sk",
|
||||
"WEKNORA_BASE_URL":"http(s)://your-weknora-address/api/v1"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
使用stdio命令直接运行
|
||||
|
||||
Run directly using stdio command:
|
||||
```
|
||||
pip install weknora-mcp-server
|
||||
python -m weknora-mcp-server
|
||||
```
|
||||
|
||||
## 🔧 初始化配置引导
|
||||
## 🔧 Initialization Configuration Guide
|
||||
|
||||
为了方便用户快速配置各类模型,降低试错成本,我们改进了原来的配置文件初始化方式,增加了Web UI界面进行各种模型的配置。在使用之前,请确保代码更新到最新版本。具体使用步骤如下:
|
||||
如果是第一次使用本项目,可跳过①②步骤,直接进入③④步骤。
|
||||
To help users quickly configure various models and reduce trial-and-error costs, we've improved the original configuration file initialization method by adding a Web UI interface for model configuration. Before using, please ensure the code is updated to the latest version. The specific steps are as follows:
|
||||
If this is your first time using this project, you can skip steps ①② and go directly to steps ③④.
|
||||
|
||||
### ① 关闭服务
|
||||
### ① Stop the services
|
||||
|
||||
```bash
|
||||
./scripts/start_all.sh --stop
|
||||
```
|
||||
|
||||
### ② 清空原有数据表(建议在没有重要数据的情况下使用)
|
||||
### ② Clear existing data tables (recommended when no important data exists)
|
||||
|
||||
```bash
|
||||
make clean-db
|
||||
```
|
||||
|
||||
### ③ 编译并启动服务
|
||||
### ③ Compile and start services
|
||||
|
||||
```bash
|
||||
./scripts/start_all.sh
|
||||
```
|
||||
|
||||
### ④ 访问Web UI
|
||||
### ④ Access Web UI
|
||||
|
||||
http://localhost
|
||||
|
||||
首次访问会自动跳转到初始化配置页面,配置完成后会自动跳转到知识库页面。请按照页面提示信息完成模型的配置。
|
||||
On your first visit, you will be automatically redirected to the registration/login page. After completing registration, please create a new knowledge base and finish the relevant settings on its configuration page.
|
||||
|
||||

|
||||
## 📱 Interface Showcase
|
||||
|
||||
|
||||
## 📱 功能展示
|
||||
|
||||
### Web UI 界面
|
||||
### Web UI Interface
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><b>知识上传</b><br/><img src="./docs/images/knowledges.png" alt="知识上传界面"></td>
|
||||
<td><b>知识问答入口</b><br/><img src="./docs/images/qa.png" alt="知识问答入口"></td>
|
||||
<td><b>Knowledge Upload</b><br/><img src="./docs/images/knowledges.png" alt="Knowledge Upload Interface"></td>
|
||||
<td><b>Q&A Entry</b><br/><img src="./docs/images/qa.png" alt="Q&A Entry Interface"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2"><b>图文结果回答</b><br/><img src="./docs/images/answer.png" alt="图文结果回答"></td>
|
||||
<td colspan="2"><b>Rich Text & Image Responses</b><br/><img src="./docs/images/answer.png" alt="Rich Answer Interface"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
**知识库管理:** 支持拖拽上传各类文档,自动识别文档结构并提取核心知识,建立索引。系统清晰展示处理进度和文档状态,实现高效的知识库管理。
|
||||
**Knowledge Base Management:** Support for dragging and dropping various documents, automatically identifying document structures and extracting core knowledge to establish indexes. The system clearly displays processing progress and document status, achieving efficient knowledge base management.
|
||||
|
||||
### 文档知识图谱
|
||||
### Document Knowledge Graph
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="./docs/images/graph2.png" alt="知识图谱展示1"></td>
|
||||
<td><img src="./docs/images/graph1.png" alt="知识图谱展示2"></td>
|
||||
</tr>
|
||||
</table>
|
||||
WeKnora supports transforming documents into knowledge graphs, displaying the relationships between different sections of the documents. Once the knowledge graph feature is enabled, the system analyzes and constructs an internal semantic association network that not only helps users understand document content but also provides structured support for indexing and retrieval, enhancing the relevance and breadth of search results.
|
||||
|
||||
WeKnora 支持将文档转化为知识图谱,展示文档中不同段落之间的关联关系。开启知识图谱功能后,系统会分析并构建文档内部的语义关联网络,不仅帮助用户理解文档内容,还为索引和检索提供结构化支撑,提升检索结果的相关性和广度。
|
||||
### 配套MCP服务器调用效果
|
||||
<img width="950" height="2063" alt="118d078426f42f3d4983c13386085d7f" src="https://github.com/user-attachments/assets/09111ec8-0489-415c-969d-aa3835778e14" />
|
||||
For detailed configuration, please refer to the [Knowledge Graph Configuration Guide](./docs/KnowledgeGraph.md).
|
||||
|
||||
### MCP Server
|
||||
|
||||
## 📘 文档
|
||||
Please refer to the [MCP Configuration Guide](./mcp-server/MCP_CONFIG.md) for the necessary setup.
|
||||
|
||||
常见问题排查:[常见问题排查](./docs/QA.md)
|
||||
## 📘 API Reference
|
||||
|
||||
详细接口说明请参考:[API 文档](./docs/API.md)
|
||||
Troubleshooting FAQ: [Troubleshooting FAQ](./docs/QA.md)
|
||||
|
||||
## 🧭 开发指南
|
||||
Detailed API documentation is available at: [API Docs](./docs/API.md)
|
||||
|
||||
### 📁 项目目录结构
|
||||
## 🧭 Developer Guide
|
||||
|
||||
### 📁 Directory Structure
|
||||
|
||||
```
|
||||
WeKnora/
|
||||
├── cmd/ # 应用入口
|
||||
├── internal/ # 核心业务逻辑
|
||||
├── config/ # 配置文件
|
||||
├── migrations/ # 数据库迁移脚本
|
||||
├── scripts/ # 启动与工具脚本
|
||||
├── services/ # 各子服务实现
|
||||
├── frontend/ # 前端项目
|
||||
└── docs/ # 项目文档
|
||||
├── client/ # go client
|
||||
├── cmd/ # Main entry point
|
||||
├── config/ # Configuration files
|
||||
├── docker/ # docker images files
|
||||
├── docreader/ # Document parsing app
|
||||
├── docs/ # Project documentation
|
||||
├── frontend/ # Frontend app
|
||||
├── internal/ # Core business logic
|
||||
├── mcp-server/ # MCP server
|
||||
├── migrations/ # DB migration scripts
|
||||
└── scripts/ # Shell scripts
|
||||
```
|
||||
|
||||
### 🔧 常用命令
|
||||
## 🤝 Contributing
|
||||
|
||||
```bash
|
||||
# 清空数据库(慎用!)
|
||||
make clean-db
|
||||
```
|
||||
We welcome community contributions! For suggestions, bugs, or feature requests, please submit an [Issue](https://github.com/Tencent/WeKnora/issues) or directly create a Pull Request.
|
||||
|
||||
## 🤝 贡献指南
|
||||
### 🎯 How to Contribute
|
||||
|
||||
我们欢迎社区用户参与贡献!如有建议、Bug 或新功能需求,请通过 [Issue](https://github.com/Tencent/WeKnora/issues) 提出,或直接提交 Pull Request。
|
||||
- 🐛 **Bug Fixes**: Discover and fix system defects
|
||||
- ✨ **New Features**: Propose and implement new capabilities
|
||||
- 📚 **Documentation**: Improve project documentation
|
||||
- 🧪 **Test Cases**: Write unit and integration tests
|
||||
- 🎨 **UI/UX Enhancements**: Improve user interface and experience
|
||||
|
||||
### 🎯 贡献方式
|
||||
### 📋 Contribution Process
|
||||
|
||||
- 🐛 **Bug修复**: 发现并修复系统缺陷
|
||||
- ✨ **新功能**: 提出并实现新特性
|
||||
- 📚 **文档改进**: 完善项目文档
|
||||
- 🧪 **测试用例**: 编写单元测试和集成测试
|
||||
- 🎨 **UI/UX优化**: 改进用户界面和体验
|
||||
1. **Fork the project** to your GitHub account
|
||||
2. **Create a feature branch** `git checkout -b feature/amazing-feature`
|
||||
3. **Commit changes** `git commit -m 'Add amazing feature'`
|
||||
4. **Push branch** `git push origin feature/amazing-feature`
|
||||
5. **Create a Pull Request** with detailed description of changes
|
||||
|
||||
### 📋 贡献流程
|
||||
### 🎨 Code Standards
|
||||
|
||||
1. **Fork项目** 到你的GitHub账户
|
||||
2. **创建特性分支** `git checkout -b feature/amazing-feature`
|
||||
3. **提交更改** `git commit -m 'Add amazing feature'`
|
||||
4. **推送分支** `git push origin feature/amazing-feature`
|
||||
5. **创建Pull Request** 并详细描述变更内容
|
||||
- Follow [Go Code Review Comments](https://github.com/golang/go/wiki/CodeReviewComments)
|
||||
- Format code using `gofmt`
|
||||
- Add necessary unit tests
|
||||
- Update relevant documentation
|
||||
|
||||
### 🎨 代码规范
|
||||
### 📝 Commit Guidelines
|
||||
|
||||
- 遵循 [Go Code Review Comments](https://github.com/golang/go/wiki/CodeReviewComments)
|
||||
- 使用 `gofmt` 格式化代码
|
||||
- 添加必要的单元测试
|
||||
- 更新相关文档
|
||||
|
||||
### 📝 提交规范
|
||||
|
||||
使用 [Conventional Commits](https://www.conventionalcommits.org/) 规范:
|
||||
Use [Conventional Commits](https://www.conventionalcommits.org/) standard:
|
||||
|
||||
```
|
||||
feat: 添加文档批量上传功能
|
||||
fix: 修复向量检索精度问题
|
||||
docs: 更新API文档
|
||||
test: 添加检索引擎测试用例
|
||||
refactor: 重构文档解析模块
|
||||
feat: Add document batch upload functionality
|
||||
fix: Resolve vector retrieval precision issue
|
||||
docs: Update API documentation
|
||||
test: Add retrieval engine test cases
|
||||
refactor: Restructure document parsing module
|
||||
```
|
||||
|
||||
## 📄 许可证
|
||||
## 👥 Contributors
|
||||
|
||||
本项目基于 [MIT](./LICENSE) 协议发布。
|
||||
你可以自由使用、修改和分发本项目代码,但需保留原始版权声明。
|
||||
Thanks to these excellent contributors:
|
||||
|
||||
[](https://github.com/Tencent/WeKnora/graphs/contributors)
|
||||
|
||||
## 📄 License
|
||||
|
||||
This project is licensed under the [MIT License](./LICENSE).
|
||||
You are free to use, modify, and distribute the code with proper attribution.
|
||||
|
||||
## 📈 Project Statistics
|
||||
|
||||
<a href="https://www.star-history.com/#Tencent/WeKnora&type=date&legend=top-left">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&theme=dark&legend=top-left" />
|
||||
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
</picture>
|
||||
</a>
|
||||
|
||||
372
README_CN.md
Normal file
@@ -0,0 +1,372 @@
|
||||
<p align="center">
|
||||
<picture>
|
||||
<img src="./docs/images/logo.png" alt="WeKnora Logo" height="120"/>
|
||||
</picture>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://weknora.weixin.qq.com" target="_blank">
|
||||
<img alt="官方网站" src="https://img.shields.io/badge/官方网站-WeKnora-4e6b99">
|
||||
</a>
|
||||
<a href="https://chatbot.weixin.qq.com" target="_blank">
|
||||
<img alt="微信对话开放平台" src="https://img.shields.io/badge/微信对话开放平台-5ac725">
|
||||
</a>
|
||||
<a href="https://github.com/Tencent/WeKnora/blob/main/LICENSE">
|
||||
<img src="https://img.shields.io/badge/License-MIT-ffffff?labelColor=d4eaf7&color=2e6cc4" alt="License">
|
||||
</a>
|
||||
<a href="./CHANGELOG.md">
|
||||
<img alt="版本" src="https://img.shields.io/badge/version-0.1.3-2e6cc4?labelColor=d4eaf7">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
| <a href="./README.md"><b>English</b></a> | <b>简体中文</b> | <a href="./README_JA.md"><b>日本語</b></a> |
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<h4 align="center">
|
||||
|
||||
[项目介绍](#-项目介绍) • [架构设计](#-架构设计) • [核心特性](#-核心特性) • [快速开始](#-快速开始) • [文档](#-文档) • [开发指南](#-开发指南)
|
||||
|
||||
</h4>
|
||||
</p>
|
||||
|
||||
# 💡 WeKnora - 基于大模型的文档理解检索框架
|
||||
|
||||
## 📌 项目介绍
|
||||
|
||||
[**WeKnora(维娜拉)**](https://weknora.weixin.qq.com) 是一款基于大语言模型(LLM)的文档理解与语义检索框架,专为结构复杂、内容异构的文档场景而打造。
|
||||
|
||||
框架采用模块化架构,融合多模态预处理、语义向量索引、智能召回与大模型生成推理,构建起高效、可控的文档问答流程。核心检索流程基于 **RAG(Retrieval-Augmented Generation)** 机制,将上下文相关片段与语言模型结合,实现更高质量的语义回答。
|
||||
|
||||
**官网:** https://weknora.weixin.qq.com
|
||||
|
||||
## 🔒 安全声明
|
||||
|
||||
**重要提示:** 从 v0.1.3 版本开始,WeKnora 提供了登录鉴权功能,以增强系统安全性。在生产环境部署时,我们强烈建议:
|
||||
|
||||
- 将 WeKnora 服务部署在内网/私有网络环境中,而非公网环境
|
||||
- 避免将服务直接暴露在公网上,以防止重要信息泄露风险
|
||||
- 为部署环境配置适当的防火墙规则和访问控制
|
||||
- 定期更新到最新版本以获取安全补丁和改进
|
||||
|
||||
## 🏗️ 架构设计
|
||||
|
||||

|
||||
|
||||
WeKnora 采用现代化模块化设计,构建了一条完整的文档理解与检索流水线。系统主要包括文档解析、向量化处理、检索引擎和大模型推理等核心模块,每个组件均可灵活配置与扩展。
|
||||
|
||||
## 🎯 核心特性
|
||||
|
||||
- **🔍 精准理解**:支持 PDF、Word、图片等文档的结构化内容提取,统一构建语义视图
|
||||
- **🧠 智能推理**:借助大语言模型理解文档上下文与用户意图,支持精准问答与多轮对话
|
||||
- **🔧 灵活扩展**:从解析、嵌入、召回到生成全流程解耦,便于灵活集成与定制扩展
|
||||
- **⚡ 高效检索**:混合多种检索策略:关键词、向量、知识图谱
|
||||
- **🎯 简单易用**:直观的Web界面与标准API,零技术门槛快速上手
|
||||
- **🔒 安全可控**:支持本地化与私有云部署,数据完全自主可控
|
||||
|
||||
## 📊 适用场景
|
||||
|
||||
| 应用场景 | 具体应用 | 核心价值 |
|
||||
|---------|----------|----------|
|
||||
| **企业知识管理** | 内部文档检索、规章制度问答、操作手册查询 | 提升知识查找效率,降低培训成本 |
|
||||
| **科研文献分析** | 论文检索、研究报告分析、学术资料整理 | 加速文献调研,辅助研究决策 |
|
||||
| **产品技术支持** | 产品手册问答、技术文档检索、故障排查 | 提升客户服务质量,减少技术支持负担 |
|
||||
| **法律合规审查** | 合同条款检索、法规政策查询、案例分析 | 提高合规效率,降低法律风险 |
|
||||
| **医疗知识辅助** | 医学文献检索、诊疗指南查询、病例分析 | 辅助临床决策,提升诊疗质量 |
|
||||
|
||||
## 🧩 功能模块能力
|
||||
|
||||
| 功能模块 | 支持情况 | 说明 |
|
||||
|---------|---------|------|
|
||||
| 文档格式支持 | ✅ PDF / Word / Txt / Markdown / 图片(含 OCR / Caption) | 支持多种结构化与非结构化文档内容解析,支持图文混排与图像文字提取 |
|
||||
| 嵌入模型支持 | ✅ 本地模型、BGE / GTE API 等 | 支持自定义 embedding 模型,兼容本地部署与云端向量生成接口 |
|
||||
| 向量数据库接入 | ✅ PostgreSQL(pgvector)、Elasticsearch | 支持主流向量索引后端,可灵活切换与扩展,适配不同检索场景 |
|
||||
| 检索机制 | ✅ BM25 / Dense Retrieve / GraphRAG | 支持稠密/稀疏召回、知识图谱增强检索等多种策略,可自由组合召回-重排-生成流程 |
|
||||
| 大模型集成 | ✅ 支持 Qwen、DeepSeek 等,思考/非思考模式切换 | 可接入本地大模型(如 Ollama 启动)或调用外部 API 服务,支持推理模式灵活配置 |
|
||||
| 问答能力 | ✅ 上下文感知、多轮对话、提示词模板 | 支持复杂语义建模、指令控制与链式问答,可配置提示词与上下文窗口 |
|
||||
| 端到端测试支持 | ✅ 检索+生成过程可视化与指标评估 | 提供一体化链路测试工具,支持评估召回命中率、回答覆盖度、BLEU / ROUGE 等主流指标 |
|
||||
| 部署模式 | ✅ 支持本地部署 / Docker 镜像 | 满足私有化、离线部署与灵活运维的需求 |
|
||||
| 用户界面 | ✅ Web UI + RESTful API | 提供交互式界面与标准 API 接口,适配开发者与业务用户使用习惯 |
|
||||
|
||||
## 🚀 快速开始
|
||||
|
||||
### 🛠 环境要求
|
||||
|
||||
确保本地已安装以下工具:
|
||||
|
||||
* [Docker](https://www.docker.com/)
|
||||
* [Docker Compose](https://docs.docker.com/compose/)
|
||||
* [Git](https://git-scm.com/)
|
||||
|
||||
### 📦 安装步骤
|
||||
|
||||
#### ① 克隆代码仓库
|
||||
|
||||
```bash
|
||||
# 克隆主仓库
|
||||
git clone https://github.com/Tencent/WeKnora.git
|
||||
cd WeKnora
|
||||
```
|
||||
|
||||
#### ② 配置环境变量
|
||||
|
||||
```bash
|
||||
# 复制示例配置文件
|
||||
cp .env.example .env
|
||||
|
||||
# 编辑 .env,填入对应配置信息
|
||||
# 所有变量说明详见 .env.example 注释
|
||||
```
|
||||
|
||||
#### ③ 启动服务 (含 Ollama)
|
||||
|
||||
检查 .env 文件中需要启动的镜像。
|
||||
|
||||
```bash
|
||||
./scripts/start_all.sh
|
||||
```
|
||||
|
||||
或者
|
||||
|
||||
```bash
|
||||
make start-all
|
||||
```
|
||||
|
||||
#### ③.0 启动Ollama (可选)
|
||||
|
||||
```bash
|
||||
ollama serve > /dev/null 2>&1 &
|
||||
```
|
||||
|
||||
#### ③.1 激活不同组合的功能
|
||||
|
||||
- 启动最小功能
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
- 启动全部功能
|
||||
```bash
|
||||
docker-compose --profile full up -d
|
||||
```
|
||||
|
||||
- 需要 tracing 日志
|
||||
```bash
|
||||
docker-compose --profile jaeger up -d
|
||||
```
|
||||
|
||||
- 需要 neo4j 知识图谱
|
||||
```bash
|
||||
docker-compose --profile neo4j up -d
|
||||
```
|
||||
|
||||
- 需要 minio 文件存储服务
|
||||
```bash
|
||||
docker-compose --profile minio up -d
|
||||
```
|
||||
|
||||
- 多选项组合
|
||||
```bash
|
||||
docker-compose --profile neo4j --profile minio up -d
|
||||
```
|
||||
|
||||
#### ④ 停止服务
|
||||
|
||||
```bash
|
||||
./scripts/start_all.sh --stop
|
||||
# 或
|
||||
make stop-all
|
||||
```
|
||||
|
||||
### 🌐 服务访问地址
|
||||
|
||||
启动成功后,可访问以下地址:
|
||||
|
||||
* Web UI:`http://localhost`
|
||||
* 后端 API:`http://localhost:8080`
|
||||
* 链路追踪(Jaeger):`http://localhost:16686`
|
||||
|
||||
### 🔌 使用微信对话开放平台
|
||||
|
||||
WeKnora 作为[微信对话开放平台](https://chatbot.weixin.qq.com)的核心技术框架,提供更简便的使用方式:
|
||||
|
||||
- **零代码部署**:只需上传知识,即可在微信生态中快速部署智能问答服务,实现"即问即答"的体验
|
||||
- **高效问题管理**:支持高频问题的独立分类管理,提供丰富的数据工具,确保回答精准可靠且易于维护
|
||||
- **微信生态覆盖**:通过微信对话开放平台,WeKnora 的智能问答能力可无缝集成到公众号、小程序等微信场景中,提升用户交互体验
|
||||
|
||||
### 🔗 MCP 服务器访问已经部署好的 WeKnora
|
||||
|
||||
#### 1️⃣克隆储存库
|
||||
|
||||
```
|
||||
git clone https://github.com/Tencent/WeKnora
|
||||
```
|
||||
|
||||
#### 2️⃣配置MCP服务器
|
||||
|
||||
> 推荐直接参考 [MCP配置说明](./mcp-server/MCP_CONFIG.md) 进行配置。
|
||||
|
||||
mcp客户端配置服务器
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"weknora": {
|
||||
"args": [
|
||||
"path/to/WeKnora/mcp-server/run_server.py"
|
||||
],
|
||||
"command": "python",
|
||||
"env":{
|
||||
"WEKNORA_API_KEY":"进入你的weknora实例,打开开发者工具,查看请求头x-api-key,以sk开头",
|
||||
"WEKNORA_BASE_URL":"http(s)://你的weknora地址/api/v1"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
使用stdio命令直接运行
|
||||
```
|
||||
pip install weknora-mcp-server
|
||||
python -m weknora-mcp-server
|
||||
```
|
||||
|
||||
## 🔧 初始化配置引导
|
||||
|
||||
为了方便用户快速配置各类模型,降低试错成本,我们改进了原来的配置文件初始化方式,增加了Web UI界面进行各种模型的配置。在使用之前,请确保代码更新到最新版本。具体使用步骤如下:
|
||||
如果是第一次使用本项目,可跳过①②步骤,直接进入③④步骤。
|
||||
|
||||
### ① 关闭服务
|
||||
|
||||
```bash
|
||||
./scripts/start_all.sh --stop
|
||||
```
|
||||
|
||||
### ② 清空原有数据表(建议在没有重要数据的情况下使用)
|
||||
|
||||
```bash
|
||||
make clean-db
|
||||
```
|
||||
|
||||
### ③ 编译并启动服务
|
||||
|
||||
```bash
|
||||
./scripts/start_all.sh
|
||||
```
|
||||
|
||||
### ④ 访问Web UI
|
||||
|
||||
http://localhost
|
||||
|
||||
首次访问会自动跳转到注册登录页面,完成注册后,请创建一个新的知识库,并在该知识库的设置页面完成相关设置。
|
||||
|
||||
## 📱 功能展示
|
||||
|
||||
### Web UI 界面
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><b>知识上传</b><br/><img src="./docs/images/knowledges.png" alt="知识上传界面"></td>
|
||||
<td><b>知识问答入口</b><br/><img src="./docs/images/qa.png" alt="知识问答入口"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2"><b>图文结果回答</b><br/><img src="./docs/images/answer.png" alt="图文结果回答"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
**知识库管理:** 支持拖拽上传各类文档,自动识别文档结构并提取核心知识,建立索引。系统清晰展示处理进度和文档状态,实现高效的知识库管理。
|
||||
|
||||
### 文档知识图谱
|
||||
|
||||
WeKnora 支持将文档转化为知识图谱,展示文档中不同段落之间的关联关系。开启知识图谱功能后,系统会分析并构建文档内部的语义关联网络,不仅帮助用户理解文档内容,还为索引和检索提供结构化支撑,提升检索结果的相关性和广度。
|
||||
|
||||
具体配置请参考 [知识图谱配置说明](./docs/KnowledgeGraph.md) 进行相关配置。
|
||||
|
||||
### 配套MCP服务器
|
||||
|
||||
请参考 [MCP配置说明](./mcp-server/MCP_CONFIG.md) 进行相关配置。
|
||||
|
||||
## 📘 文档
|
||||
|
||||
常见问题排查:[常见问题排查](./docs/QA.md)
|
||||
|
||||
详细接口说明请参考:[API 文档](./docs/API.md)
|
||||
|
||||
## 🧭 开发指南
|
||||
|
||||
### 📁 项目目录结构
|
||||
|
||||
```
|
||||
WeKnora/
|
||||
├── client/ # go客户端
|
||||
├── cmd/ # 应用入口
|
||||
├── config/ # 配置文件
|
||||
├── docker/ # docker 镜像文件
|
||||
├── docreader/ # 文档解析项目
|
||||
├── docs/ # 项目文档
|
||||
├── frontend/ # 前端项目
|
||||
├── internal/ # 核心业务逻辑
|
||||
├── mcp-server/ # MCP服务器
|
||||
├── migrations/ # 数据库迁移脚本
|
||||
└── scripts/ # 启动与工具脚本
|
||||
```
|
||||
|
||||
## 🤝 贡献指南
|
||||
|
||||
我们欢迎社区用户参与贡献!如有建议、Bug 或新功能需求,请通过 [Issue](https://github.com/Tencent/WeKnora/issues) 提出,或直接提交 Pull Request。
|
||||
|
||||
### 🎯 贡献方式
|
||||
|
||||
- 🐛 **Bug修复**: 发现并修复系统缺陷
|
||||
- ✨ **新功能**: 提出并实现新特性
|
||||
- 📚 **文档改进**: 完善项目文档
|
||||
- 🧪 **测试用例**: 编写单元测试和集成测试
|
||||
- 🎨 **UI/UX优化**: 改进用户界面和体验
|
||||
|
||||
### 📋 贡献流程
|
||||
|
||||
1. **Fork项目** 到你的GitHub账户
|
||||
2. **创建特性分支** `git checkout -b feature/amazing-feature`
|
||||
3. **提交更改** `git commit -m 'Add amazing feature'`
|
||||
4. **推送分支** `git push origin feature/amazing-feature`
|
||||
5. **创建Pull Request** 并详细描述变更内容
|
||||
|
||||
### 🎨 代码规范
|
||||
|
||||
- 遵循 [Go Code Review Comments](https://github.com/golang/go/wiki/CodeReviewComments)
|
||||
- 使用 `gofmt` 格式化代码
|
||||
- 添加必要的单元测试
|
||||
- 更新相关文档
|
||||
|
||||
### 📝 提交规范
|
||||
|
||||
使用 [Conventional Commits](https://www.conventionalcommits.org/) 规范:
|
||||
|
||||
```
|
||||
feat: 添加文档批量上传功能
|
||||
fix: 修复向量检索精度问题
|
||||
docs: 更新API文档
|
||||
test: 添加检索引擎测试用例
|
||||
refactor: 重构文档解析模块
|
||||
```
|
||||
|
||||
## 👥 贡献者
|
||||
|
||||
感谢以下优秀的贡献者们:
|
||||
|
||||
[](https://github.com/Tencent/WeKnora/graphs/contributors)
|
||||
|
||||
## 📄 许可证
|
||||
|
||||
本项目基于 [MIT](./LICENSE) 协议发布。
|
||||
你可以自由使用、修改和分发本项目代码,但需保留原始版权声明。
|
||||
|
||||
## 📈 项目统计
|
||||
|
||||
<a href="https://www.star-history.com/#Tencent/WeKnora&type=date&legend=top-left">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&theme=dark&legend=top-left" />
|
||||
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
</picture>
|
||||
</a>
|
||||
249
README_EN.md
@@ -1,249 +0,0 @@
|
||||
<p align="center">
|
||||
<picture>
|
||||
<img src="./docs/images/logo.png" alt="WeKnora Logo" height="120"/>
|
||||
</picture>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://weknora.weixin.qq.com" target="_blank">
|
||||
<img alt="官方网站" src="https://img.shields.io/badge/官方网站-WeKnora-4e6b99">
|
||||
</a>
|
||||
<a href="https://chatbot.weixin.qq.com" target="_blank">
|
||||
<img alt="微信对话开放平台" src="https://img.shields.io/badge/微信对话开放平台-5ac725">
|
||||
</a>
|
||||
<a href="https://github.com/Tencent/WeKnora/blob/main/LICENSE">
|
||||
<img src="https://img.shields.io/badge/License-MIT-ffffff?labelColor=d4eaf7&color=2e6cc4" alt="License">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
| <b>English</b> | <a href="./README.md"><b>简体中文</b></a> | <a href="./README_JA.md"><b>日本語</b></a> |
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<h4 align="center">
|
||||
|
||||
[Overview](#-overview) • [Architecture](#-architecture) • [Key Features](#-key-features) • [Getting Started](#-getting-started) • [API Reference](#-api-reference) • [Developer Guide](#-developer-guide)
|
||||
|
||||
</h4>
|
||||
</p>
|
||||
|
||||
# 💡 WeKnora - LLM-Powered Document Understanding & Retrieval Framework
|
||||
|
||||
## 📌 Overview
|
||||
|
||||
[**WeKnora**](https://weknora.weixin.qq.com) is an LLM-powered framework designed for deep document understanding and semantic retrieval, especially for handling complex, heterogeneous documents.
|
||||
|
||||
It adopts a modular architecture that combines multimodal preprocessing, semantic vector indexing, intelligent retrieval, and large language model inference. At its core, WeKnora follows the **RAG (Retrieval-Augmented Generation)** paradigm, enabling high-quality, context-aware answers by combining relevant document chunks with model reasoning.
|
||||
|
||||
**Website:** https://weknora.weixin.qq.com
|
||||
|
||||
## 🏗️ Architecture
|
||||
|
||||

|
||||
|
||||
WeKnora employs a modern modular design to build a complete document understanding and retrieval pipeline. The system primarily includes document parsing, vector processing, retrieval engine, and large model inference as core modules, with each component being flexibly configurable and extendable.
|
||||
|
||||
## 🎯 Key Features
|
||||
|
||||
- **🔍 Precise Understanding**: Structured content extraction from PDFs, Word documents, images and more into unified semantic views
|
||||
- **🧠 Intelligent Reasoning**: Leverages LLMs to understand document context and user intent for accurate Q&A and multi-turn conversations
|
||||
- **🔧 Flexible Extension**: All components from parsing and embedding to retrieval and generation are decoupled for easy customization
|
||||
- **⚡ Efficient Retrieval**: Hybrid retrieval strategies combining keywords, vectors, and knowledge graphs
|
||||
- **🎯 User-Friendly**: Intuitive web interface and standardized APIs for zero technical barriers
|
||||
- **🔒 Secure & Controlled**: Support for local deployment and private cloud, ensuring complete data sovereignty
|
||||
|
||||
## 📊 Application Scenarios
|
||||
|
||||
| Scenario | Applications | Core Value |
|
||||
|---------|----------|----------|
|
||||
| **Enterprise Knowledge Management** | Internal document retrieval, policy Q&A, operation manual search | Improve knowledge discovery efficiency, reduce training costs |
|
||||
| **Academic Research Analysis** | Paper retrieval, research report analysis, scholarly material organization | Accelerate literature review, assist research decisions |
|
||||
| **Product Technical Support** | Product manual Q&A, technical documentation search, troubleshooting | Enhance customer service quality, reduce support burden |
|
||||
| **Legal & Compliance Review** | Contract clause retrieval, regulatory policy search, case analysis | Improve compliance efficiency, reduce legal risks |
|
||||
| **Medical Knowledge Assistance** | Medical literature retrieval, treatment guideline search, case analysis | Support clinical decisions, improve diagnosis quality |
|
||||
|
||||
## 🧩 Feature Matrix
|
||||
|
||||
| Module | Support | Description |
|
||||
|---------|---------|------|
|
||||
| Document Formats | ✅ PDF / Word / Txt / Markdown / Images (with OCR / Caption) | Support for structured and unstructured documents with text extraction from images |
|
||||
| Embedding Models | ✅ Local models, BGE / GTE APIs, etc. | Customizable embedding models, compatible with local deployment and cloud vector generation APIs |
|
||||
| Vector DB Integration | ✅ PostgreSQL (pgvector), Elasticsearch | Support for mainstream vector index backends, flexible switching for different retrieval scenarios |
|
||||
| Retrieval Strategies | ✅ BM25 / Dense Retrieval / GraphRAG | Support for sparse/dense recall and knowledge graph-enhanced retrieval with customizable retrieve-rerank-generate pipelines |
|
||||
| LLM Integration | ✅ Support for Qwen, DeepSeek, etc., with thinking/non-thinking mode switching | Compatible with local models (e.g., via Ollama) or external API services with flexible inference configuration |
|
||||
| QA Capabilities | ✅ Context-aware, multi-turn dialogue, prompt templates | Support for complex semantic modeling, instruction control and chain-of-thought Q&A with configurable prompts and context windows |
|
||||
| E2E Testing | ✅ Retrieval+generation process visualization and metric evaluation | End-to-end testing tools for evaluating recall hit rates, answer coverage, BLEU/ROUGE and other metrics |
|
||||
| Deployment Modes | ✅ Support for local deployment / Docker images | Meets private, offline deployment and flexible operation requirements |
|
||||
| User Interfaces | ✅ Web UI + RESTful API | Interactive interface and standard API endpoints, suitable for both developers and business users |
|
||||
|
||||
## 🚀 Getting Started
|
||||
|
||||
### 🛠 Prerequisites
|
||||
|
||||
Make sure the following tools are installed on your system:
|
||||
|
||||
* [Docker](https://www.docker.com/)
|
||||
* [Docker Compose](https://docs.docker.com/compose/)
|
||||
* [Git](https://git-scm.com/)
|
||||
|
||||
### 📦 Installation
|
||||
|
||||
#### ① Clone the repository
|
||||
|
||||
```bash
|
||||
# Clone the main repository
|
||||
git clone https://github.com/Tencent/WeKnora.git
|
||||
cd WeKnora
|
||||
```
|
||||
|
||||
#### ② Configure environment variables
|
||||
|
||||
```bash
|
||||
# Copy example env file
|
||||
cp .env.example .env
|
||||
|
||||
# Edit .env and set required values
|
||||
# All variables are documented in the .env.example comments
|
||||
```
|
||||
|
||||
#### ③ Start the services
|
||||
|
||||
```bash
|
||||
# Start all services (Ollama + backend containers)
|
||||
./scripts/start_all.sh
|
||||
# Or
|
||||
make start-all
|
||||
```
|
||||
|
||||
#### ③ Start the services (backup)
|
||||
|
||||
```bash
|
||||
# Start ollama services (Optional)
|
||||
ollama serve > /dev/null 2>&1 &
|
||||
|
||||
# Start the service
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
#### ④ Stop the services
|
||||
|
||||
```bash
|
||||
./scripts/start_all.sh --stop
|
||||
# Or
|
||||
make stop-all
|
||||
```
|
||||
|
||||
### 🌐 Access Services
|
||||
|
||||
Once started, services will be available at:
|
||||
|
||||
* Web UI: `http://localhost`
|
||||
* Backend API: `http://localhost:8080`
|
||||
* Jaeger Tracing: `http://localhost:16686`
|
||||
|
||||
### 🔌 Using WeChat Dialog Open Platform
|
||||
|
||||
WeKnora serves as the core technology framework for the [WeChat Dialog Open Platform](https://chatbot.weixin.qq.com), providing a more convenient usage approach:
|
||||
|
||||
- **Zero-code Deployment**: Simply upload knowledge to quickly deploy intelligent Q&A services within the WeChat ecosystem, achieving an "ask and answer" experience
|
||||
- **Efficient Question Management**: Support for categorized management of high-frequency questions, with rich data tools to ensure accurate, reliable, and easily maintainable answers
|
||||
- **WeChat Ecosystem Integration**: Through the WeChat Dialog Open Platform, WeKnora's intelligent Q&A capabilities can be seamlessly integrated into WeChat Official Accounts, Mini Programs, and other WeChat scenarios, enhancing user interaction experiences
|
||||
|
||||
## 📱 Interface Showcase
|
||||
|
||||
### Web UI Interface
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><b>Knowledge Upload</b><br/><img src="./docs/images/knowledges.png" alt="Knowledge Upload Interface"></td>
|
||||
<td><b>Q&A Entry</b><br/><img src="./docs/images/qa.png" alt="Q&A Entry Interface"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2"><b>Rich Text & Image Responses</b><br/><img src="./docs/images/answer.png" alt="Rich Answer Interface"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
**Knowledge Base Management:** Support for dragging and dropping various documents, automatically identifying document structures and extracting core knowledge to establish indexes. The system clearly displays processing progress and document status, achieving efficient knowledge base management.
|
||||
|
||||
### Document Knowledge Graph
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="./docs/images/graph2.png" alt="Knowledge Graph View 1"></td>
|
||||
<td><img src="./docs/images/graph1.png" alt="Knowledge Graph View 2"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
WeKnora supports transforming documents into knowledge graphs, displaying the relationships between different sections of the documents. Once the knowledge graph feature is enabled, the system analyzes and constructs an internal semantic association network that not only helps users understand document content but also provides structured support for indexing and retrieval, enhancing the relevance and breadth of search results.
|
||||
|
||||
## 📘 API Reference
|
||||
|
||||
Detailed API documentation is available at: [API Docs](./docs/API.md)
|
||||
|
||||
## 🧭 Developer Guide
|
||||
|
||||
### 📁 Directory Structure
|
||||
|
||||
```
|
||||
WeKnora/
|
||||
├── cmd/ # Main entry point
|
||||
├── internal/ # Core business logic
|
||||
├── config/ # Configuration files
|
||||
├── migrations/ # DB migration scripts
|
||||
├── scripts/ # Shell scripts
|
||||
├── services/ # Microservice logic
|
||||
├── frontend/ # Frontend app
|
||||
└── docs/ # Project documentation
|
||||
```
|
||||
|
||||
### 🔧 Common Commands
|
||||
|
||||
```bash
|
||||
# Wipe all data from DB (use with caution)
|
||||
make clean-db
|
||||
```
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
We welcome community contributions! For suggestions, bugs, or feature requests, please submit an [Issue](https://github.com/Tencent/WeKnora/issues) or directly create a Pull Request.
|
||||
|
||||
### 🎯 How to Contribute
|
||||
|
||||
- 🐛 **Bug Fixes**: Discover and fix system defects
|
||||
- ✨ **New Features**: Propose and implement new capabilities
|
||||
- 📚 **Documentation**: Improve project documentation
|
||||
- 🧪 **Test Cases**: Write unit and integration tests
|
||||
- 🎨 **UI/UX Enhancements**: Improve user interface and experience
|
||||
|
||||
### 📋 Contribution Process
|
||||
|
||||
1. **Fork the project** to your GitHub account
|
||||
2. **Create a feature branch** `git checkout -b feature/amazing-feature`
|
||||
3. **Commit changes** `git commit -m 'Add amazing feature'`
|
||||
4. **Push branch** `git push origin feature/amazing-feature`
|
||||
5. **Create a Pull Request** with detailed description of changes
|
||||
|
||||
### 🎨 Code Standards
|
||||
|
||||
- Follow [Go Code Review Comments](https://github.com/golang/go/wiki/CodeReviewComments)
|
||||
- Format code using `gofmt`
|
||||
- Add necessary unit tests
|
||||
- Update relevant documentation
|
||||
|
||||
### 📝 Commit Guidelines
|
||||
|
||||
Use [Conventional Commits](https://www.conventionalcommits.org/) standard:
|
||||
|
||||
```
|
||||
feat: Add document batch upload functionality
|
||||
fix: Resolve vector retrieval precision issue
|
||||
docs: Update API documentation
|
||||
test: Add retrieval engine test cases
|
||||
refactor: Restructure document parsing module
|
||||
```
|
||||
|
||||
## 📄 License
|
||||
|
||||
This project is licensed under the [MIT License](./LICENSE).
|
||||
You are free to use, modify, and distribute the code with proper attribution.
|
||||
132
README_JA.md
@@ -14,10 +14,13 @@
|
||||
<a href="https://github.com/Tencent/WeKnora/blob/main/LICENSE">
|
||||
<img src="https://img.shields.io/badge/License-MIT-ffffff?labelColor=d4eaf7&color=2e6cc4" alt="License">
|
||||
</a>
|
||||
<a href="./CHANGELOG.md">
|
||||
<img alt="バージョン" src="https://img.shields.io/badge/version-0.1.3-2e6cc4?labelColor=d4eaf7">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
| <a href="./README_EN.md"><b>English</b></a> | <a href="./README.md"><b>简体中文</b></a> | <b>日本語</b> |
|
||||
| <a href="./README.md"><b>English</b></a> | <a href="./README_CN.md"><b>简体中文</b></a> | <b>日本語</b> |
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
@@ -38,6 +41,15 @@
|
||||
|
||||
**公式サイト:** https://weknora.weixin.qq.com
|
||||
|
||||
## 🔒 セキュリティ通知
|
||||
|
||||
**重要:** v0.1.3バージョンより、WeKnoraにはシステムセキュリティを強化するためのログイン認証機能が含まれています。本番環境でのデプロイメントにおいて、以下を強く推奨します:
|
||||
|
||||
- WeKnoraサービスはパブリックインターネットではなく、内部/プライベートネットワーク環境にデプロイしてください
|
||||
- 重要な情報漏洩を防ぐため、サービスを直接パブリックネットワークに公開することは避けてください
|
||||
- デプロイメント環境に適切なファイアウォールルールとアクセス制御を設定してください
|
||||
- セキュリティパッチと改善のため、定期的に最新バージョンに更新してください
|
||||
|
||||
## 🏗️ アーキテクチャ設計
|
||||
|
||||

|
||||
@@ -107,25 +119,58 @@ cp .env.example .env
|
||||
# すべての変数の説明は.env.exampleのコメントを参照
|
||||
```
|
||||
|
||||
#### ③ サービスの起動
|
||||
#### ③ サービスを起動します(Ollama を含む)
|
||||
|
||||
.env ファイルで、起動する必要があるイメージを確認します。
|
||||
|
||||
```bash
|
||||
# すべてのサービスを起動(Ollamaとバックエンドコンテナを含む)
|
||||
./scripts/start_all.sh
|
||||
# または
|
||||
```
|
||||
|
||||
または
|
||||
|
||||
```bash
|
||||
make start-all
|
||||
```
|
||||
|
||||
#### ③ サービス起動の代替方法
|
||||
#### ③.0 ollama サービスを起動する (オプション)
|
||||
|
||||
```bash
|
||||
# ollamaサービスを起動(オプション)
|
||||
ollama serve > /dev/null 2>&1 &
|
||||
```
|
||||
|
||||
# サービスを起動
|
||||
#### ③.1 さまざまな機能の組み合わせを有効にする
|
||||
|
||||
- 最小限のコアサービス
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
- すべての機能を有効にする
|
||||
```bash
|
||||
docker-compose --profile full up -d
|
||||
```
|
||||
|
||||
- トレースログが必要
|
||||
```bash
|
||||
docker-compose --profile jaeger up -d
|
||||
```
|
||||
|
||||
- Neo4j ナレッジグラフが必要
|
||||
```bash
|
||||
docker-compose --profile neo4j up -d
|
||||
```
|
||||
|
||||
- Minio ファイルストレージサービスが必要
|
||||
```bash
|
||||
docker-compose --profile minio up -d
|
||||
```
|
||||
|
||||
- 複数のオプションの組み合わせ
|
||||
```bash
|
||||
docker-compose --profile neo4j --profile minio up -d
|
||||
```
|
||||
|
||||
#### ④ サービスの停止
|
||||
|
||||
```bash
|
||||
@@ -150,12 +195,17 @@ WeKnoraは[WeChat対話オープンプラットフォーム](https://chatbot.wei
|
||||
- **効率的な問題管理**:高頻度の問題の独立した分類管理をサポートし、豊富なデータツールを提供して、正確で信頼性が高く、メンテナンスが容易な回答を保証
|
||||
- **WeChatエコシステムカバレッジ**:WeChat対話オープンプラットフォームを通じて、WeKnoraのインテリジェントQ&A能力を公式アカウント、ミニプログラムなどのWeChatシナリオにシームレスに統合し、ユーザーインタラクション体験を向上
|
||||
|
||||
### 🔗MCPサーバーを使用してデプロイ済みのWeKnoraにアクセス
|
||||
### 🔗 MCP サーバーを使用してデプロイ済みの WeKnora にアクセス
|
||||
|
||||
#### 1️⃣リポジトリのクローン
|
||||
```
|
||||
git clone https://github.com/Tencent/WeKnora
|
||||
```
|
||||
#### 2️⃣MCPサーバーの設定
|
||||
|
||||
#### 2️⃣ MCPサーバーの設定
|
||||
|
||||
> 設定には直接 [MCP設定説明](./mcp-server/MCP_CONFIG.md) を参照することをお勧めします。
|
||||
|
||||
MCPクライアントでサーバーを設定
|
||||
```json
|
||||
{
|
||||
@@ -173,6 +223,7 @@ MCPクライアントでサーバーを設定
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
stdioコマンドで直接実行
|
||||
```
|
||||
pip install weknora-mcp-server
|
||||
@@ -206,10 +257,7 @@ make clean-db
|
||||
|
||||
http://localhost
|
||||
|
||||
初回アクセス時は自動的に初期設定ページにリダイレクトされ、設定完了後は自動的にナレッジベースページにリダイレクトされます。ページの指示に従ってモデルの設定を完了してください。
|
||||
|
||||

|
||||
|
||||
初回アクセス時は自動的に登録・ログインページに遷移します。登録完了後、新規にナレッジベースを作成し、その設定画面で必要な項目を構成してください。
|
||||
|
||||
## 📱 機能デモ
|
||||
|
||||
@@ -229,17 +277,13 @@ http://localhost
|
||||
|
||||
### 文書ナレッジグラフ
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="./docs/images/graph2.png" alt="ナレッジグラフ表示1"></td>
|
||||
<td><img src="./docs/images/graph1.png" alt="ナレッジグラフ表示2"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
WeKnoraは文書をナレッジグラフに変換し、文書内の異なる段落間の関連関係を表示することをサポートします。ナレッジグラフ機能を有効にすると、システムは文書内部の意味関連ネットワークを分析・構築し、ユーザーが文書内容を理解するのを助けるだけでなく、インデックスと検索に構造化サポートを提供し、検索結果の関連性と幅を向上させます。
|
||||
|
||||
### 対応MCPサーバー呼び出し効果
|
||||
<img width="950" height="2063" alt="118d078426f42f3d4983c13386085d7f" src="https://github.com/user-attachments/assets/09111ec8-0489-415c-969d-aa3835778e14" />
|
||||
詳細な設定については、[ナレッジグラフ設定ガイド](./docs/KnowledgeGraph.md)をご参照ください。
|
||||
|
||||
### 対応するMCPサーバー
|
||||
|
||||
[MCP設定ガイド](./mcp-server/MCP_CONFIG.md) をご参照のうえ、必要な設定を行ってください。
|
||||
|
||||
|
||||
## 📘 ドキュメント
|
||||
@@ -253,22 +297,18 @@ WeKnoraは文書をナレッジグラフに変換し、文書内の異なる段
|
||||
### 📁 プロジェクトディレクトリ構造
|
||||
|
||||
```
|
||||
WeKnora/
|
||||
├── cmd/ # アプリケーションエントリー
|
||||
├── internal/ # コアビジネスロジック
|
||||
├── config/ # 設定ファイル
|
||||
├── migrations/ # データベースマイグレーションスクリプト
|
||||
├── scripts/ # 起動とツールスクリプト
|
||||
├── services/ # 各サブサービスの実装
|
||||
├── frontend/ # フロントエンドプロジェクト
|
||||
└── docs/ # プロジェクトドキュメント
|
||||
```
|
||||
|
||||
### 🔧 よく使うコマンド
|
||||
|
||||
```bash
|
||||
# データベースをクリア(注意して使用!)
|
||||
make clean-db
|
||||
WeKnora/
|
||||
├── client/ # Goクライアント
|
||||
├── cmd/ # アプリケーションエントリ
|
||||
├── config/ # 設定ファイル
|
||||
├── docker/ # Dockerイメージファイル
|
||||
├── docreader/ # 文書解析プロジェクト
|
||||
├── docs/ # プロジェクトドキュメント
|
||||
├── frontend/ # フロントエンドプロジェクト
|
||||
├── internal/ # コアビジネスロジック
|
||||
├── mcp-server/ # MCPサーバー
|
||||
├── migrations/ # データベースマイグレーションスクリプト
|
||||
└── scripts/ # 起動およびツールスクリプト
|
||||
```
|
||||
|
||||
## 🤝 貢献ガイド
|
||||
@@ -310,7 +350,23 @@ test: 検索エンジンテストケースを追加
|
||||
refactor: 文書解析モジュールをリファクタリング
|
||||
```
|
||||
|
||||
## 👥 コントリビューター
|
||||
|
||||
素晴らしいコントリビューターに感謝します:
|
||||
|
||||
[](https://github.com/Tencent/WeKnora/graphs/contributors )
|
||||
|
||||
## 📄 ライセンス
|
||||
|
||||
このプロジェクトは[MIT](./LICENSE)ライセンスの下で公開されています。
|
||||
このプロジェクトのコードを自由に使用、変更、配布できますが、元の著作権表示を保持する必要があります。
|
||||
|
||||
## 📈 プロジェクト統計
|
||||
|
||||
<a href="https://www.star-history.com/#Tencent/WeKnora&type=date&legend=top-left">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&theme=dark&legend=top-left" />
|
||||
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
</picture>
|
||||
</a>
|
||||
|
||||
@@ -74,6 +74,9 @@ type UpdateImageInfoRequest struct {
|
||||
// ErrDuplicateFile is returned when attempting to create a knowledge entry with a file that already exists
|
||||
var ErrDuplicateFile = errors.New("file already exists")
|
||||
|
||||
// ErrDuplicateURL is returned when attempting to create a knowledge entry with a URL that already exists
|
||||
var ErrDuplicateURL = errors.New("URL already exists")
|
||||
|
||||
// CreateKnowledgeFromFile creates a knowledge entry from a local file path
|
||||
func (c *Client) CreateKnowledgeFromFile(ctx context.Context,
|
||||
knowledgeBaseID string, filePath string, metadata map[string]string, enableMultimodel *bool,
|
||||
@@ -186,7 +189,12 @@ func (c *Client) CreateKnowledgeFromURL(ctx context.Context, knowledgeBaseID str
|
||||
}
|
||||
|
||||
var response KnowledgeResponse
|
||||
if err := parseResponse(resp, &response); err != nil {
|
||||
if resp.StatusCode == http.StatusConflict {
|
||||
if err := json.NewDecoder(resp.Body).Decode(&response); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse response: %w", err)
|
||||
}
|
||||
return &response.Data, ErrDuplicateURL
|
||||
} else if err := parseResponse(resp, &response); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ conversation:
|
||||
keyword_threshold: 0.3
|
||||
embedding_top_k: 10
|
||||
vector_threshold: 0.5
|
||||
rerank_threshold: 0.7
|
||||
rerank_threshold: 0.5
|
||||
rerank_top_k: 5
|
||||
fallback_strategy: "fixed"
|
||||
fallback_response: "抱歉,我无法回答这个问题。"
|
||||
@@ -534,3 +534,69 @@ knowledge_base:
|
||||
split_markers: ["\n\n", "\n", "。"]
|
||||
image_processing:
|
||||
enable_multimodal: true
|
||||
|
||||
extract:
|
||||
extract_graph:
|
||||
description: |
|
||||
请基于给定文本,按以下步骤完成信息提取任务,确保逻辑清晰、信息完整准确:
|
||||
|
||||
## 一、实体提取与属性补充
|
||||
1. **提取核心实体**:通读文本,按逻辑顺序(如文本叙述顺序、实体关联紧密程度)提取所有与任务相关的核心实体。
|
||||
2. **补充实体详细属性**:针对每个提取的实体,全面补充其在文本中明确提及的详细属性,确保无关键属性遗漏。
|
||||
|
||||
## 二、关系提取与验证
|
||||
1. **明确关系类型**:仅从指定关系列表中选择对应类型,限定关系类型为: %s。
|
||||
2. **提取有效关系**:基于已提取的实体及属性,识别文本中真实存在的关系,确保关系符合文本事实、无虚假关联。
|
||||
3. **明确关系主体**:对每一组提取的关系,清晰标注两个关联主体,避免主体混淆。
|
||||
4. **补充关联属性**:若文本中存在与该关系直接相关的补充信息,需将该信息作为关系的关联属性补充,进一步完善关系信息。
|
||||
tags:
|
||||
- "作者"
|
||||
- "别名"
|
||||
examples:
|
||||
- text: |
|
||||
《红楼梦》,又名《石头记》,是清代作家曹雪芹创作的中国古典四大名著之一,被誉为中国封建社会的百科全书。该书前80回由曹雪芹所著,后40回一般认为是高鹗所续。
|
||||
小说以贾、史、王、薛四大家族的兴衰为背景,以贾宝玉、林黛玉和薛宝钗的爱情悲剧为主线,刻画了以贾宝玉和金陵十二钗为中心的正邪两赋、贤愚并出的高度复杂的人物群像。
|
||||
成书于乾隆年间(1743年前后),是中国文学史上现实主义的高峰,对后世影响深远。
|
||||
node:
|
||||
- name: "红楼梦"
|
||||
attributes:
|
||||
- "中国古典四大名著之一"
|
||||
- "又名《石头记》"
|
||||
- "被誉为中国封建社会的百科全书"
|
||||
- name: "石头记"
|
||||
attributes:
|
||||
- "《红楼梦》的别名"
|
||||
- name: "曹雪芹"
|
||||
attributes:
|
||||
- "清代作家"
|
||||
- "《红楼梦》前 80 回的作者"
|
||||
- name: "高鹗"
|
||||
attributes:
|
||||
- "一般认为是《红楼梦》后 40 回的续写者"
|
||||
relation:
|
||||
- node1: "红楼梦"
|
||||
node2: "曹雪芹"
|
||||
type: "作者"
|
||||
- node1: "红楼梦"
|
||||
node2: "高鹗"
|
||||
type: "作者"
|
||||
- node1: "红楼梦"
|
||||
node2: "石头记"
|
||||
type: "别名"
|
||||
extract_entity:
|
||||
description: |
|
||||
请基于用户给的问题,按以下步骤处理关键信息提取任务:
|
||||
1. 梳理逻辑关联:首先完整分析文本内容,明确其核心逻辑关系,并简要标注该核心逻辑类型;
|
||||
2. 提取关键实体:围绕梳理出的逻辑关系,精准提取文本中的关键信息并归类为明确实体,确保不遗漏核心信息、不添加冗余内容;
|
||||
3. 排序实体优先级:按实体与文本核心主题的关联紧密程度排序,优先呈现对理解文本主旨最重要的实体;
|
||||
examples:
|
||||
- text: "《红楼梦》,又名《石头记》,是清代作家曹雪芹创作的中国古典四大名著之一,被誉为中国封建社会的百科全书。"
|
||||
node:
|
||||
- name: "红楼梦"
|
||||
- name: "曹雪芹"
|
||||
- name: "中国古典四大名著"
|
||||
fabri_text:
|
||||
with_tag: |
|
||||
请随机生成一段文本,要求内容与 %s 等相关,字数在 [50-200] 之间,并且尽量包含一些与这些标签相关的专业术语或典型元素,使文本更具针对性和相关性。
|
||||
with_no_tag: |
|
||||
请随机生成一段文本,内容请自由发挥,字数在 [50-200] 之间。
|
||||
@@ -1,20 +1,49 @@
|
||||
services:
|
||||
frontend:
|
||||
image: wechatopenai/weknora-ui:latest
|
||||
build: ./frontend
|
||||
container_name: WeKnora-frontend
|
||||
ports:
|
||||
- "${FRONTEND_PORT:-80}:80"
|
||||
depends_on:
|
||||
app:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
|
||||
app:
|
||||
image: wechatopenai/weknora-app:latest
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/Dockerfile.app
|
||||
container_name: WeKnora-app
|
||||
ports:
|
||||
- "8080:8080"
|
||||
- "${APP_PORT:-8080}:8080"
|
||||
volumes:
|
||||
- data-files:/data/files
|
||||
- ./config/config.yaml:/app/config/config.yaml
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
environment:
|
||||
- GIN_MODE=${GIN_MODE}
|
||||
- COS_SECRET_ID=${COS_SECRET_ID:-}
|
||||
- COS_SECRET_KEY=${COS_SECRET_KEY:-}
|
||||
- COS_REGION=${COS_REGION:-}
|
||||
- COS_BUCKET_NAME=${COS_BUCKET_NAME:-}
|
||||
- COS_APP_ID=${COS_APP_ID:-}
|
||||
- COS_PATH_PREFIX=${COS_PATH_PREFIX:-}
|
||||
- COS_ENABLE_OLD_DOMAIN=${COS_ENABLE_OLD_DOMAIN:-}
|
||||
- GIN_MODE=${GIN_MODE:-}
|
||||
- DB_DRIVER=postgres
|
||||
- DB_HOST=postgres
|
||||
- DB_PORT=5432
|
||||
- DB_USER=${DB_USER}
|
||||
- DB_PASSWORD=${DB_PASSWORD}
|
||||
- DB_NAME=${DB_NAME}
|
||||
- DB_USER=${DB_USER:-}
|
||||
- DB_PASSWORD=${DB_PASSWORD:-}
|
||||
- DB_NAME=${DB_NAME:-}
|
||||
- TZ=Asia/Shanghai
|
||||
- OTEL_EXPORTER_OTLP_ENDPOINT=jaeger:4317
|
||||
- OTEL_SERVICE_NAME=WeKnora
|
||||
@@ -22,141 +51,101 @@ services:
|
||||
- OTEL_METRICS_EXPORTER=none
|
||||
- OTEL_LOGS_EXPORTER=none
|
||||
- OTEL_PROPAGATORS=tracecontext,baggage
|
||||
- RETRIEVE_DRIVER=${RETRIEVE_DRIVER}
|
||||
- ELASTICSEARCH_ADDR=${ELASTICSEARCH_ADDR}
|
||||
- ELASTICSEARCH_USERNAME=${ELASTICSEARCH_USERNAME}
|
||||
- ELASTICSEARCH_PASSWORD=${ELASTICSEARCH_PASSWORD}
|
||||
- ELASTICSEARCH_INDEX=${ELASTICSEARCH_INDEX}
|
||||
- RETRIEVE_DRIVER=${RETRIEVE_DRIVER:-}
|
||||
- ELASTICSEARCH_ADDR=${ELASTICSEARCH_ADDR:-}
|
||||
- ELASTICSEARCH_USERNAME=${ELASTICSEARCH_USERNAME:-}
|
||||
- ELASTICSEARCH_PASSWORD=${ELASTICSEARCH_PASSWORD:-}
|
||||
- ELASTICSEARCH_INDEX=${ELASTICSEARCH_INDEX:-}
|
||||
- DOCREADER_ADDR=docreader:50051
|
||||
- STORAGE_TYPE=${STORAGE_TYPE}
|
||||
- LOCAL_STORAGE_BASE_DIR=${LOCAL_STORAGE_BASE_DIR}
|
||||
- STORAGE_TYPE=${STORAGE_TYPE:-}
|
||||
- LOCAL_STORAGE_BASE_DIR=${LOCAL_STORAGE_BASE_DIR:-}
|
||||
- MINIO_ENDPOINT=minio:9000
|
||||
- MINIO_ACCESS_KEY_ID=${MINIO_ACCESS_KEY_ID:-minioadmin}
|
||||
- MINIO_SECRET_ACCESS_KEY=${MINIO_SECRET_ACCESS_KEY:-minioadmin}
|
||||
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME}
|
||||
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
|
||||
- OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-http://host.docker.internal:11434}
|
||||
- STREAM_MANAGER_TYPE=${STREAM_MANAGER_TYPE}
|
||||
- STREAM_MANAGER_TYPE=${STREAM_MANAGER_TYPE:-}
|
||||
- REDIS_ADDR=redis:6379
|
||||
- REDIS_PASSWORD=${REDIS_PASSWORD}
|
||||
- REDIS_DB=${REDIS_DB}
|
||||
- REDIS_PREFIX=${REDIS_PREFIX}
|
||||
- ENABLE_GRAPH_RAG=${ENABLE_GRAPH_RAG}
|
||||
- TENANT_AES_KEY=${TENANT_AES_KEY}
|
||||
- REDIS_PASSWORD=${REDIS_PASSWORD:-}
|
||||
- REDIS_DB=${REDIS_DB:-}
|
||||
- REDIS_PREFIX=${REDIS_PREFIX:-}
|
||||
- ENABLE_GRAPH_RAG=${ENABLE_GRAPH_RAG:-}
|
||||
- NEO4J_ENABLE=${NEO4J_ENABLE:-}
|
||||
- NEO4J_URI=bolt://neo4j:7687
|
||||
- NEO4J_USERNAME=${NEO4J_USERNAME:-neo4j}
|
||||
- NEO4J_PASSWORD=${NEO4J_PASSWORD:-password}
|
||||
- TENANT_AES_KEY=${TENANT_AES_KEY:-}
|
||||
- CONCURRENCY_POOL_SIZE=${CONCURRENCY_POOL_SIZE:-5}
|
||||
- INIT_LLM_MODEL_NAME=${INIT_LLM_MODEL_NAME}
|
||||
- INIT_LLM_MODEL_BASE_URL=${INIT_LLM_MODEL_BASE_URL}
|
||||
- INIT_LLM_MODEL_API_KEY=${INIT_LLM_MODEL_API_KEY}
|
||||
- INIT_EMBEDDING_MODEL_NAME=${INIT_EMBEDDING_MODEL_NAME}
|
||||
- INIT_EMBEDDING_MODEL_BASE_URL=${INIT_EMBEDDING_MODEL_BASE_URL}
|
||||
- INIT_EMBEDDING_MODEL_API_KEY=${INIT_EMBEDDING_MODEL_API_KEY}
|
||||
- INIT_EMBEDDING_MODEL_DIMENSION=${INIT_EMBEDDING_MODEL_DIMENSION}
|
||||
- INIT_EMBEDDING_MODEL_ID=${INIT_EMBEDDING_MODEL_ID}
|
||||
- INIT_RERANK_MODEL_NAME=${INIT_RERANK_MODEL_NAME}
|
||||
- INIT_RERANK_MODEL_BASE_URL=${INIT_RERANK_MODEL_BASE_URL}
|
||||
- INIT_RERANK_MODEL_API_KEY=${INIT_RERANK_MODEL_API_KEY}
|
||||
- INIT_LLM_MODEL_NAME=${INIT_LLM_MODEL_NAME:-}
|
||||
- INIT_LLM_MODEL_BASE_URL=${INIT_LLM_MODEL_BASE_URL:-}
|
||||
- INIT_LLM_MODEL_API_KEY=${INIT_LLM_MODEL_API_KEY:-}
|
||||
- INIT_EMBEDDING_MODEL_NAME=${INIT_EMBEDDING_MODEL_NAME:-}
|
||||
- INIT_EMBEDDING_MODEL_BASE_URL=${INIT_EMBEDDING_MODEL_BASE_URL:-}
|
||||
- INIT_EMBEDDING_MODEL_API_KEY=${INIT_EMBEDDING_MODEL_API_KEY:-}
|
||||
- INIT_EMBEDDING_MODEL_DIMENSION=${INIT_EMBEDDING_MODEL_DIMENSION:-}
|
||||
- INIT_EMBEDDING_MODEL_ID=${INIT_EMBEDDING_MODEL_ID:-}
|
||||
- INIT_RERANK_MODEL_NAME=${INIT_RERANK_MODEL_NAME:-}
|
||||
- INIT_RERANK_MODEL_BASE_URL=${INIT_RERANK_MODEL_BASE_URL:-}
|
||||
- INIT_RERANK_MODEL_API_KEY=${INIT_RERANK_MODEL_API_KEY:-}
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_started
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
minio:
|
||||
condition: service_started
|
||||
docreader:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
container_name: WeKnora-minio
|
||||
ports:
|
||||
- "${MINIO_PORT:-9000}:9000"
|
||||
- "${MINIO_CONSOLE_PORT:-9001}:9001"
|
||||
environment:
|
||||
- MINIO_ROOT_USER=${MINIO_ACCESS_KEY_ID:-minioadmin}
|
||||
- MINIO_ROOT_PASSWORD=${MINIO_SECRET_ACCESS_KEY:-minioadmin}
|
||||
command: server --console-address ":9001" /data
|
||||
volumes:
|
||||
- minio_data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
networks:
|
||||
- WeKnora-network
|
||||
|
||||
frontend:
|
||||
image: wechatopenai/weknora-ui:latest
|
||||
container_name: WeKnora-frontend
|
||||
ports:
|
||||
- "80:80"
|
||||
depends_on:
|
||||
- app
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
|
||||
docreader:
|
||||
image: wechatopenai/weknora-docreader:latest
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/Dockerfile.docreader
|
||||
container_name: WeKnora-docreader
|
||||
ports:
|
||||
- "50051:50051"
|
||||
- "${DOCREADER_PORT:-50051}:50051"
|
||||
environment:
|
||||
- COS_SECRET_ID=${COS_SECRET_ID}
|
||||
- COS_SECRET_KEY=${COS_SECRET_KEY}
|
||||
- COS_REGION=${COS_REGION}
|
||||
- COS_BUCKET_NAME=${COS_BUCKET_NAME}
|
||||
- COS_APP_ID=${COS_APP_ID}
|
||||
- COS_PATH_PREFIX=${COS_PATH_PREFIX}
|
||||
- COS_ENABLE_OLD_DOMAIN=${COS_ENABLE_OLD_DOMAIN}
|
||||
- VLM_MODEL_BASE_URL=${VLM_MODEL_BASE_URL}
|
||||
- VLM_MODEL_NAME=${VLM_MODEL_NAME}
|
||||
- VLM_MODEL_API_KEY=${VLM_MODEL_API_KEY}
|
||||
- STORAGE_TYPE=${STORAGE_TYPE}
|
||||
- COS_SECRET_ID=${COS_SECRET_ID:-}
|
||||
- COS_SECRET_KEY=${COS_SECRET_KEY:-}
|
||||
- COS_REGION=${COS_REGION:-}
|
||||
- COS_BUCKET_NAME=${COS_BUCKET_NAME:-}
|
||||
- COS_APP_ID=${COS_APP_ID:-}
|
||||
- COS_PATH_PREFIX=${COS_PATH_PREFIX:-}
|
||||
- COS_ENABLE_OLD_DOMAIN=${COS_ENABLE_OLD_DOMAIN:-}
|
||||
- VLM_MODEL_BASE_URL=${VLM_MODEL_BASE_URL:-}
|
||||
- VLM_MODEL_NAME=${VLM_MODEL_NAME:-}
|
||||
- VLM_MODEL_API_KEY=${VLM_MODEL_API_KEY:-}
|
||||
- STORAGE_TYPE=${STORAGE_TYPE:-}
|
||||
- MINIO_PUBLIC_ENDPOINT=http://localhost:${MINIO_PORT:-9000}
|
||||
- MINIO_ENDPOINT=minio:9000
|
||||
- MINIO_ACCESS_KEY_ID=${MINIO_ACCESS_KEY_ID:-minioadmin}
|
||||
- MINIO_SECRET_ACCESS_KEY=${MINIO_SECRET_ACCESS_KEY:-minioadmin}
|
||||
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME}
|
||||
- MINIO_USE_SSL=${MINIO_USE_SSL}
|
||||
- WEB_PROXY=${WEB_PROXY}
|
||||
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
|
||||
- MINIO_USE_SSL=${MINIO_USE_SSL:-}
|
||||
- WEB_PROXY=${WEB_PROXY:-}
|
||||
- MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
|
||||
healthcheck:
|
||||
test: ["CMD", "grpc_health_probe", "-addr=:50051"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
ports:
|
||||
- "6831:6831/udp" # Jaeger Thrift接收器
|
||||
- "6832:6832/udp" # Jaeger Thrift接收器(Compact)
|
||||
- "5778:5778" # 配置端口
|
||||
- "16686:16686" # Web UI
|
||||
- "4317:4317" # OTLP gRPC接收器
|
||||
- "4318:4318" # OTLP HTTP接收器
|
||||
- "14250:14250" # 接收模型端口
|
||||
- "14268:14268" # Jaeger HTTP接收器
|
||||
- "9411:9411" # Zipkin兼容性端口
|
||||
environment:
|
||||
- COLLECTOR_OTLP_ENABLED=true
|
||||
- COLLECTOR_ZIPKIN_HOST_PORT=:9411
|
||||
volumes:
|
||||
- jaeger_data:/var/lib/jaeger # 持久化 Jaeger 数据
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
# 修改的PostgreSQL配置
|
||||
postgres:
|
||||
image: paradedb/paradedb:latest
|
||||
image: paradedb/paradedb:v0.18.9-pg17
|
||||
container_name: WeKnora-postgres
|
||||
ports:
|
||||
- "${DB_PORT}:5432"
|
||||
environment:
|
||||
- POSTGRES_USER=${DB_USER}
|
||||
# NOCC:hardcode-password(工具误报)
|
||||
- POSTGRES_PASSWORD=${DB_PASSWORD}
|
||||
- POSTGRES_DB=${DB_NAME}
|
||||
volumes:
|
||||
@@ -178,15 +167,79 @@ services:
|
||||
redis:
|
||||
image: redis:7.0-alpine
|
||||
container_name: WeKnora-redis
|
||||
ports:
|
||||
- "${REDIS_PORT}:6379"
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
command: redis-server --appendonly yes --requirepass ${REDIS_PASSWORD}
|
||||
restart: always
|
||||
networks:
|
||||
- WeKnora-network
|
||||
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
container_name: WeKnora-minio
|
||||
ports:
|
||||
- "${MINIO_PORT:-9000}:9000"
|
||||
- "${MINIO_CONSOLE_PORT:-9001}:9001"
|
||||
environment:
|
||||
- MINIO_ROOT_USER=${MINIO_ACCESS_KEY_ID:-minioadmin}
|
||||
- MINIO_ROOT_PASSWORD=${MINIO_SECRET_ACCESS_KEY:-minioadmin}
|
||||
command: server --console-address ":9001" /data
|
||||
volumes:
|
||||
- minio_data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
networks:
|
||||
- WeKnora-network
|
||||
profiles:
|
||||
- minio
|
||||
- full
|
||||
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
ports:
|
||||
- "6831:6831/udp" # Jaeger Thrift接收器
|
||||
- "6832:6832/udp" # Jaeger Thrift接收器(Compact)
|
||||
- "5778:5778" # 配置端口
|
||||
- "16686:16686" # Web UI
|
||||
- "4317:4317" # OTLP gRPC接收器
|
||||
- "4318:4318" # OTLP HTTP接收器
|
||||
- "14250:14250" # 接收模型端口
|
||||
- "14268:14268" # Jaeger HTTP接收器
|
||||
- "9411:9411" # Zipkin兼容性端口
|
||||
environment:
|
||||
- COLLECTOR_OTLP_ENABLED=true
|
||||
- COLLECTOR_ZIPKIN_HOST_PORT=:9411
|
||||
volumes:
|
||||
- jaeger_data:/var/lib/jaeger # 持久化 Jaeger 数据
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
profiles:
|
||||
- jaeger
|
||||
- full
|
||||
|
||||
neo4j:
|
||||
image: neo4j:latest
|
||||
container_name: WeKnora-neo4j
|
||||
volumes:
|
||||
- neo4j-data:/data
|
||||
environment:
|
||||
- NEO4J_AUTH=${NEO4J_USERNAME:-neo4j}/${NEO4J_PASSWORD:-password}
|
||||
- NEO4J_apoc_export_file_enabled=true
|
||||
- NEO4J_apoc_import_file_enabled=true
|
||||
- NEO4J_apoc_import_file_use__neo4j__config=true
|
||||
- NEO4JLABS_PLUGINS=["apoc"]
|
||||
ports:
|
||||
- "7474:7474"
|
||||
- "7687:7687"
|
||||
restart: always
|
||||
networks:
|
||||
- WeKnora-network
|
||||
profiles:
|
||||
- neo4j
|
||||
- full
|
||||
|
||||
networks:
|
||||
WeKnora-network:
|
||||
driver: bridge
|
||||
@@ -195,5 +248,5 @@ volumes:
|
||||
postgres-data:
|
||||
data-files:
|
||||
jaeger_data:
|
||||
redis_data:
|
||||
minio_data:
|
||||
neo4j-data:
|
||||
|
||||
@@ -3,10 +3,6 @@ FROM golang:1.24-alpine AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install dependencies
|
||||
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apk/repositories && \
|
||||
apk add --no-cache git build-base
|
||||
|
||||
# 通过构建参数接收敏感信息
|
||||
ARG GOPRIVATE_ARG
|
||||
ARG GOPROXY_ARG
|
||||
@@ -17,19 +13,33 @@ ENV GOPRIVATE=${GOPRIVATE_ARG}
|
||||
ENV GOPROXY=${GOPROXY_ARG}
|
||||
ENV GOSUMDB=${GOSUMDB_ARG}
|
||||
|
||||
# Copy go mod and sum files
|
||||
COPY go.mod go.sum ./
|
||||
RUN go mod download
|
||||
# Install dependencies
|
||||
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apk/repositories && \
|
||||
apk add --no-cache git build-base
|
||||
|
||||
ENV CGO_ENABLED=1
|
||||
# Install migrate tool
|
||||
RUN go install -tags 'postgres' github.com/golang-migrate/migrate/v4/cmd/migrate@latest
|
||||
|
||||
# Copy source code
|
||||
# Copy go mod and sum files
|
||||
COPY go.mod go.sum ./
|
||||
RUN --mount=type=cache,target=/go/pkg/mod go mod download
|
||||
COPY . .
|
||||
|
||||
# Build the application
|
||||
RUN make build-prod
|
||||
# Get version and commit info for build injection
|
||||
ARG VERSION_ARG
|
||||
ARG COMMIT_ID_ARG
|
||||
ARG BUILD_TIME_ARG
|
||||
ARG GO_VERSION_ARG
|
||||
|
||||
# Set build-time variables
|
||||
ENV VERSION=${VERSION_ARG}
|
||||
ENV COMMIT_ID=${COMMIT_ID_ARG}
|
||||
ENV BUILD_TIME=${BUILD_TIME_ARG}
|
||||
ENV GO_VERSION=${GO_VERSION_ARG}
|
||||
|
||||
# Build the application with version info
|
||||
RUN --mount=type=cache,target=/go/pkg/mod make build-prod
|
||||
RUN --mount=type=cache,target=/go/pkg/mod cp -r /go/pkg/mod/github.com/yanyiwu/ /app/yanyiwu/
|
||||
|
||||
# Final stage
|
||||
FROM alpine:3.17
|
||||
@@ -39,36 +49,31 @@ WORKDIR /app
|
||||
# Install runtime dependencies
|
||||
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apk/repositories && \
|
||||
apk update && apk upgrade && \
|
||||
apk add --no-cache build-base postgresql-client mysql-client ca-certificates tzdata sed curl bash supervisor vim wget
|
||||
|
||||
# Copy the binary from the builder stage
|
||||
COPY --from=builder /app/WeKnora .
|
||||
COPY --from=builder /app/config ./config
|
||||
COPY --from=builder /app/scripts ./scripts
|
||||
COPY --from=builder /app/migrations ./migrations
|
||||
COPY --from=builder /app/dataset/samples ./dataset/samples
|
||||
|
||||
# Copy migrate tool from builder stage
|
||||
COPY --from=builder /go/bin/migrate /usr/local/bin/
|
||||
COPY --from=builder /go/pkg/mod/github.com/yanyiwu /go/pkg/mod/github.com/yanyiwu/
|
||||
|
||||
# Make scripts executable
|
||||
RUN chmod +x ./scripts/*.sh
|
||||
|
||||
# Setup supervisor configuration
|
||||
RUN mkdir -p /etc/supervisor.d/
|
||||
COPY docker/config/supervisord.conf /etc/supervisor.d/supervisord.conf
|
||||
|
||||
# Expose ports
|
||||
EXPOSE 8080
|
||||
|
||||
# Set environment variables
|
||||
ENV CGO_ENABLED=1
|
||||
apk add --no-cache build-base postgresql-client mysql-client ca-certificates tzdata sed curl bash vim wget
|
||||
|
||||
# Create a non-root user and switch to it
|
||||
RUN mkdir -p /data/files && \
|
||||
adduser -D -g '' appuser && \
|
||||
chown -R appuser:appuser /app /data/files
|
||||
|
||||
# Run supervisor instead of direct application start
|
||||
CMD ["supervisord", "-c", "/etc/supervisor.d/supervisord.conf"]
|
||||
# Copy migrate tool from builder stage
|
||||
COPY --from=builder /go/bin/migrate /usr/local/bin/
|
||||
COPY --from=builder /app/yanyiwu/ /go/pkg/mod/github.com/yanyiwu/
|
||||
|
||||
# Copy the binary from the builder stage
|
||||
COPY --from=builder /app/config ./config
|
||||
COPY --from=builder /app/scripts ./scripts
|
||||
COPY --from=builder /app/migrations ./migrations
|
||||
COPY --from=builder /app/dataset/samples ./dataset/samples
|
||||
COPY --from=builder /app/WeKnora .
|
||||
|
||||
# Make scripts executable
|
||||
RUN chmod +x ./scripts/*.sh
|
||||
|
||||
# Expose ports
|
||||
EXPOSE 8080
|
||||
|
||||
# Switch to non-root user and run the application directly
|
||||
USER appuser
|
||||
|
||||
CMD ["./WeKnora"]
|
||||
@@ -26,50 +26,68 @@ RUN apt-get update && apt-get install -y \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 检查是否存在本地protoc安装包,如果存在则离线安装,否则在线安装,其他安装包按需求添加
|
||||
ARG TARGETARCH
|
||||
COPY packages/ /app/packages/
|
||||
RUN echo "检查本地protoc安装包..." && \
|
||||
if [ -f "/app/packages/protoc-3.19.4-linux-x86_64.zip" ]; then \
|
||||
# 根据目标架构选择正确的protoc包名
|
||||
case ${TARGETARCH} in \
|
||||
"amd64") PROTOC_ARCH="x86_64" ;; \
|
||||
"arm64") PROTOC_ARCH="aarch_64" ;; \
|
||||
"arm") PROTOC_ARCH="arm" ;; \
|
||||
*) echo "Unsupported architecture for protoc: ${TARGETARCH}" && exit 1 ;; \
|
||||
esac && \
|
||||
PROTOC_PACKAGE="protoc-3.19.4-linux-${PROTOC_ARCH}.zip" && \
|
||||
if [ -f "/app/packages/${PROTOC_PACKAGE}" ]; then \
|
||||
echo "发现本地protoc安装包,将进行离线安装"; \
|
||||
# 离线安装:使用本地包(精确路径避免歧义)
|
||||
cp /app/packages/protoc-*.zip /app/ && \
|
||||
unzip -o /app/protoc-*.zip -d /usr/local && \
|
||||
cp /app/packages/${PROTOC_PACKAGE} /app/ && \
|
||||
unzip -o /app/${PROTOC_PACKAGE} -d /usr/local && \
|
||||
chmod +x /usr/local/bin/protoc && \
|
||||
rm -f /app/protoc-*.zip; \
|
||||
rm -f /app/${PROTOC_PACKAGE}; \
|
||||
else \
|
||||
echo "未发现本地protoc安装包,将进行在线安装"; \
|
||||
# 在线安装:从网络下载
|
||||
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \
|
||||
unzip -o protoc-3.19.4-linux-x86_64.zip -d /usr/local && \
|
||||
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/${PROTOC_PACKAGE} && \
|
||||
unzip -o ${PROTOC_PACKAGE} -d /usr/local && \
|
||||
chmod +x /usr/local/bin/protoc && \
|
||||
rm -f protoc-3.19.4-linux-x86_64.zip; \
|
||||
rm -f ${PROTOC_PACKAGE}; \
|
||||
fi
|
||||
|
||||
# 预下载 PP-OCRv4 模型
|
||||
RUN mkdir -p /root/.paddleocr/whl/det/ch && \
|
||||
mkdir -p /root/.paddleocr/whl/rec/ch && \
|
||||
mkdir -p /root/.paddleocr/whl/cls/ch && \
|
||||
# 下载检测模型
|
||||
wget https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar \
|
||||
-O /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar && \
|
||||
tar -xf /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar -C /root/.paddleocr/whl/det/ch/ && \
|
||||
# 下载识别模型
|
||||
wget https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar \
|
||||
-O /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
|
||||
tar -xf /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar -C /root/.paddleocr/whl/rec/ch/ && \
|
||||
# 下载文本方向分类模型(用于判断文本是否需要旋转)
|
||||
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar \
|
||||
-O /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar && \
|
||||
tar -xf /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar -C /root/.paddleocr/whl/cls/ && \
|
||||
# 清理压缩包
|
||||
rm -f /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar && \
|
||||
rm -f /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
|
||||
rm -f /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar
|
||||
|
||||
# 复制依赖文件
|
||||
COPY services/docreader/requirements.txt .
|
||||
|
||||
# 安装依赖
|
||||
RUN pip cache purge && pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# 预下载 PP-OCRv5 模型
|
||||
RUN mkdir -p /root/.paddlex/official_models && \
|
||||
wget https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/PP-OCRv5_server_det_infer.tar \
|
||||
-O /root/.paddlex/official_models/PP-OCRv5_server_det_infer.tar && \
|
||||
wget https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/PP-OCRv5_server_rec_infer.tar \
|
||||
-O /root/.paddlex/official_models/PP-OCRv5_server_rec_infer.tar && \
|
||||
tar -xf /root/.paddlex/official_models/PP-OCRv5_server_det_infer.tar -C /root/.paddlex/official_models/ && \
|
||||
tar -xf /root/.paddlex/official_models/PP-OCRv5_server_rec_infer.tar -C /root/.paddlex/official_models/ && \
|
||||
rm -rf /root/.paddlex/official_models/PP-OCRv5_server_det_infer.tar /root/.paddlex/official_models/PP-OCRv5_server_rec_infer.tar
|
||||
COPY docreader/pyproject.toml docreader/uv.lock ./
|
||||
RUN pip install uv --break-system-packages && \
|
||||
python -m uv sync --locked --no-dev
|
||||
|
||||
# 复制源代码和生成脚本
|
||||
COPY services/docreader/src/ /app/src/
|
||||
COPY services/docreader/scripts/ /app/scripts/
|
||||
|
||||
# 确保模型目录存在
|
||||
RUN ls -la /root/.paddlex/official_models
|
||||
COPY docreader docreader
|
||||
|
||||
# 生成 protobuf 代码
|
||||
RUN chmod +x /app/scripts/generate_proto.sh && bash /app/scripts/generate_proto.sh
|
||||
RUN chmod +x docreader/scripts/generate_proto.sh && \
|
||||
bash docreader/scripts/generate_proto.sh
|
||||
|
||||
# 确保模型目录存在
|
||||
RUN ls -la /root/.paddleocr/whl/
|
||||
|
||||
# =========================
|
||||
# 运行阶段
|
||||
@@ -91,7 +109,6 @@ RUN apt-get update && apt-get install -y \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
antiword \
|
||||
supervisor \
|
||||
vim \
|
||||
tar \
|
||||
dpkg \
|
||||
@@ -103,46 +120,42 @@ RUN apt-get update && apt-get install -y \
|
||||
libglu1-mesa \
|
||||
libsm6 \
|
||||
libreoffice \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# # 下载并安装 LibreOffice(区分架构)
|
||||
# RUN mkdir -p /tmp/libreoffice && cd /tmp/libreoffice && \
|
||||
# if [ "$(uname -m)" = "x86_64" ]; then \
|
||||
# wget https://mirrors.tuna.tsinghua.edu.cn/libreoffice/libreoffice/stable/25.2.5/deb/x86_64/LibreOffice_25.2.5_Linux_x86-64_deb.tar.gz && \
|
||||
# tar -xzf LibreOffice_25.2.5_Linux_x86-64_deb.tar.gz && \
|
||||
# cd LibreOffice_25.2.5.2_Linux_x86-64_deb/DEBS && dpkg -i *.deb; \
|
||||
# elif [ "$(uname -m)" = "aarch64" ] || [ "$(uname -m)" = "arm64" ]; then \
|
||||
# wget https://mirrors.aliyun.com/libreoffice/testing/25.8.0/deb/aarch64/LibreOffice_25.8.0.3_Linux_aarch64_deb.tar.gz && \
|
||||
# tar -xzf LibreOffice_25.8.0.3_Linux_aarch64_deb.tar.gz && \
|
||||
# cd LibreOffice_25.8.0.3_Linux_aarch64_deb/DEBS && dpkg -i *.deb; \
|
||||
# else \
|
||||
# echo "Unsupported architecture: $(uname -m)" && exit 1; \
|
||||
# fi && \
|
||||
# cd / && rm -rf /tmp/libreoffice
|
||||
|
||||
# 设置 LibreOffice 环境变量
|
||||
# RUN echo 'export LIBREOFFICE_PATH=/opt/libreoffice25.2/program/soffice' >> /etc/environment;
|
||||
# 安装 grpc_health_probe
|
||||
ARG TARGETARCH
|
||||
RUN GRPC_HEALTH_PROBE_VERSION=v0.4.24 && \
|
||||
# 根据目标架构选择正确的二进制文件
|
||||
case ${TARGETARCH} in \
|
||||
"amd64") ARCH="amd64" ;; \
|
||||
"arm64") ARCH="arm64" ;; \
|
||||
"arm") ARCH="arm" ;; \
|
||||
*) echo "Unsupported architecture: ${TARGETARCH}" && exit 1 ;; \
|
||||
esac && \
|
||||
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-${ARCH} && \
|
||||
chmod +x /bin/grpc_health_probe
|
||||
|
||||
# 从构建阶段复制已安装的依赖和生成的代码
|
||||
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
|
||||
ENV VIRTUAL_ENV=/app/.venv
|
||||
COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
|
||||
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
|
||||
|
||||
COPY --from=builder /usr/local/bin /usr/local/bin
|
||||
COPY --from=builder /root/.paddlex/official_models /root/.paddlex/official_models
|
||||
COPY --from=builder /app/src /app/src
|
||||
COPY --from=builder /root/.paddleocr /root/.paddleocr
|
||||
|
||||
# 安装 Playwright 浏览器
|
||||
RUN python -m playwright install webkit
|
||||
RUN python -m playwright install-deps webkit
|
||||
|
||||
# 设置 Python 路径
|
||||
ENV PYTHONPATH=/app/src
|
||||
RUN cd /app/src && python -m download_deps
|
||||
# COPY docreader/scripts/download_deps.py download_deps.py
|
||||
# RUN python -m download_deps
|
||||
|
||||
# 创建supervisor配置
|
||||
RUN mkdir -p /etc/supervisor/conf.d
|
||||
COPY services/docreader/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||
COPY docreader/pyproject.toml docreader/uv.lock ./
|
||||
COPY --from=builder /app/docreader docreader
|
||||
|
||||
# 暴露 gRPC 端口
|
||||
EXPOSE 50051
|
||||
|
||||
# 使用supervisor启动服务
|
||||
CMD ["supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
||||
# 直接运行 Python 服务(日志输出到 stdout/stderr)
|
||||
CMD ["uv", "run", "-m", "docreader.main"]
|
||||
@@ -3,6 +3,9 @@ version: "3.8"
|
||||
services:
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
read_only: true
|
||||
tmpfs:
|
||||
- /tmp
|
||||
container_name: WeKnora-minio
|
||||
ports:
|
||||
- "9000:9000"
|
||||
@@ -26,4 +29,4 @@ volumes:
|
||||
|
||||
networks:
|
||||
WeKnora-network:
|
||||
external: true
|
||||
external: true
|
||||
|
||||
5
docreader/.pylintrc
Normal file
@@ -0,0 +1,5 @@
|
||||
[LOGGING]
|
||||
logging-format-style=fstr
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
; disable=W1203
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/Tencent/WeKnora/services/docreader/src/proto"
|
||||
"github.com/Tencent/WeKnora/docreader/proto"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
"google.golang.org/grpc/resolver"
|
||||
@@ -16,10 +16,8 @@ const (
|
||||
maxMessageSize = 50 * 1024 * 1024 // 50MB
|
||||
)
|
||||
|
||||
var (
|
||||
// Logger is the default logger used by the client
|
||||
Logger = log.New(os.Stdout, "[DocReader] ", log.LstdFlags|log.Lmicroseconds)
|
||||
)
|
||||
// Logger is the default logger used by the client
|
||||
var Logger = log.New(os.Stdout, "[DocReader] ", log.LstdFlags|log.Lmicroseconds)
|
||||
|
||||
// ImageInfo 表示一个图片的信息
|
||||
type ImageInfo struct {
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/Tencent/WeKnora/services/docreader/src/proto"
|
||||
"github.com/Tencent/WeKnora/docreader/proto"
|
||||
)
|
||||
|
||||
func init() {
|
||||
356
docreader/main.py
Normal file
@@ -0,0 +1,356 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import traceback
|
||||
import uuid
|
||||
from concurrent import futures
|
||||
from typing import Optional
|
||||
|
||||
import grpc
|
||||
from grpc_health.v1 import health_pb2_grpc
|
||||
from grpc_health.v1.health import HealthServicer
|
||||
|
||||
from docreader.models.read_config import ChunkingConfig
|
||||
from docreader.parser import Parser
|
||||
from docreader.parser.ocr_engine import OCREngine
|
||||
from docreader.proto import docreader_pb2_grpc
|
||||
from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse
|
||||
from docreader.utils.request import init_logging_request_id, request_id_context
|
||||
|
||||
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
|
||||
# cannot be encoded to UTF-8
|
||||
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
|
||||
|
||||
|
||||
def to_valid_utf8_text(s: Optional[str]) -> str:
|
||||
"""Return a UTF-8 safe string for protobuf.
|
||||
|
||||
- Replace any surrogate code points with U+FFFD
|
||||
- Re-encode with errors='replace' to ensure valid UTF-8
|
||||
"""
|
||||
if not s:
|
||||
return ""
|
||||
s = _SURROGATE_RE.sub("\ufffd", s)
|
||||
return s.encode("utf-8", errors="replace").decode("utf-8")
|
||||
|
||||
|
||||
# Ensure no existing handlers
|
||||
for handler in logging.root.handlers[:]:
|
||||
logging.root.removeHandler(handler)
|
||||
|
||||
# Configure logging - use stdout
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
logging.root.addHandler(handler)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.info("Initializing server logging")
|
||||
|
||||
# Initialize request ID logging
|
||||
init_logging_request_id()
|
||||
|
||||
# Set max message size to 50MB
|
||||
MAX_MESSAGE_LENGTH = 50 * 1024 * 1024
|
||||
|
||||
|
||||
parser = Parser()
|
||||
|
||||
|
||||
class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.parser = Parser()
|
||||
|
||||
def ReadFromFile(self, request, context):
|
||||
# Get or generate request ID
|
||||
request_id = (
|
||||
request.request_id
|
||||
if hasattr(request, "request_id") and request.request_id
|
||||
else str(uuid.uuid4())
|
||||
)
|
||||
|
||||
# Use request ID context
|
||||
with request_id_context(request_id):
|
||||
try:
|
||||
# Get file type
|
||||
file_type = (
|
||||
request.file_type or os.path.splitext(request.file_name)[1][1:]
|
||||
)
|
||||
logger.info(
|
||||
f"ReadFromFile for file: {request.file_name}, type: {file_type}"
|
||||
)
|
||||
logger.info(f"File content size: {len(request.file_content)} bytes")
|
||||
|
||||
# Create chunking config
|
||||
chunk_size = request.read_config.chunk_size or 512
|
||||
chunk_overlap = request.read_config.chunk_overlap or 50
|
||||
separators = request.read_config.separators or ["\n\n", "\n", "。"]
|
||||
enable_multimodal = request.read_config.enable_multimodal or False
|
||||
|
||||
logger.info(
|
||||
f"Using chunking config: size={chunk_size}, "
|
||||
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
|
||||
)
|
||||
|
||||
# Get Storage and VLM config from request
|
||||
storage_config = None
|
||||
vlm_config = None
|
||||
|
||||
sc = request.read_config.storage_config
|
||||
# Keep parser-side key name as cos_config for backward compatibility
|
||||
storage_config = {
|
||||
"provider": "minio" if sc.provider == 2 else "cos",
|
||||
"region": sc.region,
|
||||
"bucket_name": sc.bucket_name,
|
||||
"access_key_id": sc.access_key_id,
|
||||
"secret_access_key": sc.secret_access_key,
|
||||
"app_id": sc.app_id,
|
||||
"path_prefix": sc.path_prefix,
|
||||
}
|
||||
logger.info(
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, "
|
||||
f"bucket={storage_config['bucket_name']}"
|
||||
)
|
||||
|
||||
vlm_config = {
|
||||
"model_name": request.read_config.vlm_config.model_name,
|
||||
"base_url": request.read_config.vlm_config.base_url,
|
||||
"api_key": request.read_config.vlm_config.api_key or "",
|
||||
"interface_type": request.read_config.vlm_config.interface_type
|
||||
or "openai",
|
||||
}
|
||||
logger.info(
|
||||
f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}"
|
||||
)
|
||||
|
||||
chunking_config = ChunkingConfig(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
separators=separators,
|
||||
enable_multimodal=enable_multimodal,
|
||||
storage_config=storage_config,
|
||||
vlm_config=vlm_config,
|
||||
)
|
||||
|
||||
# Parse file
|
||||
logger.info("Starting file parsing process")
|
||||
result = self.parser.parse_file(
|
||||
request.file_name, file_type, request.file_content, chunking_config
|
||||
)
|
||||
|
||||
if not result:
|
||||
error_msg = "Failed to parse file"
|
||||
logger.error(error_msg)
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(error_msg)
|
||||
return ReadResponse()
|
||||
|
||||
# Convert to protobuf message
|
||||
logger.info(
|
||||
f"Parsed file {request.file_name}, with {len(result.chunks)} chunks"
|
||||
)
|
||||
|
||||
# Build response, including image info
|
||||
response = ReadResponse(
|
||||
chunks=[
|
||||
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
|
||||
]
|
||||
)
|
||||
logger.info(f"Response size: {response.ByteSize()} bytes")
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error reading file: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
logger.info(f"Detailed traceback: {traceback.format_exc()}")
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return ReadResponse(error=str(e))
|
||||
|
||||
def ReadFromURL(self, request, context):
|
||||
# Get or generate request ID
|
||||
request_id = (
|
||||
request.request_id
|
||||
if hasattr(request, "request_id") and request.request_id
|
||||
else str(uuid.uuid4())
|
||||
)
|
||||
|
||||
# Use request ID context
|
||||
with request_id_context(request_id):
|
||||
try:
|
||||
logger.info(f"Received ReadFromURL request for URL: {request.url}")
|
||||
|
||||
# Create chunking config
|
||||
chunk_size = request.read_config.chunk_size or 512
|
||||
chunk_overlap = request.read_config.chunk_overlap or 50
|
||||
separators = request.read_config.separators or ["\n\n", "\n", "。"]
|
||||
enable_multimodal = request.read_config.enable_multimodal or False
|
||||
|
||||
logger.info(
|
||||
f"Using chunking config: size={chunk_size}, "
|
||||
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
|
||||
)
|
||||
|
||||
# Get Storage and VLM config from request
|
||||
storage_config = None
|
||||
vlm_config = None
|
||||
|
||||
sc = request.read_config.storage_config
|
||||
storage_config = {
|
||||
"provider": "minio" if sc.provider == 2 else "cos",
|
||||
"region": sc.region,
|
||||
"bucket_name": sc.bucket_name,
|
||||
"access_key_id": sc.access_key_id,
|
||||
"secret_access_key": sc.secret_access_key,
|
||||
"app_id": sc.app_id,
|
||||
"path_prefix": sc.path_prefix,
|
||||
}
|
||||
logger.info(
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, "
|
||||
f"bucket={storage_config['bucket_name']}"
|
||||
)
|
||||
|
||||
vlm_config = {
|
||||
"model_name": request.read_config.vlm_config.model_name,
|
||||
"base_url": request.read_config.vlm_config.base_url,
|
||||
"api_key": request.read_config.vlm_config.api_key or "",
|
||||
"interface_type": request.read_config.vlm_config.interface_type
|
||||
or "openai",
|
||||
}
|
||||
logger.info(
|
||||
f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}"
|
||||
)
|
||||
|
||||
chunking_config = ChunkingConfig(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
separators=separators,
|
||||
enable_multimodal=enable_multimodal,
|
||||
storage_config=storage_config,
|
||||
vlm_config=vlm_config,
|
||||
)
|
||||
|
||||
# Parse URL
|
||||
logger.info("Starting URL parsing process")
|
||||
result = self.parser.parse_url(
|
||||
request.url, request.title, chunking_config
|
||||
)
|
||||
if not result:
|
||||
error_msg = "Failed to parse URL"
|
||||
logger.error(error_msg)
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(error_msg)
|
||||
return ReadResponse(error=error_msg)
|
||||
|
||||
# Convert to protobuf message, including image info
|
||||
logger.info(
|
||||
f"Parsed URL {request.url}, returning {len(result.chunks)} chunks"
|
||||
)
|
||||
|
||||
response = ReadResponse(
|
||||
chunks=[
|
||||
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
|
||||
]
|
||||
)
|
||||
logger.info(f"Response size: {response.ByteSize()} bytes")
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error reading URL: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
logger.info(f"Detailed traceback: {traceback.format_exc()}")
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return ReadResponse(error=str(e))
|
||||
|
||||
def _convert_chunk_to_proto(self, chunk):
|
||||
"""Convert internal Chunk object to protobuf Chunk message
|
||||
Ensures all string fields are valid UTF-8 for protobuf (no lone surrogates).
|
||||
"""
|
||||
# Clean helper for strings
|
||||
_c = to_valid_utf8_text
|
||||
|
||||
proto_chunk = Chunk(
|
||||
content=_c(getattr(chunk, "content", None)),
|
||||
seq=getattr(chunk, "seq", 0),
|
||||
start=getattr(chunk, "start", 0),
|
||||
end=getattr(chunk, "end", 0),
|
||||
)
|
||||
|
||||
# If chunk has images attribute and is not empty, add image info
|
||||
if hasattr(chunk, "images") and chunk.images:
|
||||
logger.info(
|
||||
f"Adding {len(chunk.images)} images to chunk {getattr(chunk, 'seq', 0)}"
|
||||
)
|
||||
for img_info in chunk.images:
|
||||
# img_info expected as dict
|
||||
proto_image = Image(
|
||||
url=_c(img_info.get("cos_url", "")),
|
||||
caption=_c(img_info.get("caption", "")),
|
||||
ocr_text=_c(img_info.get("ocr_text", "")),
|
||||
original_url=_c(img_info.get("original_url", "")),
|
||||
start=int(img_info.get("start", 0) or 0),
|
||||
end=int(img_info.get("end", 0) or 0),
|
||||
)
|
||||
proto_chunk.images.append(proto_image)
|
||||
|
||||
return proto_chunk
|
||||
|
||||
|
||||
def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs):
|
||||
"""Initialize OCR engine"""
|
||||
backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle")
|
||||
logger.info(f"Initializing OCR engine with backend: {backend_type}")
|
||||
OCREngine.get_instance(backend_type=backend_type, **kwargs)
|
||||
|
||||
|
||||
def main():
|
||||
init_ocr_engine()
|
||||
|
||||
# Set max number of worker threads
|
||||
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
|
||||
logger.info(f"Starting DocReader service with {max_workers} worker threads")
|
||||
|
||||
# Get port number
|
||||
port = os.environ.get("GRPC_PORT", "50051")
|
||||
|
||||
# Create server
|
||||
server = grpc.server(
|
||||
futures.ThreadPoolExecutor(max_workers=max_workers),
|
||||
options=[
|
||||
("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
|
||||
("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
|
||||
],
|
||||
)
|
||||
|
||||
# Register services
|
||||
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
|
||||
|
||||
# Register health check service
|
||||
health_servicer = HealthServicer()
|
||||
health_pb2_grpc.add_HealthServicer_to_server(health_servicer, server)
|
||||
|
||||
# Set listen address
|
||||
server.add_insecure_port(f"[::]:{port}")
|
||||
|
||||
# Start service
|
||||
server.start()
|
||||
|
||||
logger.info(f"Server started on port {port}")
|
||||
logger.info("Server is ready to accept connections")
|
||||
|
||||
try:
|
||||
# Wait for service termination
|
||||
server.wait_for_termination()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received termination signal, shutting down server")
|
||||
server.stop(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
docreader/models/__init__.py
Normal file
87
docreader/models/document.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""Chunk document schema."""
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class Chunk(BaseModel):
|
||||
"""Document Chunk including chunk content, chunk metadata."""
|
||||
|
||||
content: str = Field(default="", description="chunk text content")
|
||||
seq: int = Field(default=0, description="Chunk sequence number")
|
||||
start: int = Field(default=0, description="Chunk start position")
|
||||
end: int = Field(description="Chunk end position")
|
||||
images: List[Dict[str, Any]] = Field(
|
||||
default_factory=list, description="Images in the chunk"
|
||||
)
|
||||
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="metadata fields",
|
||||
)
|
||||
|
||||
def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
|
||||
"""Convert Chunk to dict."""
|
||||
|
||||
data = self.model_dump()
|
||||
data.update(kwargs)
|
||||
data["class_name"] = self.__class__.__name__
|
||||
return data
|
||||
|
||||
def to_json(self, **kwargs: Any) -> str:
|
||||
"""Convert Chunk to json."""
|
||||
data = self.to_dict(**kwargs)
|
||||
return json.dumps(data)
|
||||
|
||||
def __hash__(self):
|
||||
"""Hash function."""
|
||||
return hash((self.content,))
|
||||
|
||||
def __eq__(self, other):
|
||||
"""Equal function."""
|
||||
return self.content == other.content
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any], **kwargs: Any): # type: ignore
|
||||
"""Create Chunk from dict."""
|
||||
if isinstance(kwargs, dict):
|
||||
data.update(kwargs)
|
||||
|
||||
data.pop("class_name", None)
|
||||
return cls(**data)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, data_str: str, **kwargs: Any): # type: ignore
|
||||
"""Create Chunk from json."""
|
||||
data = json.loads(data_str)
|
||||
return cls.from_dict(data, **kwargs)
|
||||
|
||||
|
||||
class Document(BaseModel):
|
||||
"""Document including document content, document metadata."""
|
||||
|
||||
model_config = {"arbitrary_types_allowed": True}
|
||||
|
||||
content: str = Field(default="", description="document text content")
|
||||
images: Dict[str, str] = Field(
|
||||
default_factory=dict, description="Images in the document"
|
||||
)
|
||||
|
||||
chunks: List[Chunk] = Field(default_factory=list, description="document chunks")
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="metadata fields",
|
||||
)
|
||||
|
||||
def set_content(self, content: str) -> None:
|
||||
"""Set document content."""
|
||||
self.content = content
|
||||
|
||||
def get_content(self) -> str:
|
||||
"""Get document content."""
|
||||
return self.content
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.content != ""
|
||||
27
docreader/models/read_config.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkingConfig:
|
||||
"""
|
||||
Configuration for text chunking process.
|
||||
Controls how documents are split into smaller pieces for processing.
|
||||
"""
|
||||
|
||||
# Maximum size of each chunk in tokens/chars
|
||||
chunk_size: int = 512
|
||||
|
||||
# Number of tokens/chars to overlap between chunks
|
||||
chunk_overlap: int = 50
|
||||
|
||||
# Text separators in order of priority
|
||||
separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"])
|
||||
|
||||
# Whether to enable multimodal processing (text + images)
|
||||
enable_multimodal: bool = False
|
||||
|
||||
# Preferred field name going forward
|
||||
storage_config: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
# VLM configuration for image captioning
|
||||
vlm_config: dict[str, str] = field(default_factory=dict)
|
||||
@@ -13,22 +13,20 @@ The parsers extract content from documents and can split them into
|
||||
meaningful chunks for further processing and indexing.
|
||||
"""
|
||||
|
||||
from .base_parser import BaseParser, ParseResult
|
||||
from .docx_parser import DocxParser
|
||||
from .csv_parser import CSVParser
|
||||
from .doc_parser import DocParser
|
||||
from .pdf_parser import PDFParser
|
||||
from .markdown_parser import MarkdownParser
|
||||
from .text_parser import TextParser
|
||||
from .docx2_parser import Docx2Parser
|
||||
from .excel_parser import ExcelParser
|
||||
from .image_parser import ImageParser
|
||||
from .web_parser import WebParser
|
||||
from .markdown_parser import MarkdownParser
|
||||
from .parser import Parser
|
||||
from .config import ChunkingConfig
|
||||
from .ocr_engine import OCREngine
|
||||
from .pdf_parser import PDFParser
|
||||
from .text_parser import TextParser
|
||||
from .web_parser import WebParser
|
||||
|
||||
# Export public classes and modules
|
||||
__all__ = [
|
||||
"BaseParser", # Base parser class that all format parsers inherit from
|
||||
"DocxParser", # Parser for .docx files (modern Word documents)
|
||||
"Docx2Parser", # Parser for .docx files (modern Word documents)
|
||||
"DocParser", # Parser for .doc files (legacy Word documents)
|
||||
"PDFParser", # Parser for PDF documents
|
||||
"MarkdownParser", # Parser for Markdown text files
|
||||
@@ -36,7 +34,6 @@ __all__ = [
|
||||
"ImageParser", # Parser for images with text content
|
||||
"WebParser", # Parser for web pages
|
||||
"Parser", # Main parser factory that selects the appropriate parser
|
||||
"ChunkingConfig", # Configuration for text chunking behavior
|
||||
"ParseResult", # Standard result format returned by all parsers
|
||||
"OCREngine", # OCR engine for extracting text from images
|
||||
"CSVParser", # Parser for CSV files
|
||||
"ExcelParser", # Parser for Excel files
|
||||
]
|
||||
@@ -3,11 +3,10 @@ import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Union
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import requests
|
||||
import ollama
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -158,11 +157,16 @@ class CaptionChatResp:
|
||||
Returns:
|
||||
The content string from the first choice, or empty string if no choices
|
||||
"""
|
||||
if self.choices:
|
||||
logger.info("Retrieving content from first choice")
|
||||
return self.choices[0].message.content
|
||||
logger.warning("No choices available in response")
|
||||
return ""
|
||||
if (
|
||||
not self.choices
|
||||
or not self.choices[0]
|
||||
or not self.choices[0].message
|
||||
or not self.choices[0].message.content
|
||||
):
|
||||
logger.warning("No choices available in response")
|
||||
return ""
|
||||
logger.info("Retrieving content from first choice")
|
||||
return self.choices[0].message.content
|
||||
|
||||
|
||||
class Caption:
|
||||
@@ -171,33 +175,43 @@ class Caption:
|
||||
Uses an external API to process images and return textual descriptions.
|
||||
"""
|
||||
|
||||
def __init__(self, vlm_config=None):
|
||||
"""Initialize the Caption service with configuration from parameters or environment variables."""
|
||||
def __init__(self, vlm_config: Optional[Dict[str, str]] = None):
|
||||
"""
|
||||
Initialize the Caption service with configuration
|
||||
from parameters or environment variables.
|
||||
"""
|
||||
logger.info("Initializing Caption service")
|
||||
self.prompt = """简单凝炼的描述图片的主要内容"""
|
||||
|
||||
# Use provided VLM config if available, otherwise fall back to environment variables
|
||||
self.timeout = 30
|
||||
|
||||
# Use provided VLM config if available,
|
||||
# otherwise fall back to environment variables
|
||||
if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"):
|
||||
self.completion_url = vlm_config.get("base_url", "") + "/chat/completions"
|
||||
self.model = vlm_config.get("model_name", "")
|
||||
self.api_key = vlm_config.get("api_key", "")
|
||||
self.interface_type = vlm_config.get("interface_type", "openai").lower()
|
||||
else:
|
||||
if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "":
|
||||
base_url = os.getenv("VLM_MODEL_BASE_URL")
|
||||
model_name = os.getenv("VLM_MODEL_NAME")
|
||||
if not base_url or not model_name:
|
||||
logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set")
|
||||
return
|
||||
self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions"
|
||||
self.model = os.getenv("VLM_MODEL_NAME")
|
||||
self.api_key = os.getenv("VLM_MODEL_API_KEY")
|
||||
self.completion_url = base_url + "/chat/completions"
|
||||
self.model = model_name
|
||||
self.api_key = os.getenv("VLM_MODEL_API_KEY", "")
|
||||
self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower()
|
||||
|
||||
|
||||
# 验证接口类型
|
||||
if self.interface_type not in ["ollama", "openai"]:
|
||||
logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai")
|
||||
logger.warning(
|
||||
f"Unknown interface type: {self.interface_type}, defaulting to openai"
|
||||
)
|
||||
self.interface_type = "openai"
|
||||
|
||||
|
||||
logger.info(
|
||||
f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}"
|
||||
f"Configured with model: {self.model}, "
|
||||
f"endpoint: {self.completion_url}, interface: {self.interface_type}"
|
||||
)
|
||||
|
||||
def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]:
|
||||
@@ -210,8 +224,8 @@ class Caption:
|
||||
Returns:
|
||||
CaptionChatResp object if successful, None otherwise
|
||||
"""
|
||||
logger.info(f"Calling Caption API for image captioning")
|
||||
logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}")
|
||||
logger.info("Calling Caption API for image captioning")
|
||||
logger.info(f"Processing image data: {image_data[:50]}...")
|
||||
|
||||
# 根据接口类型选择调用方式
|
||||
if self.interface_type == "ollama":
|
||||
@@ -226,39 +240,35 @@ class Caption:
|
||||
|
||||
client = ollama.Client(
|
||||
host=host,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
logger.info(f"Calling Ollama API with model: {self.model}")
|
||||
|
||||
|
||||
# 调用Ollama API,使用images参数传递base64编码的图片
|
||||
response = client.generate(
|
||||
model=self.model,
|
||||
prompt="简单凝炼的描述图片的主要内容",
|
||||
images=[image_base64], # image_base64是base64编码的图片数据
|
||||
images=[image_base64], # image_base64是base64编码的图片数据
|
||||
options={"temperature": 0.1},
|
||||
stream=False,
|
||||
)
|
||||
|
||||
|
||||
# 构造响应对象
|
||||
caption_resp = CaptionChatResp(
|
||||
id="ollama_response",
|
||||
created=int(time.time()),
|
||||
model=self.model,
|
||||
model=Model(id=self.model),
|
||||
object="chat.completion",
|
||||
choices=[
|
||||
Choice(
|
||||
message=Message(
|
||||
role="assistant",
|
||||
content=response.response
|
||||
)
|
||||
)
|
||||
]
|
||||
Choice(message=Message(role="assistant", content=response.response))
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
logger.info("Successfully received response from Ollama API")
|
||||
return caption_resp
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Ollama API: {e}")
|
||||
return None
|
||||
@@ -266,13 +276,16 @@ class Caption:
|
||||
def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]:
|
||||
"""Call OpenAI-compatible API for image captioning."""
|
||||
logger.info(f"Calling OpenAI-compatible API with model: {self.model}")
|
||||
|
||||
|
||||
user_msg = UserMessage(
|
||||
role="user",
|
||||
content=[
|
||||
Content(type="text", text=self.prompt),
|
||||
Content(
|
||||
type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto")
|
||||
type="image_url",
|
||||
image_url=ImageUrl(
|
||||
url="data:image/png;base64," + image_base64, detail="auto"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
@@ -295,23 +308,23 @@ class Caption:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
|
||||
try:
|
||||
logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}")
|
||||
logger.info(
|
||||
f"Sending request to OpenAI-compatible API with model: {self.model}"
|
||||
)
|
||||
response = requests.post(
|
||||
self.completion_url,
|
||||
data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4),
|
||||
headers=headers,
|
||||
timeout=30,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
if response.status_code != 200:
|
||||
logger.error(
|
||||
f"OpenAI-compatible API returned non-200 status code: {response.status_code}"
|
||||
f"OpenAI API returned non-200 status code: {response.status_code}"
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
logger.info(
|
||||
f"Successfully received response from OpenAI-compatible API with status: {response.status_code}"
|
||||
)
|
||||
logger.info(f"Converting response to CaptionChatResp object")
|
||||
logger.info(f"Received from OpenAI with status: {response.status_code}")
|
||||
logger.info("Converting response to CaptionChatResp object")
|
||||
caption_resp = CaptionChatResp.from_json(response.json())
|
||||
|
||||
if caption_resp.usage:
|
||||
@@ -322,7 +335,7 @@ class Caption:
|
||||
|
||||
return caption_resp
|
||||
except requests.exceptions.Timeout:
|
||||
logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds")
|
||||
logger.error("Timeout while calling OpenAI-compatible API after 30 seconds")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Request error calling OpenAI-compatible API: {e}")
|
||||
71
docreader/parser/chain_parser.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import logging
|
||||
from typing import Dict, List, Tuple, Type
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class FirstParser(BaseParser):
|
||||
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self._parsers: List[BaseParser] = []
|
||||
for parser_cls in self._parser_cls:
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
for p in self._parsers:
|
||||
logger.info(f"FirstParser: using parser {p.__class__.__name__}")
|
||||
document = p.parse_into_text(content)
|
||||
if document.is_valid():
|
||||
logger.info(f"FirstParser: parser {p.__class__.__name__} succeeded")
|
||||
return document
|
||||
return Document()
|
||||
|
||||
@classmethod
|
||||
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["FirstParser"]:
|
||||
names = "_".join([p.__name__ for p in parser_classes])
|
||||
return type(f"FirstParser_{names}", (cls,), {"_parser_cls": parser_classes})
|
||||
|
||||
|
||||
class PipelineParser(BaseParser):
|
||||
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self._parsers: List[BaseParser] = []
|
||||
for parser_cls in self._parser_cls:
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
images: Dict[str, str] = {}
|
||||
document = Document()
|
||||
for p in self._parsers:
|
||||
logger.info(f"PipelineParser: using parser {p.__class__.__name__}")
|
||||
document = p.parse_into_text(content)
|
||||
content = endecode.encode_bytes(document.content)
|
||||
images.update(document.images)
|
||||
document.images.update(images)
|
||||
return document
|
||||
|
||||
@classmethod
|
||||
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["PipelineParser"]:
|
||||
names = "_".join([p.__name__ for p in parser_classes])
|
||||
return type(f"PipelineParser_{names}", (cls,), {"_parser_cls": parser_classes})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
|
||||
cls = FirstParser.create(MarkdownParser)
|
||||
parser = cls()
|
||||
print(parser.parse_into_text(b"aaa"))
|
||||
50
docreader/parser/csv_parser.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docreader.models.document import Chunk, Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CSVParser(BaseParser):
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
chunks: List[Chunk] = []
|
||||
text: List[str] = []
|
||||
start, end = 0, 0
|
||||
|
||||
df = pd.read_csv(BytesIO(content), on_bad_lines="skip")
|
||||
|
||||
for i, (idx, row) in enumerate(df.iterrows()):
|
||||
content_row = (
|
||||
",".join(
|
||||
f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
end += len(content_row)
|
||||
text.append(content_row)
|
||||
chunks.append(Chunk(content=content_row, seq=i, start=start, end=end))
|
||||
start = end
|
||||
|
||||
return Document(
|
||||
content="".join(text),
|
||||
chunks=chunks,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_file = "/path/to/your/file.csv"
|
||||
parser = CSVParser()
|
||||
with open(your_file, "rb") as f:
|
||||
content = f.read()
|
||||
document = parser.parse_into_text(content)
|
||||
logger.error(document.content)
|
||||
|
||||
for chunk in document.chunks:
|
||||
logger.error(chunk.content)
|
||||
247
docreader/parser/doc_parser.py
Normal file
@@ -0,0 +1,247 @@
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from typing import List, Optional
|
||||
|
||||
import textract
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.docx2_parser import Docx2Parser
|
||||
from docreader.utils.tempfile import TempDirContext, TempFileContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocParser(Docx2Parser):
|
||||
"""DOC document parser"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
logger.info(f"Parsing DOC document, content size: {len(content)} bytes")
|
||||
|
||||
handle_chain = [
|
||||
# 1. Try to convert to docx format to extract images
|
||||
self._parse_with_docx,
|
||||
# 2. If image extraction is not needed or conversion failed,
|
||||
# try using antiword to extract text
|
||||
self._parse_with_antiword,
|
||||
# 3. If antiword extraction fails, use textract
|
||||
self._parse_with_textract,
|
||||
]
|
||||
|
||||
# Save byte content as a temporary file
|
||||
with TempFileContext(content, ".doc") as temp_file_path:
|
||||
for handle in handle_chain:
|
||||
try:
|
||||
document = handle(temp_file_path)
|
||||
if document:
|
||||
return document
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse DOC with {handle.__name__} {e}")
|
||||
|
||||
return Document(content="")
|
||||
|
||||
def _parse_with_docx(self, temp_file_path: str) -> Document:
|
||||
logger.info("Multimodal enabled, attempting to extract images from DOC")
|
||||
|
||||
docx_content = self._try_convert_doc_to_docx(temp_file_path)
|
||||
if not docx_content:
|
||||
raise RuntimeError("Failed to convert DOC to DOCX")
|
||||
|
||||
logger.info("Successfully converted DOC to DOCX, using DocxParser")
|
||||
# Use existing DocxParser to parse the converted docx
|
||||
document = super(Docx2Parser, self).parse_into_text(docx_content)
|
||||
logger.info(f"Extracted {len(document.content)} characters using DocxParser")
|
||||
return document
|
||||
|
||||
def _parse_with_antiword(self, temp_file_path: str) -> Document:
|
||||
logger.info("Attempting to parse DOC file with antiword")
|
||||
|
||||
# Check if antiword is installed
|
||||
antiword_path = self._try_find_antiword()
|
||||
if not antiword_path:
|
||||
raise RuntimeError("antiword not found in PATH")
|
||||
|
||||
# Use antiword to extract text directly
|
||||
process = subprocess.Popen(
|
||||
[antiword_path, temp_file_path],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = process.communicate()
|
||||
if process.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
|
||||
)
|
||||
text = stdout.decode("utf-8", errors="ignore")
|
||||
logger.info(f"Successfully extracted {len(text)} characters using antiword")
|
||||
return Document(content=text)
|
||||
|
||||
def _parse_with_textract(self, temp_file_path: str) -> Document:
|
||||
logger.info(f"Parsing DOC file with textract: {temp_file_path}")
|
||||
text = textract.process(temp_file_path, method="antiword").decode("utf-8")
|
||||
logger.info(f"Successfully extracted {len(text)} bytes of DOC using textract")
|
||||
return Document(content=str(text))
|
||||
|
||||
def _try_convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
|
||||
"""Convert DOC file to DOCX format
|
||||
|
||||
Uses LibreOffice/OpenOffice for conversion
|
||||
|
||||
Args:
|
||||
doc_path: DOC file path
|
||||
|
||||
Returns:
|
||||
Byte stream of DOCX file content, or None if conversion fails
|
||||
"""
|
||||
logger.info(f"Converting DOC to DOCX: {doc_path}")
|
||||
|
||||
# Check if LibreOffice or OpenOffice is installed
|
||||
soffice_path = self._try_find_soffice()
|
||||
if not soffice_path:
|
||||
return None
|
||||
|
||||
# Execute conversion command
|
||||
logger.info(f"Using {soffice_path} to convert DOC to DOCX")
|
||||
|
||||
# Create a temporary directory to store the converted file
|
||||
with TempDirContext() as temp_dir:
|
||||
cmd = [
|
||||
soffice_path,
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"docx",
|
||||
"--outdir",
|
||||
temp_dir,
|
||||
doc_path,
|
||||
]
|
||||
logger.info(f"Running command: {' '.join(cmd)}")
|
||||
process = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
logger.warning(
|
||||
f"Error converting DOC to DOCX: {stderr.decode('utf-8')}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Find the converted file
|
||||
docx_file = [
|
||||
file for file in os.listdir(temp_dir) if file.endswith(".docx")
|
||||
]
|
||||
logger.info(f"Found {len(docx_file)} DOCX file(s) in temporary directory")
|
||||
for file in docx_file:
|
||||
converted_file = os.path.join(temp_dir, file)
|
||||
logger.info(f"Found converted file: {converted_file}")
|
||||
|
||||
# Read the converted file content
|
||||
with open(converted_file, "rb") as f:
|
||||
docx_content = f.read()
|
||||
logger.info(
|
||||
f"Successfully read DOCX file, size: {len(docx_content)}"
|
||||
)
|
||||
return docx_content
|
||||
return None
|
||||
|
||||
def _try_find_executable_path(
|
||||
self,
|
||||
executable_name: str,
|
||||
possible_path: List[str] = [],
|
||||
environment_variable: List[str] = [],
|
||||
) -> Optional[str]:
|
||||
"""Find executable path
|
||||
Args:
|
||||
executable_name: Executable name
|
||||
possible_path: List of possible paths
|
||||
environment_variable: List of environment variables to check
|
||||
Returns:
|
||||
Executable path, or None if not found
|
||||
"""
|
||||
# Common executable paths
|
||||
paths: List[str] = []
|
||||
paths.extend(possible_path)
|
||||
paths.extend(os.environ.get(env_var, "") for env_var in environment_variable)
|
||||
paths = list(set(paths))
|
||||
|
||||
# Check if path is set in environment variable
|
||||
for path in paths:
|
||||
if os.path.exists(path):
|
||||
logger.info(f"Found {executable_name} at {path}")
|
||||
return path
|
||||
|
||||
# Try to find in PATH
|
||||
result = subprocess.run(
|
||||
["which", executable_name], capture_output=True, text=True
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
path = result.stdout.strip()
|
||||
logger.info(f"Found {executable_name} at {path}")
|
||||
return path
|
||||
|
||||
logger.warning(f"Failed to find {executable_name}")
|
||||
return None
|
||||
|
||||
def _try_find_soffice(self) -> Optional[str]:
|
||||
"""Find LibreOffice/OpenOffice executable path
|
||||
|
||||
Returns:
|
||||
Executable path, or None if not found
|
||||
"""
|
||||
# Common LibreOffice/OpenOffice executable paths
|
||||
possible_paths = [
|
||||
# Linux
|
||||
"/usr/bin/soffice",
|
||||
"/usr/lib/libreoffice/program/soffice",
|
||||
"/opt/libreoffice25.2/program/soffice",
|
||||
# macOS
|
||||
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
||||
# Windows
|
||||
"C:\\Program Files\\LibreOffice\\program\\soffice.exe",
|
||||
"C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
|
||||
]
|
||||
return self._try_find_executable_path(
|
||||
executable_name="soffice",
|
||||
possible_path=possible_paths,
|
||||
environment_variable=["LIBREOFFICE_PATH"],
|
||||
)
|
||||
|
||||
def _try_find_antiword(self) -> Optional[str]:
|
||||
"""Find antiword executable path
|
||||
|
||||
Returns:
|
||||
Executable path, or None if not found
|
||||
"""
|
||||
# Common antiword executable paths
|
||||
possible_paths = [
|
||||
# Linux/macOS
|
||||
"/usr/bin/antiword",
|
||||
"/usr/local/bin/antiword",
|
||||
# Windows
|
||||
"C:\\Program Files\\Antiword\\antiword.exe",
|
||||
"C:\\Program Files (x86)\\Antiword\\antiword.exe",
|
||||
]
|
||||
return self._try_find_executable_path(
|
||||
executable_name="antiword",
|
||||
possible_path=possible_paths,
|
||||
environment_variable=["ANTIWORD_PATH"],
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
file_name = "/path/to/your/test.doc"
|
||||
logger.info(f"Processing file: {file_name}")
|
||||
doc_parser = DocParser(
|
||||
file_name=file_name,
|
||||
enable_multimodal=True,
|
||||
chunk_size=512,
|
||||
chunk_overlap=60,
|
||||
)
|
||||
with open(file_name, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
document = doc_parser.parse_into_text(content)
|
||||
logger.info(f"Processing complete, extracted text length: {len(document.content)}")
|
||||
logger.info(f"Sample text: {document.content[:200]}...")
|
||||
28
docreader/parser/docx2_parser.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import logging
|
||||
|
||||
from docreader.parser.chain_parser import FirstParser
|
||||
from docreader.parser.docx_parser import DocxParser
|
||||
from docreader.parser.markitdown_parser import MarkitdownParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Docx2Parser(FirstParser):
|
||||
_parser_cls = (MarkitdownParser, DocxParser)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_file = "/path/to/your/file.docx"
|
||||
parser = Docx2Parser(separators=[".", "?", "!", "。", "?", "!"])
|
||||
with open(your_file, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
document = parser.parse(content)
|
||||
for cc in document.chunks:
|
||||
logger.info(f"chunk: {cc}")
|
||||
|
||||
# document = parser.parse_into_text(content)
|
||||
# logger.info(f"docx content: {document.content}")
|
||||
# logger.info(f"find images {document.images.keys()}")
|
||||
@@ -1,37 +1,36 @@
|
||||
import logging
|
||||
import tempfile
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from io import BytesIO
|
||||
from typing import Optional, Dict, Any, Tuple, List, Union
|
||||
from dataclasses import dataclass, field
|
||||
from PIL import Image
|
||||
from docx import Document
|
||||
from docx.image.exceptions import (
|
||||
UnrecognizedImageError,
|
||||
UnexpectedEndOfFileError,
|
||||
InvalidImageStreamError,
|
||||
)
|
||||
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
|
||||
import re
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from dataclasses import dataclass, field
|
||||
from io import BytesIO
|
||||
from multiprocessing import Manager
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from .base_parser import BaseParser
|
||||
from docx import Document
|
||||
from docx.image.exceptions import (
|
||||
InvalidImageStreamError,
|
||||
UnexpectedEndOfFileError,
|
||||
UnrecognizedImageError,
|
||||
)
|
||||
from PIL import Image
|
||||
|
||||
from docreader.models.document import Document as DocumentModel
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
# Add thread local storage to track the processing status of each thread
|
||||
thread_local = threading.local()
|
||||
|
||||
|
||||
class ImageData:
|
||||
"""Represents a processed image of document content"""
|
||||
|
||||
local_path: str = ""
|
||||
object: Image.Image = None
|
||||
object: Optional[Image.Image] = None
|
||||
url: str = ""
|
||||
|
||||
|
||||
@@ -40,7 +39,9 @@ class LineData:
|
||||
"""Represents a processed line of document content with associated images"""
|
||||
|
||||
text: str = "" # Extracted text content
|
||||
images: List[ImageData] = field(default_factory=list) # List of images or image paths
|
||||
images: List[ImageData] = field(
|
||||
default_factory=list
|
||||
) # List of images or image paths
|
||||
extra_info: str = "" # Placeholder for additional info (currently unused)
|
||||
page_num: int = 0 # Page number
|
||||
content_sequence: List[Tuple[str, Any]] = field(
|
||||
@@ -53,18 +54,8 @@ class DocxParser(BaseParser):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_name: str = "",
|
||||
file_type: str = None,
|
||||
enable_multimodal: bool = True,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
separators: list = ["\n\n", "\n", "。"],
|
||||
ocr_backend: str = "paddle",
|
||||
ocr_config: dict = None,
|
||||
max_image_size: int = 1920,
|
||||
max_concurrent_tasks: int = 5,
|
||||
max_pages: int = 100, # Maximum number of pages to process, default to 50 pages
|
||||
chunking_config=None,
|
||||
max_pages: int = 100, # Maximum number of pages to process
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize DOCX document parser
|
||||
|
||||
@@ -79,37 +70,16 @@ class DocxParser(BaseParser):
|
||||
ocr_config: OCR engine configuration
|
||||
max_image_size: Maximum image size limit
|
||||
max_concurrent_tasks: Maximum number of concurrent tasks
|
||||
max_pages: Maximum number of pages to process, if more than this, only process the first max_pages pages
|
||||
max_pages: Maximum number of pages to process
|
||||
"""
|
||||
super().__init__(
|
||||
file_name=file_name,
|
||||
file_type=file_type,
|
||||
enable_multimodal=enable_multimodal,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
separators=separators,
|
||||
ocr_backend=ocr_backend,
|
||||
ocr_config=ocr_config,
|
||||
max_image_size=max_image_size,
|
||||
max_concurrent_tasks=max_concurrent_tasks,
|
||||
chunking_config=chunking_config,
|
||||
)
|
||||
super().__init__(**kwargs)
|
||||
self.max_pages = max_pages
|
||||
logger.info(f"DocxParser initialized with max_pages={max_pages}")
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
"""Parse DOCX document, extract text content and image Markdown links
|
||||
|
||||
Args:
|
||||
content: DOCX document content
|
||||
|
||||
Returns:
|
||||
Tuple of (parsed_text, image_map) where image_map maps image URLs to Image objects
|
||||
All LineData objects are used internally but not returned directly through this interface
|
||||
"""
|
||||
def parse_into_text(self, content: bytes) -> DocumentModel:
|
||||
"""Parse DOCX document, extract text content and image Markdown links"""
|
||||
logger.info(f"Parsing DOCX document, content size: {len(content)} bytes")
|
||||
logger.info(f"Max pages limit set to: {self.max_pages}")
|
||||
logger.info("Converting DOCX content to sections and tables")
|
||||
|
||||
start_time = time.time()
|
||||
# Use concurrent processing to handle the document
|
||||
@@ -123,7 +93,7 @@ class DocxParser(BaseParser):
|
||||
docx_processor = Docx(
|
||||
max_image_size=self.max_image_size,
|
||||
enable_multimodal=self.enable_multimodal,
|
||||
upload_file=self.upload_file,
|
||||
upload_file=self.storage.upload_file,
|
||||
)
|
||||
all_lines, tables = docx_processor(
|
||||
binary=content,
|
||||
@@ -140,7 +110,7 @@ class DocxParser(BaseParser):
|
||||
section_start_time = time.time()
|
||||
|
||||
text_parts = []
|
||||
image_parts = {}
|
||||
image_parts: Dict[str, str] = {}
|
||||
|
||||
for sec_idx, line in enumerate(all_lines):
|
||||
try:
|
||||
@@ -148,16 +118,19 @@ class DocxParser(BaseParser):
|
||||
text_parts.append(line.text)
|
||||
if sec_idx < 3 or sec_idx % 50 == 0:
|
||||
logger.info(
|
||||
f"Added section {sec_idx+1} text: {line.text[:50]}..."
|
||||
f"Added section {sec_idx + 1} text: {line.text[:50]}..."
|
||||
if len(line.text) > 50
|
||||
else f"Added section {sec_idx+1} text: {line.text}"
|
||||
else f"Added section {sec_idx + 1} text: {line.text}"
|
||||
)
|
||||
if line.images:
|
||||
for image_data in line.images:
|
||||
if image_data.url:
|
||||
image_parts[image_data.url] = image_data.object
|
||||
if image_data.url and image_data.object:
|
||||
image_parts[image_data.url] = endecode.decode_image(
|
||||
image_data.object
|
||||
)
|
||||
image_data.object.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing section {sec_idx+1}: {str(e)}")
|
||||
logger.error(f"Error processing section {sec_idx + 1}: {str(e)}")
|
||||
logger.error(f"Detailed stack trace: {traceback.format_exc()}")
|
||||
continue
|
||||
|
||||
@@ -176,17 +149,17 @@ class DocxParser(BaseParser):
|
||||
|
||||
total_processing_time = time.time() - start_time
|
||||
logger.info(
|
||||
f"Parsing complete in {total_processing_time:.2f}s, generated {len(text)} characters of text "
|
||||
f"Parsing complete in {total_processing_time:.2f}s, "
|
||||
f"generated {len(text)} characters of text"
|
||||
)
|
||||
|
||||
return text, image_parts
|
||||
return DocumentModel(content=text, images=image_parts)
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing DOCX document: {str(e)}")
|
||||
logger.error(f"Detailed stack trace: {traceback.format_exc()}")
|
||||
fallback_text = self._parse_using_simple_method(content)
|
||||
return fallback_text, {}
|
||||
return self._parse_using_simple_method(content)
|
||||
|
||||
def _parse_using_simple_method(self, content: bytes) -> str:
|
||||
def _parse_using_simple_method(self, content: bytes) -> DocumentModel:
|
||||
"""Parse document using a simplified method, as a fallback
|
||||
|
||||
Args:
|
||||
@@ -201,7 +174,8 @@ class DocxParser(BaseParser):
|
||||
doc = Document(BytesIO(content))
|
||||
logger.info(
|
||||
f"Successfully loaded document in simplified method, "
|
||||
f"contains {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables"
|
||||
f"contains {len(doc.paragraphs)} paragraphs "
|
||||
f"and {len(doc.tables)} tables"
|
||||
)
|
||||
text_parts = []
|
||||
|
||||
@@ -211,7 +185,7 @@ class DocxParser(BaseParser):
|
||||
para_with_text = 0
|
||||
for i, para in enumerate(doc.paragraphs):
|
||||
if i % 100 == 0:
|
||||
logger.info(f"Processing paragraph {i+1}/{para_count}")
|
||||
logger.info(f"Processing paragraph {i + 1}/{para_count}")
|
||||
if para.text.strip():
|
||||
text_parts.append(para.text.strip())
|
||||
para_with_text += 1
|
||||
@@ -225,7 +199,7 @@ class DocxParser(BaseParser):
|
||||
rows_processed = 0
|
||||
for i, table in enumerate(doc.tables):
|
||||
if i % 10 == 0:
|
||||
logger.info(f"Processing table {i+1}/{table_count}")
|
||||
logger.info(f"Processing table {i + 1}/{table_count}")
|
||||
|
||||
table_has_content = False
|
||||
for row in table.rows:
|
||||
@@ -256,25 +230,24 @@ class DocxParser(BaseParser):
|
||||
# If the result is still empty, return an error message
|
||||
if not result_text:
|
||||
logger.warning("No text extracted using simplified method")
|
||||
return "", {}
|
||||
return DocumentModel()
|
||||
|
||||
return result_text, {}
|
||||
return DocumentModel(content=result_text)
|
||||
except Exception as backup_error:
|
||||
processing_time = time.time() - start_time
|
||||
logger.error(
|
||||
f"Simplified parsing failed after {processing_time:.2f}s: {str(backup_error)}"
|
||||
f"Simplified parsing failed {processing_time:.2f}s: {backup_error}"
|
||||
)
|
||||
logger.error(f"Detailed traceback: {traceback.format_exc()}")
|
||||
return "", {}
|
||||
return DocumentModel()
|
||||
|
||||
|
||||
class Docx:
|
||||
def __init__(self, max_image_size=1920, enable_multimodal=False, upload_file=None):
|
||||
logger.info("Initializing DOCX processor")
|
||||
self.max_image_size = max_image_size # Maximum image size limit
|
||||
self.picture_cache = (
|
||||
{}
|
||||
) # Image cache to avoid processing the same image repeatedly
|
||||
# Image cache to avoid processing the same image repeatedly
|
||||
self.picture_cache = {}
|
||||
self.enable_multimodal = enable_multimodal
|
||||
self.upload_file = upload_file
|
||||
|
||||
@@ -454,7 +427,6 @@ class Docx:
|
||||
|
||||
return page_to_paragraphs
|
||||
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
binary: Optional[bytes] = None,
|
||||
@@ -611,7 +583,6 @@ class Docx:
|
||||
|
||||
return pages_to_process
|
||||
|
||||
|
||||
def _process_document(
|
||||
self,
|
||||
binary,
|
||||
@@ -806,7 +777,9 @@ class Docx:
|
||||
# Collect temporary image paths for later cleanup
|
||||
for line in page_lines:
|
||||
for image_data in line.images:
|
||||
if image_data.local_path and image_data.local_path.startswith("/tmp/docx_img_"):
|
||||
if image_data.local_path and image_data.local_path.startswith(
|
||||
"/tmp/docx_img_"
|
||||
):
|
||||
temp_img_paths.add(image_data.local_path)
|
||||
|
||||
results.extend(page_lines)
|
||||
@@ -876,7 +849,11 @@ class Docx:
|
||||
|
||||
# Process all image data objects
|
||||
for image_data in image_paths:
|
||||
if image_data.local_path and os.path.exists(image_data.local_path) and image_data.local_path not in image_url_map:
|
||||
if (
|
||||
image_data.local_path
|
||||
and os.path.exists(image_data.local_path)
|
||||
and image_data.local_path not in image_url_map
|
||||
):
|
||||
try:
|
||||
# Upload the image if it doesn't have a URL yet
|
||||
if not image_data.url:
|
||||
@@ -886,12 +863,16 @@ class Docx:
|
||||
image_data.url = image_url
|
||||
# Add image URL as Markdown format
|
||||
markdown_image = f""
|
||||
image_url_map[image_data.local_path] = markdown_image
|
||||
image_url_map[image_data.local_path] = (
|
||||
markdown_image
|
||||
)
|
||||
logger.info(
|
||||
f"Added image URL for {image_data.local_path}: {image_url}"
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Failed to upload image: {image_data.local_path}")
|
||||
logger.warning(
|
||||
f"Failed to upload image: {image_data.local_path}"
|
||||
)
|
||||
else:
|
||||
# Already has a URL, use it
|
||||
markdown_image = f""
|
||||
@@ -925,12 +906,19 @@ class Docx:
|
||||
# For ImageData objects, use the URL
|
||||
if isinstance(content, str) and content in image_url_map:
|
||||
combined_parts.append(image_url_map[content])
|
||||
elif hasattr(content, 'local_path') and content.local_path in image_url_map:
|
||||
elif (
|
||||
hasattr(content, "local_path")
|
||||
and content.local_path in image_url_map
|
||||
):
|
||||
combined_parts.append(image_url_map[content.local_path])
|
||||
|
||||
# Create the final text with proper ordering
|
||||
final_text = "\n\n".join(part for part in combined_parts if part)
|
||||
processed_lines.append(LineData(text=final_text, page_num=page_num, images=line_data.images))
|
||||
processed_lines.append(
|
||||
LineData(
|
||||
text=final_text, page_num=page_num, images=line_data.images
|
||||
)
|
||||
)
|
||||
else:
|
||||
processed_lines = lines
|
||||
|
||||
@@ -1003,11 +991,11 @@ class Docx:
|
||||
logger.info(f"Processing {table_count} tables")
|
||||
for tb_idx, tb in enumerate(self.doc.tables):
|
||||
if tb_idx % 10 == 0: # Log only every 10 tables to reduce log volume
|
||||
logger.info(f"Processing table {tb_idx+1}/{table_count}")
|
||||
logger.info(f"Processing table {tb_idx + 1}/{table_count}")
|
||||
|
||||
# Optimize: Check if table is empty
|
||||
if len(tb.rows) == 0 or all(len(r.cells) == 0 for r in tb.rows):
|
||||
logger.info(f"Skipping empty table {tb_idx+1}")
|
||||
logger.info(f"Skipping empty table {tb_idx + 1}")
|
||||
continue
|
||||
|
||||
table_html = self._convert_table_to_html(tb)
|
||||
@@ -1111,8 +1099,8 @@ def _save_image_to_temp(logger, image, page_num, img_idx):
|
||||
if not image:
|
||||
return None
|
||||
|
||||
import tempfile
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
try:
|
||||
# Create a temporary file
|
||||
@@ -1187,8 +1175,15 @@ def process_page_multiprocess(
|
||||
return []
|
||||
|
||||
# Extract page content
|
||||
combined_text, image_objects, content_sequence = _extract_page_content_in_process(
|
||||
process_logger, doc, page_num, paragraphs, enable_multimodal, max_image_size
|
||||
combined_text, image_objects, content_sequence = (
|
||||
_extract_page_content_in_process(
|
||||
process_logger,
|
||||
doc,
|
||||
page_num,
|
||||
paragraphs,
|
||||
enable_multimodal,
|
||||
max_image_size,
|
||||
)
|
||||
)
|
||||
|
||||
# Process content sequence to maintain order between processes
|
||||
@@ -1199,7 +1194,9 @@ def process_page_multiprocess(
|
||||
if enable_multimodal:
|
||||
# First pass: save all images to temporary files
|
||||
for i, image_object in enumerate(image_objects):
|
||||
img_path = _save_image_to_temp(process_logger, image_object, page_num, i)
|
||||
img_path = _save_image_to_temp(
|
||||
process_logger, image_object, page_num, i
|
||||
)
|
||||
if img_path:
|
||||
# Create ImageData object
|
||||
image_data = ImageData()
|
||||
54
docreader/parser/excel_parser.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docreader.models.document import Chunk, Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExcelParser(BaseParser):
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
chunks: List[Chunk] = []
|
||||
text: List[str] = []
|
||||
start, end = 0, 0
|
||||
|
||||
excel_file = pd.ExcelFile(BytesIO(content))
|
||||
for excel_sheet_name in excel_file.sheet_names:
|
||||
df = excel_file.parse(sheet_name=excel_sheet_name)
|
||||
df.dropna(how="all", inplace=True)
|
||||
|
||||
for _, row in df.iterrows():
|
||||
page_content = []
|
||||
for k, v in row.items():
|
||||
if pd.notna(v):
|
||||
page_content.append(f"{k}: {v}")
|
||||
if not page_content:
|
||||
continue
|
||||
content_row = ",".join(page_content) + "\n"
|
||||
end += len(content_row)
|
||||
text.append(content_row)
|
||||
chunks.append(
|
||||
Chunk(content=content_row, seq=len(chunks), start=start, end=end)
|
||||
)
|
||||
start = end
|
||||
|
||||
return Document(content="".join(text), chunks=chunks)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_file = "/path/to/your/file.xlsx"
|
||||
parser = ExcelParser()
|
||||
with open(your_file, "rb") as f:
|
||||
content = f.read()
|
||||
document = parser.parse_into_text(content)
|
||||
logger.error(document.content)
|
||||
|
||||
for chunk in document.chunks:
|
||||
logger.error(chunk.content)
|
||||
break
|
||||
44
docreader/parser/image_parser.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
|
||||
# Set up logger for this module
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ImageParser(BaseParser):
|
||||
"""
|
||||
Parser for image files with OCR capability.
|
||||
Extracts text from images and generates captions.
|
||||
|
||||
This parser handles image processing by:
|
||||
1. Uploading the image to storage
|
||||
2. Generating a descriptive caption
|
||||
3. Performing OCR to extract text content
|
||||
4. Returning a combined result with both text and image reference
|
||||
"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""
|
||||
Parse image content into markdown text
|
||||
:param content: bytes content of the image
|
||||
:return: Document object
|
||||
"""
|
||||
logger.info(f"Parsing image content, size: {len(content)} bytes")
|
||||
|
||||
# Get file extension
|
||||
ext = os.path.splitext(self.file_name)[1].lower()
|
||||
|
||||
# Upload image to storage
|
||||
image_url = self.storage.upload_bytes(content, file_ext=ext)
|
||||
logger.info(f"Successfully uploaded image, URL: {image_url[:50]}...")
|
||||
|
||||
# Generate markdown text
|
||||
text = f""
|
||||
images = {image_url: base64.b64encode(content).decode()}
|
||||
|
||||
# Create image object and add to map
|
||||
return Document(content=text, images=images)
|
||||
228
docreader/parser/markdown_parser.py
Normal file
@@ -0,0 +1,228 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from typing import Dict, List, Match, Optional, Tuple
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
# Get logger object
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownTableUtil:
|
||||
def __init__(self):
|
||||
self.align_pattern = re.compile(
|
||||
r"^([\t ]*)\|[\t ]*[:-]+(?:[\t ]*\|[\t ]*[:-]+)*[\t ]*\|[\t ]*$",
|
||||
re.MULTILINE,
|
||||
)
|
||||
self.line_pattern = re.compile(
|
||||
r"^([\t ]*)\|[\t ]*[^|\r\n]*(?:[\t ]*\|[^|\r\n]*)*\|[\t ]*$",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
def format_table(self, content: str) -> str:
|
||||
def process_align(match: Match[str]) -> str:
|
||||
columns = [col.strip() for col in match.group(0).split("|") if col.strip()]
|
||||
|
||||
processed = []
|
||||
for col in columns:
|
||||
left_colon = ":" if col.startswith(":") else ""
|
||||
right_colon = ":" if col.endswith(":") else ""
|
||||
processed.append(left_colon + "---" + right_colon)
|
||||
|
||||
prefix = match.group(1)
|
||||
return prefix + "| " + " | ".join(processed) + " |"
|
||||
|
||||
def process_line(match: Match[str]) -> str:
|
||||
columns = [col.strip() for col in match.group(0).split("|") if col.strip()]
|
||||
|
||||
prefix = match.group(1)
|
||||
return prefix + "| " + " | ".join(columns) + " |"
|
||||
|
||||
formatted_content = content
|
||||
formatted_content = self.line_pattern.sub(process_line, formatted_content)
|
||||
formatted_content = self.align_pattern.sub(process_align, formatted_content)
|
||||
|
||||
return formatted_content
|
||||
|
||||
@staticmethod
|
||||
def _self_test():
|
||||
test_content = """
|
||||
# 测试表格
|
||||
普通文本---不会被匹配
|
||||
|
||||
## 表格1(无前置空格)
|
||||
|
||||
| 姓名 | 年龄 | 城市 |
|
||||
| :---------- | -------: | :------ |
|
||||
| 张三 | 25 | 北京 |
|
||||
|
||||
## 表格3(前置4个空格+首尾|)
|
||||
| 产品 | 价格 | 库存 |
|
||||
| :-------------: | ----------- | :-----------: |
|
||||
| 手机 | 5999 | 100 |
|
||||
"""
|
||||
util = MarkdownTableUtil()
|
||||
format_content = util.format_table(test_content)
|
||||
print(format_content)
|
||||
|
||||
|
||||
class MarkdownTableFormatter(BaseParser):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.table_helper = MarkdownTableUtil()
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
text = endecode.decode_bytes(content)
|
||||
text = self.table_helper.format_table(text)
|
||||
return Document(content=text)
|
||||
|
||||
|
||||
class MarkdownImageUtil:
|
||||
def __init__(self):
|
||||
self.b64_pattern = re.compile(
|
||||
r"!\[([^\]]*)\]\(data:image/(\w+)\+?\w*;base64,([^\)]+)\)"
|
||||
)
|
||||
self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
|
||||
self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
|
||||
|
||||
def extract_image(
|
||||
self,
|
||||
content: str,
|
||||
path_prefix: Optional[str] = None,
|
||||
replace: bool = True,
|
||||
) -> Tuple[str, List[str]]:
|
||||
"""Extract base64 encoded images from Markdown content"""
|
||||
|
||||
# image_path => base64 bytes
|
||||
images: List[str] = []
|
||||
|
||||
def repl(match: Match[str]) -> str:
|
||||
title = match.group(1)
|
||||
image_path = match.group(2)
|
||||
if path_prefix:
|
||||
image_path = f"{path_prefix}/{image_path}"
|
||||
|
||||
images.append(image_path)
|
||||
|
||||
if not replace:
|
||||
return match.group(0)
|
||||
|
||||
# Replace image path with URL
|
||||
return f""
|
||||
|
||||
text = self.image_pattern.sub(repl, content)
|
||||
logger.debug(f"Extracted {len(images)} images from markdown")
|
||||
return text, images
|
||||
|
||||
def extract_base64(
|
||||
self,
|
||||
content: str,
|
||||
path_prefix: Optional[str] = None,
|
||||
replace: bool = True,
|
||||
) -> Tuple[str, Dict[str, bytes]]:
|
||||
"""Extract base64 encoded images from Markdown content"""
|
||||
|
||||
# image_path => base64 bytes
|
||||
images: Dict[str, bytes] = {}
|
||||
|
||||
def repl(match: Match[str]) -> str:
|
||||
title = match.group(1)
|
||||
img_ext = match.group(2)
|
||||
img_b64 = match.group(3)
|
||||
|
||||
image_byte = endecode.encode_image(img_b64, errors="ignore")
|
||||
if not image_byte:
|
||||
logger.error(f"Failed to decode base64 image skip it: {img_b64}")
|
||||
return title
|
||||
|
||||
image_path = f"{uuid.uuid4()}.{img_ext}"
|
||||
if path_prefix:
|
||||
image_path = f"{path_prefix}/{image_path}"
|
||||
images[image_path] = image_byte
|
||||
|
||||
if not replace:
|
||||
return match.group(0)
|
||||
|
||||
# Replace image path with URL
|
||||
return f""
|
||||
|
||||
text = self.b64_pattern.sub(repl, content)
|
||||
logger.debug(f"Extracted {len(images)} base64 images from markdown")
|
||||
return text, images
|
||||
|
||||
def replace_path(self, content: str, images: Dict[str, str]) -> str:
|
||||
content_replace: set = set()
|
||||
|
||||
def repl(match: Match[str]) -> str:
|
||||
title = match.group(1)
|
||||
image_path = match.group(2)
|
||||
if image_path not in images:
|
||||
return match.group(0)
|
||||
|
||||
content_replace.add(image_path)
|
||||
image_path = images[image_path]
|
||||
return f""
|
||||
|
||||
text = self.replace_pattern.sub(repl, content)
|
||||
logger.debug(f"Replaced {len(content_replace)} images in markdown")
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def _self_test():
|
||||
your_content = "testtest"
|
||||
image_handle = MarkdownImageUtil()
|
||||
text, images = image_handle.extract_base64(your_content)
|
||||
print(text)
|
||||
|
||||
for image_url, image_byte in images.items():
|
||||
with open(image_url, "wb") as f:
|
||||
f.write(image_byte)
|
||||
|
||||
|
||||
class MarkdownImageBase64(BaseParser):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.image_helper = MarkdownImageUtil()
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
# Convert byte content to string using universal decoding method
|
||||
text = endecode.decode_bytes(content)
|
||||
text, img_b64 = self.image_helper.extract_base64(text, path_prefix="images")
|
||||
|
||||
images: Dict[str, str] = {}
|
||||
image_replace: Dict[str, str] = {}
|
||||
|
||||
logger.debug(f"Uploading {len(img_b64)} images from markdown")
|
||||
for ipath, b64_bytes in img_b64.items():
|
||||
ext = os.path.splitext(ipath)[1].lower()
|
||||
image_url = self.storage.upload_bytes(b64_bytes, ext)
|
||||
|
||||
image_replace[ipath] = image_url
|
||||
images[image_url] = base64.b64encode(b64_bytes).decode()
|
||||
|
||||
text = self.image_helper.replace_path(text, image_replace)
|
||||
return Document(content=text, images=images)
|
||||
|
||||
|
||||
class MarkdownParser(PipelineParser):
|
||||
_parser_cls = (MarkdownTableFormatter, MarkdownImageBase64)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_content = "testtest"
|
||||
parser = MarkdownParser()
|
||||
|
||||
document = parser.parse_into_text(your_content.encode())
|
||||
logger.info(document.content)
|
||||
logger.info(f"Images: {len(document.images)}, name: {document.images.keys()}")
|
||||
|
||||
MarkdownImageUtil._self_test()
|
||||
MarkdownTableUtil._self_test()
|
||||
31
docreader/parser/markitdown_parser.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import io
|
||||
import logging
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StdMarkitdownParser(BaseParser):
|
||||
"""
|
||||
PDF Document Parser
|
||||
|
||||
This parser handles PDF documents by extracting text content.
|
||||
It uses the markitdown library for simple text extraction.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.markitdown = MarkItDown()
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
result = self.markitdown.convert(io.BytesIO(content), keep_data_uris=True)
|
||||
return Document(content=result.text_content)
|
||||
|
||||
|
||||
class MarkitdownParser(PipelineParser):
|
||||
_parser_cls = (StdMarkitdownParser, MarkdownParser)
|
||||
132
docreader/parser/mineru_parser.py
Normal file
@@ -0,0 +1,132 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Dict
|
||||
|
||||
import markdownify
|
||||
import requests
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.parser.markdown_parser import MarkdownImageUtil, MarkdownTableFormatter
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StdMinerUParser(BaseParser):
|
||||
def __init__(
|
||||
self,
|
||||
enable_markdownify: bool = True,
|
||||
mineru_endpoint: str = "",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.minerU = os.getenv("MINERU_ENDPOINT", mineru_endpoint)
|
||||
self.enable_markdownify = enable_markdownify
|
||||
self.image_helper = MarkdownImageUtil()
|
||||
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
|
||||
self.enable = self.ping()
|
||||
|
||||
def ping(self, timeout: int = 5) -> bool:
|
||||
try:
|
||||
response = requests.get(
|
||||
self.minerU + "/docs", timeout=timeout, allow_redirects=True
|
||||
)
|
||||
response.raise_for_status()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
if not self.enable:
|
||||
logger.debug("MinerU API is not enabled")
|
||||
return Document()
|
||||
|
||||
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
|
||||
md_content: str = ""
|
||||
images_b64: Dict[str, str] = {}
|
||||
try:
|
||||
response = requests.post(
|
||||
url=self.minerU + "/file_parse",
|
||||
data={
|
||||
"return_md": True,
|
||||
"return_images": True,
|
||||
"lang_list": ["ch", "en"],
|
||||
"table_enable": True,
|
||||
"formula_enable": True,
|
||||
"parse_method": "auto",
|
||||
"start_page_id": 0,
|
||||
"end_page_id": 99999,
|
||||
"backend": "pipeline",
|
||||
"response_format_zip": False,
|
||||
"return_middle_json": False,
|
||||
"return_model_output": False,
|
||||
"return_content_list": False,
|
||||
},
|
||||
files={"files": content},
|
||||
timeout=1000,
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()["results"]["files"]
|
||||
md_content = result["md_content"]
|
||||
images_b64 = result.get("images", {})
|
||||
except Exception as e:
|
||||
logger.error(f"MinerU parsing failed: {e}", exc_info=True)
|
||||
return Document()
|
||||
|
||||
# convert table(HTML) in markdown to markdown table
|
||||
if self.enable_markdownify:
|
||||
logger.debug("Converting HTML to Markdown")
|
||||
md_content = markdownify.markdownify(md_content)
|
||||
|
||||
images = {}
|
||||
image_replace = {}
|
||||
# image in images_bs64 may not be used in md_content
|
||||
# such as: table ...
|
||||
# so we need to filter them
|
||||
for ipath, b64_str in images_b64.items():
|
||||
if f"images/{ipath}" not in md_content:
|
||||
logger.debug(f"Image {ipath} not used in markdown")
|
||||
continue
|
||||
match = self.base64_pattern.match(b64_str)
|
||||
if match:
|
||||
file_ext = match.group(1)
|
||||
b64_str = match.group(2)
|
||||
|
||||
image_bytes = endecode.encode_image(b64_str, errors="ignore")
|
||||
if not image_bytes:
|
||||
logger.error("Failed to decode base64 image skip it")
|
||||
continue
|
||||
|
||||
image_url = self.storage.upload_bytes(
|
||||
image_bytes, file_ext=f".{file_ext}"
|
||||
)
|
||||
|
||||
images[image_url] = b64_str
|
||||
image_replace[f"images/{ipath}"] = image_url
|
||||
|
||||
logger.info(f"Replaced {len(image_replace)} images in markdown")
|
||||
text = self.image_helper.replace_path(md_content, image_replace)
|
||||
|
||||
logger.info(
|
||||
f"Successfully parsed PDF, text: {len(text)}, images: {len(images)}"
|
||||
)
|
||||
return Document(content=text, images=images)
|
||||
|
||||
|
||||
class MinerUParser(PipelineParser):
|
||||
_parser_cls = (StdMinerUParser, MarkdownTableFormatter)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_file = "/path/to/your/file.pdf"
|
||||
your_mineru = "http://host.docker.internal:9987"
|
||||
parser = MinerUParser(mineru_endpoint=your_mineru)
|
||||
with open(your_file, "rb") as f:
|
||||
content = f.read()
|
||||
document = parser.parse_into_text(content)
|
||||
logger.error(document.content)
|
||||
327
docreader/parser/ocr_engine.py
Normal file
@@ -0,0 +1,327 @@
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Union
|
||||
|
||||
import numpy as np
|
||||
from openai import OpenAI
|
||||
from PIL import Image
|
||||
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OCRBackend(ABC):
|
||||
"""Base class for OCR backends"""
|
||||
|
||||
@abstractmethod
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class DummyOCRBackend(OCRBackend):
|
||||
"""Dummy OCR backend implementation"""
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
logger.warning("Dummy OCR backend is used")
|
||||
return ""
|
||||
|
||||
|
||||
class PaddleOCRBackend(OCRBackend):
|
||||
"""PaddleOCR backend implementation"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize PaddleOCR backend"""
|
||||
self.ocr = None
|
||||
try:
|
||||
import paddle
|
||||
|
||||
# Set PaddlePaddle to use CPU and disable GPU
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||||
paddle.device.set_device("cpu")
|
||||
|
||||
# 尝试检测CPU是否支持AVX指令集
|
||||
try:
|
||||
# 检测CPU是否支持AVX
|
||||
if platform.system() == "Linux":
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["grep", "-o", "avx", "/proc/cpuinfo"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
has_avx = "avx" in result.stdout.lower()
|
||||
if not has_avx:
|
||||
logger.warning(
|
||||
"CPU does not support AVX instructions, "
|
||||
"using compatibility mode"
|
||||
)
|
||||
# 进一步限制指令集使用
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
except (
|
||||
subprocess.TimeoutExpired,
|
||||
FileNotFoundError,
|
||||
subprocess.SubprocessError,
|
||||
):
|
||||
logger.warning(
|
||||
"Could not detect AVX support, using compatibility mode"
|
||||
)
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error detecting CPU capabilities: {e}, using compatibility mode"
|
||||
)
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
# OCR configuration with text orientation classification enabled
|
||||
ocr_config = {
|
||||
"use_gpu": False,
|
||||
"text_det_limit_type": "max",
|
||||
"text_det_limit_side_len": 960,
|
||||
"use_doc_orientation_classify": True, # 启用文档方向分类
|
||||
"use_doc_unwarping": False,
|
||||
"use_textline_orientation": True, # 启用文本行方向检测
|
||||
"text_recognition_model_name": "PP-OCRv4_server_rec",
|
||||
"text_detection_model_name": "PP-OCRv4_server_det",
|
||||
"text_det_thresh": 0.3,
|
||||
"text_det_box_thresh": 0.6,
|
||||
"text_det_unclip_ratio": 1.5,
|
||||
"text_rec_score_thresh": 0.0,
|
||||
"ocr_version": "PP-OCRv4",
|
||||
"lang": "ch",
|
||||
"show_log": False,
|
||||
"use_dilation": True, # improves accuracy
|
||||
"det_db_score_mode": "slow", # improves accuracy
|
||||
}
|
||||
|
||||
self.ocr = PaddleOCR(**ocr_config)
|
||||
logger.info("PaddleOCR engine initialized successfully")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(
|
||||
f"Failed to import paddleocr: {str(e)}. "
|
||||
"Please install it with 'pip install paddleocr'"
|
||||
)
|
||||
except OSError as e:
|
||||
if "Illegal instruction" in str(e) or "core dumped" in str(e):
|
||||
logger.error(
|
||||
f"PaddlePaddle crashed due to CPU instruction set incompatibility:"
|
||||
f"{e}"
|
||||
)
|
||||
logger.error(
|
||||
"This happens when the CPU doesn't support AVX instructions. "
|
||||
"Try install CPU-only version of PaddlePaddle, "
|
||||
"or use a different OCR backend."
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"Failed to initialize PaddleOCR due to OS error: {str(e)}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
if isinstance(image, str):
|
||||
image = Image.open(image)
|
||||
elif isinstance(image, bytes):
|
||||
image = Image.open(io.BytesIO(image))
|
||||
|
||||
if not isinstance(image, Image.Image):
|
||||
raise TypeError("image must be a string, bytes, or PIL Image object")
|
||||
|
||||
return self._predict(image)
|
||||
|
||||
def _predict(self, image: Image.Image) -> str:
|
||||
"""Perform OCR recognition on the image
|
||||
|
||||
Args:
|
||||
image: Image object (PIL.Image or numpy array)
|
||||
|
||||
Returns:
|
||||
Extracted text string
|
||||
"""
|
||||
if self.ocr is None:
|
||||
logger.error("PaddleOCR engine not initialized")
|
||||
return ""
|
||||
try:
|
||||
# Ensure image is in RGB format
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
|
||||
# Convert to numpy array if needed
|
||||
image_array = np.array(image)
|
||||
|
||||
# Perform OCR
|
||||
ocr_result = self.ocr.ocr(image_array, cls=False)
|
||||
|
||||
# Extract text
|
||||
ocr_text = ""
|
||||
if ocr_result and ocr_result[0]:
|
||||
text = [
|
||||
line[1][0] if line and len(line) >= 2 and line[1] else ""
|
||||
for line in ocr_result[0]
|
||||
]
|
||||
text = [t.strip() for t in text if t]
|
||||
ocr_text = " ".join(text)
|
||||
|
||||
logger.info(f"OCR extracted {len(ocr_text)} characters")
|
||||
return ocr_text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR recognition error: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
class NanonetsOCRBackend(OCRBackend):
|
||||
"""Nanonets OCR backend implementation using OpenAI API format"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize Nanonets OCR backend
|
||||
|
||||
Args:
|
||||
api_key: API key for OpenAI API
|
||||
base_url: Base URL for OpenAI API
|
||||
model: Model name
|
||||
"""
|
||||
base_url = os.getenv("OCR_API_BASE_URL", "http://localhost:8000/v1")
|
||||
api_key = os.getenv("OCR_API_KEY", "123")
|
||||
timeout = 30
|
||||
self.client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout)
|
||||
|
||||
self.model = os.getenv("OCR_MODEL", "nanonets/Nanonets-OCR-s")
|
||||
logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
|
||||
self.temperature = 0.0
|
||||
self.max_tokens = 15000
|
||||
self.prompt = """## 任务说明
|
||||
|
||||
请从上传的文档中提取文字内容,严格按自然阅读顺序(从上到下,从左到右)输出,并遵循以下格式规范。
|
||||
|
||||
### 1. **文本处理**
|
||||
|
||||
* 按正常阅读顺序提取文字,语句流畅自然。
|
||||
|
||||
### 2. **表格**
|
||||
|
||||
* 所有表格统一转换为 **Markdown 表格格式**。
|
||||
* 内容保持清晰、对齐整齐,便于阅读。
|
||||
|
||||
### 3. **公式**
|
||||
|
||||
* 所有公式转换为 **LaTeX 格式**,使用 `$$公式$$` 包裹。
|
||||
|
||||
### 4. **图片**
|
||||
|
||||
* 忽略图片信息
|
||||
|
||||
### 5. **链接**
|
||||
|
||||
* 不要猜测或补全不确定的链接地址。
|
||||
"""
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image using Nanonets OCR
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
if self.client is None:
|
||||
logger.error("Nanonets OCR client not initialized")
|
||||
return ""
|
||||
|
||||
try:
|
||||
# Encode image to base64
|
||||
img_base64 = endecode.decode_image(image)
|
||||
if not img_base64:
|
||||
return ""
|
||||
|
||||
# Call Nanonets OCR API
|
||||
logger.info(f"Calling Nanonets OCR API with model: {self.model}")
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{img_base64}"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": self.prompt,
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
temperature=self.temperature,
|
||||
max_tokens=self.max_tokens,
|
||||
)
|
||||
return response.choices[0].message.content or ""
|
||||
except Exception as e:
|
||||
logger.error(f"Nanonets OCR prediction error: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
class OCREngine:
|
||||
"""OCR Engine factory class"""
|
||||
|
||||
_instance: Dict[str, OCRBackend] = {}
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls, backend_type: str) -> OCRBackend:
|
||||
"""Get OCR engine instance
|
||||
|
||||
Args:
|
||||
backend_type: OCR backend type, one of: "paddle", "nanonets"
|
||||
**kwargs: Additional arguments for the backend
|
||||
|
||||
Returns:
|
||||
OCR engine instance or None if initialization fails
|
||||
"""
|
||||
backend_type = backend_type.lower()
|
||||
if cls._instance.get(backend_type):
|
||||
return cls._instance[backend_type]
|
||||
|
||||
logger.info(f"Initializing OCR engine with backend: {backend_type}")
|
||||
|
||||
if backend_type == "paddle":
|
||||
cls._instance[backend_type] = PaddleOCRBackend()
|
||||
|
||||
elif backend_type == "nanonets":
|
||||
cls._instance[backend_type] = NanonetsOCRBackend()
|
||||
|
||||
else:
|
||||
cls._instance[backend_type] = DummyOCRBackend()
|
||||
|
||||
return cls._instance[backend_type]
|
||||
163
docreader/parser/parser.py
Normal file
@@ -0,0 +1,163 @@
|
||||
import logging
|
||||
from typing import Dict, Type
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.models.read_config import ChunkingConfig
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.csv_parser import CSVParser
|
||||
from docreader.parser.doc_parser import DocParser
|
||||
from docreader.parser.docx2_parser import Docx2Parser
|
||||
from docreader.parser.excel_parser import ExcelParser
|
||||
from docreader.parser.image_parser import ImageParser
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
from docreader.parser.pdf_parser import PDFParser
|
||||
from docreader.parser.text_parser import TextParser
|
||||
from docreader.parser.web_parser import WebParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Parser:
|
||||
"""
|
||||
Document parser facade that integrates all specialized parsers.
|
||||
Provides a unified interface for parsing various document types.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Initialize all parser types
|
||||
self.parsers: Dict[str, Type[BaseParser]] = {
|
||||
"docx": Docx2Parser,
|
||||
"doc": DocParser,
|
||||
"pdf": PDFParser,
|
||||
"md": MarkdownParser,
|
||||
"txt": TextParser,
|
||||
"jpg": ImageParser,
|
||||
"jpeg": ImageParser,
|
||||
"png": ImageParser,
|
||||
"gif": ImageParser,
|
||||
"bmp": ImageParser,
|
||||
"tiff": ImageParser,
|
||||
"webp": ImageParser,
|
||||
"markdown": MarkdownParser,
|
||||
"csv": CSVParser,
|
||||
"xlsx": ExcelParser,
|
||||
"xls": ExcelParser,
|
||||
}
|
||||
logger.info(
|
||||
"Parser initialized with %d parsers: %s",
|
||||
len(self.parsers),
|
||||
", ".join(self.parsers.keys()),
|
||||
)
|
||||
|
||||
def get_parser(self, file_type: str) -> Type[BaseParser]:
|
||||
"""
|
||||
Get parser class for the specified file type.
|
||||
|
||||
Args:
|
||||
file_type: The file extension or type identifier
|
||||
|
||||
Returns:
|
||||
Parser class for the file type, or None if unsupported
|
||||
"""
|
||||
parser = self.parsers.get(file_type.lower())
|
||||
if not parser:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
return parser
|
||||
|
||||
def parse_file(
|
||||
self,
|
||||
file_name: str,
|
||||
file_type: str,
|
||||
content: bytes,
|
||||
config: ChunkingConfig,
|
||||
) -> Document:
|
||||
"""
|
||||
Parse file content using appropriate parser based on file type.
|
||||
|
||||
Args:
|
||||
file_name: Name of the file being parsed
|
||||
file_type: Type/extension of the file
|
||||
content: Raw file content as bytes
|
||||
config: Configuration for chunking process
|
||||
|
||||
Returns:
|
||||
ParseResult containing chunks and metadata, or None if parsing failed
|
||||
"""
|
||||
logger.info(f"Parsing file: {file_name} with type: {file_type}")
|
||||
logger.info(
|
||||
f"Chunking config: size={config.chunk_size}, "
|
||||
f"overlap={config.chunk_overlap}, "
|
||||
f"multimodal={config.enable_multimodal}"
|
||||
)
|
||||
|
||||
# Get appropriate parser for file type
|
||||
cls = self.get_parser(file_type)
|
||||
|
||||
# Parse file content
|
||||
logger.info(f"Creating parser instance for {file_type} file")
|
||||
parser = cls(
|
||||
file_name=file_name,
|
||||
file_type=file_type,
|
||||
chunk_size=config.chunk_size,
|
||||
chunk_overlap=config.chunk_overlap,
|
||||
separators=config.separators,
|
||||
enable_multimodal=config.enable_multimodal,
|
||||
max_image_size=1920, # Limit image size to 1920px
|
||||
max_concurrent_tasks=5, # Limit concurrent tasks to 5
|
||||
chunking_config=config, # Pass the entire chunking config
|
||||
)
|
||||
|
||||
logger.info(f"Starting to parse file content, size: {len(content)} bytes")
|
||||
result = parser.parse(content)
|
||||
|
||||
if not result.content:
|
||||
logger.warning(f"Parser returned empty content for file: {file_name}")
|
||||
elif not result.chunks:
|
||||
logger.warning(f"Parser returned empty chunks for file: {file_name}")
|
||||
elif result.chunks[0]:
|
||||
logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
|
||||
logger.info(f"Parsed file {file_name}, with {len(result.chunks)} chunks")
|
||||
return result
|
||||
|
||||
def parse_url(self, url: str, title: str, config: ChunkingConfig) -> Document:
|
||||
"""
|
||||
Parse content from a URL using the WebParser.
|
||||
|
||||
Args:
|
||||
url: URL to parse
|
||||
title: Title of the webpage (for metadata)
|
||||
config: Configuration for chunking process
|
||||
|
||||
Returns:
|
||||
ParseResult containing chunks and metadata, or None if parsing failed
|
||||
"""
|
||||
logger.info(f"Parsing URL: {url}, title: {title}")
|
||||
logger.info(
|
||||
f"Chunking config: size={config.chunk_size}, "
|
||||
f"overlap={config.chunk_overlap}, multimodal={config.enable_multimodal}"
|
||||
)
|
||||
|
||||
# Create web parser instance
|
||||
logger.info("Creating WebParser instance")
|
||||
parser = WebParser(
|
||||
title=title,
|
||||
chunk_size=config.chunk_size,
|
||||
chunk_overlap=config.chunk_overlap,
|
||||
separators=config.separators,
|
||||
enable_multimodal=config.enable_multimodal,
|
||||
max_image_size=1920, # Limit image size
|
||||
max_concurrent_tasks=5, # Limit concurrent tasks
|
||||
chunking_config=config,
|
||||
)
|
||||
|
||||
logger.info("Starting to parse URL content")
|
||||
result = parser.parse(url.encode())
|
||||
|
||||
if not result.content:
|
||||
logger.warning(f"Parser returned empty content for url: {url}")
|
||||
elif not result.chunks:
|
||||
logger.warning(f"Parser returned empty chunks for url: {url}")
|
||||
elif result.chunks[0]:
|
||||
logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
|
||||
logger.info(f"Parsed url {url}, with {len(result.chunks)} chunks")
|
||||
return result
|
||||
7
docreader/parser/pdf_parser.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from docreader.parser.chain_parser import FirstParser
|
||||
from docreader.parser.markitdown_parser import MarkitdownParser
|
||||
from docreader.parser.mineru_parser import MinerUParser
|
||||
|
||||
|
||||
class PDFParser(FirstParser):
|
||||
_parser_cls = (MinerUParser, MarkitdownParser)
|
||||
@@ -1,64 +1,68 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import uuid
|
||||
import logging
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import traceback
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Tuple, Optional
|
||||
from typing import Dict
|
||||
|
||||
from qcloud_cos import CosConfig, CosS3Client
|
||||
from minio import Minio
|
||||
from qcloud_cos import CosConfig, CosS3Client
|
||||
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class Storage(ABC):
|
||||
"""Abstract base class for object storage operations"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to object storage
|
||||
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to object storage
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to object storage
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class CosStorage(Storage):
|
||||
"""Tencent Cloud COS storage implementation"""
|
||||
|
||||
|
||||
def __init__(self, storage_config=None):
|
||||
"""Initialize COS storage
|
||||
|
||||
|
||||
Args:
|
||||
storage_config: Storage configuration
|
||||
"""
|
||||
self.storage_config = storage_config
|
||||
self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client()
|
||||
|
||||
self.client, self.bucket_name, self.region, self.prefix = (
|
||||
self._init_cos_client()
|
||||
)
|
||||
|
||||
def _init_cos_client(self):
|
||||
"""Initialize Tencent Cloud COS client"""
|
||||
try:
|
||||
# Use provided COS config if available, otherwise fall back to environment variables
|
||||
# Use provided COS config if available,
|
||||
# otherwise fall back to environment variables
|
||||
if self.storage_config and self.storage_config.get("access_key_id") != "":
|
||||
cos_config = self.storage_config
|
||||
secret_id = cos_config.get("access_key_id")
|
||||
@@ -75,15 +79,16 @@ class CosStorage(Storage):
|
||||
bucket_name = os.getenv("COS_BUCKET_NAME")
|
||||
appid = os.getenv("COS_APP_ID")
|
||||
prefix = os.getenv("COS_PATH_PREFIX")
|
||||
|
||||
|
||||
enable_old_domain = (
|
||||
os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true"
|
||||
)
|
||||
|
||||
if not all([secret_id, secret_key, region, bucket_name, appid]):
|
||||
logger.error(
|
||||
"Incomplete COS configuration, missing required environment variables"
|
||||
f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}"
|
||||
"Incomplete COS configuration, missing environment variables"
|
||||
f"secret_id: {secret_id}, secret_key: {secret_key}, "
|
||||
f"region: {region}, bucket_name: {bucket_name}, appid: {appid}"
|
||||
)
|
||||
return None, None, None, None
|
||||
|
||||
@@ -105,27 +110,26 @@ class CosStorage(Storage):
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize COS client: {str(e)}")
|
||||
return None, None, None, None
|
||||
|
||||
|
||||
def _get_download_url(self, bucket_name, region, object_key):
|
||||
"""Generate COS object URL
|
||||
|
||||
|
||||
Args:
|
||||
bucket_name: Bucket name
|
||||
region: Region
|
||||
object_key: Object key
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
|
||||
|
||||
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to Tencent Cloud COS
|
||||
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
@@ -135,16 +139,16 @@ class CosStorage(Storage):
|
||||
return ""
|
||||
|
||||
# Generate object key, use UUID to avoid conflicts
|
||||
file_name = os.path.basename(file_path)
|
||||
object_key = (
|
||||
f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
)
|
||||
file_ext = os.path.splitext(file_path)[1]
|
||||
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
logger.info(f"Generated object key: {object_key}")
|
||||
|
||||
# Upload file
|
||||
logger.info("Attempting to upload file to COS")
|
||||
response = self.client.upload_file(
|
||||
Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key
|
||||
self.client.upload_file(
|
||||
Bucket=self.bucket_name,
|
||||
LocalFilePath=file_path,
|
||||
Key=object_key,
|
||||
)
|
||||
|
||||
# Get file URL
|
||||
@@ -156,14 +160,14 @@ class CosStorage(Storage):
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file to COS: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to Tencent Cloud COS
|
||||
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
@@ -171,10 +175,16 @@ class CosStorage(Storage):
|
||||
logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes")
|
||||
if not self.client:
|
||||
return ""
|
||||
|
||||
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
|
||||
object_key = (
|
||||
f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
if self.prefix
|
||||
else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
)
|
||||
logger.info(f"Generated object key: {object_key}")
|
||||
self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key)
|
||||
self.client.put_object(
|
||||
Bucket=self.bucket_name, Body=content, Key=object_key
|
||||
)
|
||||
file_url = self._get_download_url(self.bucket_name, self.region, object_key)
|
||||
logger.info(f"Successfully uploaded bytes to COS: {file_url}")
|
||||
return file_url
|
||||
@@ -186,16 +196,18 @@ class CosStorage(Storage):
|
||||
|
||||
class MinioStorage(Storage):
|
||||
"""MinIO storage implementation"""
|
||||
|
||||
|
||||
def __init__(self, storage_config=None):
|
||||
"""Initialize MinIO storage
|
||||
|
||||
|
||||
Args:
|
||||
storage_config: Storage configuration
|
||||
"""
|
||||
self.storage_config = storage_config
|
||||
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client()
|
||||
|
||||
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = (
|
||||
self._init_minio_client()
|
||||
)
|
||||
|
||||
def _init_minio_client(self):
|
||||
"""Initialize MinIO client from environment variables or injected config.
|
||||
|
||||
@@ -203,58 +215,69 @@ class MinioStorage(Storage):
|
||||
prefer those values to override envs.
|
||||
"""
|
||||
try:
|
||||
endpoint = os.getenv("MINIO_ENDPOINT")
|
||||
endpoint = os.getenv("MINIO_ENDPOINT", "")
|
||||
use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true"
|
||||
if self.storage_config and self.storage_config.get("bucket_name"):
|
||||
storage_config = self.storage_config
|
||||
bucket_name = storage_config.get("bucket_name")
|
||||
bucket_name = storage_config.get("bucket_name", "")
|
||||
path_prefix = storage_config.get("path_prefix").strip().strip("/")
|
||||
access_key = storage_config.get("access_key_id")
|
||||
secret_key = storage_config.get("secret_access_key")
|
||||
else:
|
||||
access_key = os.getenv("MINIO_ACCESS_KEY_ID")
|
||||
secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY")
|
||||
bucket_name = os.getenv("MINIO_BUCKET_NAME")
|
||||
bucket_name = os.getenv("MINIO_BUCKET_NAME", "")
|
||||
path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/")
|
||||
|
||||
if not all([endpoint, access_key, secret_key, bucket_name]):
|
||||
logger.error("Incomplete MinIO configuration, missing required environment variables")
|
||||
logger.error(
|
||||
"Incomplete MinIO configuration, missing environment variables"
|
||||
)
|
||||
return None, None, None, None, None
|
||||
|
||||
# Initialize client
|
||||
client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl)
|
||||
client = Minio(
|
||||
endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl
|
||||
)
|
||||
|
||||
# Ensure bucket exists
|
||||
found = client.bucket_exists(bucket_name)
|
||||
if not found:
|
||||
client.make_bucket(bucket_name)
|
||||
policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name)
|
||||
policy = (
|
||||
'{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}'
|
||||
% (bucket_name, bucket_name)
|
||||
)
|
||||
client.set_bucket_policy(bucket_name, policy)
|
||||
|
||||
return client, bucket_name, use_ssl, endpoint, path_prefix
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize MinIO client: {str(e)}")
|
||||
return None, None, None, None, None
|
||||
|
||||
def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None):
|
||||
|
||||
def _get_download_url(self, object_key: str):
|
||||
"""Construct a public URL for MinIO object.
|
||||
|
||||
If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint.
|
||||
"""
|
||||
if public_endpoint:
|
||||
base = public_endpoint
|
||||
else:
|
||||
scheme = "https" if use_ssl else "http"
|
||||
base = f"{scheme}://{endpoint}"
|
||||
# Path-style URL for MinIO
|
||||
return f"{base}/{bucket_name}/{object_key}"
|
||||
|
||||
# 1. Use public endpoint if provided
|
||||
endpoint = os.getenv("MINIO_PUBLIC_ENDPOINT")
|
||||
if endpoint:
|
||||
return f"{endpoint}/{self.bucket_name}/{object_key}"
|
||||
|
||||
# 2. Use SSL if enabled
|
||||
if self.use_ssl:
|
||||
return f"https://{self.endpoint}/{self.bucket_name}/{object_key}"
|
||||
|
||||
# 3. Use HTTP default
|
||||
return f"http://{self.endpoint}/{self.bucket_name}/{object_key}"
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to MinIO
|
||||
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
@@ -265,29 +288,27 @@ class MinioStorage(Storage):
|
||||
|
||||
# Generate object key, use UUID to avoid conflicts
|
||||
file_name = os.path.basename(file_path)
|
||||
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
object_key = (
|
||||
f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
if self.path_prefix
|
||||
else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
)
|
||||
logger.info(f"Generated MinIO object key: {object_key}")
|
||||
|
||||
# Upload file
|
||||
logger.info("Attempting to upload file to MinIO")
|
||||
with open(file_path, 'rb') as file_data:
|
||||
with open(file_path, "rb") as file_data:
|
||||
file_size = os.path.getsize(file_path)
|
||||
self.client.put_object(
|
||||
bucket_name=self.bucket_name,
|
||||
bucket_name=self.bucket_name or "",
|
||||
object_name=object_key,
|
||||
data=file_data,
|
||||
length=file_size,
|
||||
content_type='application/octet-stream'
|
||||
content_type="application/octet-stream",
|
||||
)
|
||||
|
||||
# Get file URL
|
||||
file_url = self._get_download_url(
|
||||
self.bucket_name,
|
||||
object_key,
|
||||
self.use_ssl,
|
||||
self.endpoint,
|
||||
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
|
||||
)
|
||||
file_url = self._get_download_url(object_key)
|
||||
|
||||
logger.info(f"Successfully uploaded file to MinIO: {file_url}")
|
||||
return file_url
|
||||
@@ -295,14 +316,14 @@ class MinioStorage(Storage):
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file to MinIO: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to MinIO
|
||||
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
@@ -310,23 +331,21 @@ class MinioStorage(Storage):
|
||||
logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes")
|
||||
if not self.client:
|
||||
return ""
|
||||
|
||||
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
|
||||
object_key = (
|
||||
f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
if self.path_prefix
|
||||
else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
)
|
||||
logger.info(f"Generated MinIO object key: {object_key}")
|
||||
self.client.put_object(
|
||||
self.bucket_name,
|
||||
object_key,
|
||||
data=io.BytesIO(content),
|
||||
length=len(content),
|
||||
content_type="application/octet-stream"
|
||||
)
|
||||
file_url = self._get_download_url(
|
||||
self.bucket_name,
|
||||
object_key,
|
||||
self.use_ssl,
|
||||
self.endpoint,
|
||||
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
|
||||
self.bucket_name or "",
|
||||
object_key,
|
||||
data=io.BytesIO(content),
|
||||
length=len(content),
|
||||
content_type="application/octet-stream",
|
||||
)
|
||||
file_url = self._get_download_url(object_key)
|
||||
logger.info(f"Successfully uploaded bytes to MinIO: {file_url}")
|
||||
return file_url
|
||||
except Exception as e:
|
||||
@@ -335,26 +354,61 @@ class MinioStorage(Storage):
|
||||
return ""
|
||||
|
||||
|
||||
def create_storage(storage_config=None) -> Storage:
|
||||
class LocalStorage(Storage):
|
||||
"""Local file system storage implementation"""
|
||||
|
||||
def __init__(self, storage_config: Dict[str, str] = {}):
|
||||
self.storage_config = storage_config
|
||||
base_dir = storage_config.get(
|
||||
"base_dir", os.getenv("LOCAL_STORAGE_BASE_DIR", "")
|
||||
)
|
||||
self.image_dir = os.path.join(base_dir, "images")
|
||||
os.makedirs(self.image_dir, exist_ok=True)
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
logger.info(f"Uploading file to local storage: {file_path}")
|
||||
return file_path
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
logger.info(f"Uploading file to local storage: {len(content)} bytes")
|
||||
fname = os.path.join(self.image_dir, f"{uuid.uuid4()}{file_ext}")
|
||||
with open(fname, "wb") as f:
|
||||
f.write(content)
|
||||
return fname
|
||||
|
||||
|
||||
class Base64Storage(Storage):
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
logger.info(f"Uploading file to base64 storage: {file_path}")
|
||||
return file_path
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
logger.info(f"Uploading file to base64 storage: {len(content)} bytes")
|
||||
file_ext = file_ext.lstrip(".")
|
||||
return f"data:image/{file_ext};base64,{endecode.decode_image(content)}"
|
||||
|
||||
|
||||
def create_storage(storage_config: Dict[str, str] | None = None) -> Storage:
|
||||
"""Create a storage instance based on configuration or environment variables
|
||||
|
||||
|
||||
Args:
|
||||
storage_config: Storage configuration dictionary
|
||||
|
||||
|
||||
Returns:
|
||||
Storage instance
|
||||
"""
|
||||
storage_type = os.getenv("STORAGE_TYPE", "cos").lower()
|
||||
|
||||
if storage_config:
|
||||
storage_type = str(storage_config.get("provider", storage_type)).lower()
|
||||
|
||||
logger.info(f"Creating {storage_type} storage instance")
|
||||
|
||||
|
||||
if storage_type == "minio":
|
||||
return MinioStorage(storage_config)
|
||||
elif storage_type == "cos":
|
||||
# Default to COS
|
||||
return CosStorage(storage_config)
|
||||
else:
|
||||
return None
|
||||
elif storage_type == "local":
|
||||
return LocalStorage(storage_config or {})
|
||||
elif storage_type == "base64":
|
||||
return Base64Storage()
|
||||
|
||||
raise ValueError(f"Invalid storage type: {storage_type}")
|
||||
@@ -1,6 +1,8 @@
|
||||
import logging
|
||||
from .base_parser import BaseParser
|
||||
from typing import Dict, Any, Tuple, Union
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -11,7 +13,7 @@ class TextParser(BaseParser):
|
||||
This parser handles text extraction and chunking from plain text documents.
|
||||
"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""
|
||||
Parse text document content by decoding bytes to string.
|
||||
|
||||
@@ -25,20 +27,15 @@ class TextParser(BaseParser):
|
||||
Parsed text content as string
|
||||
"""
|
||||
logger.info(f"Parsing text document, content size: {len(content)} bytes")
|
||||
text = self.decode_bytes(content)
|
||||
text = endecode.decode_bytes(content)
|
||||
logger.info(
|
||||
f"Successfully parsed text document, extracted {len(text)} characters"
|
||||
)
|
||||
return text
|
||||
return Document(content=text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger.info("Running TextParser in standalone mode")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Sample text for testing
|
||||
text = """## 标题1
|
||||
104
docreader/parser/web_parser.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
from trafilatura import extract
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StdWebParser(BaseParser):
|
||||
"""Web page parser"""
|
||||
|
||||
def __init__(self, title: str, **kwargs):
|
||||
self.title = title
|
||||
self.proxy = os.environ.get("WEB_PROXY", "")
|
||||
super().__init__(file_name=title, **kwargs)
|
||||
logger.info(f"Initialized WebParser with title: {title}")
|
||||
|
||||
async def scrape(self, url: str) -> str:
|
||||
logger.info(f"Starting web page scraping for URL: {url}")
|
||||
try:
|
||||
async with async_playwright() as p:
|
||||
kwargs = {}
|
||||
if self.proxy:
|
||||
kwargs["proxy"] = {"server": self.proxy}
|
||||
logger.info("Launching WebKit browser")
|
||||
browser = await p.webkit.launch(**kwargs)
|
||||
page = await browser.new_page()
|
||||
|
||||
logger.info(f"Navigating to URL: {url}")
|
||||
try:
|
||||
await page.goto(url, timeout=30000)
|
||||
logger.info("Initial page load complete")
|
||||
except Exception as e:
|
||||
logger.error(f"Error navigating to URL: {str(e)}")
|
||||
await browser.close()
|
||||
return ""
|
||||
|
||||
logger.info("Retrieving page HTML content")
|
||||
content = await page.content()
|
||||
logger.info(f"Retrieved {len(content)} bytes of HTML content")
|
||||
|
||||
await browser.close()
|
||||
logger.info("Browser closed")
|
||||
|
||||
# Parse HTML content with BeautifulSoup
|
||||
logger.info("Parsing HTML with BeautifulSoup")
|
||||
logger.info("Successfully parsed HTML content")
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to scrape web page: {str(e)}")
|
||||
# Return empty BeautifulSoup object on error
|
||||
return ""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""Parse web page
|
||||
|
||||
Args:
|
||||
content: Web page content
|
||||
|
||||
Returns:
|
||||
Parse result
|
||||
"""
|
||||
url = endecode.decode_bytes(content)
|
||||
|
||||
logger.info(f"Scraping web page: {url}")
|
||||
chtml = asyncio.run(self.scrape(url))
|
||||
md_text = extract(
|
||||
chtml,
|
||||
output_format="markdown",
|
||||
with_metadata=True,
|
||||
include_images=True,
|
||||
include_tables=True,
|
||||
include_links=True,
|
||||
deduplicate=True,
|
||||
)
|
||||
if not md_text:
|
||||
logger.error("Failed to parse web page")
|
||||
return Document(content=f"Error parsing web page: {url}")
|
||||
return Document(content=md_text)
|
||||
|
||||
|
||||
class WebParser(PipelineParser):
|
||||
_parser_cls = (StdWebParser, MarkdownParser)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
url = "https://cloud.tencent.com/document/product/457/6759"
|
||||
|
||||
parser = WebParser(title="")
|
||||
cc = parser.parse_into_text(url.encode())
|
||||
with open("./tencent.md", "w") as f:
|
||||
f.write(cc.content)
|
||||
55
docreader/proto/docreader_pb2.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# NO CHECKED-IN PROTOBUF GENCODE
|
||||
# source: docreader.proto
|
||||
# Protobuf Python Version: 6.31.1
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import runtime_version as _runtime_version
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
_runtime_version.ValidateProtobufRuntimeVersion(
|
||||
_runtime_version.Domain.PUBLIC,
|
||||
6,
|
||||
31,
|
||||
1,
|
||||
'',
|
||||
'docreader.proto'
|
||||
)
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x64ocreader.proto\x12\tdocreader\"\xb9\x01\n\rStorageConfig\x12,\n\x08provider\x18\x01 \x01(\x0e\x32\x1a.docreader.StorageProvider\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x03 \x01(\t\x12\x15\n\raccess_key_id\x18\x04 \x01(\t\x12\x19\n\x11secret_access_key\x18\x05 \x01(\t\x12\x0e\n\x06\x61pp_id\x18\x06 \x01(\t\x12\x13\n\x0bpath_prefix\x18\x07 \x01(\t\"Z\n\tVLMConfig\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x10\n\x08\x62\x61se_url\x18\x02 \x01(\t\x12\x0f\n\x07\x61pi_key\x18\x03 \x01(\t\x12\x16\n\x0einterface_type\x18\x04 \x01(\t\"\xc2\x01\n\nReadConfig\x12\x12\n\nchunk_size\x18\x01 \x01(\x05\x12\x15\n\rchunk_overlap\x18\x02 \x01(\x05\x12\x12\n\nseparators\x18\x03 \x03(\t\x12\x19\n\x11\x65nable_multimodal\x18\x04 \x01(\x08\x12\x30\n\x0estorage_config\x18\x05 \x01(\x0b\x32\x18.docreader.StorageConfig\x12(\n\nvlm_config\x18\x06 \x01(\x0b\x32\x14.docreader.VLMConfig\"\x91\x01\n\x13ReadFromFileRequest\x12\x14\n\x0c\x66ile_content\x18\x01 \x01(\x0c\x12\x11\n\tfile_name\x18\x02 \x01(\t\x12\x11\n\tfile_type\x18\x03 \x01(\t\x12*\n\x0bread_config\x18\x04 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x05 \x01(\t\"p\n\x12ReadFromURLRequest\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\r\n\x05title\x18\x02 \x01(\t\x12*\n\x0bread_config\x18\x03 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x04 \x01(\t\"i\n\x05Image\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\x0f\n\x07\x63\x61ption\x18\x02 \x01(\t\x12\x10\n\x08ocr_text\x18\x03 \x01(\t\x12\x14\n\x0coriginal_url\x18\x04 \x01(\t\x12\r\n\x05start\x18\x05 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x06 \x01(\x05\"c\n\x05\x43hunk\x12\x0f\n\x07\x63ontent\x18\x01 \x01(\t\x12\x0b\n\x03seq\x18\x02 \x01(\x05\x12\r\n\x05start\x18\x03 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x04 \x01(\x05\x12 \n\x06images\x18\x05 \x03(\x0b\x32\x10.docreader.Image\"?\n\x0cReadResponse\x12 \n\x06\x63hunks\x18\x01 \x03(\x0b\x32\x10.docreader.Chunk\x12\r\n\x05\x65rror\x18\x02 \x01(\t*G\n\x0fStorageProvider\x12 \n\x1cSTORAGE_PROVIDER_UNSPECIFIED\x10\x00\x12\x07\n\x03\x43OS\x10\x01\x12\t\n\x05MINIO\x10\x02\x32\x9f\x01\n\tDocReader\x12I\n\x0cReadFromFile\x12\x1e.docreader.ReadFromFileRequest\x1a\x17.docreader.ReadResponse\"\x00\x12G\n\x0bReadFromURL\x12\x1d.docreader.ReadFromURLRequest\x1a\x17.docreader.ReadResponse\"\x00\x42\x35Z3github.com/Tencent/WeKnora/internal/docreader/protob\x06proto3')
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'docreader_pb2', _globals)
|
||||
if not _descriptor._USE_C_DESCRIPTORS:
|
||||
_globals['DESCRIPTOR']._loaded_options = None
|
||||
_globals['DESCRIPTOR']._serialized_options = b'Z3github.com/Tencent/WeKnora/internal/docreader/proto'
|
||||
_globals['_STORAGEPROVIDER']._serialized_start=1042
|
||||
_globals['_STORAGEPROVIDER']._serialized_end=1113
|
||||
_globals['_STORAGECONFIG']._serialized_start=31
|
||||
_globals['_STORAGECONFIG']._serialized_end=216
|
||||
_globals['_VLMCONFIG']._serialized_start=218
|
||||
_globals['_VLMCONFIG']._serialized_end=308
|
||||
_globals['_READCONFIG']._serialized_start=311
|
||||
_globals['_READCONFIG']._serialized_end=505
|
||||
_globals['_READFROMFILEREQUEST']._serialized_start=508
|
||||
_globals['_READFROMFILEREQUEST']._serialized_end=653
|
||||
_globals['_READFROMURLREQUEST']._serialized_start=655
|
||||
_globals['_READFROMURLREQUEST']._serialized_end=767
|
||||
_globals['_IMAGE']._serialized_start=769
|
||||
_globals['_IMAGE']._serialized_end=874
|
||||
_globals['_CHUNK']._serialized_start=876
|
||||
_globals['_CHUNK']._serialized_end=975
|
||||
_globals['_READRESPONSE']._serialized_start=977
|
||||
_globals['_READRESPONSE']._serialized_end=1040
|
||||
_globals['_DOCREADER']._serialized_start=1116
|
||||
_globals['_DOCREADER']._serialized_end=1275
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
127
docreader/proto/docreader_pb2.pyi
Normal file
@@ -0,0 +1,127 @@
|
||||
from google.protobuf.internal import containers as _containers
|
||||
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import message as _message
|
||||
from collections.abc import Iterable as _Iterable, Mapping as _Mapping
|
||||
from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
|
||||
|
||||
DESCRIPTOR: _descriptor.FileDescriptor
|
||||
|
||||
class StorageProvider(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
||||
__slots__ = ()
|
||||
STORAGE_PROVIDER_UNSPECIFIED: _ClassVar[StorageProvider]
|
||||
COS: _ClassVar[StorageProvider]
|
||||
MINIO: _ClassVar[StorageProvider]
|
||||
STORAGE_PROVIDER_UNSPECIFIED: StorageProvider
|
||||
COS: StorageProvider
|
||||
MINIO: StorageProvider
|
||||
|
||||
class StorageConfig(_message.Message):
|
||||
__slots__ = ("provider", "region", "bucket_name", "access_key_id", "secret_access_key", "app_id", "path_prefix")
|
||||
PROVIDER_FIELD_NUMBER: _ClassVar[int]
|
||||
REGION_FIELD_NUMBER: _ClassVar[int]
|
||||
BUCKET_NAME_FIELD_NUMBER: _ClassVar[int]
|
||||
ACCESS_KEY_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
SECRET_ACCESS_KEY_FIELD_NUMBER: _ClassVar[int]
|
||||
APP_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
PATH_PREFIX_FIELD_NUMBER: _ClassVar[int]
|
||||
provider: StorageProvider
|
||||
region: str
|
||||
bucket_name: str
|
||||
access_key_id: str
|
||||
secret_access_key: str
|
||||
app_id: str
|
||||
path_prefix: str
|
||||
def __init__(self, provider: _Optional[_Union[StorageProvider, str]] = ..., region: _Optional[str] = ..., bucket_name: _Optional[str] = ..., access_key_id: _Optional[str] = ..., secret_access_key: _Optional[str] = ..., app_id: _Optional[str] = ..., path_prefix: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class VLMConfig(_message.Message):
|
||||
__slots__ = ("model_name", "base_url", "api_key", "interface_type")
|
||||
MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
|
||||
BASE_URL_FIELD_NUMBER: _ClassVar[int]
|
||||
API_KEY_FIELD_NUMBER: _ClassVar[int]
|
||||
INTERFACE_TYPE_FIELD_NUMBER: _ClassVar[int]
|
||||
model_name: str
|
||||
base_url: str
|
||||
api_key: str
|
||||
interface_type: str
|
||||
def __init__(self, model_name: _Optional[str] = ..., base_url: _Optional[str] = ..., api_key: _Optional[str] = ..., interface_type: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class ReadConfig(_message.Message):
|
||||
__slots__ = ("chunk_size", "chunk_overlap", "separators", "enable_multimodal", "storage_config", "vlm_config")
|
||||
CHUNK_SIZE_FIELD_NUMBER: _ClassVar[int]
|
||||
CHUNK_OVERLAP_FIELD_NUMBER: _ClassVar[int]
|
||||
SEPARATORS_FIELD_NUMBER: _ClassVar[int]
|
||||
ENABLE_MULTIMODAL_FIELD_NUMBER: _ClassVar[int]
|
||||
STORAGE_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
VLM_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
chunk_size: int
|
||||
chunk_overlap: int
|
||||
separators: _containers.RepeatedScalarFieldContainer[str]
|
||||
enable_multimodal: bool
|
||||
storage_config: StorageConfig
|
||||
vlm_config: VLMConfig
|
||||
def __init__(self, chunk_size: _Optional[int] = ..., chunk_overlap: _Optional[int] = ..., separators: _Optional[_Iterable[str]] = ..., enable_multimodal: bool = ..., storage_config: _Optional[_Union[StorageConfig, _Mapping]] = ..., vlm_config: _Optional[_Union[VLMConfig, _Mapping]] = ...) -> None: ...
|
||||
|
||||
class ReadFromFileRequest(_message.Message):
|
||||
__slots__ = ("file_content", "file_name", "file_type", "read_config", "request_id")
|
||||
FILE_CONTENT_FIELD_NUMBER: _ClassVar[int]
|
||||
FILE_NAME_FIELD_NUMBER: _ClassVar[int]
|
||||
FILE_TYPE_FIELD_NUMBER: _ClassVar[int]
|
||||
READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
file_content: bytes
|
||||
file_name: str
|
||||
file_type: str
|
||||
read_config: ReadConfig
|
||||
request_id: str
|
||||
def __init__(self, file_content: _Optional[bytes] = ..., file_name: _Optional[str] = ..., file_type: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class ReadFromURLRequest(_message.Message):
|
||||
__slots__ = ("url", "title", "read_config", "request_id")
|
||||
URL_FIELD_NUMBER: _ClassVar[int]
|
||||
TITLE_FIELD_NUMBER: _ClassVar[int]
|
||||
READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
url: str
|
||||
title: str
|
||||
read_config: ReadConfig
|
||||
request_id: str
|
||||
def __init__(self, url: _Optional[str] = ..., title: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class Image(_message.Message):
|
||||
__slots__ = ("url", "caption", "ocr_text", "original_url", "start", "end")
|
||||
URL_FIELD_NUMBER: _ClassVar[int]
|
||||
CAPTION_FIELD_NUMBER: _ClassVar[int]
|
||||
OCR_TEXT_FIELD_NUMBER: _ClassVar[int]
|
||||
ORIGINAL_URL_FIELD_NUMBER: _ClassVar[int]
|
||||
START_FIELD_NUMBER: _ClassVar[int]
|
||||
END_FIELD_NUMBER: _ClassVar[int]
|
||||
url: str
|
||||
caption: str
|
||||
ocr_text: str
|
||||
original_url: str
|
||||
start: int
|
||||
end: int
|
||||
def __init__(self, url: _Optional[str] = ..., caption: _Optional[str] = ..., ocr_text: _Optional[str] = ..., original_url: _Optional[str] = ..., start: _Optional[int] = ..., end: _Optional[int] = ...) -> None: ...
|
||||
|
||||
class Chunk(_message.Message):
|
||||
__slots__ = ("content", "seq", "start", "end", "images")
|
||||
CONTENT_FIELD_NUMBER: _ClassVar[int]
|
||||
SEQ_FIELD_NUMBER: _ClassVar[int]
|
||||
START_FIELD_NUMBER: _ClassVar[int]
|
||||
END_FIELD_NUMBER: _ClassVar[int]
|
||||
IMAGES_FIELD_NUMBER: _ClassVar[int]
|
||||
content: str
|
||||
seq: int
|
||||
start: int
|
||||
end: int
|
||||
images: _containers.RepeatedCompositeFieldContainer[Image]
|
||||
def __init__(self, content: _Optional[str] = ..., seq: _Optional[int] = ..., start: _Optional[int] = ..., end: _Optional[int] = ..., images: _Optional[_Iterable[_Union[Image, _Mapping]]] = ...) -> None: ...
|
||||
|
||||
class ReadResponse(_message.Message):
|
||||
__slots__ = ("chunks", "error")
|
||||
CHUNKS_FIELD_NUMBER: _ClassVar[int]
|
||||
ERROR_FIELD_NUMBER: _ClassVar[int]
|
||||
chunks: _containers.RepeatedCompositeFieldContainer[Chunk]
|
||||
error: str
|
||||
def __init__(self, chunks: _Optional[_Iterable[_Union[Chunk, _Mapping]]] = ..., error: _Optional[str] = ...) -> None: ...
|
||||
@@ -5,7 +5,7 @@ import warnings
|
||||
|
||||
import docreader_pb2 as docreader__pb2
|
||||
|
||||
GRPC_GENERATED_VERSION = '1.74.0'
|
||||
GRPC_GENERATED_VERSION = '1.76.0'
|
||||
GRPC_VERSION = grpc.__version__
|
||||
_version_not_supported = False
|
||||
|
||||
@@ -18,7 +18,7 @@ except ImportError:
|
||||
if _version_not_supported:
|
||||
raise RuntimeError(
|
||||
f'The grpc package installed is at version {GRPC_VERSION},'
|
||||
+ f' but the generated code in docreader_pb2_grpc.py depends on'
|
||||
+ ' but the generated code in docreader_pb2_grpc.py depends on'
|
||||
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
|
||||
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
|
||||
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
|
||||
38
docreader/pyproject.toml
Normal file
@@ -0,0 +1,38 @@
|
||||
[project]
|
||||
name = "docreader"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10.18"
|
||||
dependencies = [
|
||||
"antiword>=0.1.0",
|
||||
"asyncio>=4.0.0",
|
||||
"beautifulsoup4>=4.14.2",
|
||||
"cos-python-sdk-v5>=1.9.38",
|
||||
"goose3[all]>=3.1.20",
|
||||
"grpcio>=1.76.0",
|
||||
"grpcio-health-checking>=1.76.0",
|
||||
"grpcio-tools>=1.76.0",
|
||||
"lxml>=6.0.2",
|
||||
"markdown>=3.10",
|
||||
"markdownify>=1.2.0",
|
||||
"markitdown[docx,pdf,xls,xlsx]>=0.1.3",
|
||||
"minio>=7.2.18",
|
||||
"mistletoe>=1.5.0",
|
||||
"ollama>=0.6.0",
|
||||
"openai>=2.7.1",
|
||||
"paddleocr>=2.10.0,<3.0.0",
|
||||
"paddlepaddle>=3.0.0,<4.0.0",
|
||||
"pdfplumber>=0.11.7",
|
||||
"pillow>=12.0.0",
|
||||
"playwright>=1.55.0",
|
||||
"protobuf>=6.33.0",
|
||||
"pydantic>=2.12.3",
|
||||
"pypdf>=6.1.3",
|
||||
"pypdf2>=3.0.1",
|
||||
"python-docx>=1.2.0",
|
||||
"requests>=2.32.5",
|
||||
"textract==1.5.0",
|
||||
"trafilatura>=2.0.0",
|
||||
"urllib3>=2.5.0",
|
||||
]
|
||||
70
docreader/scripts/download_deps.py
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
# 添加当前目录到Python路径
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
if current_dir not in sys.path:
|
||||
sys.path.append(current_dir)
|
||||
|
||||
# 导入ImageParser
|
||||
from parser.image_parser import ImageParser
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def init_ocr_model():
|
||||
"""Initialize PaddleOCR model to pre-download and cache models"""
|
||||
try:
|
||||
logger.info("Initializing PaddleOCR model for pre-download...")
|
||||
|
||||
# 使用与代码中相同的配置
|
||||
ocr_config = {
|
||||
"use_gpu": False,
|
||||
"text_det_limit_type": "max",
|
||||
"text_det_limit_side_len": 960,
|
||||
"use_doc_orientation_classify": True, # 启用文档方向分类
|
||||
"use_doc_unwarping": False,
|
||||
"use_textline_orientation": True, # 启用文本行方向检测
|
||||
"text_recognition_model_name": "PP-OCRv4_server_rec",
|
||||
"text_detection_model_name": "PP-OCRv4_server_det",
|
||||
"text_det_thresh": 0.3,
|
||||
"text_det_box_thresh": 0.6,
|
||||
"text_det_unclip_ratio": 1.5,
|
||||
"text_rec_score_thresh": 0.0,
|
||||
"ocr_version": "PP-OCRv4",
|
||||
"lang": "ch",
|
||||
"show_log": False,
|
||||
"use_dilation": True,
|
||||
"det_db_score_mode": "slow",
|
||||
}
|
||||
|
||||
# 初始化PaddleOCR,这会触发模型下载和缓存
|
||||
ocr = PaddleOCR(**ocr_config)
|
||||
logger.info("PaddleOCR model initialization completed successfully")
|
||||
|
||||
# 测试OCR功能以确保模型正常工作
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
# 创建一个简单的测试图像
|
||||
test_image = np.ones((100, 300, 3), dtype=np.uint8) * 255
|
||||
test_pil = Image.fromarray(test_image)
|
||||
|
||||
# 执行一次OCR测试
|
||||
result = ocr.ocr(np.array(test_pil), cls=False)
|
||||
logger.info("PaddleOCR test completed successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize PaddleOCR model: {str(e)}")
|
||||
raise
|
||||
@@ -2,13 +2,14 @@
|
||||
set -x
|
||||
|
||||
# 设置目录
|
||||
PROTO_DIR="src/proto"
|
||||
PYTHON_OUT="src/proto"
|
||||
GO_OUT="src/proto"
|
||||
PROTO_DIR="docreader/proto"
|
||||
PYTHON_OUT="docreader/proto"
|
||||
GO_OUT="docreader/proto"
|
||||
|
||||
# 生成Python代码
|
||||
python3 -m grpc_tools.protoc -I${PROTO_DIR} \
|
||||
--python_out=${PYTHON_OUT} \
|
||||
--pyi_out=${PYTHON_OUT} \
|
||||
--grpc_python_out=${PYTHON_OUT} \
|
||||
${PROTO_DIR}/docreader.proto
|
||||
|
||||
@@ -22,10 +23,10 @@ protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
|
||||
# 修复Python导入问题(MacOS兼容版本)
|
||||
if [ "$(uname)" == "Darwin" ]; then
|
||||
# MacOS版本
|
||||
sed -i '' 's/import docreader_pb2/from . import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
sed -i '' 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
else
|
||||
# Linux版本
|
||||
sed -i 's/import docreader_pb2/from . import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
sed -i 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
fi
|
||||
|
||||
echo "Proto files generated successfully!"
|
||||
112
docreader/splitter/header_hook.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import re
|
||||
from typing import Callable, Dict, List, Match, Pattern, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class HeaderTrackerHook(BaseModel):
|
||||
"""表头追踪Hook的配置类,支持多种场景的表头识别"""
|
||||
|
||||
start_pattern: Pattern[str] = Field(
|
||||
description="表头开始匹配(正则表达式或字符串)"
|
||||
)
|
||||
end_pattern: Pattern[str] = Field(description="表头结束匹配(正则表达式或字符串)")
|
||||
extract_header_fn: Callable[[Match[str]], str] = Field(
|
||||
default=lambda m: m.group(0),
|
||||
description="从开始匹配结果中提取表头内容的函数(默认取匹配到的整个内容)",
|
||||
)
|
||||
priority: int = Field(default=0, description="优先级(多个配置时,高优先级先匹配)")
|
||||
case_sensitive: bool = Field(
|
||||
default=True, description="是否大小写敏感(仅当传入字符串pattern时生效)"
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
start_pattern: Union[str, Pattern[str]],
|
||||
end_pattern: Union[str, Pattern[str]],
|
||||
**kwargs,
|
||||
):
|
||||
flags = 0 if kwargs.get("case_sensitive", True) else re.IGNORECASE
|
||||
if isinstance(start_pattern, str):
|
||||
start_pattern = re.compile(start_pattern, flags | re.DOTALL)
|
||||
if isinstance(end_pattern, str):
|
||||
end_pattern = re.compile(end_pattern, flags | re.DOTALL)
|
||||
super().__init__(
|
||||
start_pattern=start_pattern,
|
||||
end_pattern=end_pattern,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
# 初始化表头Hook配置(提供默认配置:支持Markdown表格、代码块)
|
||||
DEFAULT_CONFIGS = [
|
||||
# 代码块配置(```开头,```结尾)
|
||||
# HeaderTrackerHook(
|
||||
# # 代码块开始(支持语言指定)
|
||||
# start_pattern=r"^\s*```(\w+).*(?!```$)",
|
||||
# # 代码块结束
|
||||
# end_pattern=r"^\s*```.*$",
|
||||
# extract_header_fn=lambda m: f"```{m.group(1)}" if m.group(1) else "```",
|
||||
# priority=20, # 代码块优先级高于表格
|
||||
# case_sensitive=True,
|
||||
# ),
|
||||
# Markdown表格配置(表头带下划线)
|
||||
HeaderTrackerHook(
|
||||
# 表头行 + 分隔行
|
||||
start_pattern=r"^\s*(?:\|[^|\n]*)+[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|?[\r\n]+$",
|
||||
# 空行或非表格内容
|
||||
end_pattern=r"^\s*$|^\s*[^|\s].*$",
|
||||
priority=15,
|
||||
case_sensitive=False,
|
||||
),
|
||||
]
|
||||
DEFAULT_CONFIGS.sort(key=lambda x: -x.priority)
|
||||
|
||||
|
||||
# 定义Hook状态数据结构
|
||||
class HeaderTracker(BaseModel):
|
||||
"""表头追踪 Hook 的状态类"""
|
||||
|
||||
header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS)
|
||||
active_headers: Dict[int, str] = Field(default_factory=dict)
|
||||
ended_headers: set[int] = Field(default_factory=set)
|
||||
|
||||
def update(self, split: str) -> Dict[int, str]:
|
||||
"""检测当前split中的表头开始/结束,更新Hook状态"""
|
||||
new_headers: Dict[int, str] = {}
|
||||
|
||||
# 1. 检查是否有表头结束标记
|
||||
for config in self.header_hook_configs:
|
||||
if config.priority in self.active_headers and config.end_pattern.search(
|
||||
split
|
||||
):
|
||||
self.ended_headers.add(config.priority)
|
||||
del self.active_headers[config.priority]
|
||||
|
||||
# 2. 检查是否有新的表头开始标记(只处理未活跃且未结束的)
|
||||
for config in self.header_hook_configs:
|
||||
if (
|
||||
config.priority not in self.active_headers
|
||||
and config.priority not in self.ended_headers
|
||||
):
|
||||
match = config.start_pattern.search(split)
|
||||
if match:
|
||||
header = config.extract_header_fn(match)
|
||||
self.active_headers[config.priority] = header
|
||||
new_headers[config.priority] = header
|
||||
|
||||
# 3. 检查是否所有活跃表头都已结束(清空结束标记)
|
||||
if not self.active_headers:
|
||||
self.ended_headers.clear()
|
||||
|
||||
return new_headers
|
||||
|
||||
def get_headers(self) -> str:
|
||||
"""获取当前所有活跃表头的拼接文本(按优先级排序)"""
|
||||
# 按优先级降序排列表头
|
||||
sorted_headers = sorted(self.active_headers.items(), key=lambda x: -x[0])
|
||||
return (
|
||||
"\n".join([header for _, header in sorted_headers])
|
||||
if sorted_headers
|
||||
else ""
|
||||
)
|
||||
311
docreader/splitter/splitter.py
Normal file
@@ -0,0 +1,311 @@
|
||||
"""Token splitter."""
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
from typing import Callable, Generic, List, Pattern, Tuple, TypeVar
|
||||
|
||||
from pydantic import BaseModel, Field, PrivateAttr
|
||||
|
||||
from docreader.splitter.header_hook import (
|
||||
HeaderTracker,
|
||||
)
|
||||
from docreader.utils.split import split_by_char, split_by_sep
|
||||
|
||||
DEFAULT_CHUNK_OVERLAP = 100
|
||||
DEFAULT_CHUNK_SIZE = 512
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextSplitter(BaseModel, Generic[T]):
|
||||
chunk_size: int = Field(description="The token chunk size for each chunk.")
|
||||
chunk_overlap: int = Field(
|
||||
description="The token overlap of each chunk when splitting."
|
||||
)
|
||||
separators: List[str] = Field(
|
||||
description="Default separators for splitting into words"
|
||||
)
|
||||
|
||||
# Try to keep the matched characters as a whole.
|
||||
# If it's too long, the content will be further segmented.
|
||||
protected_regex: List[str] = Field(
|
||||
description="Protected regex for splitting into words"
|
||||
)
|
||||
len_function: Callable[[str], int] = Field(description="The length function.")
|
||||
# Header tracking Hook related attributes
|
||||
header_hook: HeaderTracker = Field(default_factory=HeaderTracker, exclude=True)
|
||||
|
||||
_protected_fns: List[Pattern] = PrivateAttr()
|
||||
_split_fns: List[Callable] = PrivateAttr()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
||||
separators: List[str] = ["\n", "。", " "],
|
||||
protected_regex: List[str] = [
|
||||
# math formula
|
||||
r"\$\$[\s\S]*?\$\$",
|
||||
# image
|
||||
r"!\[.*?\]\(.*?\)",
|
||||
# link
|
||||
r"\[.*?\]\(.*?\)",
|
||||
# table header
|
||||
r"(?:\|[^|\n]*)+\|[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|[\r\n]+",
|
||||
# table body
|
||||
r"(?:\|[^|\n]*)+\|[\r\n]+",
|
||||
# code header
|
||||
r"```(?:\w+)[\r\n]+[^\r\n]*",
|
||||
],
|
||||
length_function: Callable[[str], int] = lambda x: len(x),
|
||||
):
|
||||
"""Initialize with parameters."""
|
||||
if chunk_overlap > chunk_size:
|
||||
raise ValueError(
|
||||
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
|
||||
f"({chunk_size}), should be smaller."
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
separators=separators,
|
||||
protected_regex=protected_regex,
|
||||
len_function=length_function,
|
||||
)
|
||||
self._protected_fns = [re.compile(reg) for reg in protected_regex]
|
||||
self._split_fns = [split_by_sep(sep) for sep in separators] + [split_by_char()]
|
||||
|
||||
def split_text(self, text: str) -> List[Tuple[int, int, str]]:
|
||||
"""Split text into chunks."""
|
||||
if text == "":
|
||||
return []
|
||||
|
||||
splits = self._split(text)
|
||||
protect = self._split_protected(text)
|
||||
splits = self._join(splits, protect)
|
||||
|
||||
assert "".join(splits) == text
|
||||
|
||||
chunks = self._merge(splits)
|
||||
return chunks
|
||||
|
||||
def _split(self, text: str) -> List[str]:
|
||||
"""Break text into splits that are smaller than chunk size.
|
||||
|
||||
NOTE: the splits contain the separators.
|
||||
"""
|
||||
if self.len_function(text) <= self.chunk_size:
|
||||
return [text]
|
||||
|
||||
splits = []
|
||||
for split_fn in self._split_fns:
|
||||
splits = split_fn(text)
|
||||
if len(splits) > 1:
|
||||
break
|
||||
|
||||
new_splits = []
|
||||
for split in splits:
|
||||
split_len = self.len_function(split)
|
||||
if split_len <= self.chunk_size:
|
||||
new_splits.append(split)
|
||||
else:
|
||||
# recursively split
|
||||
new_splits.extend(self._split(split))
|
||||
return new_splits
|
||||
|
||||
def _merge(self, splits: List[str]) -> List[Tuple[int, int, str]]:
|
||||
"""Merge splits into chunks.
|
||||
|
||||
The high-level idea is to keep adding splits to a chunk until we
|
||||
exceed the chunk size, then we start a new chunk with overlap.
|
||||
|
||||
When we start a new chunk, we pop off the first element of the previous
|
||||
chunk until the total length is less than the chunk size.
|
||||
"""
|
||||
chunks: List[Tuple[int, int, str]] = []
|
||||
|
||||
cur_chunk: List[Tuple[int, int, str]] = []
|
||||
|
||||
cur_headers, cur_len = "", 0
|
||||
cur_start, cur_end = 0, 0
|
||||
for split in splits:
|
||||
cur_end = cur_start + len(split)
|
||||
split_len = self.len_function(split)
|
||||
if split_len > self.chunk_size:
|
||||
logger.error(
|
||||
f"Got a split of size {split_len}, ",
|
||||
f"larger than chunk size {self.chunk_size}.",
|
||||
)
|
||||
|
||||
self.header_hook.update(split)
|
||||
cur_headers = self.header_hook.get_headers()
|
||||
cur_headers_len = self.len_function(cur_headers)
|
||||
|
||||
if cur_headers_len > self.chunk_size:
|
||||
logger.error(
|
||||
f"Got headers of size {cur_headers_len}, ",
|
||||
f"larger than chunk size {self.chunk_size}.",
|
||||
)
|
||||
cur_headers, cur_headers_len = "", 0
|
||||
|
||||
# if we exceed the chunk size after adding the new split, then
|
||||
# we need to end the current chunk and start a new one
|
||||
if cur_len + split_len + cur_headers_len > self.chunk_size:
|
||||
# end the previous chunk
|
||||
if len(cur_chunk) > 0:
|
||||
chunks.append(
|
||||
(
|
||||
cur_chunk[0][0],
|
||||
cur_chunk[-1][1],
|
||||
"".join([c[2] for c in cur_chunk]),
|
||||
)
|
||||
)
|
||||
|
||||
# start a new chunk with overlap
|
||||
# keep popping off the first element of the previous chunk until:
|
||||
# 1. the current chunk length is less than chunk overlap
|
||||
# 2. the total length is less than chunk size
|
||||
while cur_chunk and (
|
||||
cur_len > self.chunk_overlap
|
||||
or cur_len + split_len + cur_headers_len > self.chunk_size
|
||||
):
|
||||
# pop off the first element
|
||||
first_chunk = cur_chunk.pop(0)
|
||||
cur_len -= self.len_function(first_chunk[2])
|
||||
|
||||
if (
|
||||
cur_headers
|
||||
and split_len + cur_headers_len < self.chunk_size
|
||||
and cur_headers not in split
|
||||
):
|
||||
cur_chunk.insert(
|
||||
0,
|
||||
(
|
||||
cur_chunk[0][0] if cur_chunk else cur_start,
|
||||
cur_chunk[0][1] if cur_chunk else cur_end,
|
||||
cur_headers,
|
||||
),
|
||||
)
|
||||
cur_len += cur_headers_len
|
||||
|
||||
cur_chunk.append((cur_start, cur_end, split))
|
||||
cur_len += split_len
|
||||
cur_start = cur_end
|
||||
|
||||
# handle the last chunk
|
||||
assert cur_chunk
|
||||
chunks.append(
|
||||
(
|
||||
cur_chunk[0][0],
|
||||
cur_chunk[-1][1],
|
||||
"".join([c[2] for c in cur_chunk]),
|
||||
)
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_protected(self, text: str) -> List[Tuple[int, str]]:
|
||||
matches = [
|
||||
(match.start(), match.end())
|
||||
for pattern in self._protected_fns
|
||||
for match in pattern.finditer(text)
|
||||
]
|
||||
matches.sort(key=lambda x: (x[0], -x[1]))
|
||||
|
||||
res = []
|
||||
|
||||
def fold(initial: int, current: Tuple[int, int]) -> int:
|
||||
if current[0] >= initial:
|
||||
if current[1] - current[0] < self.chunk_size:
|
||||
res.append((current[0], text[current[0] : current[1]]))
|
||||
else:
|
||||
logger.warning(f"Protected text ignore: {current}")
|
||||
return max(initial, current[1])
|
||||
|
||||
# filter overlapping matches
|
||||
list(itertools.accumulate(matches, fold, initial=-1))
|
||||
return res
|
||||
|
||||
def _join(self, splits: List[str], protect: List[Tuple[int, str]]) -> List[str]:
|
||||
"""
|
||||
Merges and splits elements in splits array based on protected substrings.
|
||||
|
||||
The function processes the input splits to ensure all protected substrings
|
||||
remain as single items. If a protected substring is concatenated with preceding
|
||||
or following content in any split element, it will be separated from
|
||||
the adjacent content. The final result maintains the original order of content
|
||||
while enforcing the integrity of protected substrings.
|
||||
|
||||
Key behaviors:
|
||||
1. Preserves the complete structure of each protected substring
|
||||
2. Separates protected substrings from any adjacent non-protected content
|
||||
3. Maintains the original sequence of all content except for necessary
|
||||
4. Handles cases where protected substrings are partially concatenated
|
||||
"""
|
||||
j = 0
|
||||
point, start = 0, 0
|
||||
res = []
|
||||
|
||||
for split in splits:
|
||||
end = start + len(split)
|
||||
|
||||
cur = split[point - start :]
|
||||
while j < len(protect):
|
||||
p_start, p_content = protect[j]
|
||||
p_end = p_start + len(p_content)
|
||||
|
||||
if end <= p_start:
|
||||
break
|
||||
|
||||
if point < p_start:
|
||||
local_end = p_start - point
|
||||
res.append(cur[:local_end])
|
||||
cur = cur[local_end:]
|
||||
point = p_start
|
||||
|
||||
res.append(p_content)
|
||||
j += 1
|
||||
|
||||
if point < p_end:
|
||||
local_start = p_end - point
|
||||
cur = cur[local_start:]
|
||||
point = p_end
|
||||
|
||||
if not cur:
|
||||
break
|
||||
|
||||
if cur:
|
||||
res.append(cur)
|
||||
point = end
|
||||
|
||||
start = end
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
s = """
|
||||
这是一些普通文本。
|
||||
|
||||
| 姓名 | 年龄 | 城市 |
|
||||
|------|------|------|
|
||||
| 张三 | 25 | 北京 |
|
||||
| 李四 | 30 | 上海 |
|
||||
| 王五 | 28 | 广州 |
|
||||
| 张三 | 25 | 北京 |
|
||||
| 李四 | 30 | 上海 |
|
||||
| 王五 | 28 | 广州 |
|
||||
|
||||
这是文本结束。
|
||||
|
||||
"""
|
||||
|
||||
sp = TextSplitter(chunk_size=200, chunk_overlap=2)
|
||||
ck = sp.split_text(s)
|
||||
for c in ck:
|
||||
print("------", len(c))
|
||||
print(c)
|
||||
pass
|
||||
|
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.8 KiB |
103
docreader/utils/endecode.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import base64
|
||||
import binascii
|
||||
import io
|
||||
import logging
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def decode_image(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
|
||||
"""Convert image to base64 encoded string
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, PIL Image object, or numpy array
|
||||
|
||||
Returns:
|
||||
Base64 encoded image string, or empty string if conversion fails
|
||||
"""
|
||||
if isinstance(image, str):
|
||||
# It's a file path
|
||||
with open(image, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode()
|
||||
|
||||
elif isinstance(image, bytes):
|
||||
# It's bytes data
|
||||
return base64.b64encode(image).decode()
|
||||
|
||||
elif isinstance(image, Image.Image):
|
||||
# It's a PIL Image
|
||||
buffer = io.BytesIO()
|
||||
image.save(buffer, format=image.format)
|
||||
return base64.b64encode(buffer.getvalue()).decode()
|
||||
|
||||
elif isinstance(image, np.ndarray):
|
||||
# It's a numpy array
|
||||
pil_image = Image.fromarray(image)
|
||||
buffer = io.BytesIO()
|
||||
pil_image.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode()
|
||||
|
||||
raise ValueError(f"Unsupported image type: {type(image)}")
|
||||
|
||||
|
||||
def encode_image(image: str, errors="strict") -> bytes:
|
||||
"""
|
||||
Decode image bytes using base64.
|
||||
|
||||
errors
|
||||
The error handling scheme to use for the handling of decoding errors.
|
||||
The default is 'strict' meaning that decoding errors raise a
|
||||
UnicodeDecodeError. Other possible values are 'ignore' and '????'
|
||||
as well as any other name registered with codecs.register_error that
|
||||
can handle UnicodeDecodeErrors.
|
||||
"""
|
||||
try:
|
||||
image_bytes = base64.b64decode(image)
|
||||
except binascii.Error as e:
|
||||
if errors == "ignore":
|
||||
return b""
|
||||
else:
|
||||
raise e
|
||||
return image_bytes
|
||||
|
||||
|
||||
def encode_bytes(content: str) -> bytes:
|
||||
return content.encode()
|
||||
|
||||
|
||||
def decode_bytes(
|
||||
content: bytes,
|
||||
encodings: List[str] = [
|
||||
"utf-8",
|
||||
"gb18030",
|
||||
"gb2312",
|
||||
"gbk",
|
||||
"big5",
|
||||
"ascii",
|
||||
"latin-1",
|
||||
],
|
||||
) -> str:
|
||||
# Try decoding with each encoding format
|
||||
for encoding in encodings:
|
||||
try:
|
||||
text = content.decode(encoding)
|
||||
logger.debug(f"Decode content with {encoding}: {len(text)} characters")
|
||||
return text
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
text = content.decode(encoding="latin-1", errors="replace")
|
||||
logger.warning(
|
||||
"Unable to determine correct encoding, using latin-1 as fallback. "
|
||||
"This may cause character issues."
|
||||
)
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
img = "testtest"
|
||||
encode_image(img, errors="ignore")
|
||||
@@ -1,10 +1,10 @@
|
||||
from contextvars import ContextVar
|
||||
import logging
|
||||
import uuid
|
||||
import contextlib
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
import uuid
|
||||
from contextvars import ContextVar
|
||||
from logging import LogRecord
|
||||
from typing import Optional
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -26,21 +26,21 @@ def get_request_id() -> Optional[str]:
|
||||
|
||||
class MillisecondFormatter(logging.Formatter):
|
||||
"""自定义日志格式化器,只显示毫秒级时间戳(3位数字)而不是微秒(6位)"""
|
||||
|
||||
|
||||
def formatTime(self, record, datefmt=None):
|
||||
"""重写formatTime方法,将微秒格式化为毫秒"""
|
||||
# 先获取标准的格式化时间
|
||||
result = super().formatTime(record, datefmt)
|
||||
|
||||
|
||||
# 如果使用了包含.%f的格式,则将微秒(6位)截断为毫秒(3位)
|
||||
if datefmt and ".%f" in datefmt:
|
||||
# 格式化的时间字符串应该在最后有6位微秒数
|
||||
parts = result.split('.')
|
||||
parts = result.split(".")
|
||||
if len(parts) > 1 and len(parts[1]) >= 6:
|
||||
# 只保留前3位作为毫秒
|
||||
millis = parts[1][:3]
|
||||
result = f"{parts[0]}.{millis}"
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
34
docreader/utils/split.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import re
|
||||
from typing import Callable, List
|
||||
|
||||
|
||||
def split_text_keep_separator(text: str, separator: str) -> List[str]:
|
||||
"""Split text with separator and keep the separator at the end of each split."""
|
||||
parts = text.split(separator)
|
||||
result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
|
||||
return [s for s in result if s]
|
||||
|
||||
|
||||
def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
|
||||
"""Split text by separator."""
|
||||
if keep_sep:
|
||||
return lambda text: split_text_keep_separator(text, sep)
|
||||
else:
|
||||
return lambda text: text.split(sep)
|
||||
|
||||
|
||||
def split_by_char() -> Callable[[str], List[str]]:
|
||||
"""Split text by character."""
|
||||
return lambda text: list(text)
|
||||
|
||||
|
||||
def split_by_regex(regex: str) -> Callable[[str], List[str]]:
|
||||
"""Split text by regex."""
|
||||
pattern = re.compile(f"({regex})")
|
||||
return lambda text: list(filter(None, pattern.split(text)))
|
||||
|
||||
|
||||
def match_by_regex(regex: str) -> Callable[[str], bool]:
|
||||
"""Split text by regex."""
|
||||
pattern = re.compile(regex)
|
||||
return lambda text: bool(pattern.match(text))
|
||||
77
docreader/utils/tempfile.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TempFileContext:
|
||||
def __init__(self, file_content: bytes, suffix: str):
|
||||
"""
|
||||
Initialize the context
|
||||
:param file_content: Byte data to write to file
|
||||
:param suffix: File suffix
|
||||
"""
|
||||
self.file_content = file_content
|
||||
self.suffix = suffix
|
||||
self.file = None
|
||||
|
||||
def __enter__(self):
|
||||
"""
|
||||
Create file when entering context
|
||||
"""
|
||||
self.temp_file = tempfile.NamedTemporaryFile(suffix=self.suffix, delete=False)
|
||||
self.temp_file.write(self.file_content)
|
||||
self.temp_file.flush()
|
||||
logger.info(
|
||||
f"Saved {self.suffix} content to temporary file: {self.temp_file.name}"
|
||||
)
|
||||
return self.temp_file.name
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""
|
||||
Delete file when exiting context
|
||||
"""
|
||||
if self.temp_file:
|
||||
self.temp_file.close()
|
||||
if os.path.exists(self.temp_file.name):
|
||||
os.remove(self.temp_file.name)
|
||||
logger.info(f"File {self.temp_file.name} has been deleted.")
|
||||
# Return False to propagate exception (if any exception occurred)
|
||||
return False
|
||||
|
||||
|
||||
class TempDirContext:
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize the context
|
||||
"""
|
||||
self.temp_dir = None
|
||||
|
||||
def __enter__(self):
|
||||
"""
|
||||
Create directory when entering context
|
||||
"""
|
||||
self.temp_dir = tempfile.TemporaryDirectory()
|
||||
logger.info(f"Created temporary directory: {self.temp_dir.name}")
|
||||
return self.temp_dir.name
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""
|
||||
Delete directory when exiting context
|
||||
"""
|
||||
if self.temp_dir and os.path.exists(self.temp_dir.name):
|
||||
self.temp_dir.cleanup()
|
||||
logger.info(f"Directory {self.temp_dir.name} has been deleted.")
|
||||
# Return False to propagate exception (if any exception occurred)
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_bytes = b"Hello, this is a test file."
|
||||
file_name = "test_file.txt"
|
||||
|
||||
# Using with statement
|
||||
with TempFileContext(example_bytes, file_name) as temp_file:
|
||||
# File operations can be performed within the context
|
||||
print(f"Does file {file_name} exist: {os.path.exists(file_name)}")
|
||||
3740
docreader/uv.lock
generated
Normal file
50
docs/API.md
@@ -44,9 +44,7 @@ X-Request-ID: unique_request_id
|
||||
|
||||
### 获取 API Key
|
||||
|
||||
获取 API Key 有以下方式:
|
||||
|
||||
**创建租户时获取**:通过 `POST /api/v1/tenants` 接口创建新租户时,响应中会自动返回生成的 API Key。
|
||||
在 web 页面完成账户注册后,请前往账户信息页面获取您的 API Key。
|
||||
|
||||
请妥善保管您的 API Key,避免泄露。API Key 代表您的账户身份,拥有完整的 API 访问权限。
|
||||
|
||||
@@ -336,7 +334,6 @@ curl --location 'http://localhost:8080/api/v1/tenants' \
|
||||
| GET | `/knowledge-bases/:id` | 获取知识库详情 |
|
||||
| PUT | `/knowledge-bases/:id` | 更新知识库 |
|
||||
| DELETE | `/knowledge-bases/:id` | 删除知识库 |
|
||||
| GET | `/knowledge-bases/:id/hybrid-search` | 混合搜索知识库内容 |
|
||||
| POST | `/knowledge-bases/copy` | 拷贝知识库 |
|
||||
|
||||
#### POST `/knowledge-bases` - 创建知识库
|
||||
@@ -658,51 +655,6 @@ curl --location --request DELETE 'http://localhost:8080/api/v1/knowledge-bases/b
|
||||
}
|
||||
```
|
||||
|
||||
#### GET `/knowledge-bases/:id/hybrid-search` - 混合搜索知识库内容
|
||||
|
||||
**请求**:
|
||||
|
||||
```curl
|
||||
curl --location --request GET 'http://localhost:8080/api/v1/knowledge-bases/kb-00000001/hybrid-search' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'X-API-Key: sk-vQHV2NZI_LK5W7wHQvH3yGYExX8YnhaHwZipUYbiZKCYJbBQ' \
|
||||
--data '{
|
||||
"query_text": "彗星",
|
||||
"vector_threshold": 0.1,
|
||||
"keyword_threshold": 0.1,
|
||||
"match_count": 1
|
||||
}'
|
||||
```
|
||||
|
||||
**响应**:
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"id": "7d955251-3f79-4fd5-a6aa-02f81e044091",
|
||||
"content": "有几位后来xxxxx",
|
||||
"knowledge_id": "a6790b93-4700-4676-bd48-0d4804e1456b",
|
||||
"chunk_index": 3,
|
||||
"knowledge_title": "彗星.txt",
|
||||
"start_at": 2287,
|
||||
"end_at": 2760,
|
||||
"seq": 3,
|
||||
"score": 0.7402352891601821,
|
||||
"match_type": 2,
|
||||
"sub_chunk_id": null,
|
||||
"metadata": {},
|
||||
"chunk_type": "text",
|
||||
"parent_chunk_id": "",
|
||||
"image_info": "",
|
||||
"knowledge_filename": "彗星.txt",
|
||||
"knowledge_source": ""
|
||||
}
|
||||
],
|
||||
"success": true
|
||||
}
|
||||
```
|
||||
|
||||
<div align="right"><a href="#weknora-api-文档">返回顶部 ↑</a></div>
|
||||
|
||||
### 知识管理API
|
||||
|
||||
28
docs/KnowledgeGraph.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# WeKnora 知识图谱
|
||||
|
||||
## 快速开始
|
||||
|
||||
- .env 配置相关环境变量
|
||||
- 启用 Neo4j: `NEO4J_ENABLE=true`
|
||||
- Neo4j URI: `NEO4J_URI=bolt://neo4j:7687`
|
||||
- Neo4j 用户名: `NEO4J_USERNAME=neo4j`
|
||||
- Neo4j 密码: `NEO4J_PASSWORD=password`
|
||||
|
||||
- 启动 Neo4j
|
||||
```bash
|
||||
docker-compose --profile neo4j up -d
|
||||
```
|
||||
|
||||
- 在知识库设置页面启用实体和关系提取,并根据提示配置相关内容
|
||||
|
||||
## 生成图谱
|
||||
|
||||
上传任意文档后,系统会自动提取实体和关系,并生成对应的知识图谱。
|
||||
|
||||

|
||||
|
||||
## 查看图谱
|
||||
|
||||
登陆 `http://localhost:7474`,执行 `match (n) return (n)` 即可查看生成的知识图谱。
|
||||
|
||||
在对话时,系统会自动查询知识图谱,并获取相关知识。
|
||||
@@ -2,11 +2,7 @@
|
||||
|
||||
## 1. 如何查看日志?
|
||||
```bash
|
||||
# 查看 主服务 日志
|
||||
docker exec -it WeKnora-app tail -f /var/log/WeKnora.log
|
||||
|
||||
# 查看 文档解析模块 日志
|
||||
docker exec -it WeKnora-docreader tail -f /var/log/docreader.log
|
||||
docker compose logs -f app docreader postgres
|
||||
```
|
||||
|
||||
## 2. 如何启动和停止服务?
|
||||
|
||||
319
docs/WeKnora.md
Normal file
@@ -0,0 +1,319 @@
|
||||
## 介绍
|
||||
WeKora 是一个可立即在生产环境投入的企业级RAG框架,实现智能文档理解和检索功能。该系统采用模块化设计,将文档理解、向量存储、推理文件等功能分离。
|
||||
|
||||

|
||||
|
||||
---
|
||||
|
||||
## PipeLine
|
||||
WeKnora 处理文档需要多个步骤:插入-》知识提取-》索引-》检索-》生成,整个流程支持多种检索方法,
|
||||
|
||||

|
||||
|
||||
|
||||
以用户上传的一张住宿流水单pdf文件为例,详细介绍下其数据流:
|
||||
|
||||
### 1. 接收请求与初始化
|
||||
+ **请求识别**: 系统收到一个请求,并为其分配了唯一的 `request_id=Lkq0OGLYu2fV`,用于追踪整个处理流程。
|
||||
+ **租户与会话验证**:
|
||||
- 系统首先验证了租户信息(ID: 1, Name: Default Tenant)。
|
||||
- 接着开始处理一个知识库问答(Knowledge QA)请求,该请求属于会话 `1f241340-ae75-40a5-8731-9a3a82e34fdd`。
|
||||
+ **用户问题**: 用户的原始问题是:“**入住的房型是什么**”。
|
||||
+ **消息创建**: 系统为用户的提问和即将生成的回答分别创建了消息记录,ID 分别为 `703ddf09-...` 和 `6f057649-...`。
|
||||
|
||||
### 2. 知识库问答流程启动
|
||||
系统正式调用知识库问答服务,并定义了将要按顺序执行的完整处理管道(Pipeline),包含以下9个事件:
|
||||
`[rewrite_query, preprocess_query, chunk_search, chunk_rerank, chunk_merge, filter_top_k, into_chat_message, chat_completion_stream, stream_filter]`
|
||||
|
||||
---
|
||||
|
||||
### 3. 事件执行详情
|
||||
#### 事件 1: `rewrite_query` - 问题改写
|
||||
+ **目的**: 为了让检索更精确,系统需要结合上下文来理解用户的真实意图。
|
||||
+ **操作**:
|
||||
1. 系统检索了当前会话最近的20条历史消息(实际检索到8条)作为上下文。
|
||||
2. 调用了一个名为 `deepseek-r1:7b` 的本地大语言模型。
|
||||
3. 模型根据聊天历史分析出提问者是“Liwx”,并将原问题“入住的房型是什么”改写得更具体。
|
||||
+ **结果**: 问题被成功改写为:“**Liwx本次入住的房型是什么**”。
|
||||
|
||||
#### 事件 2: `preprocess_query` - 问题预处理
|
||||
+ **目的**: 将改写后的问题进行分词,转换为适合搜索引擎处理的关键词序列。
|
||||
+ **操作**: 对改写后的问题进行了分词处理。
|
||||
+ **结果**: 生成了一串关键词:“`需要 改写 用户 问题 入住 房型 根据 提供 信息 入住 人 Liwx 选择 房型 双床 房 因此 改写 后 完整 问题 为 Liwx 本次 入住 房型`”。
|
||||
|
||||
#### 事件 3: `chunk_search` - 知识区块检索
|
||||
这是最核心的**检索(Retrieval)**步骤,系统执行了两次混合搜索(Hybrid Search)。
|
||||
|
||||
+ **第一次搜索 (使用改写后的完整问句)**:
|
||||
- **向量检索**:
|
||||
1. 加载嵌入模型 `bge-m3:latest` 将问句转换为一个1024维的向量。
|
||||
2. 在PostgreSQL数据库中进行向量相似度搜索,找到了2个相关的知识区块(chunk),ID 分别为 `e3bf6599-...` 和 `3989c6ce-...`。
|
||||
- **关键词检索**:
|
||||
1. 同时,系统也进行了关键词搜索。
|
||||
2. 同样找到了上述2个知识区块。
|
||||
- **结果合并**: 两种方法找到的4个结果(实际是2个重复的)被去重,最终得到2个唯一的知识区块。
|
||||
+ **第二次搜索 (使用预处理后的关键词序列)**:
|
||||
- 系统使用分词后的关键词重复了上述的**向量检索**和**关键词检索**过程。
|
||||
- 最终也得到了相同的2个知识区块。
|
||||
+ **最终结果**: 经过两次搜索和结果合并,系统锁定了2个最相关的知识区块,并将它们的内容提取出来,准备用于生成答案。
|
||||
|
||||
#### 事件 4: `chunk_rerank` - 结果重排序
|
||||
+ **目的**: 使用一个更强大的模型对初步检索出的结果进行更精细的排序,以提高最终答案的质量。
|
||||
+ **操作**: 日志显示 `Rerank model ID is empty, skipping reranking`。这意味着系统配置了重排序步骤,但没有指定具体的重排序模型,因此**跳过了此步骤**。
|
||||
|
||||
#### 事件 5: `chunk_merge` - 区块合并
|
||||
+ **目的**: 将内容上相邻或相关的知识区块进行合并,形成更完整的上下文。
|
||||
+ **操作**: 系统分析了检索到的2个区块,并尝试进行合并。根据日志,最终处理后仍然是2个独立的区块,但已按相关性分数排好序。
|
||||
|
||||
#### 事件 6: `filter_top_k` - Top-K 过滤
|
||||
+ **目的**: 仅保留最相关的K个结果,防止过多无关信息干扰语言模型。
|
||||
+ **操作**: 系统配置保留前5个(Top-K = 5)最相关的区块。由于当前只有2个区块,它们全部通过了此过滤器。
|
||||
|
||||
#### 事件 7 & 8: `into_chat_message` & `chat_completion_stream` - 生成回答
|
||||
这是**生成(Generation)**步骤。
|
||||
|
||||
+ **目的**: 基于检索到的信息,生成自然流畅的回答。
|
||||
+ **操作**:
|
||||
1. 系统将检索到的2个知识区块的内容、用户的原始问题以及聊天历史整合在一起,形成一个完整的提示(Prompt)。
|
||||
2. 再次调用 `deepseek-r1:7b` 大语言模型,并以**流式(Stream)**的方式请求生成答案。流式输出可以实现打字机效果,提升用户体验。
|
||||
|
||||
#### 事件 9: `stream_filter` - 流式输出过滤
|
||||
+ **目的**: 对模型生成的实时文本流进行后处理,过滤掉不需要的特殊标记或内容。
|
||||
+ **操作**:
|
||||
- 系统设置了一个过滤器,用于移除模型在思考过程中可能产生的内部标记,如 `<think>` 和 `</think>`。
|
||||
- 日志显示,模型输出的第一个词块是 `<think> 根据`,过滤器成功拦截并移除了 `<think>` 标记,只将“根据”及之后的内容传递下去。
|
||||
|
||||
### 4. 完成与响应
|
||||
+ **发送引用**: 在生成答案的同时,系统将作为依据的2个知识区块作为“参考内容”发送给前端,以便用户查证来源。
|
||||
+ **更新消息**: 当模型生成完所有内容后,系统将完整的回答更新到之前创建的消息记录(ID: `6f057649-...`)中。
|
||||
+ **请求结束**: 服务器返回 `200` 成功状态码,标志着本次从提问到回答的完整流程结束。
|
||||
|
||||
### 总结
|
||||
这个日志完整地记录了一次典型的RAG流程:系统通过**问题改写**和**预处理**来精确理解用户意图,接着利用**向量与关键词混合检索**从知识库中找到相关信息,虽然跳过了**重排序**,但依然执行了**合并**与**过滤**,最后将检索到的知识作为上下文,交由大语言模型**生成**流畅、准确的回答,并通过**流式过滤**保证了输出的纯净性。
|
||||
|
||||
## 文档解析切分
|
||||
代码实现了一个独立的、通过gRPC通信的微服务,专门负责文档内容的深度解析、分块和多模态信息提取。它正是“异步处理”阶段的核心执行者。
|
||||
|
||||
### **整体架构**
|
||||
这是一个基于Python的gRPC服务,其核心职责是接收文件(或URL),并将其解析成结构化的、可供后续处理(如向量化)的文本块(Chunks)。
|
||||
|
||||
+ `server.py`: 服务的入口和网络层。它负责启动一个多进程、多线程的gRPC服务器,接收来自Go后端的请求,并将解析结果返回。
|
||||
+ `parser.py`: 设计模式中的**外观(Facade)模式**。它提供了一个统一的`Parser`类,屏蔽了内部多种具体解析器(如PDF、DOCX、Markdown等)的复杂性。外部调用者(`server.py`)只需与这个`Parser`类交互。
|
||||
+ `base_parser.py`: 解析器的基类,定义了所有具体解析器共享的核心逻辑和抽象方法。这是整个解析流程的“大脑”,包含了最复杂的文本分块、图片处理、OCR和图像描述生成等功能。
|
||||
|
||||
---
|
||||
|
||||
### **详细工作流程**
|
||||
当Go后端启动异步任务时,它会携带文件内容和配置信息,向这个Python服务发起一次gRPC调用。以下是完整的处理流程:
|
||||
|
||||
#### **第一步:请求接收与分发 (**`server.py`** & **`parser.py`**)
|
||||
1. **gRPC服务入口 (**`server.py: serve`**)**:
|
||||
- 服务通过`serve()`函数启动。它会根据环境变量(`GRPC_WORKER_PROCESSES`, `GRPC_MAX_WORKERS`)启动一个**多进程、多线程**的服务器,以充分利用CPU资源,提高并发处理能力。
|
||||
- 每个工作进程都监听在指定的端口(如50051),准备接收请求。
|
||||
2. **请求处理 (**`server.py: ReadFromFile`**)**:
|
||||
- 当Go后端发起`ReadFromFile`请求时,其中一个工作进程会接收到该请求。
|
||||
- 该方法首先会解析请求中的参数,包括:
|
||||
* `file_name`, `file_type`, `file_content`:文件的基本信息和二进制内容。
|
||||
* `read_config`: 一个包含所有解析配置的复杂对象,如`chunk_size`(分块大小)、`chunk_overlap`(重叠大小)、`enable_multimodal`(是否启用多模态处理)、`storage_config`(对象存储配置)、`vlm_config`(视觉语言模型配置)等。
|
||||
- 它将这些配置整合成一个`ChunkingConfig`数据对象。
|
||||
- 最关键的一步是调用 `self.parser.parse_file(...)`,将解析任务交给`Parser`外观类处理。
|
||||
3. **解析器选择 (**`parser.py: Parser.parse_file`**)**:
|
||||
- `Parser`类接收到任务后,首先调用`get_parser(file_type)`方法。
|
||||
- 该方法会根据文件类型(例如 `'pdf'`)在一个字典 `self.parsers` 中查找对应的具体解析器类(例如 `PDFParser`)。
|
||||
- 找到后,它会**实例化**这个`PDFParser`类,并将`ChunkingConfig`等所有配置信息传递给构造函数。
|
||||
|
||||
#### **第二步:核心解析与分块 (**`base_parser.py`**)**
|
||||
它触及了整个流程的核心:**如何保证信息的上下文完整性和原始顺序**。
|
||||
|
||||
根据 `base_parser.py` 代码,**最终切分出的 Chunk 中的文本、表格和图像是按照它们在原始文档中的出现顺序来保存的**。
|
||||
|
||||
这个顺序得以保证,主要归功于 `BaseParser` 中几个设计精巧的方法相互协作。我们来详细追踪一下这个流程。
|
||||
|
||||
整个顺序的保证可以分为三个阶段:
|
||||
|
||||
1. **阶段一:统一的文本流创建 (**`pdf_parser.py`**)**:
|
||||
- 在 `parse_into_text` 方法中,您的代码会**逐页**处理PDF。
|
||||
- 在每一页内部,它会按照一定的逻辑(先提取非表格文本,再附加表格,最后附加图像占位符)将所有内容**拼接成一个长字符串** (`page_content_parts`)。
|
||||
- **关键点**: 虽然在这个阶段,文本、表格和图像占位符的拼接顺序可能不是100%精确到字符级别,但它保证了**同一页的内容会在一起**,并且大致遵循了从上到下的阅读顺序。
|
||||
- 最后,所有页面的内容被 `"\n\n--- Page Break ---\n\n"` 连接起来,形成一个**包含了所有信息(文本、Markdown表格、图像占位符)的、单一的、有序的文本流 (**`final_text`**)**。
|
||||
2. **阶段二:原子化与保护 (**`_split_into_units`**)**:
|
||||
- 这个单一的 `final_text` 被传递给 `_split_into_units` 方法。
|
||||
- 这个方法是**保证结构完整性的关键**。它使用正则表达式,将**整个Markdown表格**和**整个Markdown图像占位符**识别为**不可分割的原子单元 (atomic units)**。
|
||||
- 它会将这些原子单元(表格、图片)和它们之间的普通文本块,按照它们在 `final_text` 中出现的**原始顺序**,切分成一个列表 (`units`)。
|
||||
- **结果**: 我们现在有了一个列表,例如 `['一些文本', '', '另一些文本', '|...|...|\n|---|---|\n...', '更多文本']`。这个列表中的元素顺序**完全等同于它们在原始文档中的顺序**。
|
||||
3. **阶段三:顺序分块 (**`chunk_text`**)**:
|
||||
- `chunk_text` 方法接收到这个**有序的 **`units`** 列表**。
|
||||
- 它的工作机制非常简单直接:它会**按顺序**遍历这个列表中的每一个单元(`unit`)。
|
||||
- 它将这些单元**依次添加**到一个临时的 `current_chunk` 列表中,直到这个块的长度接近 `chunk_size` 的上限。
|
||||
- 当一个块满了之后,它就被保存下来,然后开始一个新的块(可能会带有上一个块的重叠部分)。
|
||||
- **关键点**: 因为 `chunk_text` **严格按照 **`units`** 列表的顺序进行处理**,所以它永远不会打乱表格、文本和图像之间的相对顺序。一个在文档中先出现的表格,也必然会出现在一个序号更靠前的 Chunk 中。
|
||||
4. **阶段四:图像信息附加 (**`process_chunks_images`**)**:
|
||||
- 在文本块被切分好之后,`process_chunks_images` 方法会被调用。
|
||||
- 它会处理**每一个**已经生成好的 Chunk。
|
||||
- 在每个 Chunk 内部,它会找到图像占位符,然后进行AI处理。
|
||||
- 最后,它会将处理好的图像信息(包含永久URL、OCR文本、图像描述等)附加到**该 Chunk 自己**的 `.images` 属性中。
|
||||
- **关键点**: 这个过程**不会改变 Chunk 的顺序或其 **`.content`** 的内容**。它只是为已经存在的、顺序正确的 Chunk 附加额外的信息。
|
||||
|
||||
#### **第三步:多模态处理(如果启用) (**`base_parser.py`**)**
|
||||
如果 `enable_multimodal` 为 `True`,在文本分块完成后,会进入最复杂的多模态处理阶段。
|
||||
|
||||
1. **并发任务启动 (**`BaseParser.process_chunks_images`**)**:
|
||||
- 该方法使用`asyncio`(Python的异步I/O框架)来**并发处理所有文本块中的图片**,以极大地提升效率。
|
||||
- 它为每个`Chunk`创建一个异步任务`process_chunk_images_async`。
|
||||
2. **处理单个块中的图片 (**`BaseParser.process_chunk_images_async`**)**:
|
||||
- **提取图片引用**: 首先,使用正则表达式 `extract_images_from_chunk` 从当前块的文本中找到所有的图片引用(例如,``)。
|
||||
- **图片持久化**: 对于找到的每个图片,并发地调用 `download_and_upload_image`。这个函数负责:
|
||||
* 从其原始位置(可能是PDF内部、本地路径或远程URL)获取图片数据。
|
||||
* 将图片**上传到配置好的对象存储(COS/MinIO)**。这一步至关重要,它将临时的、不稳定的图片引用转换成一个持久化、可通过URL公开访问的地址。
|
||||
* 返回持久化的URL和图片对象(PIL Image)。
|
||||
- **并发AI处理**: 将所有成功上传的图片收集起来,调用`process_multiple_images`。
|
||||
* 该方法内部使用`asyncio.Semaphore`来限制并发数量(例如最多同时处理5张图片),防止瞬间消耗过多内存或触发模型API的速率限制。
|
||||
* 对于每张图片,它会调用`process_image_async`。
|
||||
3. **处理单张图片 (**`BaseParser.process_image_async`**)**:
|
||||
- **OCR**: 调用`perform_ocr`,它会使用一个OCR引擎(如`PaddleOCR`)来识别图片中的所有文字。
|
||||
- **图像描述 (Caption)**: 调用`get_image_caption`,它会将图片数据(转为Base64)发送给配置的视觉语言模型(VLM),生成对图片内容的自然语言描述。
|
||||
- 该方法返回 `(ocr_text, caption, 持久化URL)`。
|
||||
4. **结果聚合**:
|
||||
- 所有图片处理完成后,包含持久化URL、OCR文本和图像描述的结构化信息,会被附加到对应`Chunk`对象的 `.images` 字段上。
|
||||
|
||||
#### **第四步:返回结果 (**`server.py`**)**
|
||||
1. **数据转换 (**`server.py: _convert_chunk_to_proto`**)**:
|
||||
- 当`parser.parse_file`执行完毕后,它返回一个包含所有处理过的`Chunk`对象的列表(`ParseResult`)。
|
||||
- `ReadFromFile`方法接收到这个结果,并调用`_convert_chunk_to_proto`,将Python的`Chunk`对象(包括其内部的图片信息)转换成gRPC定义的Protobuf消息格式。
|
||||
2. **响应返回**:
|
||||
- 最后,gRPC服务器将这个包含所有分块和多模态信息的`ReadResponse`消息发送回给调用方——Go后端服务。
|
||||
|
||||
至此,Go后端就拿到了结构化、信息丰富的文档数据,可以进行下一步的向量化和索引存储了。
|
||||
|
||||
|
||||
## 部署
|
||||
支持Docker 镜像本地部署,并通过API端口提供接口服务
|
||||
|
||||
## 性能和监控
|
||||
Weknora包含丰富的监控和测试组件:
|
||||
|
||||
+ 分布式跟踪:集成Jaeger用于跟踪请求在服务架构中的完整执行路。本质上,Jaeger是一种帮助用户“看见”请求在分布式系统中完整生命周期的技术。
|
||||
+ 健康监控:监控服务处在健康状态
|
||||
+ 可扩展性:通过容器化部署,可通过多个服务满足大规模并发请求
|
||||
|
||||
## QA
|
||||
### 问题1: 在检索过程的执行了两次混合搜索的目的是什么?以及第一次和第二次搜索有什么不同?
|
||||
这是一个非常好的观察。系统执行两次混合搜索是为了**最大化检索的准确性和召回率**,本质上是一种**查询扩展(Query Expansion)和多策略检索**的组合方法。
|
||||
|
||||
#### 目的
|
||||
通过两种不同形式的查询(原始改写句 vs. 分词后的关键词序列)去搜索,系统可以结合两种查询方式的优点:
|
||||
|
||||
+ **语义检索的深度**: 使用完整的句子进行搜索,能更好地利用向量模型(如`bge-m3`)对句子整体含义的理解能力,找到语义上最接近的知识区块。
|
||||
+ **关键词检索的广度**: 使用分词后的关键词进行搜索,能确保即使知识区块的表述方式与原问题不同,但只要包含了核心关键词,就有机会被命中。这对于传统的关键词匹配算法(如BM25)尤其有效。
|
||||
|
||||
简单来说,就是**用两种不同的“问法”去问同一个问题**,然后将两边的结果汇总起来,确保最相关的知识不会被遗漏。
|
||||
|
||||
#### 两次搜索的不同点
|
||||
它们最核心的不同在于**输入的查询文本(Query Text)**:
|
||||
|
||||
1. **第一次混合搜索**
|
||||
- **输入**: 使用的是经过`rewrite_query`事件后生成的、**语法完整的自然语言问句**。
|
||||
- **日志证据**:
|
||||
|
||||
```plain
|
||||
INFO [2025-08-29 09:46:36.896] [request_id=Lkq0OGLYu2fV] knowledgebase.go:266[HybridSearch] | Hybrid search parameters, knowledge base ID: kb-00000001, query text: 需要改写的用户问题是:“入住的房型是什么”。根据提供的信息,入住人Liwx选择的房型是双床房。因此,改写后的完整问题为: “Liwx本次入住的房型是什么”
|
||||
```
|
||||
|
||||
2. **第二次混合搜索**
|
||||
- **输入**: 使用的是经过`preprocess_query`事件处理后生成的、**由空格隔开的关键词序列**。
|
||||
- **日志证据**:
|
||||
|
||||
```plain
|
||||
INFO [2025-08-29 09:46:37.257] [request_id=Lkq0OGLYu2fV] knowledgebase.go:266[HybridSearch] | Hybrid search parameters, knowledge base ID: kb-00000001, query text: 需要 改写 用户 问题 入住 房型 根据 提供 信息 入住 人 Liwx 选择 房型 双床 房 因此 改写 后 完整 问题 为 Liwx 本次 入住 房型
|
||||
```
|
||||
|
||||
最终,系统将这两次搜索的结果进行去重和合并(日志中显示每次都找到2个结果,去重后总共还是2个),从而得到一个更可靠的知识集合,用于后续的答案生成。
|
||||
|
||||
|
||||
|
||||
### 问题2:重排序模型分析
|
||||
Reranker(重排器)是目前RAG领域中非常先进的技术,它们在工作原理和适用场景上有着显著的区别。
|
||||
|
||||
简单来说,它们代表了从“**专门的判别模型**”到“**利用大语言模型(LLM)进行判别**”再到“**深度挖掘LLM内部信息进行判别**”的演进。
|
||||
|
||||
以下是它们的详细区别:
|
||||
|
||||
|
||||
|
||||
#### 1. Normal Reranker (常规重排器 / 交叉编码器)
|
||||
这是最经典也是最主流的重排方法。
|
||||
|
||||
+ **模型类型**: **序列分类模型 (Sequence Classification Model)**。本质上是一个**交叉编码器 (Cross-Encoder)**,通常基于BERT、RoBERTa等双向编码器架构。`BAAI/bge-reranker-base/large/v2-m3` 都属于这一类。
|
||||
+ **工作原理**:
|
||||
1. 它将**查询(Query)**和**待排序的文档(Passage)**拼接成一个单一的输入序列,例如:`[CLS] what is panda? [SEP] The giant panda is a bear species endemic to China. [SEP]`。
|
||||
2. 这个拼接后的序列被完整地送入模型中。模型内部的自注意力机制(Self-Attention)可以同时分析查询和文档中的每一个词,并计算它们之间**细粒度的交互关系**。
|
||||
3. 模型最终输出一个**单一的分数(Logit)**,这个分数直接代表了查询和文档的相关性。分数越高,相关性越强。
|
||||
+ **关键特性**:
|
||||
- **优点**: 由于查询和文档在模型内部进行了充分的、深度的交互,其**准确度通常非常高**,是衡量Reranker性能的黄金标准。
|
||||
- **缺点**: **速度较慢**。因为它必须为**每一个“查询-文档”对**都独立执行一次完整的、代价高昂的计算。如果初步检索返回了100个文档,它就需要运行100次。
|
||||
|
||||
|
||||
|
||||
#### 2. LLM-based Reranker (基于LLM的重排器)
|
||||
这种方法创造性地利用了通用大语言模型(LLM)的能力来进行重排。
|
||||
|
||||
+ **模型类型**: **因果语言模型 (Causal Language Model)**,即我们常说的GPT、Llama、Gemma这类用于生成文本的LLM。`BAAI/bge-reranker-v2-gemma` 就是一个典型的例子。
|
||||
+ **工作原理**:
|
||||
1. 它**不是直接输出一个分数**,而是将重排任务**转化为一个问答或文本生成任务**。
|
||||
2. 它通过一个精心设计的**提示(Prompt)**来组织输入,例如:`"Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'. A: {query} B: {passage}"`。
|
||||
3. 它将这个完整的Prompt喂给LLM,然后**观察LLM在最后生成“Yes”这个词的概率**。
|
||||
4. 这个**生成“Yes”的概率(或其Logit值)就被当作是相关性分数**。如果模型非常确信答案是“Yes”,说明它认为文档B包含了查询A的答案,即相关性高。
|
||||
+ **关键特性**:
|
||||
- **优点**: 能够利用LLM强大的**语义理解、推理和世界知识**,对于需要深度理解和推理才能判断相关性的复杂查询,效果可能更好。
|
||||
- **缺点**: 计算开销可能非常大(取决于LLM的大小),并且性能**高度依赖于Prompt的设计**。
|
||||
|
||||
|
||||
|
||||
#### 3. LLM-based Layerwise Reranker (基于LLM分层信息的重排器)
|
||||
这是第二种方法的“威力加强版”,是一种更前沿、更复杂的探究性技术。
|
||||
|
||||
+ **模型类型**: 同样是**因果语言模型 (Causal Language Model)**,例如`BAAI/bge-reranker-v2-minicpm-layerwise`。
|
||||
+ **工作原理**:
|
||||
1. 输入部分与第二种方法完全相同,也是使用“Yes/No”的Prompt。
|
||||
2. 核心区别在于**分数的提取方式**。它不再仅仅依赖LLM**最后一层**的输出(即最终的预测结果)。
|
||||
3. 它认为LLM在逐层处理信息的过程中,不同深度的网络层(Layer)可能捕获了不同层次的语义相关性信息。因此,它会从**模型的多个中间层**提取出关于“Yes”这个词的预测Logit。
|
||||
4. 代码中的 `cutoff_layers=[28]` 参数就是告诉模型:“请把第28层的输出给我”。最终,你会得到一个或多个来自不同网络层的分数,这些分数可以被平均或以其他方式组合,形成一个更鲁棒的最终相关性判断。
|
||||
+ **关键特性**:
|
||||
- **优点**: 理论上可以获得**更丰富、更全面的相关性信号**,可能达到比只看最后一层更高的精度,是目前探索性能极限的一种方法。
|
||||
- **缺点**: **复杂度最高**,需要对模型进行特定的修改才能提取中间层信息(代码中的`trust_remote_code=True`就是一个信号),计算开销也很大。
|
||||
|
||||
#### 总结对比
|
||||
| 特性 | 1. Normal Reranker (常规) | 2. LLM-based Reranker (基于LLM) | 3. LLM-based Layerwise Reranker (基于LLM分层) |
|
||||
| :--- | :--- | :--- | :--- |
|
||||
| **底层模型** | 交叉编码器 (如BERT) | 因果语言模型 (如Gemma) | 因果语言模型 (如MiniCPM) |
|
||||
| **工作原理** | 计算Query和Passage的深度交互,直接输出相关分 | 将排序任务转为"Yes/No"预测,用"Yes"的概率作为分数 | 与2类似,但从LLM的多个中间层提取"Yes"的概率 |
|
||||
| **输出** | 单一的相关性分数 | 单一的相关性分数(来自最后一层) | 多个相关性分数(来自不同层) |
|
||||
| **优点** | **速度与精度的最佳平衡点**,成熟稳定 | 利用LLM的推理能力,处理复杂问题 | 理论上精度最高,信号更丰富 |
|
||||
| **缺点** | 相比向量检索慢 | 计算开销大,依赖Prompt设计 | **复杂度最高**,计算开销最大 |
|
||||
| **推荐场景** | **大多数生产环境的首选**,效果好,易于部署 | 对答案质量有极致要求,且计算资源充足的场景 | 学术研究或追求SOTA(State-of-the-art)性能的场景 |
|
||||
|
||||
|
||||
#### 使用建议
|
||||
1. **开始阶段**: 强烈建议您**从 **`Normal Reranker`** 开始**,例如 `BAAI/bge-reranker-v2-m3`。它是目前综合表现最好的模型之一,能显著提升您的RAG系统性能,并且相对容易集成和部署。
|
||||
2. **进阶探索**: 如果您发现常规Reranker在处理某些非常微妙或需要复杂推理的查询时表现不佳,并且您拥有充足的GPU资源,可以尝试 `LLM-based Reranker`。
|
||||
3. **前沿研究**: `Layerwise Reranker` 更适合研究人员或希望在特定任务上压榨出最后一点性能的专家。
|
||||
|
||||
|
||||
### 问题3:粗过滤或细过滤后的知识(带重排)如何组装发送给大模型的?
|
||||
这一块主要是设计提示词,典型的指令细节,其核心任务是根据上下文回答用户问题。组装上下文时需要指定
|
||||
关键约束:必须严格按照所提供文档回答,禁止使用你自己的知识回答
|
||||
未知情况处理: 如果文档中没有足够的信息来回答问题,请告知“根据所掌握的资料,无法回答这个问题”
|
||||
引用要求:在回答时,如果引用了某个文档内容,请在句子末尾加上文档编号
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
BIN
docs/WeKnora.pdf
Normal file
BIN
docs/images/arc.png
Normal file
|
After Width: | Height: | Size: 35 KiB |
BIN
docs/images/graph3.png
Normal file
|
After Width: | Height: | Size: 339 KiB |
BIN
docs/images/pipeline.png
Normal file
|
After Width: | Height: | Size: 504 KiB |
BIN
docs/images/pipeline2.jpeg
Normal file
|
After Width: | Height: | Size: 104 KiB |
@@ -2,6 +2,16 @@ server {
|
||||
listen 80;
|
||||
server_name localhost;
|
||||
client_max_body_size 50M;
|
||||
|
||||
# 安全头配置
|
||||
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||
add_header X-Content-Type-Options "nosniff" always;
|
||||
add_header X-XSS-Protection "1; mode=block" always;
|
||||
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
|
||||
|
||||
# 错误日志配置
|
||||
error_log /var/log/nginx/error.log warn;
|
||||
access_log /var/log/nginx/access.log;
|
||||
|
||||
# 前端静态文件
|
||||
location / {
|
||||
@@ -18,6 +28,12 @@ server {
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# 连接和重试配置
|
||||
proxy_connect_timeout 30s; # 连接超时时间
|
||||
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
|
||||
proxy_next_upstream_tries 3; # 重试次数
|
||||
proxy_next_upstream_timeout 30s; # 重试超时时间
|
||||
|
||||
# SSE 相关配置
|
||||
proxy_http_version 1.1; # 使用 HTTP/1.1
|
||||
proxy_set_header Connection ""; # 禁用 Connection: close,保持连接打开
|
||||
|
||||
50
frontend/package-lock.json
generated
@@ -1,18 +1,21 @@
|
||||
{
|
||||
"name": "knowledage-base",
|
||||
"version": "0.0.0",
|
||||
"version": "0.1.3",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "knowledage-base",
|
||||
"version": "0.0.0",
|
||||
"version": "0.1.3",
|
||||
"dependencies": {
|
||||
"@microsoft/fetch-event-source": "^2.0.1",
|
||||
"@types/dompurify": "^3.0.5",
|
||||
"axios": "^1.8.4",
|
||||
"dompurify": "^3.2.6",
|
||||
"marked": "^5.1.2",
|
||||
"pagefind": "^1.1.1",
|
||||
"pinia": "^3.0.1",
|
||||
"tdesign-icons-vue-next": "^0.4.1",
|
||||
"tdesign-vue-next": "^1.11.5",
|
||||
"vue": "^3.5.13",
|
||||
"vue-router": "^4.5.0",
|
||||
@@ -1274,6 +1277,15 @@
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/dompurify": {
|
||||
"version": "3.0.5",
|
||||
"resolved": "https://mirrors.tencent.com/npm/@types/dompurify/-/dompurify-3.0.5.tgz",
|
||||
"integrity": "sha512-1Wg0g3BtQF7sSb27fJQAKck1HECM6zV1EB66j8JH9i3LCjYabJa0FSdiSgsD5K/RbrsR0SiraKacLB+T8ZVYAg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/trusted-types": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/eslint": {
|
||||
"version": "9.6.1",
|
||||
"resolved": "https://mirrors.tencent.com/npm/@types/eslint/-/eslint-9.6.1.tgz",
|
||||
@@ -1346,6 +1358,12 @@
|
||||
"resolved": "https://mirrors.tencent.com/npm/@types/tinycolor2/-/tinycolor2-1.4.6.tgz",
|
||||
"integrity": "sha512-iEN8J0BoMnsWBqjVbWH/c0G0Hh7O21lpR2/+PrvAVgWdzL7eexIFm4JN/Wn10PTcmNdtS6U67r499mlWMXOxNw=="
|
||||
},
|
||||
"node_modules/@types/trusted-types": {
|
||||
"version": "2.0.7",
|
||||
"resolved": "https://mirrors.tencent.com/npm/@types/trusted-types/-/trusted-types-2.0.7.tgz",
|
||||
"integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/validator": {
|
||||
"version": "13.15.2",
|
||||
"resolved": "https://mirrors.tencent.com/npm/@types/validator/-/validator-13.15.2.tgz",
|
||||
@@ -2121,6 +2139,15 @@
|
||||
"node": ">=0.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/dompurify": {
|
||||
"version": "3.2.6",
|
||||
"resolved": "https://mirrors.tencent.com/npm/dompurify/-/dompurify-3.2.6.tgz",
|
||||
"integrity": "sha512-/2GogDQlohXPZe6D6NOgQvXLPSYBqIWMnZ8zzOhn09REE4eyAzb+Hed3jhoM9OkuaJ8P6ZGTTVWQKAi8ieIzfQ==",
|
||||
"license": "(MPL-2.0 OR Apache-2.0)",
|
||||
"optionalDependencies": {
|
||||
"@types/trusted-types": "^2.0.7"
|
||||
}
|
||||
},
|
||||
"node_modules/dunder-proto": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://mirrors.tencent.com/npm/dunder-proto/-/dunder-proto-1.0.1.tgz",
|
||||
@@ -3374,9 +3401,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/tdesign-icons-vue-next": {
|
||||
"version": "0.3.6",
|
||||
"resolved": "https://mirrors.tencent.com/npm/tdesign-icons-vue-next/-/tdesign-icons-vue-next-0.3.6.tgz",
|
||||
"integrity": "sha512-X9u90dBv8tPhfpguUyx+BzF8CU2ef2L4RXOO7MYOj1ufHCHwBXTF8L3GPfq6KZd/2u4vMLYAA8lGURn4PZZICw==",
|
||||
"version": "0.4.1",
|
||||
"resolved": "https://mirrors.tencent.com/npm/tdesign-icons-vue-next/-/tdesign-icons-vue-next-0.4.1.tgz",
|
||||
"integrity": "sha512-uDPuTLRORnGcTyVGNoentNaK4V+ZcBmhYwcY3KqDaQQ5rrPeLMxu0ZVmgOEf0JtF2QZiqAxY7vodNEiLUdoRKA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.16.3"
|
||||
},
|
||||
@@ -3410,6 +3438,18 @@
|
||||
"vue": ">=3.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/tdesign-vue-next/node_modules/tdesign-icons-vue-next": {
|
||||
"version": "0.3.7",
|
||||
"resolved": "https://mirrors.tencent.com/npm/tdesign-icons-vue-next/-/tdesign-icons-vue-next-0.3.7.tgz",
|
||||
"integrity": "sha512-Q5ebVty/TCqhBa0l/17kkhjC0pBAOGvn7C35MAt1xS+johKVM9QEDOy9R6XEl332AiwQ37MwqioczqjYC30ckw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.16.3"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"vue": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/terser": {
|
||||
"version": "5.43.1",
|
||||
"resolved": "https://mirrors.tencent.com/npm/terser/-/terser-5.43.1.tgz",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "knowledage-base",
|
||||
"version": "0.1.0",
|
||||
"version": "0.1.3",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
@@ -13,12 +13,16 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"@microsoft/fetch-event-source": "^2.0.1",
|
||||
"@types/dompurify": "^3.0.5",
|
||||
"axios": "^1.8.4",
|
||||
"dompurify": "^3.2.6",
|
||||
"marked": "^5.1.2",
|
||||
"pagefind": "^1.1.1",
|
||||
"pinia": "^3.0.1",
|
||||
"tdesign-icons-vue-next": "^0.4.1",
|
||||
"tdesign-vue-next": "^1.11.5",
|
||||
"vue": "^3.5.13",
|
||||
"vue-i18n": "^9.9.0",
|
||||
"vue-router": "^4.5.0",
|
||||
"webpack": "^5.94.0"
|
||||
},
|
||||
|
||||
@@ -1,9 +1,31 @@
|
||||
<script setup lang="ts">
|
||||
import { computed } from 'vue'
|
||||
import { useI18n } from 'vue-i18n'
|
||||
import { ConfigProvider } from 'tdesign-vue-next'
|
||||
import enUS from 'tdesign-vue-next/es/locale/en_US'
|
||||
import zhCN from 'tdesign-vue-next/es/locale/zh_CN'
|
||||
import ruRU from 'tdesign-vue-next/es/locale/ru_RU'
|
||||
|
||||
const { locale } = useI18n()
|
||||
|
||||
const tdesignLocale = computed(() => {
|
||||
switch (locale.value) {
|
||||
case 'en-US':
|
||||
return enUS
|
||||
case 'ru-RU':
|
||||
return ruRU
|
||||
case 'zh-CN':
|
||||
default:
|
||||
return zhCN
|
||||
}
|
||||
})
|
||||
</script>
|
||||
<template>
|
||||
<div id="app">
|
||||
<RouterView />
|
||||
</div>
|
||||
<ConfigProvider :global-config="tdesignLocale">
|
||||
<div id="app">
|
||||
<RouterView />
|
||||
</div>
|
||||
</ConfigProvider>
|
||||
</template>
|
||||
<style>
|
||||
body,
|
||||
|
||||
239
frontend/src/api/auth/index.ts
Normal file
@@ -0,0 +1,239 @@
|
||||
import { post, get, put } from '@/utils/request'
|
||||
|
||||
// 用户登录接口
|
||||
export interface LoginRequest {
|
||||
email: string
|
||||
password: string
|
||||
}
|
||||
|
||||
export interface LoginResponse {
|
||||
success: boolean
|
||||
message?: string
|
||||
user?: {
|
||||
id: string
|
||||
username: string
|
||||
email: string
|
||||
avatar?: string
|
||||
tenant_id: number
|
||||
is_active: boolean
|
||||
created_at: string
|
||||
updated_at: string
|
||||
}
|
||||
tenant?: {
|
||||
id: number
|
||||
name: string
|
||||
description: string
|
||||
api_key: string
|
||||
status: string
|
||||
business: string
|
||||
storage_quota: number
|
||||
storage_used: number
|
||||
created_at: string
|
||||
updated_at: string
|
||||
}
|
||||
token?: string
|
||||
refresh_token?: string
|
||||
}
|
||||
|
||||
// 用户注册接口
|
||||
export interface RegisterRequest {
|
||||
username: string
|
||||
email: string
|
||||
password: string
|
||||
}
|
||||
|
||||
export interface RegisterResponse {
|
||||
success: boolean
|
||||
message?: string
|
||||
data?: {
|
||||
user: {
|
||||
id: string
|
||||
username: string
|
||||
email: string
|
||||
}
|
||||
tenant: {
|
||||
id: string
|
||||
name: string
|
||||
api_key: string
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 用户信息接口
|
||||
export interface UserInfo {
|
||||
id: string
|
||||
username: string
|
||||
email: string
|
||||
avatar?: string
|
||||
tenant_id: string
|
||||
created_at: string
|
||||
updated_at: string
|
||||
}
|
||||
|
||||
// 租户信息接口
|
||||
export interface TenantInfo {
|
||||
id: string
|
||||
name: string
|
||||
description?: string
|
||||
api_key: string
|
||||
status?: string
|
||||
business?: string
|
||||
owner_id: string
|
||||
storage_quota?: number
|
||||
storage_used?: number
|
||||
created_at: string
|
||||
updated_at: string
|
||||
knowledge_bases?: KnowledgeBaseInfo[]
|
||||
}
|
||||
|
||||
// 知识库信息接口
|
||||
export interface KnowledgeBaseInfo {
|
||||
id: string
|
||||
name: string
|
||||
description: string
|
||||
tenant_id: string
|
||||
created_at: string
|
||||
updated_at: string
|
||||
document_count?: number
|
||||
chunk_count?: number
|
||||
}
|
||||
|
||||
// 模型信息接口
|
||||
export interface ModelInfo {
|
||||
id: string
|
||||
name: string
|
||||
type: string
|
||||
source: string
|
||||
description?: string
|
||||
is_default?: boolean
|
||||
created_at: string
|
||||
updated_at: string
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户登录
|
||||
*/
|
||||
export async function login(data: LoginRequest): Promise<LoginResponse> {
|
||||
try {
|
||||
const response = await post('/api/v1/auth/login', data)
|
||||
return response as unknown as LoginResponse
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '登录失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户注册
|
||||
*/
|
||||
export async function register(data: RegisterRequest): Promise<RegisterResponse> {
|
||||
try {
|
||||
const response = await post('/api/v1/auth/register', data)
|
||||
return response as unknown as RegisterResponse
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '注册失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取当前用户信息
|
||||
*/
|
||||
export async function getCurrentUser(): Promise<{ success: boolean; data?: { user: UserInfo; tenant: TenantInfo }; message?: string }> {
|
||||
try {
|
||||
const response = await get('/api/v1/auth/me')
|
||||
return response as unknown as { success: boolean; data?: { user: UserInfo; tenant: TenantInfo }; message?: string }
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '获取用户信息失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取当前租户信息
|
||||
*/
|
||||
export async function getCurrentTenant(): Promise<{ success: boolean; data?: TenantInfo; message?: string }> {
|
||||
try {
|
||||
const response = await get('/api/v1/auth/tenant')
|
||||
return response as unknown as { success: boolean; data?: TenantInfo; message?: string }
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '获取租户信息失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 刷新Token
|
||||
*/
|
||||
export async function refreshToken(refreshToken: string): Promise<{ success: boolean; data?: { token: string; refreshToken: string }; message?: string }> {
|
||||
try {
|
||||
const response: any = await post('/api/v1/auth/refresh', { refreshToken })
|
||||
if (response && response.success) {
|
||||
if (response.access_token || response.refresh_token) {
|
||||
return {
|
||||
success: true,
|
||||
data: {
|
||||
token: response.access_token,
|
||||
refreshToken: response.refresh_token,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 其他情况直接返回原始消息
|
||||
return {
|
||||
success: false,
|
||||
message: response?.message || '刷新Token失败'
|
||||
}
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '刷新Token失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户登出
|
||||
*/
|
||||
export async function logout(): Promise<{ success: boolean; message?: string }> {
|
||||
try {
|
||||
await post('/api/v1/auth/logout', {})
|
||||
return {
|
||||
success: true
|
||||
}
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '登出失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证Token有效性
|
||||
*/
|
||||
export async function validateToken(): Promise<{ success: boolean; valid?: boolean; message?: string }> {
|
||||
try {
|
||||
const response = await get('/api/v1/auth/validate')
|
||||
return response as unknown as { success: boolean; valid?: boolean; message?: string }
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
valid: false,
|
||||
message: error.message || 'Token验证失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,54 +1,24 @@
|
||||
import { get, post, put, del, postChat } from "../../utils/request";
|
||||
import { loadTestData } from "../test-data";
|
||||
|
||||
// 从localStorage获取设置
|
||||
function getSettings() {
|
||||
const settingsStr = localStorage.getItem("WeKnora_settings");
|
||||
if (settingsStr) {
|
||||
try {
|
||||
const settings = JSON.parse(settingsStr);
|
||||
if (settings.apiKey && settings.endpoint) {
|
||||
return settings;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error("解析设置失败:", e);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// 根据是否有设置决定是否需要加载测试数据
|
||||
async function ensureConfigured() {
|
||||
const settings = getSettings();
|
||||
// 如果没有设置APIKey和Endpoint,则加载测试数据
|
||||
if (!settings) {
|
||||
await loadTestData();
|
||||
}
|
||||
}
|
||||
|
||||
export async function createSessions(data = {}) {
|
||||
await ensureConfigured();
|
||||
return post("/api/v1/sessions", data);
|
||||
}
|
||||
|
||||
export async function getSessionsList(page: number, page_size: number) {
|
||||
await ensureConfigured();
|
||||
return get(`/api/v1/sessions?page=${page}&page_size=${page_size}`);
|
||||
}
|
||||
|
||||
export async function generateSessionsTitle(session_id: string, data: any) {
|
||||
await ensureConfigured();
|
||||
return post(`/api/v1/sessions/${session_id}/generate_title`, data);
|
||||
}
|
||||
|
||||
export async function knowledgeChat(data: { session_id: string; query: string; }) {
|
||||
await ensureConfigured();
|
||||
return postChat(`/api/v1/knowledge-chat/${data.session_id}`, { query: data.query });
|
||||
}
|
||||
|
||||
export async function getMessageList(data: { session_id: string; limit: number, created_at: string }) {
|
||||
await ensureConfigured();
|
||||
|
||||
if (data.created_at) {
|
||||
return get(`/api/v1/messages/${data.session_id}/load?before_time=${encodeURIComponent(data.created_at)}&limit=${data.limit}`);
|
||||
} else {
|
||||
@@ -57,6 +27,5 @@ export async function getMessageList(data: { session_id: string; limit: number,
|
||||
}
|
||||
|
||||
export async function delSession(session_id: string) {
|
||||
await ensureConfigured();
|
||||
return del(`/api/v1/sessions/${session_id}`);
|
||||
}
|
||||
@@ -1,22 +1,8 @@
|
||||
import { fetchEventSource } from '@microsoft/fetch-event-source'
|
||||
import { ref, type Ref, onUnmounted, nextTick } from 'vue'
|
||||
import { generateRandomString } from '@/utils/index';
|
||||
import { getTestData } from '@/utils/request';
|
||||
import { loadTestData } from '@/api/test-data';
|
||||
|
||||
// 从localStorage获取设置
|
||||
function getSettings() {
|
||||
const settingsStr = localStorage.getItem("WeKnora_settings");
|
||||
if (settingsStr) {
|
||||
try {
|
||||
const settings = JSON.parse(settingsStr);
|
||||
return settings;
|
||||
} catch (e) {
|
||||
console.error("解析设置失败:", e);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
interface StreamOptions {
|
||||
// 请求方法 (默认POST)
|
||||
@@ -49,26 +35,15 @@ export function useStream() {
|
||||
isStreaming.value = true;
|
||||
isLoading.value = true;
|
||||
|
||||
// 获取设置信息
|
||||
const settings = getSettings();
|
||||
let apiUrl = '';
|
||||
let apiKey = '';
|
||||
|
||||
// 如果有设置信息,优先使用设置信息
|
||||
if (settings && settings.endpoint && settings.apiKey) {
|
||||
apiUrl = settings.endpoint;
|
||||
apiKey = settings.apiKey;
|
||||
} else {
|
||||
// 否则加载测试数据
|
||||
await loadTestData();
|
||||
const testData = getTestData();
|
||||
if (!testData) {
|
||||
error.value = "测试数据未初始化,无法进行聊天";
|
||||
stopStream();
|
||||
return;
|
||||
}
|
||||
apiUrl = import.meta.env.VITE_IS_DOCKER ? "" : "http://localhost:8080";
|
||||
apiKey = testData.tenant.api_key;
|
||||
// 获取API配置
|
||||
const apiUrl = import.meta.env.VITE_IS_DOCKER ? "" : "http://localhost:8080";
|
||||
|
||||
// 获取JWT Token
|
||||
const token = localStorage.getItem('weknora_token');
|
||||
if (!token) {
|
||||
error.value = "未找到登录令牌,请重新登录";
|
||||
stopStream();
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
@@ -80,7 +55,7 @@ export function useStream() {
|
||||
method: params.method,
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"X-API-Key": apiKey,
|
||||
"Authorization": `Bearer ${token}`,
|
||||
"X-Request-ID": `${generateRandomString(12)}`,
|
||||
},
|
||||
body:
|
||||
|
||||
@@ -19,6 +19,7 @@ export interface InitializationConfig {
|
||||
modelName: string;
|
||||
baseUrl: string;
|
||||
apiKey?: string;
|
||||
enabled: boolean;
|
||||
};
|
||||
multimodal: {
|
||||
enabled: boolean;
|
||||
@@ -49,6 +50,13 @@ export interface InitializationConfig {
|
||||
};
|
||||
// Frontend-only hint for storage selection UI
|
||||
storageType?: 'cos' | 'minio';
|
||||
nodeExtract: {
|
||||
enabled: boolean,
|
||||
text: string,
|
||||
tags: string[],
|
||||
nodes: Node[],
|
||||
relations: Relation[]
|
||||
}
|
||||
}
|
||||
|
||||
// 下载任务状态类型
|
||||
@@ -62,34 +70,18 @@ export interface DownloadTask {
|
||||
endTime?: string;
|
||||
}
|
||||
|
||||
// 系统初始化状态检查
|
||||
export function checkInitializationStatus(): Promise<{ initialized: boolean }> {
|
||||
// 根据知识库ID执行配置更新
|
||||
export function initializeSystemByKB(kbId: string, config: InitializationConfig): Promise<any> {
|
||||
return new Promise((resolve, reject) => {
|
||||
get('/api/v1/initialization/status')
|
||||
console.log('开始知识库配置更新...', kbId, config);
|
||||
post(`/api/v1/initialization/initialize/${kbId}`, config)
|
||||
.then((response: any) => {
|
||||
resolve(response.data || { initialized: false });
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.warn('检查初始化状态失败,假设需要初始化:', error);
|
||||
resolve({ initialized: false });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// 执行系统初始化
|
||||
export function initializeSystem(config: InitializationConfig): Promise<any> {
|
||||
return new Promise((resolve, reject) => {
|
||||
console.log('开始系统初始化...', config);
|
||||
post('/api/v1/initialization/initialize', config)
|
||||
.then((response: any) => {
|
||||
console.log('系统初始化完成', response);
|
||||
// 设置本地初始化状态标记
|
||||
localStorage.setItem('system_initialized', 'true');
|
||||
console.log('知识库配置更新完成', response);
|
||||
resolve(response);
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.error('系统初始化失败:', error);
|
||||
reject(error);
|
||||
console.error('知识库配置更新失败:', error);
|
||||
reject(error.error || error);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -178,15 +170,15 @@ export function listDownloadTasks(): Promise<DownloadTask[]> {
|
||||
});
|
||||
}
|
||||
|
||||
// 获取当前系统配置
|
||||
export function getCurrentConfig(): Promise<InitializationConfig & { hasFiles: boolean }> {
|
||||
|
||||
export function getCurrentConfigByKB(kbId: string): Promise<InitializationConfig & { hasFiles: boolean }> {
|
||||
return new Promise((resolve, reject) => {
|
||||
get('/api/v1/initialization/config')
|
||||
get(`/api/v1/initialization/config/${kbId}`)
|
||||
.then((response: any) => {
|
||||
resolve(response.data || {});
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.error('获取当前配置失败:', error);
|
||||
console.error('获取知识库配置失败:', error);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
@@ -311,9 +303,17 @@ export function testMultimodalFunction(testData: {
|
||||
formData.append('chunk_overlap', testData.chunk_overlap.toString());
|
||||
formData.append('separators', JSON.stringify(testData.separators));
|
||||
|
||||
// 获取鉴权Token
|
||||
const token = localStorage.getItem('weknora_token');
|
||||
const headers: Record<string, string> = {};
|
||||
if (token) {
|
||||
headers['Authorization'] = `Bearer ${token}`;
|
||||
}
|
||||
|
||||
// 使用原生fetch因为需要发送FormData
|
||||
fetch('/api/v1/initialization/multimodal/test', {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: formData
|
||||
})
|
||||
.then(response => response.json())
|
||||
@@ -329,4 +329,93 @@ export function testMultimodalFunction(testData: {
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 文本内容关系提取接口
|
||||
export interface TextRelationExtractionRequest {
|
||||
text: string;
|
||||
tags: string[];
|
||||
llmConfig: LLMConfig;
|
||||
}
|
||||
|
||||
export interface Node {
|
||||
name: string;
|
||||
attributes: string[];
|
||||
}
|
||||
|
||||
export interface Relation {
|
||||
node1: string;
|
||||
node2: string;
|
||||
type: string;
|
||||
}
|
||||
|
||||
export interface LLMConfig {
|
||||
source: 'local' | 'remote';
|
||||
modelName: string;
|
||||
baseUrl: string;
|
||||
apiKey: string;
|
||||
}
|
||||
|
||||
export interface TextRelationExtractionResponse {
|
||||
nodes: Node[];
|
||||
relations: Relation[];
|
||||
}
|
||||
|
||||
// 文本内容关系提取
|
||||
export function extractTextRelations(request: TextRelationExtractionRequest): Promise<TextRelationExtractionResponse> {
|
||||
return new Promise((resolve, reject) => {
|
||||
post('/api/v1/initialization/extract/text-relation', request)
|
||||
.then((response: any) => {
|
||||
resolve(response.data || { nodes: [], relations: [] });
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.error('文本内容关系提取失败:', error);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export interface FabriTextRequest {
|
||||
tags: string[];
|
||||
llmConfig: LLMConfig;
|
||||
}
|
||||
|
||||
export interface FabriTextResponse {
|
||||
text: string;
|
||||
}
|
||||
|
||||
// 文本内容生成
|
||||
export function fabriText(request: FabriTextRequest): Promise<FabriTextResponse> {
|
||||
return new Promise((resolve, reject) => {
|
||||
post('/api/v1/initialization/extract/fabri-text', request)
|
||||
.then((response: any) => {
|
||||
resolve(response.data || { text: '' });
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.error('文本内容生成失败:', error);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export interface FabriTagRequest {
|
||||
llmConfig: LLMConfig;
|
||||
}
|
||||
|
||||
export interface FabriTagResponse {
|
||||
tags: string[];
|
||||
}
|
||||
|
||||
// 文本内容生成
|
||||
export function fabriTag(request: FabriTagRequest): Promise<FabriTagResponse> {
|
||||
return new Promise((resolve, reject) => {
|
||||
post('/api/v1/initialization/extract/fabri-tag', request)
|
||||
.then((response: any) => {
|
||||
resolve(response.data || { tags: [] as string[] });
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.error('标签生成失败:', error);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -1,62 +1,55 @@
|
||||
import { get, post, put, del, postUpload, getDown, getTestData } from "../../utils/request";
|
||||
import { loadTestData } from "../test-data";
|
||||
import { get, post, put, del, postUpload, getDown } from "../../utils/request";
|
||||
|
||||
// 获取知识库ID(优先从设置中获取)
|
||||
async function getKnowledgeBaseID() {
|
||||
// 从localStorage获取设置中的知识库ID
|
||||
const settingsStr = localStorage.getItem("WeKnora_settings");
|
||||
let knowledgeBaseId = "";
|
||||
|
||||
if (settingsStr) {
|
||||
try {
|
||||
const settings = JSON.parse(settingsStr);
|
||||
if (settings.knowledgeBaseId) {
|
||||
return settings.knowledgeBaseId;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error("解析设置失败:", e);
|
||||
}
|
||||
}
|
||||
|
||||
// 如果设置中没有知识库ID,则使用测试数据
|
||||
await loadTestData();
|
||||
|
||||
const testData = getTestData();
|
||||
if (!testData || testData.knowledge_bases.length === 0) {
|
||||
console.error("测试数据未初始化或不包含知识库");
|
||||
throw new Error("测试数据未初始化或不包含知识库");
|
||||
}
|
||||
return testData.knowledge_bases[0].id;
|
||||
// 知识库管理 API(列表、创建、获取、更新、删除、复制)
|
||||
export function listKnowledgeBases() {
|
||||
return get(`/api/v1/knowledge-bases`);
|
||||
}
|
||||
|
||||
export async function uploadKnowledgeBase(data = {}) {
|
||||
const kbId = await getKnowledgeBaseID();
|
||||
export function createKnowledgeBase(data: { name: string; description?: string; chunking_config?: any }) {
|
||||
return post(`/api/v1/knowledge-bases`, data);
|
||||
}
|
||||
|
||||
export function getKnowledgeBaseById(id: string) {
|
||||
return get(`/api/v1/knowledge-bases/${id}`);
|
||||
}
|
||||
|
||||
export function updateKnowledgeBase(id: string, data: { name: string; description?: string; config: any }) {
|
||||
return put(`/api/v1/knowledge-bases/${id}` , data);
|
||||
}
|
||||
|
||||
export function deleteKnowledgeBase(id: string) {
|
||||
return del(`/api/v1/knowledge-bases/${id}`);
|
||||
}
|
||||
|
||||
export function copyKnowledgeBase(data: { source_id: string; target_id?: string }) {
|
||||
return post(`/api/v1/knowledge-bases/copy`, data);
|
||||
}
|
||||
|
||||
// 知识文件 API(基于具体知识库)
|
||||
export function uploadKnowledgeFile(kbId: string, data = {}) {
|
||||
return postUpload(`/api/v1/knowledge-bases/${kbId}/knowledge/file`, data);
|
||||
}
|
||||
|
||||
export async function getKnowledgeBase({page, page_size}) {
|
||||
const kbId = await getKnowledgeBaseID();
|
||||
return get(
|
||||
`/api/v1/knowledge-bases/${kbId}/knowledge?page=${page}&page_size=${page_size}`
|
||||
);
|
||||
export function listKnowledgeFiles(kbId: string, { page, page_size }: { page: number; page_size: number }) {
|
||||
return get(`/api/v1/knowledge-bases/${kbId}/knowledge?page=${page}&page_size=${page_size}`);
|
||||
}
|
||||
|
||||
export function getKnowledgeDetails(id: any) {
|
||||
export function getKnowledgeDetails(id: string) {
|
||||
return get(`/api/v1/knowledge/${id}`);
|
||||
}
|
||||
|
||||
export function delKnowledgeDetails(id: any) {
|
||||
export function delKnowledgeDetails(id: string) {
|
||||
return del(`/api/v1/knowledge/${id}`);
|
||||
}
|
||||
|
||||
export function downKnowledgeDetails(id: any) {
|
||||
export function downKnowledgeDetails(id: string) {
|
||||
return getDown(`/api/v1/knowledge/${id}/download`);
|
||||
}
|
||||
|
||||
export function batchQueryKnowledge(ids: any) {
|
||||
return get(`/api/v1/knowledge/batch?${ids}`);
|
||||
export function batchQueryKnowledge(idsQueryString: string) {
|
||||
return get(`/api/v1/knowledge/batch?${idsQueryString}`);
|
||||
}
|
||||
|
||||
export function getKnowledgeDetailsCon(id: any, page) {
|
||||
export function getKnowledgeDetailsCon(id: string, page: number) {
|
||||
return get(`/api/v1/chunks/${id}?page=${page}&page_size=25`);
|
||||
}
|
||||
12
frontend/src/api/system/index.ts
Normal file
@@ -0,0 +1,12 @@
|
||||
import { get } from '@/utils/request'
|
||||
|
||||
export interface SystemInfo {
|
||||
version: string
|
||||
commit_id?: string
|
||||
build_time?: string
|
||||
go_version?: string
|
||||
}
|
||||
|
||||
export function getSystemInfo(): Promise<{ data: SystemInfo }> {
|
||||
return get('/api/v1/system/info')
|
||||
}
|
||||
@@ -1,55 +0,0 @@
|
||||
import { get, setTestData } from '../../utils/request';
|
||||
|
||||
export interface TestDataResponse {
|
||||
success: boolean;
|
||||
data: {
|
||||
tenant: {
|
||||
id: number;
|
||||
name: string;
|
||||
api_key: string;
|
||||
};
|
||||
knowledge_bases: Array<{
|
||||
id: string;
|
||||
name: string;
|
||||
description: string;
|
||||
}>;
|
||||
}
|
||||
}
|
||||
|
||||
// 是否已加载测试数据
|
||||
let isTestDataLoaded = false;
|
||||
|
||||
/**
|
||||
* 加载测试数据
|
||||
* 在API调用前调用此函数以确保测试数据已加载
|
||||
* @returns Promise<boolean> 是否成功加载
|
||||
*/
|
||||
export async function loadTestData(): Promise<boolean> {
|
||||
// 如果已经加载过,直接返回
|
||||
if (isTestDataLoaded) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
console.log('开始加载测试数据...');
|
||||
const response = await get('/api/v1/test-data');
|
||||
console.log('测试数据', response);
|
||||
|
||||
if (response && response.data) {
|
||||
// 设置测试数据
|
||||
setTestData({
|
||||
tenant: response.data.tenant,
|
||||
knowledge_bases: response.data.knowledge_bases
|
||||
});
|
||||
isTestDataLoaded = true;
|
||||
console.log('测试数据加载成功');
|
||||
return true;
|
||||
} else {
|
||||
console.warn('测试数据响应为空');
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('加载测试数据失败:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
6
frontend/src/assets/img/logout.svg
Normal file
@@ -0,0 +1,6 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none">
|
||||
<path d="M10 3H6a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h4" stroke="#000" stroke-opacity="0.6" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
|
||||
<path d="M17 16l4-4-4-4" stroke="#000" stroke-opacity="0.6" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
|
||||
<path d="M21 12H10" stroke="#000" stroke-opacity="0.6" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
|
||||
</svg>
|
||||
|
||||
|
After Width: | Height: | Size: 509 B |
4
frontend/src/assets/img/user-green.svg
Normal file
@@ -0,0 +1,4 @@
|
||||
<svg width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<circle cx="10" cy="6" r="3" stroke="#07C05F" stroke-width="1.5" fill="none"/>
|
||||
<path d="M4 16c0-3.314 2.686-6 6-6s6 2.686 6 6" stroke="#07C05F" stroke-width="1.5" fill="none"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 284 B |
4
frontend/src/assets/img/user.svg
Normal file
@@ -0,0 +1,4 @@
|
||||
<svg width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<circle cx="10" cy="6" r="3" stroke="currentColor" stroke-width="1.5" fill="none"/>
|
||||
<path d="M4 16c0-3.314 2.686-6 6-6s6 2.686 6 6" stroke="currentColor" stroke-width="1.5" fill="none"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 294 B |
@@ -1,8 +1,11 @@
|
||||
<script setup lang="ts">
|
||||
import { ref, defineEmits, onMounted, defineProps, defineExpose } from "vue";
|
||||
import { useI18n } from 'vue-i18n';
|
||||
import useKnowledgeBase from '@/hooks/useKnowledgeBase';
|
||||
import { onBeforeRouteUpdate } from 'vue-router';
|
||||
import { MessagePlugin } from "tdesign-vue-next";
|
||||
|
||||
const { t } = useI18n();
|
||||
let { cardList, total, getKnowled } = useKnowledgeBase()
|
||||
let query = ref("");
|
||||
const props = defineProps({
|
||||
@@ -17,15 +20,15 @@ onMounted(() => {
|
||||
const emit = defineEmits(['send-msg']);
|
||||
const createSession = (val: string) => {
|
||||
if (!val.trim()) {
|
||||
MessagePlugin.info("请先输入内容!");
|
||||
MessagePlugin.info(t('chat.pleaseEnterContent'));
|
||||
return
|
||||
}
|
||||
if (!query.value && cardList.value.length == 0) {
|
||||
MessagePlugin.info("请先上传知识库!");
|
||||
MessagePlugin.info(t('chat.pleaseUploadKnowledgeBase'));
|
||||
return;
|
||||
}
|
||||
if (props.isReplying) {
|
||||
return MessagePlugin.error("正在回复中,请稍后再试!");
|
||||
return MessagePlugin.error(t('chat.replyingPleaseWait'));
|
||||
}
|
||||
emit('send-msg', val);
|
||||
clearvalue();
|
||||
@@ -50,9 +53,9 @@ onBeforeRouteUpdate((to, from, next) => {
|
||||
</script>
|
||||
<template>
|
||||
<div class="answers-input">
|
||||
<t-textarea v-model="query" placeholder="基于知识库提问" name="description" :autosize="true" @keydown="onKeydown" />
|
||||
<t-textarea v-model="query" :placeholder="t('chat.askKnowledgeBase')" name="description" :autosize="true" @keydown="onKeydown" />
|
||||
<div class="answers-input-source">
|
||||
<span>{{ total }}个来源</span>
|
||||
<span>{{ t('chat.sourcesCount', { count: total }) }}</span>
|
||||
</div>
|
||||
<div @click="createSession(query)" class="answers-input-send"
|
||||
:class="[query.length && total ? '' : 'grey-out']">
|
||||
|
||||
67
frontend/src/components/LanguageSwitcher.vue
Normal file
@@ -0,0 +1,67 @@
|
||||
<template>
|
||||
<div class="language-switcher">
|
||||
<t-select
|
||||
v-model="selectedLanguage"
|
||||
:options="languageOptions"
|
||||
@change="handleLanguageChange"
|
||||
:popup-props="{ overlayClassName: 'language-select-popup' }"
|
||||
size="small"
|
||||
>
|
||||
<template #prefixIcon>
|
||||
<t-icon name="translate" />
|
||||
</template>
|
||||
</t-select>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, watch } from 'vue'
|
||||
import { useI18n } from 'vue-i18n'
|
||||
|
||||
const { locale } = useI18n()
|
||||
|
||||
const languageOptions = [
|
||||
{ label: '中文', value: 'zh-CN' },
|
||||
{ label: 'English', value: 'en-US' },
|
||||
{ label: 'Русский', value: 'ru-RU' }
|
||||
]
|
||||
|
||||
const selectedLanguage = ref(localStorage.getItem('locale') || 'zh-CN')
|
||||
|
||||
const handleLanguageChange = (value: string) => {
|
||||
console.log('Язык изменен на:', value)
|
||||
if (value && ['ru-RU', 'en-US', 'zh-CN'].includes(value)) {
|
||||
locale.value = value
|
||||
localStorage.setItem('locale', value)
|
||||
// Перезагрузка страницы для применения нового языка
|
||||
setTimeout(() => {
|
||||
window.location.reload()
|
||||
}, 100)
|
||||
}
|
||||
}
|
||||
|
||||
// Синхронизация с i18n при инициализации
|
||||
watch(() => locale.value, (newLocale) => {
|
||||
if (selectedLanguage.value !== newLocale) {
|
||||
selectedLanguage.value = newLocale
|
||||
}
|
||||
}, { immediate: true })
|
||||
</script>
|
||||
|
||||
<style lang="less" scoped>
|
||||
.language-switcher {
|
||||
.t-button {
|
||||
color: #666;
|
||||
font-size: 14px;
|
||||
|
||||
&:hover {
|
||||
color: #333;
|
||||
background-color: rgba(0, 0, 0, 0.04);
|
||||
}
|
||||
}
|
||||
|
||||
.t-icon {
|
||||
margin-right: 4px;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
@@ -4,6 +4,8 @@ import { onMounted, ref, nextTick, onUnmounted, onUpdated, watch } from "vue";
|
||||
import { downKnowledgeDetails } from "@/api/knowledge-base/index";
|
||||
import { MessagePlugin } from "tdesign-vue-next";
|
||||
import picturePreview from '@/components/picture-preview.vue';
|
||||
import { sanitizeHTML, safeMarkdownToHTML, createSafeImage, isValidImageURL } from '@/utils/security';
|
||||
|
||||
marked.use({
|
||||
mangle: false,
|
||||
headerIds: false,
|
||||
@@ -37,10 +39,16 @@ const checkImage = (url) => {
|
||||
});
|
||||
};
|
||||
renderer.image = function (href, title, text) {
|
||||
// 自定义HTML结构,图片展示带标题
|
||||
// 安全地处理图片链接
|
||||
if (!isValidImageURL(href)) {
|
||||
return `<p>无效的图片链接</p>`;
|
||||
}
|
||||
|
||||
// 使用安全的图片创建函数
|
||||
const safeImage = createSafeImage(href, text || '', title || '');
|
||||
return `<figure>
|
||||
<img class="markdown-image" src="${href}" alt="${title}" title="${text}">
|
||||
<figcaption style="text-align: left;">${text}</figcaption>
|
||||
${safeImage}
|
||||
<figcaption style="text-align: left;">${text || ''}</figcaption>
|
||||
</figure>`;
|
||||
};
|
||||
const props = defineProps(["visible", "details"]);
|
||||
@@ -66,14 +74,23 @@ watch(() => props.details.md, (newVal) => {
|
||||
deep: true
|
||||
})
|
||||
|
||||
// 处理 Markdown 中的图片
|
||||
// 安全地处理 Markdown 内容
|
||||
const processMarkdown = (markdownText) => {
|
||||
// 自定义渲染器处理图片
|
||||
if (!markdownText || typeof markdownText !== 'string') {
|
||||
return '';
|
||||
}
|
||||
|
||||
// 首先对 Markdown 内容进行安全处理
|
||||
const safeMarkdown = safeMarkdownToHTML(markdownText);
|
||||
|
||||
// 使用安全的渲染器
|
||||
marked.use({ renderer });
|
||||
let html = marked.parse(markdownText);
|
||||
const parser = new DOMParser();
|
||||
const doc = parser.parseFromString(html, 'text/html');
|
||||
return doc.body.innerHTML;
|
||||
let html = marked.parse(safeMarkdown);
|
||||
|
||||
// 使用 DOMPurify 进行最终的安全清理
|
||||
const sanitizedHTML = sanitizeHTML(html);
|
||||
|
||||
return sanitizedHTML;
|
||||
};
|
||||
const closePreImg = () => {
|
||||
reviewImg.value = false
|
||||
@@ -87,15 +104,19 @@ const downloadFile = () => {
|
||||
downKnowledgeDetails(props.details.id)
|
||||
.then((result) => {
|
||||
if (result) {
|
||||
if (url.value) {
|
||||
URL.revokeObjectURL(url.value);
|
||||
}
|
||||
url.value = URL.createObjectURL(result);
|
||||
down.value.click();
|
||||
// const link = document.createElement("a");
|
||||
// link.style.display = "none";
|
||||
// link.setAttribute("href", url);
|
||||
// link.setAttribute("download", props.details.title);
|
||||
// link.click();
|
||||
// document.body.removeChild(link);
|
||||
window.URL.revokeObjectURL(url);
|
||||
const link = document.createElement("a");
|
||||
link.style.display = "none";
|
||||
link.setAttribute("href", url.value);
|
||||
link.setAttribute("download", props.details.title);
|
||||
link.click();
|
||||
nextTick(() => {
|
||||
document.body.removeChild(link);
|
||||
URL.revokeObjectURL(url.value);
|
||||
})
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
<script setup lang="ts">
|
||||
import { useI18n } from 'vue-i18n'
|
||||
|
||||
const { t } = useI18n()
|
||||
</script>
|
||||
<template>
|
||||
<div class="empty">
|
||||
<img class="empty-img" src="@/assets/img/upload.svg" alt="">
|
||||
<span class="empty-txt">知识为空,拖放上传</span>
|
||||
<span class="empty-type-txt">pdf、doc 格式文件,不超过10M</span>
|
||||
<span class="empty-type-txt">text、markdown格式文件,不超过200K</span>
|
||||
<span class="empty-txt">{{ t('knowledgeBase.emptyKnowledgeDragDrop') }}</span>
|
||||
<span class="empty-type-txt">{{ t('knowledgeBase.pdfDocFormat') }}</span>
|
||||
<span class="empty-type-txt">{{ t('knowledgeBase.textMarkdownFormat') }}</span>
|
||||
</div>
|
||||
</template>
|
||||
<style scoped lang="less">
|
||||
|
||||
@@ -1,68 +1,134 @@
|
||||
<template>
|
||||
<div class="aside_box">
|
||||
<div class="logo_box">
|
||||
<div class="logo_box" @click="router.push('/platform/knowledge-bases')" style="cursor: pointer;">
|
||||
<img class="logo" src="@/assets/img/weknora.png" alt="">
|
||||
</div>
|
||||
<div class="menu_box" v-for="(item, index) in menuArr" :key="index">
|
||||
<div @click="gotopage(item.path)"
|
||||
@mouseenter="mouseenteMenu(item.path)" @mouseleave="mouseleaveMenu(item.path)"
|
||||
:class="['menu_item', item.childrenPath && item.childrenPath == currentpath ? 'menu_item_c_active' : item.path == currentpath ? 'menu_item_active' : '']">
|
||||
<div class="menu_item-box">
|
||||
<div class="menu_icon">
|
||||
<img class="icon" :src="getImgSrc(item.icon == 'zhishiku' ? knowledgeIcon : item.icon == 'setting' ? settingIcon : prefixIcon)" alt="">
|
||||
|
||||
<!-- 上半部分:知识库和对话 -->
|
||||
<div class="menu_top">
|
||||
<div class="menu_box" :class="{ 'has-submenu': item.children }" v-for="(item, index) in topMenuItems" :key="index">
|
||||
<div @click="handleMenuClick(item.path)"
|
||||
@mouseenter="mouseenteMenu(item.path)" @mouseleave="mouseleaveMenu(item.path)"
|
||||
:class="['menu_item', item.childrenPath && item.childrenPath == currentpath ? 'menu_item_c_active' : isMenuItemActive(item.path) ? 'menu_item_active' : '']">
|
||||
<div class="menu_item-box">
|
||||
<div class="menu_icon">
|
||||
<img class="icon" :src="getImgSrc(item.icon == 'zhishiku' ? knowledgeIcon : item.icon == 'logout' ? logoutIcon : item.icon == 'tenant' ? tenantIcon : prefixIcon)" alt="">
|
||||
</div>
|
||||
<span class="menu_title" :title="item.path === 'knowledge-bases' && kbMenuItem?.title ? kbMenuItem.title : t(item.titleKey)">{{ item.path === 'knowledge-bases' && kbMenuItem?.title ? kbMenuItem.title : t(item.titleKey) }}</span>
|
||||
<!-- 知识库切换下拉箭头 -->
|
||||
<div v-if="item.path === 'knowledge-bases' && isInKnowledgeBase"
|
||||
class="kb-dropdown-icon"
|
||||
:class="{
|
||||
'rotate-180': showKbDropdown,
|
||||
'active': isMenuItemActive(item.path)
|
||||
}"
|
||||
@click.stop="toggleKbDropdown">
|
||||
<svg width="12" height="12" viewBox="0 0 12 12" fill="currentColor">
|
||||
<path d="M2.5 4.5L6 8L9.5 4.5H2.5Z"/>
|
||||
</svg>
|
||||
</div>
|
||||
</div>
|
||||
<span class="menu_title">{{ item.title }}</span>
|
||||
<!-- 知识库切换下拉菜单 -->
|
||||
<div v-if="item.path === 'knowledge-bases' && showKbDropdown && isInKnowledgeBase"
|
||||
class="kb-dropdown-menu">
|
||||
<div v-for="kb in initializedKnowledgeBases"
|
||||
:key="kb.id"
|
||||
class="kb-dropdown-item"
|
||||
:class="{ 'active': kb.name === currentKbName }"
|
||||
@click.stop="switchKnowledgeBase(kb.id)">
|
||||
{{ kb.name }}
|
||||
</div>
|
||||
</div>
|
||||
<t-popup overlayInnerClassName="upload-popup" class="placement top center" :content="t('menu.uploadKnowledge')"
|
||||
placement="top" show-arrow destroy-on-close>
|
||||
<div class="upload-file-wrap" @click.stop="uploadFile" variant="outline"
|
||||
v-if="item.path === 'knowledge-bases' && $route.name === 'knowledgeBaseDetail'">
|
||||
<img class="upload-file-icon" :class="[item.path == currentpath ? 'active-upload' : '']"
|
||||
:src="getImgSrc(fileAddIcon)" alt="">
|
||||
</div>
|
||||
</t-popup>
|
||||
</div>
|
||||
<t-popup overlayInnerClassName="upload-popup" class="placement top center" content="上传知识"
|
||||
placement="top" show-arrow destroy-on-close>
|
||||
<div class="upload-file-wrap" @click="uploadFile" variant="outline"
|
||||
v-if="item.path == 'knowledgeBase'">
|
||||
<img class="upload-file-icon" :class="[item.path == currentpath ? 'active-upload' : '']"
|
||||
:src="getImgSrc(fileAddIcon)" alt="">
|
||||
</div>
|
||||
</t-popup>
|
||||
</div>
|
||||
<div ref="submenuscrollContainer" @scroll="handleScroll" class="submenu" v-if="item.children">
|
||||
<div class="submenu_item_p" v-for="(subitem, subindex) in item.children" :key="subindex"
|
||||
@click="gotopage(subitem.path)">
|
||||
<div :class="['submenu_item', currentSecondpath == subitem.path ? 'submenu_item_active' : '']"
|
||||
@mouseenter="mouseenteBotDownr(subindex)" @mouseleave="mouseleaveBotDown">
|
||||
<i v-if="currentSecondpath == subitem.path" class="dot"></i>
|
||||
<span class="submenu_title"
|
||||
:style="currentSecondpath == subitem.path ? 'margin-left:14px;max-width:160px;' : 'margin-left:18px;max-width:173px;'">
|
||||
{{ subitem.title }}
|
||||
</span>
|
||||
<t-popup v-model:visible="subitem.isMore" @overlay-click="delCard(subindex, subitem)"
|
||||
@visible-change="onVisibleChange" overlayClassName="del-menu-popup" trigger="click"
|
||||
destroy-on-close placement="top-left">
|
||||
<div v-if="(activeSubmenu == subindex) || (currentSecondpath == subitem.path) || subitem.isMore"
|
||||
@click.stop="openMore(subindex)" variant="outline" class="menu-more-wrap">
|
||||
<t-icon name="ellipsis" class="menu-more" />
|
||||
</div>
|
||||
<template #content>
|
||||
<span class="del_submenu">删除记录</span>
|
||||
</template>
|
||||
</t-popup>
|
||||
<div ref="submenuscrollContainer" @scroll="handleScroll" class="submenu" v-if="item.children">
|
||||
<div class="submenu_item_p" v-for="(subitem, subindex) in item.children" :key="subindex"
|
||||
@click="gotopage(subitem.path)">
|
||||
<div :class="['submenu_item', currentSecondpath == subitem.path ? 'submenu_item_active' : '']"
|
||||
@mouseenter="mouseenteBotDownr(subindex)" @mouseleave="mouseleaveBotDown">
|
||||
<i v-if="currentSecondpath == subitem.path" class="dot"></i>
|
||||
<span class="submenu_title"
|
||||
:style="currentSecondpath == subitem.path ? 'margin-left:14px;max-width:160px;' : 'margin-left:18px;max-width:173px;'">
|
||||
{{ subitem.title }}
|
||||
</span>
|
||||
<t-popup v-model:visible="subitem.isMore" @overlay-click="delCard(subindex, subitem)"
|
||||
@visible-change="onVisibleChange" overlayClassName="del-menu-popup" trigger="click"
|
||||
destroy-on-close placement="top-left">
|
||||
<div v-if="(activeSubmenu == subindex) || (currentSecondpath == subitem.path) || subitem.isMore"
|
||||
@click.stop="openMore(subindex)" variant="outline" class="menu-more-wrap">
|
||||
<t-icon name="ellipsis" class="menu-more" />
|
||||
</div>
|
||||
<template #content>
|
||||
<span class="del_submenu">{{ t('menu.deleteRecord') }}</span>
|
||||
</template>
|
||||
</t-popup>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 下半部分:账户信息、系统设置、退出登录 -->
|
||||
<div class="menu_bottom">
|
||||
<div class="menu_box" v-for="(item, index) in bottomMenuItems" :key="'bottom-' + index">
|
||||
<div v-if="item.path === 'logout'">
|
||||
<t-popconfirm
|
||||
:content="t('menu.confirmLogout')"
|
||||
@confirm="handleLogout"
|
||||
placement="top"
|
||||
:show-arrow="true"
|
||||
>
|
||||
<div @mouseenter="mouseenteMenu(item.path)" @mouseleave="mouseleaveMenu(item.path)"
|
||||
:class="['menu_item', 'logout-item']">
|
||||
<div class="menu_item-box">
|
||||
<div class="menu_icon">
|
||||
<img class="icon" :src="getImgSrc(logoutIcon)" alt="">
|
||||
</div>
|
||||
<span class="menu_title">{{ t(item.titleKey) }}</span>
|
||||
</div>
|
||||
</div>
|
||||
</t-popconfirm>
|
||||
</div>
|
||||
<div v-else @click="handleMenuClick(item.path)"
|
||||
@mouseenter="mouseenteMenu(item.path)" @mouseleave="mouseleaveMenu(item.path)"
|
||||
:class="['menu_item', item.childrenPath && item.childrenPath == currentpath ? 'menu_item_c_active' : (item.path == currentpath) ? 'menu_item_active' : '']">
|
||||
<div class="menu_item-box">
|
||||
<div class="menu_icon">
|
||||
<img class="icon" :src="getImgSrc(item.icon == 'zhishiku' ? knowledgeIcon : item.icon == 'tenant' ? tenantIcon : prefixIcon)" alt="">
|
||||
</div>
|
||||
<span class="menu_title">{{ item.path === 'knowledge-bases' && kbMenuItem?.title ? kbMenuItem.title : t(item.titleKey) }}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<input type="file" @change="upload" style="display: none" ref="uploadInput"
|
||||
accept=".pdf,.docx,.doc,.txt,.md,.jpg,.jpeg,.png" />
|
||||
accept=".pdf,.docx,.doc,.txt,.md,.jpg,.jpeg,.png,.csv,.xls,.xlsx" />
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
<script setup lang="ts">
|
||||
import { storeToRefs } from 'pinia';
|
||||
import { onMounted, watch, computed, ref, reactive } from 'vue';
|
||||
import { onMounted, watch, computed, ref, reactive, nextTick } from 'vue';
|
||||
import { useRoute, useRouter } from 'vue-router';
|
||||
import { useI18n } from 'vue-i18n';
|
||||
import { getSessionsList, delSession } from "@/api/chat/index";
|
||||
import { getKnowledgeBaseById, listKnowledgeBases, uploadKnowledgeFile } from '@/api/knowledge-base';
|
||||
import { kbFileTypeVerification } from '@/utils/index';
|
||||
import { useMenuStore } from '@/stores/menu';
|
||||
import useKnowledgeBase from '@/hooks/useKnowledgeBase';
|
||||
import { useAuthStore } from '@/stores/auth';
|
||||
import { MessagePlugin } from "tdesign-vue-next";
|
||||
let { requestMethod } = useKnowledgeBase()
|
||||
const { t } = useI18n();
|
||||
let uploadInput = ref();
|
||||
const usemenuStore = useMenuStore();
|
||||
const authStore = useAuthStore();
|
||||
const route = useRoute();
|
||||
const router = useRouter();
|
||||
const currentpath = ref('');
|
||||
@@ -74,39 +140,206 @@ const submenuscrollContainer = ref(null);
|
||||
// 计算总页数
|
||||
const totalPages = computed(() => Math.ceil(total.value / page_size.value));
|
||||
const hasMore = computed(() => currentPage.value < totalPages.value);
|
||||
type MenuItem = { title: string; icon: string; path: string; childrenPath?: string; children?: any[] };
|
||||
const { menuArr } = storeToRefs(usemenuStore);
|
||||
let activeSubmenu = ref(-1);
|
||||
let activeSubmenu = ref<number>(-1);
|
||||
|
||||
// 是否处于知识库详情页
|
||||
const isInKnowledgeBase = computed<boolean>(() => {
|
||||
return route.name === 'knowledgeBaseDetail' ||
|
||||
route.name === 'kbCreatChat' ||
|
||||
route.name === 'chat' ||
|
||||
route.name === 'knowledgeBaseSettings';
|
||||
});
|
||||
|
||||
// 统一的菜单项激活状态判断
|
||||
const isMenuItemActive = (itemPath: string): boolean => {
|
||||
const currentRoute = route.name;
|
||||
|
||||
switch (itemPath) {
|
||||
case 'knowledge-bases':
|
||||
return currentRoute === 'knowledgeBaseList' ||
|
||||
currentRoute === 'knowledgeBaseDetail' ||
|
||||
currentRoute === 'knowledgeBaseSettings';
|
||||
case 'creatChat':
|
||||
return currentRoute === 'kbCreatChat';
|
||||
case 'tenant':
|
||||
return currentRoute === 'tenant';
|
||||
default:
|
||||
return itemPath === currentpath.value;
|
||||
}
|
||||
};
|
||||
|
||||
// 统一的图标激活状态判断
|
||||
const getIconActiveState = (itemPath: string) => {
|
||||
const currentRoute = route.name;
|
||||
|
||||
return {
|
||||
isKbActive: itemPath === 'knowledge-bases' && (
|
||||
currentRoute === 'knowledgeBaseList' ||
|
||||
currentRoute === 'knowledgeBaseDetail' ||
|
||||
currentRoute === 'knowledgeBaseSettings'
|
||||
),
|
||||
isCreatChatActive: itemPath === 'creatChat' && currentRoute === 'kbCreatChat',
|
||||
isTenantActive: itemPath === 'tenant' && currentRoute === 'tenant',
|
||||
isChatActive: itemPath === 'chat' && currentRoute === 'chat'
|
||||
};
|
||||
};
|
||||
|
||||
// 分离上下两部分菜单
|
||||
const topMenuItems = computed<MenuItem[]>(() => {
|
||||
return (menuArr.value as unknown as MenuItem[]).filter((item: MenuItem) =>
|
||||
item.path === 'knowledge-bases' || (isInKnowledgeBase.value && item.path === 'creatChat')
|
||||
);
|
||||
});
|
||||
|
||||
const bottomMenuItems = computed<MenuItem[]>(() => {
|
||||
return (menuArr.value as unknown as MenuItem[]).filter((item: MenuItem) => {
|
||||
if (item.path === 'knowledge-bases' || item.path === 'creatChat') {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
});
|
||||
|
||||
// 当前知识库名称和列表
|
||||
const currentKbName = ref<string>('')
|
||||
const allKnowledgeBases = ref<Array<{ id: string; name: string; embedding_model_id?: string; summary_model_id?: string }>>([])
|
||||
const showKbDropdown = ref<boolean>(false)
|
||||
|
||||
// 过滤已初始化的知识库
|
||||
const initializedKnowledgeBases = computed(() => {
|
||||
return allKnowledgeBases.value.filter(kb =>
|
||||
kb.embedding_model_id && kb.embedding_model_id !== '' &&
|
||||
kb.summary_model_id && kb.summary_model_id !== ''
|
||||
)
|
||||
})
|
||||
|
||||
// 动态更新知识库菜单项标题
|
||||
const kbMenuItem = computed(() => {
|
||||
const kbItem = topMenuItems.value.find(item => item.path === 'knowledge-bases')
|
||||
if (kbItem && isInKnowledgeBase.value && currentKbName.value) {
|
||||
return { ...kbItem, title: currentKbName.value }
|
||||
}
|
||||
return kbItem
|
||||
})
|
||||
|
||||
const loading = ref(false)
|
||||
const uploadFile = () => {
|
||||
const uploadFile = async () => {
|
||||
// 获取当前知识库ID
|
||||
const currentKbId = await getCurrentKbId();
|
||||
|
||||
// 检查当前知识库的初始化状态
|
||||
if (currentKbId) {
|
||||
try {
|
||||
const kbResponse = await getKnowledgeBaseById(currentKbId);
|
||||
const kb = kbResponse.data;
|
||||
|
||||
// 检查知识库是否已初始化(有 EmbeddingModelID 和 SummaryModelID)
|
||||
if (!kb.embedding_model_id || kb.embedding_model_id === '' ||
|
||||
!kb.summary_model_id || kb.summary_model_id === '') {
|
||||
MessagePlugin.warning(t('knowledgeBase.notInitialized'));
|
||||
return;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('获取知识库信息失败:', error);
|
||||
MessagePlugin.error(t('knowledgeBase.getInfoFailed'));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
uploadInput.value.click()
|
||||
}
|
||||
const upload = (e) => {
|
||||
requestMethod(e.target.files[0], uploadInput)
|
||||
const upload = async (e: any) => {
|
||||
const file = e.target.files[0];
|
||||
if (!file) return;
|
||||
|
||||
// 文件类型验证
|
||||
if (kbFileTypeVerification(file)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 获取当前知识库ID
|
||||
const currentKbId = (route.params as any)?.kbId as string;
|
||||
if (!currentKbId) {
|
||||
MessagePlugin.error(t('knowledgeBase.missingId'));
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await uploadKnowledgeFile(currentKbId, { file });
|
||||
const responseData = result as any;
|
||||
console.log('上传API返回结果:', responseData);
|
||||
|
||||
// 如果没有抛出异常,就认为上传成功,先触发刷新事件
|
||||
console.log('文件上传完成,发送事件通知页面刷新,知识库ID:', currentKbId);
|
||||
window.dispatchEvent(new CustomEvent('knowledgeFileUploaded', {
|
||||
detail: { kbId: currentKbId }
|
||||
}));
|
||||
|
||||
// 然后处理UI消息
|
||||
// 判断上传是否成功 - 检查多种可能的成功标识
|
||||
const isSuccess = responseData.success || responseData.code === 200 || responseData.status === 'success' || (!responseData.error && responseData);
|
||||
|
||||
if (isSuccess) {
|
||||
MessagePlugin.info(t('file.uploadSuccess'));
|
||||
} else {
|
||||
// 改进错误信息提取逻辑
|
||||
let errorMessage = t('file.uploadFailed');
|
||||
if (responseData.error && responseData.error.message) {
|
||||
errorMessage = responseData.error.message;
|
||||
} else if (responseData.message) {
|
||||
errorMessage = responseData.message;
|
||||
}
|
||||
if (responseData.code === 'duplicate_file' || (responseData.error && responseData.error.code === 'duplicate_file')) {
|
||||
errorMessage = t('file.fileExists');
|
||||
}
|
||||
MessagePlugin.error(errorMessage);
|
||||
}
|
||||
} catch (err: any) {
|
||||
let errorMessage = t('file.uploadFailed');
|
||||
if (err.code === 'duplicate_file') {
|
||||
errorMessage = t('file.fileExists');
|
||||
} else if (err.error && err.error.message) {
|
||||
errorMessage = err.error.message;
|
||||
} else if (err.message) {
|
||||
errorMessage = err.message;
|
||||
}
|
||||
MessagePlugin.error(errorMessage);
|
||||
} finally {
|
||||
uploadInput.value.value = "";
|
||||
}
|
||||
}
|
||||
const mouseenteBotDownr = (val) => {
|
||||
const mouseenteBotDownr = (val: number) => {
|
||||
activeSubmenu.value = val;
|
||||
}
|
||||
const mouseleaveBotDown = () => {
|
||||
activeSubmenu.value = -1;
|
||||
}
|
||||
const onVisibleChange = (e) => {
|
||||
const onVisibleChange = (_e: any) => {
|
||||
}
|
||||
|
||||
const delCard = (index, item) => {
|
||||
delSession(item.id).then(res => {
|
||||
if (res && res.success) {
|
||||
menuArr.value[1].children.splice(index, 1);
|
||||
const delCard = (index: number, item: any) => {
|
||||
delSession(item.id).then((res: any) => {
|
||||
if (res && (res as any).success) {
|
||||
(menuArr.value as any[])[1]?.children?.splice(index, 1);
|
||||
if (item.id == route.params.chatid) {
|
||||
router.push('/platform/creatChat');
|
||||
// 删除当前会话后,跳转到当前知识库的创建聊天页面
|
||||
const kbId = route.params.kbId;
|
||||
if (kbId) {
|
||||
router.push(`/platform/knowledge-bases/${kbId}/creatChat`);
|
||||
} else {
|
||||
router.push('/platform/knowledge-bases');
|
||||
}
|
||||
}
|
||||
} else {
|
||||
MessagePlugin.error("删除失败,请稍后再试!");
|
||||
MessagePlugin.error(t('knowledgeBase.deleteFailed'));
|
||||
}
|
||||
})
|
||||
}
|
||||
const debounce = (fn, delay) => {
|
||||
let timer
|
||||
return (...args) => {
|
||||
const debounce = (fn: (...args: any[]) => void, delay: number) => {
|
||||
let timer: ReturnType<typeof setTimeout>
|
||||
return (...args: any[]) => {
|
||||
clearTimeout(timer)
|
||||
timer = setTimeout(() => fn(...args), delay)
|
||||
}
|
||||
@@ -124,80 +357,221 @@ const checkScrollBottom = () => {
|
||||
}
|
||||
}
|
||||
const handleScroll = debounce(checkScrollBottom, 200)
|
||||
const getMessageList = () => {
|
||||
const getMessageList = async () => {
|
||||
// 仅在知识库内部显示对话列表
|
||||
if (!isInKnowledgeBase.value) {
|
||||
usemenuStore.clearMenuArr();
|
||||
currentKbName.value = '';
|
||||
return;
|
||||
}
|
||||
let kbId = (route.params as any)?.kbId as string
|
||||
// 新的路由格式:/platform/chat/:kbId/:chatid,直接从路由参数获取知识库ID
|
||||
if (!kbId) {
|
||||
usemenuStore.clearMenuArr();
|
||||
currentKbName.value = '';
|
||||
return;
|
||||
}
|
||||
|
||||
// 获取知识库名称和所有知识库列表
|
||||
try {
|
||||
const [kbRes, allKbRes]: any[] = await Promise.all([
|
||||
getKnowledgeBaseById(kbId),
|
||||
listKnowledgeBases()
|
||||
])
|
||||
if (kbRes?.data?.name) {
|
||||
currentKbName.value = kbRes.data.name
|
||||
}
|
||||
if (allKbRes?.data) {
|
||||
allKnowledgeBases.value = allKbRes.data
|
||||
}
|
||||
} catch {}
|
||||
|
||||
if (loading.value) return;
|
||||
loading.value = true;
|
||||
usemenuStore.clearMenuArr();
|
||||
getSessionsList(currentPage.value, page_size.value).then(res => {
|
||||
getSessionsList(currentPage.value, page_size.value).then((res: any) => {
|
||||
if (res.data && res.data.length) {
|
||||
res.data.forEach(item => {
|
||||
let obj = { title: item.title ? item.title : "新会话", path: `chat/${item.id}`, id: item.id, isMore: false, isNoTitle: item.title ? false : true }
|
||||
// 过滤出当前知识库的会话
|
||||
const filtered = res.data.filter((s: any) => s.knowledge_base_id === kbId)
|
||||
filtered.forEach((item: any) => {
|
||||
let obj = { title: item.title ? item.title : t('menu.newSession'), path: `chat/${kbId}/${item.id}`, id: item.id, isMore: false, isNoTitle: item.title ? false : true }
|
||||
usemenuStore.updatemenuArr(obj)
|
||||
});
|
||||
loading.value = false;
|
||||
}
|
||||
if (res.total) {
|
||||
total.value = res.total;
|
||||
if ((res as any).total) {
|
||||
total.value = (res as any).total;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const openMore = (e) => { }
|
||||
const openMore = (_e: any) => { }
|
||||
onMounted(() => {
|
||||
currentpath.value = route.name;
|
||||
if (route.params.chatid) {
|
||||
currentSecondpath.value = `${route.name}/${route.params.chatid}`;
|
||||
const routeName = typeof route.name === 'string' ? route.name : (route.name ? String(route.name) : '')
|
||||
currentpath.value = routeName;
|
||||
if (route.params.chatid && route.params.kbId) {
|
||||
currentSecondpath.value = `chat/${route.params.kbId}/${route.params.chatid}`;
|
||||
}
|
||||
getMessageList();
|
||||
});
|
||||
|
||||
watch([() => route.name, () => route.params], (newvalue) => {
|
||||
currentpath.value = newvalue[0];
|
||||
if (newvalue[1].chatid) {
|
||||
currentSecondpath.value = `${newvalue[0]}/${newvalue[1].chatid}`;
|
||||
const nameStr = typeof newvalue[0] === 'string' ? (newvalue[0] as string) : (newvalue[0] ? String(newvalue[0]) : '')
|
||||
currentpath.value = nameStr;
|
||||
if (newvalue[1].chatid && newvalue[1].kbId) {
|
||||
currentSecondpath.value = `chat/${newvalue[1].kbId}/${newvalue[1].chatid}`;
|
||||
} else {
|
||||
currentSecondpath.value = "";
|
||||
}
|
||||
|
||||
// 路由变化时刷新对话列表(仅在知识库内部)
|
||||
getMessageList();
|
||||
// 路由变化时更新图标状态
|
||||
getIcon(nameStr);
|
||||
});
|
||||
let fileAddIcon = ref('file-add-green.svg');
|
||||
let knowledgeIcon = ref('zhishiku-green.svg');
|
||||
let prefixIcon = ref('prefixIcon.svg');
|
||||
let settingIcon = ref('setting.svg');
|
||||
let logoutIcon = ref('logout.svg');
|
||||
let tenantIcon = ref('user.svg'); // 使用专门的用户图标
|
||||
let pathPrefix = ref(route.name)
|
||||
const getIcon = (path) => {
|
||||
fileAddIcon.value = path == 'knowledgeBase' ? 'file-add-green.svg' : 'file-add.svg';
|
||||
knowledgeIcon.value = path == 'knowledgeBase' ? 'zhishiku-green.svg' : 'zhishiku.svg';
|
||||
prefixIcon.value = path == 'creatChat' ? 'prefixIcon-green.svg' : path == 'knowledgeBase' ? 'prefixIcon-grey.svg' : 'prefixIcon.svg';
|
||||
settingIcon.value = path == 'settings' ? 'setting-green.svg' : 'setting.svg';
|
||||
const getIcon = (path: string) => {
|
||||
// 根据当前路由状态更新所有图标
|
||||
const kbActiveState = getIconActiveState('knowledge-bases');
|
||||
const creatChatActiveState = getIconActiveState('creatChat');
|
||||
const tenantActiveState = getIconActiveState('tenant');
|
||||
|
||||
// 上传图标:只在知识库相关页面显示绿色
|
||||
fileAddIcon.value = kbActiveState.isKbActive ? 'file-add-green.svg' : 'file-add.svg';
|
||||
|
||||
// 知识库图标:只在知识库页面显示绿色
|
||||
knowledgeIcon.value = kbActiveState.isKbActive ? 'zhishiku-green.svg' : 'zhishiku.svg';
|
||||
|
||||
// 对话图标:只在对话创建页面显示绿色,在知识库页面显示灰色,其他情况显示默认
|
||||
prefixIcon.value = creatChatActiveState.isCreatChatActive ? 'prefixIcon-green.svg' :
|
||||
kbActiveState.isKbActive ? 'prefixIcon-grey.svg' :
|
||||
'prefixIcon.svg';
|
||||
|
||||
// 租户图标:只在租户页面显示绿色
|
||||
tenantIcon.value = tenantActiveState.isTenantActive ? 'user-green.svg' : 'user.svg';
|
||||
|
||||
// 退出图标:始终显示默认
|
||||
logoutIcon.value = 'logout.svg';
|
||||
}
|
||||
getIcon(route.name)
|
||||
const gotopage = (path) => {
|
||||
pathPrefix.value = path;
|
||||
// 如果是系统设置,跳转到初始化配置页面
|
||||
if (path === 'settings') {
|
||||
router.push('/initialization');
|
||||
getIcon(typeof route.name === 'string' ? route.name as string : (route.name ? String(route.name) : ''))
|
||||
const handleMenuClick = async (path: string) => {
|
||||
if (path === 'knowledge-bases') {
|
||||
// 知识库菜单项:如果在知识库内部,跳转到当前知识库文件页;否则跳转到知识库列表
|
||||
const kbId = await getCurrentKbId()
|
||||
if (kbId) {
|
||||
router.push(`/platform/knowledge-bases/${kbId}`)
|
||||
} else {
|
||||
router.push('/platform/knowledge-bases')
|
||||
}
|
||||
} else {
|
||||
router.push(`/platform/${path}`);
|
||||
gotopage(path)
|
||||
}
|
||||
}
|
||||
|
||||
// 处理退出登录确认
|
||||
const handleLogout = () => {
|
||||
gotopage('logout')
|
||||
}
|
||||
|
||||
const getCurrentKbId = async (): Promise<string | null> => {
|
||||
let kbId = (route.params as any)?.kbId as string
|
||||
// 新的路由格式:/platform/chat/:kbId/:chatid,直接从路由参数获取
|
||||
if (!kbId && route.name === 'chat' && (route.params as any)?.kbId) {
|
||||
kbId = (route.params as any).kbId
|
||||
}
|
||||
return kbId || null
|
||||
}
|
||||
|
||||
const gotopage = async (path: string) => {
|
||||
pathPrefix.value = path;
|
||||
// 处理退出登录
|
||||
if (path === 'logout') {
|
||||
authStore.logout();
|
||||
router.push('/login');
|
||||
return;
|
||||
} else {
|
||||
if (path === 'creatChat') {
|
||||
const kbId = await getCurrentKbId()
|
||||
if (kbId) {
|
||||
router.push(`/platform/knowledge-bases/${kbId}/creatChat`)
|
||||
} else {
|
||||
router.push(`/platform/knowledge-bases`)
|
||||
}
|
||||
} else {
|
||||
router.push(`/platform/${path}`);
|
||||
}
|
||||
}
|
||||
getIcon(path)
|
||||
}
|
||||
|
||||
const getImgSrc = (url) => {
|
||||
const getImgSrc = (url: string) => {
|
||||
return new URL(`/src/assets/img/${url}`, import.meta.url).href;
|
||||
}
|
||||
|
||||
const mouseenteMenu = (path) => {
|
||||
if (pathPrefix.value != 'knowledgeBase' && pathPrefix.value != 'creatChat' && path != 'knowledgeBase') {
|
||||
const mouseenteMenu = (path: string) => {
|
||||
if (pathPrefix.value != 'knowledge-bases' && pathPrefix.value != 'creatChat' && path != 'knowledge-bases') {
|
||||
prefixIcon.value = 'prefixIcon-grey.svg';
|
||||
}
|
||||
}
|
||||
const mouseleaveMenu = (path) => {
|
||||
if (pathPrefix.value != 'knowledgeBase' && pathPrefix.value != 'creatChat' && path != 'knowledgeBase') {
|
||||
getIcon(route.name)
|
||||
const mouseleaveMenu = (path: string) => {
|
||||
if (pathPrefix.value != 'knowledge-bases' && pathPrefix.value != 'creatChat' && path != 'knowledge-bases') {
|
||||
const nameStr = typeof route.name === 'string' ? route.name as string : (route.name ? String(route.name) : '')
|
||||
getIcon(nameStr)
|
||||
}
|
||||
}
|
||||
|
||||
// 知识库下拉相关方法
|
||||
const toggleKbDropdown = (event?: Event) => {
|
||||
if (event) {
|
||||
event.stopPropagation()
|
||||
}
|
||||
showKbDropdown.value = !showKbDropdown.value
|
||||
}
|
||||
|
||||
const switchKnowledgeBase = (kbId: string, event?: Event) => {
|
||||
if (event) {
|
||||
event.stopPropagation()
|
||||
}
|
||||
showKbDropdown.value = false
|
||||
const currentRoute = route.name
|
||||
|
||||
// 路由跳转
|
||||
if (currentRoute === 'knowledgeBaseDetail') {
|
||||
router.push(`/platform/knowledge-bases/${kbId}`)
|
||||
} else if (currentRoute === 'kbCreatChat') {
|
||||
router.push(`/platform/knowledge-bases/${kbId}/creatChat`)
|
||||
} else if (currentRoute === 'knowledgeBaseSettings') {
|
||||
router.push(`/platform/knowledge-bases/${kbId}/settings`)
|
||||
} else {
|
||||
router.push(`/platform/knowledge-bases/${kbId}`)
|
||||
}
|
||||
|
||||
// 刷新右侧内容 - 通过触发页面重新加载或发送事件
|
||||
nextTick(() => {
|
||||
// 发送全局事件通知页面刷新知识库内容
|
||||
window.dispatchEvent(new CustomEvent('knowledgeBaseChanged', {
|
||||
detail: { kbId }
|
||||
}))
|
||||
})
|
||||
}
|
||||
|
||||
// 点击外部关闭下拉菜单
|
||||
const handleClickOutside = () => {
|
||||
showKbDropdown.value = false
|
||||
}
|
||||
|
||||
onMounted(() => {
|
||||
document.addEventListener('click', handleClickOutside)
|
||||
})
|
||||
|
||||
watch(() => route.params.kbId, () => {
|
||||
showKbDropdown.value = false
|
||||
})
|
||||
|
||||
</script>
|
||||
<style lang="less" scoped>
|
||||
.del_submenu {
|
||||
@@ -210,6 +584,10 @@ const mouseleaveMenu = (path) => {
|
||||
padding: 8px;
|
||||
background: #fff;
|
||||
box-sizing: border-box;
|
||||
height: 100vh;
|
||||
overflow: hidden;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
|
||||
.logo_box {
|
||||
height: 80px;
|
||||
@@ -239,9 +617,28 @@ const mouseleaveMenu = (path) => {
|
||||
line-height: 21.7px;
|
||||
}
|
||||
|
||||
.menu_top {
|
||||
flex: 1;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
overflow: hidden;
|
||||
min-height: 0;
|
||||
}
|
||||
|
||||
.menu_bottom {
|
||||
flex-shrink: 0;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.menu_box {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
|
||||
&.has-submenu {
|
||||
flex: 1;
|
||||
min-height: 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -341,18 +738,21 @@ const mouseleaveMenu = (path) => {
|
||||
font-style: normal;
|
||||
font-weight: 600;
|
||||
line-height: 22px;
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
max-width: 120px;
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.submenu {
|
||||
font-family: "PingFang SC";
|
||||
font-size: 14px;
|
||||
font-style: normal;
|
||||
font-family: "PingFang SC";
|
||||
font-size: 14px;
|
||||
font-style: normal;
|
||||
overflow-y: scroll;
|
||||
overflow-y: auto;
|
||||
scrollbar-width: none;
|
||||
height: calc(98vh - 276px);
|
||||
flex: 1;
|
||||
min-height: 0;
|
||||
margin-left: 4px;
|
||||
}
|
||||
|
||||
.submenu_item_p {
|
||||
@@ -427,6 +827,92 @@ const mouseleaveMenu = (path) => {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 知识库下拉菜单样式 */
|
||||
.kb-dropdown-icon {
|
||||
margin-left: auto;
|
||||
color: #666;
|
||||
transition: transform 0.3s ease, color 0.2s ease;
|
||||
cursor: pointer;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
width: 16px;
|
||||
height: 16px;
|
||||
|
||||
&.rotate-180 {
|
||||
transform: rotate(180deg);
|
||||
}
|
||||
|
||||
&:hover {
|
||||
color: #07c05f;
|
||||
}
|
||||
|
||||
&.active {
|
||||
color: #07c05f;
|
||||
}
|
||||
|
||||
&.active:hover {
|
||||
color: #05a04f;
|
||||
}
|
||||
|
||||
svg {
|
||||
width: 12px;
|
||||
height: 12px;
|
||||
transition: inherit;
|
||||
}
|
||||
}
|
||||
|
||||
.kb-dropdown-menu {
|
||||
position: absolute;
|
||||
top: 100%;
|
||||
left: 0;
|
||||
right: 0;
|
||||
background: #fff;
|
||||
border: 1px solid #e5e7eb;
|
||||
border-radius: 6px;
|
||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
|
||||
z-index: 1000;
|
||||
max-height: 200px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.kb-dropdown-item {
|
||||
padding: 8px 16px;
|
||||
cursor: pointer;
|
||||
transition: background-color 0.2s ease;
|
||||
font-size: 14px;
|
||||
color: #333;
|
||||
|
||||
&:hover {
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
&.active {
|
||||
background-color: #07c05f1a;
|
||||
color: #07c05f;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
&:first-child {
|
||||
border-radius: 6px 6px 0 0;
|
||||
}
|
||||
|
||||
&:last-child {
|
||||
border-radius: 0 0 6px 6px;
|
||||
}
|
||||
}
|
||||
|
||||
.menu_item-box {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
width: 100%;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.menu_box {
|
||||
position: relative;
|
||||
}
|
||||
</style>
|
||||
<style lang="less">
|
||||
.upload-popup {
|
||||
@@ -456,4 +942,48 @@ const mouseleaveMenu = (path) => {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// 退出登录确认框样式
|
||||
:deep(.t-popconfirm) {
|
||||
.t-popconfirm__content {
|
||||
background: #fff;
|
||||
border: 1px solid #e7e7e7;
|
||||
border-radius: 6px;
|
||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
|
||||
padding: 12px 16px;
|
||||
font-size: 14px;
|
||||
color: #333;
|
||||
max-width: 200px;
|
||||
}
|
||||
|
||||
.t-popconfirm__arrow {
|
||||
border-bottom-color: #e7e7e7;
|
||||
}
|
||||
|
||||
.t-popconfirm__arrow::after {
|
||||
border-bottom-color: #fff;
|
||||
}
|
||||
|
||||
.t-popconfirm__buttons {
|
||||
margin-top: 8px;
|
||||
display: flex;
|
||||
justify-content: flex-end;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.t-button--variant-outline {
|
||||
border-color: #d9d9d9;
|
||||
color: #666;
|
||||
}
|
||||
|
||||
.t-button--theme-danger {
|
||||
background-color: #ff4d4f;
|
||||
border-color: #ff4d4f;
|
||||
}
|
||||
|
||||
.t-button--theme-danger:hover {
|
||||
background-color: #ff7875;
|
||||
border-color: #ff7875;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
@@ -1,52 +1,54 @@
|
||||
import { ref, reactive, onMounted } from "vue";
|
||||
import { ref, reactive } from "vue";
|
||||
import { storeToRefs } from "pinia";
|
||||
import { formatStringDate, kbFileTypeVerification } from "../utils/index";
|
||||
import { MessagePlugin } from "tdesign-vue-next";
|
||||
import {
|
||||
uploadKnowledgeBase,
|
||||
getKnowledgeBase,
|
||||
uploadKnowledgeFile,
|
||||
listKnowledgeFiles,
|
||||
getKnowledgeDetails,
|
||||
delKnowledgeDetails,
|
||||
getKnowledgeDetailsCon,
|
||||
} from "@/api/knowledge-base/index";
|
||||
import { knowledgeStore } from "@/stores/knowledge";
|
||||
import { useRoute } from 'vue-router';
|
||||
|
||||
const usemenuStore = knowledgeStore();
|
||||
export default function () {
|
||||
export default function (knowledgeBaseId?: string) {
|
||||
const route = useRoute();
|
||||
const { cardList, total } = storeToRefs(usemenuStore);
|
||||
let moreIndex = ref(-1);
|
||||
const details = reactive({
|
||||
title: "",
|
||||
time: "",
|
||||
md: [],
|
||||
md: [] as any[],
|
||||
id: "",
|
||||
total: 0
|
||||
});
|
||||
const getKnowled = (query = { page: 1, page_size: 35 }) => {
|
||||
getKnowledgeBase(query)
|
||||
const getKnowled = (query = { page: 1, page_size: 35 }, kbId?: string) => {
|
||||
const targetKbId = kbId || knowledgeBaseId;
|
||||
if (!targetKbId) return;
|
||||
|
||||
listKnowledgeFiles(targetKbId, query)
|
||||
.then((result: any) => {
|
||||
let { data, total: totalResult } = result;
|
||||
let cardList_ = data.map((item) => {
|
||||
item["file_name"] = item.file_name.substring(
|
||||
0,
|
||||
item.file_name.lastIndexOf(".")
|
||||
);
|
||||
return {
|
||||
...item,
|
||||
updated_at: formatStringDate(new Date(item.updated_at)),
|
||||
isMore: false,
|
||||
file_type: item.file_type.toLocaleUpperCase(),
|
||||
};
|
||||
});
|
||||
if (query.page == 1) {
|
||||
const { data, total: totalResult } = result;
|
||||
const cardList_ = data.map((item: any) => ({
|
||||
...item,
|
||||
file_name: item.file_name.substring(0, item.file_name.lastIndexOf(".")),
|
||||
updated_at: formatStringDate(new Date(item.updated_at)),
|
||||
isMore: false,
|
||||
file_type: item.file_type.toLocaleUpperCase(),
|
||||
}));
|
||||
|
||||
if (query.page === 1) {
|
||||
cardList.value = cardList_;
|
||||
} else {
|
||||
cardList.value.push(...cardList_);
|
||||
}
|
||||
total.value = totalResult;
|
||||
})
|
||||
.catch((err) => {});
|
||||
.catch(() => {});
|
||||
};
|
||||
const delKnowledge = (index: number, item) => {
|
||||
const delKnowledge = (index: number, item: any) => {
|
||||
cardList.value[index].isMore = false;
|
||||
moreIndex.value = -1;
|
||||
delKnowledgeDetails(item.id)
|
||||
@@ -58,7 +60,7 @@ export default function () {
|
||||
MessagePlugin.error("知识删除失败!");
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
.catch(() => {
|
||||
MessagePlugin.error("知识删除失败!");
|
||||
});
|
||||
};
|
||||
@@ -70,56 +72,48 @@ export default function () {
|
||||
moreIndex.value = -1;
|
||||
}
|
||||
};
|
||||
const requestMethod = (file: any, uploadInput) => {
|
||||
if (file instanceof File && uploadInput) {
|
||||
if (kbFileTypeVerification(file)) {
|
||||
return;
|
||||
}
|
||||
uploadKnowledgeBase({ file })
|
||||
.then((result: any) => {
|
||||
if (result.success) {
|
||||
MessagePlugin.info("上传成功!");
|
||||
getKnowled();
|
||||
} else {
|
||||
// 改进错误信息提取逻辑
|
||||
let errorMessage = "上传失败!";
|
||||
|
||||
// 优先从 error 对象中获取错误信息
|
||||
if (result.error && result.error.message) {
|
||||
errorMessage = result.error.message;
|
||||
} else if (result.message) {
|
||||
errorMessage = result.message;
|
||||
}
|
||||
|
||||
// 检查错误码,如果是重复文件则显示特定提示
|
||||
if (result.code === 'duplicate_file' || (result.error && result.error.code === 'duplicate_file')) {
|
||||
errorMessage = "文件已存在";
|
||||
}
|
||||
|
||||
MessagePlugin.error(errorMessage);
|
||||
}
|
||||
uploadInput.value.value = "";
|
||||
})
|
||||
.catch((err: any) => {
|
||||
// 改进 catch 中的错误处理
|
||||
let errorMessage = "上传失败!";
|
||||
|
||||
if (err.code === 'duplicate_file') {
|
||||
errorMessage = "文件已存在";
|
||||
} else if (err.error && err.error.message) {
|
||||
errorMessage = err.error.message;
|
||||
} else if (err.message) {
|
||||
errorMessage = err.message;
|
||||
}
|
||||
|
||||
MessagePlugin.error(errorMessage);
|
||||
uploadInput.value.value = "";
|
||||
});
|
||||
} else {
|
||||
MessagePlugin.error("file文件类型错误!");
|
||||
const requestMethod = (file: any, uploadInput: any) => {
|
||||
if (!(file instanceof File) || !uploadInput) {
|
||||
MessagePlugin.error("文件类型错误!");
|
||||
return;
|
||||
}
|
||||
|
||||
if (kbFileTypeVerification(file)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 获取当前知识库ID
|
||||
let currentKbId: string | undefined = (route.params as any)?.kbId as string;
|
||||
if (!currentKbId && typeof window !== 'undefined') {
|
||||
const match = window.location.pathname.match(/knowledge-bases\/([^/]+)/);
|
||||
if (match?.[1]) currentKbId = match[1];
|
||||
}
|
||||
if (!currentKbId) {
|
||||
currentKbId = knowledgeBaseId;
|
||||
}
|
||||
if (!currentKbId) {
|
||||
MessagePlugin.error("缺少知识库ID");
|
||||
return;
|
||||
}
|
||||
|
||||
uploadKnowledgeFile(currentKbId, { file })
|
||||
.then((result: any) => {
|
||||
if (result.success) {
|
||||
MessagePlugin.info("上传成功!");
|
||||
getKnowled({ page: 1, page_size: 35 }, currentKbId);
|
||||
} else {
|
||||
const errorMessage = result.error?.message || result.message || "上传失败!";
|
||||
MessagePlugin.error(result.code === 'duplicate_file' ? "文件已存在" : errorMessage);
|
||||
}
|
||||
uploadInput.value.value = "";
|
||||
})
|
||||
.catch((err: any) => {
|
||||
const errorMessage = err.error?.message || err.message || "上传失败!";
|
||||
MessagePlugin.error(err.code === 'duplicate_file' ? "文件已存在" : errorMessage);
|
||||
uploadInput.value.value = "";
|
||||
});
|
||||
};
|
||||
const getCardDetails = (item) => {
|
||||
const getCardDetails = (item: any) => {
|
||||
Object.assign(details, {
|
||||
title: "",
|
||||
time: "",
|
||||
@@ -129,7 +123,7 @@ export default function () {
|
||||
getKnowledgeDetails(item.id)
|
||||
.then((result: any) => {
|
||||
if (result.success && result.data) {
|
||||
let { data } = result;
|
||||
const { data } = result;
|
||||
Object.assign(details, {
|
||||
title: data.file_name,
|
||||
time: formatStringDate(new Date(data.updated_at)),
|
||||
@@ -137,15 +131,16 @@ export default function () {
|
||||
});
|
||||
}
|
||||
})
|
||||
.catch((err) => {});
|
||||
getfDetails(item.id, 1);
|
||||
.catch(() => {});
|
||||
getfDetails(item.id, 1);
|
||||
};
|
||||
const getfDetails = (id, page) => {
|
||||
|
||||
const getfDetails = (id: string, page: number) => {
|
||||
getKnowledgeDetailsCon(id, page)
|
||||
.then((result: any) => {
|
||||
if (result.success && result.data) {
|
||||
let { data, total: totalResult } = result;
|
||||
if (page == 1) {
|
||||
const { data, total: totalResult } = result;
|
||||
if (page === 1) {
|
||||
details.md = data;
|
||||
} else {
|
||||
details.md.push(...data);
|
||||
@@ -153,7 +148,7 @@ export default function () {
|
||||
details.total = totalResult;
|
||||
}
|
||||
})
|
||||
.catch((err) => {});
|
||||
.catch(() => {});
|
||||
};
|
||||
return {
|
||||
cardList,
|
||||
|
||||
24
frontend/src/i18n/index.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
import { createI18n } from 'vue-i18n'
|
||||
import zhCN from './locales/zh-CN.ts'
|
||||
import ruRU from './locales/ru-RU.ts'
|
||||
import enUS from './locales/en-US.ts'
|
||||
|
||||
const messages = {
|
||||
'zh-CN': zhCN,
|
||||
'en-US': enUS,
|
||||
'ru-RU': ruRU
|
||||
}
|
||||
|
||||
// Получаем сохраненный язык из localStorage или используем китайский по умолчанию
|
||||
const savedLocale = localStorage.getItem('locale') || 'zh-CN'
|
||||
console.log('i18n инициализация с языком:', savedLocale)
|
||||
|
||||
const i18n = createI18n({
|
||||
legacy: false,
|
||||
locale: savedLocale,
|
||||
fallbackLocale: 'zh-CN',
|
||||
globalInjection: true,
|
||||
messages
|
||||
})
|
||||
|
||||
export default i18n
|
||||
553
frontend/src/i18n/locales/en-US.ts
Normal file
@@ -0,0 +1,553 @@
|
||||
export default {
|
||||
menu: {
|
||||
knowledgeBase: 'Knowledge Base',
|
||||
chat: 'Chat',
|
||||
createChat: 'Create Chat',
|
||||
tenant: 'Account Info',
|
||||
settings: 'System Settings',
|
||||
logout: 'Logout',
|
||||
uploadKnowledge: 'Upload Knowledge',
|
||||
deleteRecord: 'Delete Record',
|
||||
newSession: 'New Chat',
|
||||
confirmLogout: 'Are you sure you want to logout?',
|
||||
systemInfo: 'System Information'
|
||||
},
|
||||
knowledgeBase: {
|
||||
title: 'Knowledge Base',
|
||||
list: 'Knowledge Base List',
|
||||
detail: 'Knowledge Base Details',
|
||||
create: 'Create Knowledge Base',
|
||||
edit: 'Edit Knowledge Base',
|
||||
delete: 'Delete Knowledge Base',
|
||||
name: 'Name',
|
||||
description: 'Description',
|
||||
files: 'Files',
|
||||
settings: 'Settings',
|
||||
upload: 'Upload File',
|
||||
uploadSuccess: 'File uploaded successfully!',
|
||||
uploadFailed: 'File upload failed!',
|
||||
fileExists: 'File already exists',
|
||||
notInitialized: 'Knowledge base is not initialized. Please configure models in settings before uploading files',
|
||||
getInfoFailed: 'Failed to get knowledge base information, file upload is not possible',
|
||||
missingId: 'Knowledge base ID is missing',
|
||||
deleteFailed: 'Delete failed. Please try again later!',
|
||||
createKnowledgeBase: 'Create Knowledge Base',
|
||||
knowledgeBaseName: 'Knowledge Base Name',
|
||||
enterName: 'Enter knowledge base name',
|
||||
embeddingModel: 'Embedding Model',
|
||||
selectEmbeddingModel: 'Select embedding model',
|
||||
summaryModel: 'Summary Model',
|
||||
selectSummaryModel: 'Select summary model',
|
||||
rerankModel: 'Rerank Model',
|
||||
selectRerankModel: 'Select rerank model (optional)',
|
||||
createSuccess: 'Knowledge base created successfully',
|
||||
createFailed: 'Failed to create knowledge base',
|
||||
updateSuccess: 'Knowledge base updated successfully',
|
||||
updateFailed: 'Failed to update knowledge base',
|
||||
deleteSuccess: 'Knowledge base deleted successfully',
|
||||
deleteConfirm: 'Are you sure you want to delete this knowledge base?',
|
||||
fileName: 'File Name',
|
||||
fileSize: 'File Size',
|
||||
uploadTime: 'Upload Time',
|
||||
status: 'Status',
|
||||
actions: 'Actions',
|
||||
processing: 'Processing',
|
||||
completed: 'Completed',
|
||||
failed: 'Failed',
|
||||
noFiles: 'No files',
|
||||
dragFilesHere: 'Drag files here or',
|
||||
clickToUpload: 'click to upload',
|
||||
supportedFormats: 'Supported formats',
|
||||
maxFileSize: 'Max file size',
|
||||
viewDetails: 'View Details',
|
||||
downloadFile: 'Download File',
|
||||
deleteFile: 'Delete File',
|
||||
confirmDeleteFile: 'Are you sure you want to delete this file?',
|
||||
totalFiles: 'Total files',
|
||||
totalSize: 'Total size',
|
||||
// Additional translations for KnowledgeBase.vue
|
||||
newSession: 'New Chat',
|
||||
deleteDocument: 'Delete Document',
|
||||
parsingFailed: 'Parsing failed',
|
||||
parsingInProgress: 'Parsing...',
|
||||
deleteConfirmation: 'Delete Confirmation',
|
||||
confirmDeleteDocument: 'Confirm deletion of document "{fileName}", recovery will be impossible after deletion',
|
||||
cancel: 'Cancel',
|
||||
confirmDelete: 'Confirm Delete',
|
||||
selectKnowledgeBaseFirst: 'Please select a knowledge base first',
|
||||
sessionCreationFailed: 'Failed to create chat session',
|
||||
sessionCreationError: 'Chat session creation error',
|
||||
settingsParsingFailed: 'Failed to parse settings',
|
||||
fileUploadEventReceived: 'File upload event received, uploaded knowledge base ID: {uploadedKbId}, current knowledge base ID: {currentKbId}',
|
||||
matchingKnowledgeBase: 'Matching knowledge base, starting file list update',
|
||||
routeParamChange: 'Route parameter change, re-fetching knowledge base content',
|
||||
fileUploadEventListening: 'Listening for file upload events',
|
||||
apiCallKnowledgeFiles: 'Direct API call to get knowledge base file list',
|
||||
responseInterceptorData: 'Since the response interceptor has already returned data, result is part of the response data',
|
||||
hookProcessing: 'Processing according to useKnowledgeBase hook method',
|
||||
errorHandling: 'Error handling',
|
||||
priorityCurrentPageKbId: 'Priority to use knowledge base ID of current page',
|
||||
fallbackLocalStorageKbId: 'If current page has no knowledge base ID, attempt to get knowledge base ID from settings in localStorage',
|
||||
// Additional translations for KnowledgeBaseList.vue
|
||||
createNewKnowledgeBase: 'Create Knowledge Base',
|
||||
uninitializedWarning: 'Some knowledge bases are not initialized, you need to configure model information in settings first to add knowledge documents',
|
||||
initializedStatus: 'Initialized',
|
||||
notInitializedStatus: 'Not Initialized',
|
||||
needSettingsFirst: 'You need to configure model information in settings first to add knowledge',
|
||||
documents: 'Documents',
|
||||
configureModelsFirst: 'Please configure model information in settings first',
|
||||
confirmDeleteKnowledgeBase: 'Confirm deletion of this knowledge base?',
|
||||
createKnowledgeBaseDialog: 'Create Knowledge Base',
|
||||
enterNameKb: 'Enter name',
|
||||
enterDescriptionKb: 'Enter description',
|
||||
createKb: 'Create',
|
||||
deleted: 'Deleted',
|
||||
deleteFailedKb: 'Delete failed',
|
||||
noDescription: 'No description',
|
||||
emptyKnowledgeDragDrop: 'Knowledge is empty, drag and drop to upload',
|
||||
pdfDocFormat: 'pdf, doc format files, max 10M',
|
||||
textMarkdownFormat: 'text, markdown format files, max 200K',
|
||||
dragFileNotText: 'Please drag files instead of text or links'
|
||||
},
|
||||
chat: {
|
||||
title: 'Chat',
|
||||
newChat: 'New Chat',
|
||||
inputPlaceholder: 'Enter your message...',
|
||||
send: 'Send',
|
||||
thinking: 'Thinking...',
|
||||
regenerate: 'Regenerate',
|
||||
copy: 'Copy',
|
||||
delete: 'Delete',
|
||||
reference: 'Reference',
|
||||
noMessages: 'No messages',
|
||||
// Additional translations for chat components
|
||||
waitingForAnswer: 'Waiting for answer...',
|
||||
cannotAnswer: 'Sorry, I cannot answer this question.',
|
||||
summarizingAnswer: 'Summarizing answer...',
|
||||
loading: 'Loading...',
|
||||
enterDescription: 'Enter description',
|
||||
referencedContent: '{count} related materials used',
|
||||
deepThinking: 'Deep thinking completed',
|
||||
knowledgeBaseQandA: 'Knowledge Base Q&A',
|
||||
askKnowledgeBase: 'Ask the knowledge base',
|
||||
sourcesCount: '{count} sources',
|
||||
pleaseEnterContent: 'Please enter content!',
|
||||
pleaseUploadKnowledgeBase: 'Please upload knowledge base first!',
|
||||
replyingPleaseWait: 'Replying, please try again later!',
|
||||
createSessionFailed: 'Failed to create session',
|
||||
createSessionError: 'Session creation error',
|
||||
unableToGetKnowledgeBaseId: 'Unable to get knowledge base ID'
|
||||
},
|
||||
settings: {
|
||||
title: 'Settings',
|
||||
system: 'System Settings',
|
||||
systemConfig: 'System Configuration',
|
||||
knowledgeBaseSettings: 'Knowledge Base Settings',
|
||||
configureKbModels: 'Configure models and document splitting parameters for this knowledge base',
|
||||
manageSystemModels: 'Manage and update system models and service configurations',
|
||||
basicInfo: 'Basic Information',
|
||||
documentSplitting: 'Document Splitting',
|
||||
apiEndpoint: 'API Endpoint',
|
||||
enterApiEndpoint: 'Enter API endpoint, e.g.: http://localhost',
|
||||
enterApiKey: 'Enter API key',
|
||||
enterKnowledgeBaseId: 'Enter knowledge base ID',
|
||||
saveConfig: 'Save Configuration',
|
||||
reset: 'Reset',
|
||||
configSaved: 'Configuration saved successfully',
|
||||
enterApiEndpointRequired: 'Enter API endpoint',
|
||||
enterApiKeyRequired: 'Enter API key',
|
||||
enterKnowledgeBaseIdRequired: 'Enter knowledge base ID',
|
||||
name: 'Name',
|
||||
enterName: 'Enter name',
|
||||
description: 'Description',
|
||||
chunkSize: 'Chunk Size',
|
||||
chunkOverlap: 'Chunk Overlap',
|
||||
save: 'Save',
|
||||
saving: 'Saving...',
|
||||
saveSuccess: 'Saved successfully',
|
||||
saveFailed: 'Failed to save',
|
||||
model: 'Model',
|
||||
llmModel: 'LLM Model',
|
||||
embeddingModel: 'Embedding Model',
|
||||
rerankModel: 'Rerank Model',
|
||||
vlmModel: 'Multimodal Model',
|
||||
modelName: 'Model Name',
|
||||
modelUrl: 'Model URL',
|
||||
apiKey: 'API Key',
|
||||
cancel: 'Cancel',
|
||||
saveFailedSettings: 'Failed to save settings',
|
||||
enterNameRequired: 'Enter name'
|
||||
},
|
||||
initialization: {
|
||||
title: 'Initialization',
|
||||
welcome: 'Welcome to WeKnora',
|
||||
description: 'Please configure the system before starting',
|
||||
step1: 'Step 1: Configure LLM Model',
|
||||
step2: 'Step 2: Configure Embedding Model',
|
||||
step3: 'Step 3: Configure Additional Models',
|
||||
complete: 'Complete Initialization',
|
||||
skip: 'Skip',
|
||||
next: 'Next',
|
||||
previous: 'Previous',
|
||||
// Ollama service
|
||||
ollamaServiceStatus: 'Ollama Service Status',
|
||||
refreshStatus: 'Refresh Status',
|
||||
ollamaServiceAddress: 'Ollama Service Address',
|
||||
notConfigured: 'Not Configured',
|
||||
notRunning: 'Not Running',
|
||||
normal: 'Normal',
|
||||
installedModels: 'Installed Models',
|
||||
none: 'None temporarily',
|
||||
// Knowledge base
|
||||
knowledgeBaseInfo: 'Knowledge Base Information',
|
||||
knowledgeBaseName: 'Knowledge Base Name',
|
||||
knowledgeBaseNamePlaceholder: 'Enter knowledge base name',
|
||||
knowledgeBaseDescription: 'Knowledge Base Description',
|
||||
knowledgeBaseDescriptionPlaceholder: 'Enter knowledge base description',
|
||||
// LLM model
|
||||
llmModelConfig: 'LLM Large Language Model Configuration',
|
||||
modelSource: 'Model Source',
|
||||
local: 'Ollama (Local)',
|
||||
remote: 'Remote API (Remote)',
|
||||
modelName: 'Model Name',
|
||||
modelNamePlaceholder: 'E.g.: qwen3:0.6b',
|
||||
baseUrl: 'Base URL',
|
||||
baseUrlPlaceholder: 'E.g.: https://api.openai.com/v1, remove /chat/completions from the end of URL',
|
||||
apiKey: 'API Key (Optional)',
|
||||
apiKeyPlaceholder: 'Enter API Key (Optional)',
|
||||
downloadModel: 'Download Model',
|
||||
installed: 'Installed',
|
||||
notInstalled: 'Not Installed',
|
||||
notChecked: 'Not Checked',
|
||||
checkConnection: 'Check Connection',
|
||||
connectionNormal: 'Connection Normal',
|
||||
connectionFailed: 'Connection Failed',
|
||||
checkingConnection: 'Checking Connection',
|
||||
// Embedding model
|
||||
embeddingModelConfig: 'Embedding Model Configuration',
|
||||
embeddingWarning: 'Knowledge base already has files, cannot change embedding model configuration',
|
||||
dimension: 'Dimension',
|
||||
dimensionPlaceholder: 'Enter vector dimension',
|
||||
detectDimension: 'Detect Dimension',
|
||||
// Rerank model
|
||||
rerankModelConfig: 'Rerank Model Configuration',
|
||||
enableRerank: 'Enable Rerank Model',
|
||||
// Multimodal settings
|
||||
multimodalConfig: 'Multimodal Configuration',
|
||||
enableMultimodal: 'Enable image information extraction',
|
||||
visualLanguageModelConfig: 'Visual Language Model Configuration',
|
||||
interfaceType: 'Interface Type',
|
||||
openaiCompatible: 'OpenAI Compatible Interface',
|
||||
// Storage settings
|
||||
storageServiceConfig: 'Storage Service Configuration',
|
||||
storageType: 'Storage Type',
|
||||
bucketName: 'Bucket Name',
|
||||
bucketNamePlaceholder: 'Enter Bucket name',
|
||||
pathPrefix: 'Path Prefix',
|
||||
pathPrefixPlaceholder: 'E.g.: images',
|
||||
secretId: 'Secret ID',
|
||||
secretIdPlaceholder: 'Enter COS Secret ID',
|
||||
secretKey: 'Secret Key',
|
||||
secretKeyPlaceholder: 'Enter COS Secret Key',
|
||||
region: 'Region',
|
||||
regionPlaceholder: 'E.g.: ap-beijing',
|
||||
appId: 'App ID',
|
||||
appIdPlaceholder: 'Enter App ID',
|
||||
// Multimodal function testing
|
||||
functionTest: 'Function Test',
|
||||
testDescription: 'Upload an image to test the model\'s image description and text recognition functions',
|
||||
selectImage: 'Select Image',
|
||||
startTest: 'Start Test',
|
||||
testResult: 'Test Result',
|
||||
imageDescription: 'Image Description:',
|
||||
textRecognition: 'Text Recognition:',
|
||||
processingTime: 'Processing Time:',
|
||||
testFailed: 'Test Failed',
|
||||
multimodalProcessingFailed: 'Multimodal processing failed',
|
||||
// Document splitting
|
||||
documentSplittingConfig: 'Document Splitting Configuration',
|
||||
splittingStrategy: 'Splitting Strategy',
|
||||
balancedMode: 'Balanced Mode',
|
||||
balancedModeDesc: 'Chunk size: 1000 / Overlap: 200',
|
||||
precisionMode: 'Precision Mode',
|
||||
precisionModeDesc: 'Chunk size: 512 / Overlap: 100',
|
||||
contextMode: 'Context Mode',
|
||||
contextModeDesc: 'Chunk size: 2048 / Overlap: 400',
|
||||
custom: 'Custom',
|
||||
customDesc: 'Configure parameters manually',
|
||||
chunkSize: 'Chunk Size',
|
||||
chunkOverlap: 'Chunk Overlap',
|
||||
separatorSettings: 'Separator Settings',
|
||||
selectOrCustomSeparators: 'Select or customize separators',
|
||||
characters: 'characters',
|
||||
separatorParagraph: 'Paragraph separator (\\n\\n)',
|
||||
separatorNewline: 'Newline (\\n)',
|
||||
separatorPeriod: 'Period (。)',
|
||||
separatorExclamation: 'Exclamation mark (!)',
|
||||
separatorQuestion: 'Question mark (?)',
|
||||
separatorSemicolon: 'Semicolon (;)',
|
||||
separatorChineseSemicolon: 'Chinese semicolon (;)',
|
||||
separatorComma: 'Comma (,)',
|
||||
separatorChineseComma: 'Chinese comma (,)',
|
||||
// Entity and relation extraction
|
||||
entityRelationExtraction: 'Entity and Relation Extraction',
|
||||
enableEntityRelationExtraction: 'Enable entity and relation extraction',
|
||||
relationTypeConfig: 'Relation Type Configuration',
|
||||
relationType: 'Relation Type',
|
||||
generateRandomTags: 'Generate Random Tags',
|
||||
completeModelConfig: 'Please complete model configuration',
|
||||
systemWillExtract: 'The system will extract corresponding entities and relations from the text according to the selected relation types',
|
||||
extractionExample: 'Extraction Example',
|
||||
sampleText: 'Sample Text',
|
||||
sampleTextPlaceholder: 'Enter text for analysis, e.g.: "Red Mansion", also known as "Dream of the Red Chamber", is one of the four great classical novels of Chinese literature, written by Cao Xueqin during the Qing Dynasty...',
|
||||
generateRandomText: 'Generate Random Text',
|
||||
entityList: 'Entity List',
|
||||
nodeName: 'Node Name',
|
||||
nodeNamePlaceholder: 'Node name',
|
||||
addAttribute: 'Add Attribute',
|
||||
attributeValue: 'Attribute Value',
|
||||
attributeValuePlaceholder: 'Attribute value',
|
||||
addEntity: 'Add Entity',
|
||||
completeEntityInfo: 'Please complete entity information',
|
||||
relationConnection: 'Relation Connection',
|
||||
selectEntity: 'Select Entity',
|
||||
addRelation: 'Add Relation',
|
||||
completeRelationInfo: 'Please complete relation information',
|
||||
startExtraction: 'Start Extraction',
|
||||
extracting: 'Extracting...',
|
||||
defaultExample: 'Default Example',
|
||||
clearExample: 'Clear Example',
|
||||
// Buttons and messages
|
||||
updateKnowledgeBaseSettings: 'Update Knowledge Base Settings',
|
||||
updateConfigInfo: 'Update Configuration Information',
|
||||
completeConfig: 'Complete Configuration',
|
||||
waitForDownloads: 'Please wait for all Ollama models to finish downloading before updating configuration',
|
||||
completeModelConfigInfo: 'Please complete model configuration information',
|
||||
knowledgeBaseIdMissing: 'Knowledge base ID is missing',
|
||||
knowledgeBaseSettingsUpdateSuccess: 'Knowledge base settings updated successfully',
|
||||
configUpdateSuccess: 'Configuration updated successfully',
|
||||
systemInitComplete: 'System initialization completed',
|
||||
operationFailed: 'Operation failed',
|
||||
updateKnowledgeBaseInfoFailed: 'Failed to update knowledge base basic information',
|
||||
knowledgeBaseIdMissingCannotSave: 'Knowledge base ID is missing, cannot save configuration',
|
||||
operationFailedCheckNetwork: 'Operation failed, please check network connection',
|
||||
imageUploadSuccess: 'Image uploaded successfully, testing can begin',
|
||||
multimodalConfigIncomplete: 'Multimodal configuration incomplete, please complete multimodal configuration before uploading images',
|
||||
pleaseSelectImage: 'Please select an image',
|
||||
multimodalTestSuccess: 'Multimodal test successful',
|
||||
multimodalTestFailed: 'Multimodal test failed',
|
||||
pleaseEnterSampleText: 'Please enter sample text',
|
||||
pleaseEnterRelationType: 'Please enter relation type',
|
||||
pleaseEnterLLMModelConfig: 'Please enter LLM large language model configuration',
|
||||
noValidNodesExtracted: 'No valid nodes extracted',
|
||||
noValidRelationsExtracted: 'No valid relations extracted',
|
||||
extractionFailedCheckNetwork: 'Extraction failed, please check network or text format',
|
||||
generateFailedRetry: 'Generation failed, please try again',
|
||||
pleaseCheckForm: 'Please check form correctness',
|
||||
detectionSuccessful: 'Detection successful, dimension automatically filled as',
|
||||
detectionFailed: 'Detection failed',
|
||||
detectionFailedCheckConfig: 'Detection failed, please check configuration',
|
||||
modelDownloadSuccess: 'Model downloaded successfully',
|
||||
modelDownloadFailed: 'Model download failed',
|
||||
downloadStartFailed: 'Download start failed',
|
||||
queryProgressFailed: 'Progress query failed',
|
||||
checkOllamaStatusFailed: 'Ollama status check failed',
|
||||
getKnowledgeBaseInfoFailed: 'Failed to get knowledge base information',
|
||||
textRelationExtractionFailed: 'Text relation extraction failed',
|
||||
// Validation
|
||||
pleaseEnterKnowledgeBaseName: 'Please enter knowledge base name',
|
||||
knowledgeBaseNameLength: 'Knowledge base name length must be 1-50 characters',
|
||||
knowledgeBaseDescriptionLength: 'Knowledge base description cannot exceed 200 characters',
|
||||
pleaseEnterLLMModelName: 'Please enter LLM model name',
|
||||
pleaseEnterBaseURL: 'Please enter BaseURL',
|
||||
pleaseEnterEmbeddingModelName: 'Please enter embedding model name',
|
||||
pleaseEnterEmbeddingDimension: 'Please enter embedding dimension',
|
||||
dimensionMustBeInteger: 'Dimension must be a valid integer, usually 768, 1024, 1536, 3584, etc.',
|
||||
pleaseEnterTextContent: 'Please enter text content',
|
||||
textContentMinLength: 'Text content must contain at least 10 characters',
|
||||
pleaseEnterValidTag: 'Please enter a valid tag',
|
||||
tagAlreadyExists: 'This tag already exists',
|
||||
// Additional translations for InitializationContent.vue
|
||||
checkFailed: 'Check failed',
|
||||
startingDownload: 'Starting download...',
|
||||
downloadStarted: 'Download started',
|
||||
model: 'Model',
|
||||
startModelDownloadFailed: 'Failed to start model download',
|
||||
downloadCompleted: 'Download completed',
|
||||
downloadFailed: 'Download failed',
|
||||
knowledgeBaseSettingsModeMissingId: 'Knowledge base settings mode missing ID',
|
||||
completeEmbeddingConfig: 'Please complete embedding configuration first',
|
||||
detectionSuccess: 'Detection successful,',
|
||||
dimensionAutoFilled: 'dimension automatically filled:',
|
||||
checkFormCorrectness: 'Please check form correctness',
|
||||
systemInitializationCompleted: 'System initialization completed',
|
||||
generationFailedRetry: 'Generation failed, please try again',
|
||||
chunkSizeDesc: 'Size of each text chunk. Larger chunks preserve more context but may reduce search accuracy.',
|
||||
chunkOverlapDesc: 'Number of characters overlapping between adjacent chunks. Helps maintain context at chunk boundaries.',
|
||||
selectRelationType: 'Select relation type'
|
||||
},
|
||||
auth: {
|
||||
login: 'Login',
|
||||
logout: 'Logout',
|
||||
username: 'Username',
|
||||
email: 'Email',
|
||||
password: 'Password',
|
||||
confirmPassword: 'Confirm Password',
|
||||
rememberMe: 'Remember Me',
|
||||
forgotPassword: 'Forgot Password?',
|
||||
loginSuccess: 'Login successful!',
|
||||
loginFailed: 'Login failed',
|
||||
loggingIn: 'Logging in...',
|
||||
register: 'Register',
|
||||
registering: 'Registering...',
|
||||
createAccount: 'Create Account',
|
||||
haveAccount: 'Already have an account?',
|
||||
noAccount: 'Don\'t have an account?',
|
||||
backToLogin: 'Back to Login',
|
||||
registerNow: 'Register Now',
|
||||
registerSuccess: 'Registration successful! The system has created an exclusive tenant for you, please login',
|
||||
registerFailed: 'Registration failed',
|
||||
subtitle: 'Document understanding and semantic search framework based on large models',
|
||||
registerSubtitle: 'The system will create an exclusive tenant for you after registration',
|
||||
emailPlaceholder: 'Enter email address',
|
||||
passwordPlaceholder: 'Enter password (8-32 characters, including letters and numbers)',
|
||||
confirmPasswordPlaceholder: 'Enter password again',
|
||||
usernamePlaceholder: 'Enter username',
|
||||
emailRequired: 'Enter email address',
|
||||
emailInvalid: 'Enter correct email format',
|
||||
passwordRequired: 'Enter password',
|
||||
passwordMinLength: 'Password must be at least 8 characters',
|
||||
passwordMaxLength: 'Password cannot exceed 32 characters',
|
||||
passwordMustContainLetter: 'Password must contain letters',
|
||||
passwordMustContainNumber: 'Password must contain numbers',
|
||||
usernameRequired: 'Enter username',
|
||||
usernameMinLength: 'Username must be at least 2 characters',
|
||||
usernameMaxLength: 'Username cannot exceed 20 characters',
|
||||
usernameInvalid: 'Username can only contain letters, numbers, underscores and Chinese characters',
|
||||
confirmPasswordRequired: 'Confirm password',
|
||||
passwordMismatch: 'Entered passwords do not match',
|
||||
loginError: 'Login error, please check email or password',
|
||||
loginErrorRetry: 'Login error, please try again later',
|
||||
registerError: 'Registration error, please try again later',
|
||||
forgotPasswordNotAvailable: 'Password recovery function is temporarily unavailable, please contact administrator'
|
||||
},
|
||||
common: {
|
||||
confirm: 'Confirm',
|
||||
cancel: 'Cancel',
|
||||
save: 'Save',
|
||||
delete: 'Delete',
|
||||
edit: 'Edit',
|
||||
create: 'Create',
|
||||
search: 'Search',
|
||||
filter: 'Filter',
|
||||
export: 'Export',
|
||||
import: 'Import',
|
||||
upload: 'Upload',
|
||||
download: 'Download',
|
||||
refresh: 'Refresh',
|
||||
loading: 'Loading...',
|
||||
noData: 'No data',
|
||||
error: 'Error',
|
||||
success: 'Success',
|
||||
warning: 'Warning',
|
||||
info: 'Information',
|
||||
yes: 'Yes',
|
||||
no: 'No',
|
||||
ok: 'OK',
|
||||
close: 'Close',
|
||||
back: 'Back',
|
||||
next: 'Next',
|
||||
finish: 'Finish',
|
||||
all: 'All',
|
||||
reset: 'Reset',
|
||||
clear: 'Clear'
|
||||
},
|
||||
file: {
|
||||
upload: 'Upload File',
|
||||
uploadSuccess: 'File uploaded successfully',
|
||||
uploadFailed: 'File upload failed',
|
||||
delete: 'Delete File',
|
||||
deleteSuccess: 'File deleted successfully',
|
||||
deleteFailed: 'File deletion failed',
|
||||
download: 'Download File',
|
||||
preview: 'Preview',
|
||||
unsupportedFormat: 'Unsupported file format',
|
||||
maxSizeExceeded: 'Maximum file size exceeded',
|
||||
selectFile: 'Select File'
|
||||
},
|
||||
tenant: {
|
||||
title: 'Tenant Information',
|
||||
name: 'Tenant Name',
|
||||
id: 'Tenant ID',
|
||||
createdAt: 'Created At',
|
||||
updatedAt: 'Updated At',
|
||||
status: 'Status',
|
||||
active: 'Active',
|
||||
inactive: 'Inactive',
|
||||
// Additional translations for TenantInfo.vue
|
||||
systemInfo: 'System Information',
|
||||
viewSystemInfo: 'View system version and user account configuration information',
|
||||
version: 'Version',
|
||||
buildTime: 'Build Time',
|
||||
goVersion: 'Go Version',
|
||||
userInfo: 'User Information',
|
||||
userId: 'User ID',
|
||||
username: 'Username',
|
||||
email: 'Email',
|
||||
tenantInfo: 'Tenant Information',
|
||||
tenantId: 'Tenant ID',
|
||||
tenantName: 'Tenant Name',
|
||||
description: 'Description',
|
||||
business: 'Business',
|
||||
noDescription: 'No description',
|
||||
noBusiness: 'None',
|
||||
statusActive: 'Active',
|
||||
statusInactive: 'Not activated',
|
||||
statusSuspended: 'Suspended',
|
||||
statusUnknown: 'Unknown',
|
||||
apiKey: 'API Key',
|
||||
keepApiKeySafe: 'Please keep your API Key safe, do not disclose it in public places or code repositories',
|
||||
storageInfo: 'Storage Information',
|
||||
storageQuota: 'Storage Quota',
|
||||
used: 'Used',
|
||||
usage: 'Usage',
|
||||
apiDevDocs: 'API Developer Documentation',
|
||||
useApiKey: 'Use your API Key to start development, view complete API documentation and code examples.',
|
||||
viewApiDoc: 'View API Documentation',
|
||||
loadingAccountInfo: 'Loading account information...',
|
||||
loadFailed: 'Load failed',
|
||||
retry: 'Retry',
|
||||
apiKeyCopied: 'API Key copied to clipboard',
|
||||
unknown: 'Unknown',
|
||||
formatError: 'Format error'
|
||||
},
|
||||
error: {
|
||||
network: 'Network error',
|
||||
server: 'Server error',
|
||||
notFound: 'Not found',
|
||||
unauthorized: 'Unauthorized',
|
||||
forbidden: 'Access forbidden',
|
||||
unknown: 'Unknown error',
|
||||
tryAgain: 'Please try again'
|
||||
},
|
||||
model: {
|
||||
llmModel: 'LLM Model',
|
||||
embeddingModel: 'Embedding Model',
|
||||
rerankModel: 'Rerank Model',
|
||||
vlmModel: 'Multimodal Model',
|
||||
modelName: 'Model Name',
|
||||
modelProvider: 'Model Provider',
|
||||
modelUrl: 'Model URL',
|
||||
apiKey: 'API Key',
|
||||
testConnection: 'Test Connection',
|
||||
connectionSuccess: 'Connection successful',
|
||||
connectionFailed: 'Connection failed',
|
||||
dimension: 'Dimension',
|
||||
maxTokens: 'Max Tokens',
|
||||
temperature: 'Temperature',
|
||||
topP: 'Top P',
|
||||
selectModel: 'Select Model',
|
||||
customModel: 'Custom Model',
|
||||
builtinModel: 'Built-in Model'
|
||||
}
|
||||
}
|
||||