Compare commits
64 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3e08d3d9d8 | ||
|
|
a0685a22db | ||
|
|
b9f264b883 | ||
|
|
154025f723 | ||
|
|
587d1b2bd3 | ||
|
|
3a2c86df5b | ||
|
|
a910bae6cd | ||
|
|
ddbdae686f | ||
|
|
4fdbec17a7 | ||
|
|
2d66abedf0 | ||
|
|
af620806e0 | ||
|
|
a625eff525 | ||
|
|
ef69e2aed5 | ||
|
|
07c3453e1a | ||
|
|
0032a9185c | ||
|
|
da640d1d33 | ||
|
|
1fd2de5a64 | ||
|
|
c1f731e026 | ||
|
|
0d790ffedc | ||
|
|
6d547131b6 | ||
|
|
8a40377a4a | ||
|
|
a1d0ccaa09 | ||
|
|
ca704fa054 | ||
|
|
02b78a5908 | ||
|
|
de96a52d54 | ||
|
|
f24cd817cb | ||
|
|
4824e41361 | ||
|
|
bfd4fffbe3 | ||
|
|
a2902de6ce | ||
|
|
a5c3623a02 | ||
|
|
8f723b38fb | ||
|
|
7973128f4c | ||
|
|
8ed050b8ec | ||
|
|
4ccbd2a127 | ||
|
|
512910584b | ||
|
|
cd7e02e54a | ||
|
|
c9b1f43ed7 | ||
|
|
76fc64a807 | ||
|
|
947899ff10 | ||
|
|
5e0a99b127 | ||
|
|
b04566be32 | ||
|
|
0157eb25bd | ||
|
|
91e65d6445 | ||
|
|
c589a911dc | ||
|
|
66aec78960 | ||
|
|
76fbfdf8ac | ||
|
|
4137a63852 | ||
|
|
d28f805707 | ||
|
|
2e395864b9 | ||
|
|
4005aa3ded | ||
|
|
5e22f96d37 | ||
|
|
2237e1ee55 | ||
|
|
b11df52cfb | ||
|
|
c3744866fd | ||
|
|
c2d52a9374 | ||
|
|
81bd2e6c2c | ||
|
|
0908f9c487 | ||
|
|
1aac37d3fd | ||
|
|
cd249df8c8 | ||
|
|
092b30af3e | ||
|
|
74c121f7fb | ||
|
|
78088057fb | ||
|
|
bff0e742fa | ||
|
|
6598baab2e |
@@ -0,0 +1,2 @@
|
||||
**/.venv/
|
||||
**/.python-version
|
||||
93
.env.example
@@ -23,10 +23,6 @@ STORAGE_TYPE=local
|
||||
# 流处理后端(memory/redis)
|
||||
STREAM_MANAGER_TYPE=redis
|
||||
|
||||
# 主数据库配置
|
||||
# 数据库端口,默认为5432
|
||||
DB_PORT=5432
|
||||
|
||||
# 应用服务端口,默认为8080
|
||||
APP_PORT=8080
|
||||
|
||||
@@ -46,9 +42,6 @@ DB_PASSWORD=postgres123!@#
|
||||
DB_NAME=WeKnora
|
||||
|
||||
# 如果使用 redis 作为流处理后端,需要配置以下参数
|
||||
# Redis端口,默认为6379
|
||||
REDIS_PORT=6379
|
||||
|
||||
# Redis密码,如果没有设置密码,可以留空
|
||||
REDIS_PASSWORD=redis123!@#
|
||||
|
||||
@@ -66,9 +59,11 @@ TENANT_AES_KEY=weknorarag-api-key-secret-secret
|
||||
# 是否开启知识图谱构建和检索(构建阶段需调用大模型,耗时较长)
|
||||
ENABLE_GRAPH_RAG=false
|
||||
|
||||
MINIO_PORT=9000
|
||||
# MinIO端口
|
||||
# MINIO_PORT=9000
|
||||
|
||||
MINIO_CONSOLE_PORT=9001
|
||||
# MinIO控制台端口
|
||||
# MINIO_CONSOLE_PORT=9001
|
||||
|
||||
# Embedding并发数,出现429错误时,可调小此参数
|
||||
CONCURRENCY_POOL_SIZE=5
|
||||
@@ -121,78 +116,14 @@ COS_ENABLE_OLD_DOMAIN=true
|
||||
# 如果解析网络连接使用Web代理,需要配置以下参数
|
||||
# WEB_PROXY=your_web_proxy
|
||||
|
||||
##############################################################
|
||||
# Neo4j 开关
|
||||
# NEO4J_ENABLE=false
|
||||
|
||||
###### 注意: 以下配置不再生效,已在Web“配置初始化”阶段完成 #########
|
||||
# Neo4j的访问地址
|
||||
# NEO4J_URI=neo4j://neo4j:7687
|
||||
|
||||
# Neo4j的用户名和密码
|
||||
# NEO4J_USERNAME=neo4j
|
||||
|
||||
# # 初始化默认租户与知识库
|
||||
# # 租户ID,通常是一个字符串
|
||||
# INIT_TEST_TENANT_ID=1
|
||||
|
||||
# # 知识库ID,通常是一个字符串
|
||||
# INIT_TEST_KNOWLEDGE_BASE_ID=kb-00000001
|
||||
|
||||
# # LLM Model
|
||||
# # 使用的LLM模型名称
|
||||
# # 默认使用 Ollama 的 Qwen3 8B 模型,ollama 会自动处理模型下载和加载
|
||||
# # 如果需要使用其他模型,请替换为实际的模型名称
|
||||
# INIT_LLM_MODEL_NAME=qwen3:8b
|
||||
|
||||
# # LLM模型的访问地址
|
||||
# # 支持第三方模型服务的URL
|
||||
# # 如果使用 Ollama 的本地服务,可以留空,ollama 会自动处理
|
||||
# # INIT_LLM_MODEL_BASE_URL=your_llm_model_base_url
|
||||
|
||||
# # LLM模型的API密钥,如果需要身份验证,可以设置
|
||||
# # 支持第三方模型服务的API密钥
|
||||
# # 如果使用 Ollama 的本地服务,可以留空,ollama 会自动处理
|
||||
# # INIT_LLM_MODEL_API_KEY=your_llm_model_api_key
|
||||
|
||||
# # Embedding Model
|
||||
# # 使用的Embedding模型名称
|
||||
# # 默认使用 nomic-embed-text 模型,支持文本嵌入
|
||||
# # 如果需要使用其他模型,请替换为实际的模型名称
|
||||
# INIT_EMBEDDING_MODEL_NAME=nomic-embed-text
|
||||
|
||||
# # Embedding模型向量维度
|
||||
# INIT_EMBEDDING_MODEL_DIMENSION=768
|
||||
|
||||
# # Embedding模型的ID,通常是一个字符串
|
||||
# INIT_EMBEDDING_MODEL_ID=builtin:nomic-embed-text:768
|
||||
|
||||
# # Embedding模型的访问地址
|
||||
# # 支持第三方模型服务的URL
|
||||
# # 如果使用 Ollama 的本地服务,可以留空,ollama 会自动处理
|
||||
# # INIT_EMBEDDING_MODEL_BASE_URL=your_embedding_model_base_url
|
||||
|
||||
# # Embedding模型的API密钥,如果需要身份验证,可以设置
|
||||
# # 支持第三方模型服务的API密钥
|
||||
# # 如果使用 Ollama 的本地服务,可以留空,ollama 会自动处理
|
||||
# # INIT_EMBEDDING_MODEL_API_KEY=your_embedding_model_api_key
|
||||
|
||||
# # Rerank Model(可选)
|
||||
# # 对于rag来说,使用Rerank模型对提升文档搜索的准确度有着重要作用
|
||||
# # 目前 ollama 暂不支持运行 Rerank 模型
|
||||
# # 使用的Rerank模型名称
|
||||
# # INIT_RERANK_MODEL_NAME=your_rerank_model_name
|
||||
|
||||
# # Rerank模型的访问地址
|
||||
# # 支持第三方模型服务的URL
|
||||
# # INIT_RERANK_MODEL_BASE_URL=your_rerank_model_base_url
|
||||
|
||||
# # Rerank模型的API密钥,如果需要身份验证,可以设置
|
||||
# # 支持第三方模型服务的API密钥
|
||||
# # INIT_RERANK_MODEL_API_KEY=your_rerank_model_api_key
|
||||
|
||||
# # VLM_MODEL_NAME 使用的多模态模型名称
|
||||
# # 用于解析图片数据
|
||||
# # VLM_MODEL_NAME=your_vlm_model_name
|
||||
|
||||
# # VLM_MODEL_BASE_URL 使用的多模态模型访问地址
|
||||
# # 支持第三方模型服务的URL
|
||||
# # VLM_MODEL_BASE_URL=your_vlm_model_base_url
|
||||
|
||||
# # VLM_MODEL_API_KEY 使用的多模态模型API密钥
|
||||
# # 支持第三方模型服务的API密钥
|
||||
# # VLM_MODEL_API_KEY=your_vlm_model_api_key
|
||||
# Neo4j的密码
|
||||
# NEO4J_PASSWORD=password
|
||||
|
||||
13
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
@@ -49,15 +49,7 @@ body:
|
||||
|
||||
请按照以下步骤收集相关日志:
|
||||
|
||||
**1. 应用模块日志:**
|
||||
```bash
|
||||
docker exec -it WeKnora-app tail -f /var/log/WeKnora.log
|
||||
```
|
||||
|
||||
**2. 文档解析模块日志:**
|
||||
```bash
|
||||
docker exec -it WeKnora-docreader tail -f /var/log/docreader.log
|
||||
```
|
||||
docker compose logs -f --tail=1000 app docreader postgres
|
||||
|
||||
请重现问题并收集相关日志,然后粘贴到下面的日志字段中。
|
||||
|
||||
@@ -68,8 +60,7 @@ body:
|
||||
description: 请按照上面的指南收集并粘贴相关日志
|
||||
placeholder: |
|
||||
请粘贴从以下命令收集的日志:
|
||||
- docker exec -it WeKnora-app tail -f /var/log/WeKnora.log
|
||||
- docker exec -it WeKnora-docreader tail -f /var/log/docreader.log
|
||||
docker compose logs -f --tail=1000 app docreader postgres
|
||||
render: shell
|
||||
|
||||
- type: input
|
||||
|
||||
8
.github/ISSUE_TEMPLATE/question.yml
vendored
@@ -68,14 +68,8 @@ body:
|
||||
|
||||
如果问题涉及错误或需要调试,请收集相关日志:
|
||||
|
||||
**应用模块日志:**
|
||||
```bash
|
||||
docker exec -it WeKnora-app tail -f /var/log/WeKnora.log
|
||||
```
|
||||
|
||||
**文档解析模块日志:**
|
||||
```bash
|
||||
docker exec -it WeKnora-docreader tail -f /var/log/docreader.log
|
||||
docker compose logs -f --tail=1000 app docreader postgres
|
||||
```
|
||||
|
||||
- type: textarea
|
||||
|
||||
228
.github/workflows/docker-image.yml
vendored
@@ -1,6 +1,8 @@
|
||||
name: Build and Push Docker Image
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v*"
|
||||
branches:
|
||||
- main
|
||||
|
||||
@@ -9,51 +11,217 @@ concurrency:
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
build-app:
|
||||
build-ui:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- service_name: ui
|
||||
file: frontend/Dockerfile
|
||||
context: ./frontend
|
||||
platform: linux/amd64,linux/arm64
|
||||
- service_name: app
|
||||
file: docker/Dockerfile.app
|
||||
context: .
|
||||
platform: linux/amd64,linux/arm64
|
||||
- service_name: docreader
|
||||
file: docker/Dockerfile.docreader
|
||||
context: .
|
||||
platform: linux/amd64,linux/arm64
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Read VERSION file
|
||||
run: echo "VERSION=$(cat VERSION)" >> $GITHUB_ENV
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-ui
|
||||
|
||||
- name: Build ui Image
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
file: frontend/Dockerfile
|
||||
context: ./frontend
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-ui:cache
|
||||
cache-to: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-ui:cache,mode=max
|
||||
|
||||
build-docreader:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Free Disk Space
|
||||
uses: jlumbroso/free-disk-space@main
|
||||
with:
|
||||
# this might remove tools that are actually needed,
|
||||
# if set to "true" but frees about 6 GB
|
||||
tool-cache: false
|
||||
|
||||
# all of these default to true, but feel free to set to
|
||||
# "false" if necessary for your workflow
|
||||
android: true
|
||||
dotnet: true
|
||||
haskell: true
|
||||
large-packages: true
|
||||
docker-images: true
|
||||
swap-storage: true
|
||||
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-docreader
|
||||
|
||||
- name: Build docreader Image
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
file: docker/Dockerfile.docreader
|
||||
context: .
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-docreader:cache
|
||||
cache-to: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-docreader:cache,mode=max
|
||||
|
||||
build-app:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- arch: amd64
|
||||
platform: linux/amd64
|
||||
runs: ubuntu-latest
|
||||
- arch: arm64
|
||||
platform: linux/arm64
|
||||
runs: ubuntu-24.04-arm
|
||||
runs-on: ${{ matrix.runs }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
id: setup-buildx
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-app
|
||||
|
||||
- name: Prepare version info
|
||||
id: version
|
||||
run: |
|
||||
# 使用统一的版本管理脚本
|
||||
eval "$(./scripts/get_version.sh env)"
|
||||
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
||||
echo "commit_id=$COMMIT_ID" >> $GITHUB_OUTPUT
|
||||
echo "build_time=$BUILD_TIME" >> $GITHUB_OUTPUT
|
||||
echo "go_version=$GO_VERSION" >> $GITHUB_OUTPUT
|
||||
|
||||
platform=${{ matrix.platform }}
|
||||
echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
|
||||
# 显示版本信息
|
||||
./scripts/get_version.sh info
|
||||
|
||||
- name: Build ${{ matrix.service_name }} Image
|
||||
- name: Build Cache for Docker
|
||||
uses: actions/cache@v4
|
||||
id: cache
|
||||
with:
|
||||
path: go-pkg-mod
|
||||
key: ${{ env.PLATFORM_PAIR }}-go-build-cache-${{ hashFiles('**/go.sum') }}
|
||||
|
||||
- name: Inject go-build-cache
|
||||
uses: reproducible-containers/buildkit-cache-dance@v3
|
||||
with:
|
||||
builder: ${{ steps.setup-buildx.outputs.name }}
|
||||
cache-map: |
|
||||
{
|
||||
"go-pkg-mod": "/go/pkg/mod"
|
||||
}
|
||||
skip-extraction: ${{ steps.cache.outputs.cache-hit }}
|
||||
|
||||
- name: Build app Image
|
||||
id: build
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
push: true
|
||||
platforms: ${{ matrix.platform }}
|
||||
file: ${{ matrix.file }}
|
||||
context: ${{ matrix.context }}
|
||||
tags: |
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/weknora-${{ matrix.service_name }}:latest
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/weknora-${{ matrix.service_name }}:${{ env.VERSION }}
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-${{ matrix.service_name }}:cache
|
||||
cache-to: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-${{ matrix.service_name }}:cache,mode=max
|
||||
file: docker/Dockerfile.app
|
||||
context: .
|
||||
build-args: |
|
||||
${{ format('VERSION_ARG={0}', steps.version.outputs.version) }}
|
||||
${{ format('COMMIT_ID_ARG={0}', steps.version.outputs.commit_id) }}
|
||||
${{ format('BUILD_TIME_ARG={0}', steps.version.outputs.build_time) }}
|
||||
${{ format('GO_VERSION_ARG={0}', steps.version.outputs.go_version) }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
tags: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-app
|
||||
cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-app:cache-${{ env.PLATFORM_PAIR }}
|
||||
cache-to: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-app:cache-${{ env.PLATFORM_PAIR }},mode=max
|
||||
outputs: type=image,push-by-digest=true,name-canonical=true,push=true
|
||||
|
||||
- name: Export digest
|
||||
run: |
|
||||
mkdir -p ${{ runner.temp }}/digests
|
||||
digest="${{ steps.build.outputs.digest }}"
|
||||
touch "${{ runner.temp }}/digests/${digest#sha256:}"
|
||||
|
||||
- name: Upload digest
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: digests-${{ env.PLATFORM_PAIR }}
|
||||
path: ${{ runner.temp }}/digests/*
|
||||
if-no-files-found: error
|
||||
retention-days: 1
|
||||
|
||||
merge:
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- build-app
|
||||
steps:
|
||||
- name: Download digests
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ${{ runner.temp }}/digests
|
||||
pattern: digests-*
|
||||
merge-multiple: true
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-app
|
||||
|
||||
- name: Create manifest list and push
|
||||
working-directory: ${{ runner.temp }}/digests
|
||||
run: |
|
||||
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
||||
$(printf '${{ secrets.DOCKERHUB_USERNAME }}/weknora-app@sha256:%s ' *)
|
||||
|
||||
- name: Inspect image
|
||||
run: |
|
||||
docker buildx imagetools inspect ${{ secrets.DOCKERHUB_USERNAME }}/weknora-app:${{ steps.meta.outputs.version }}
|
||||
|
||||
5
.gitignore
vendored
@@ -26,13 +26,14 @@ temp/
|
||||
|
||||
WeKnora
|
||||
/models/
|
||||
services/docreader/src/proto/__pycache__
|
||||
test/data/mswag.txt
|
||||
data/files/
|
||||
|
||||
.python-version
|
||||
.venv/
|
||||
**/__pycache__
|
||||
.python-version
|
||||
|
||||
### macOS
|
||||
# General
|
||||
.DS_Store
|
||||
PROGRESS_RU.md
|
||||
|
||||
86
CHANGELOG.md
@@ -2,6 +2,89 @@
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
## [0.1.4] - 2025-09-17
|
||||
|
||||
### 🚀 Major Features
|
||||
- **NEW**: Multi-knowledgebases operation support
|
||||
- Added comprehensive multi-knowledgebase management functionality
|
||||
- Implemented multi-data source search engine configuration and optimization logic
|
||||
- Enhanced knowledge base switching and management in UI
|
||||
- **NEW**: Enhanced tenant information management
|
||||
- Added dedicated tenant information page
|
||||
- Improved user and tenant management capabilities
|
||||
|
||||
### 🎨 UI/UX Improvements
|
||||
- **REDESIGNED**: Settings page with improved layout and functionality
|
||||
- **ENHANCED**: Menu component with multi-knowledgebase support
|
||||
- **IMPROVED**: Initialization configuration page structure
|
||||
- **OPTIMIZED**: Login page and authentication flow
|
||||
|
||||
### 🔒 Security Fixes
|
||||
- **FIXED**: XSS attack vulnerabilities in thinking component
|
||||
- **FIXED**: Content Security Policy (CSP) errors
|
||||
- **ENHANCED**: Frontend security measures and input sanitization
|
||||
|
||||
### 🐛 Bug Fixes
|
||||
- **FIXED**: Login direct page navigation issues
|
||||
- **FIXED**: App LLM model check logic
|
||||
- **FIXED**: Version script functionality
|
||||
- **FIXED**: File download content errors
|
||||
- **IMPROVED**: Document content component display
|
||||
|
||||
### 🧹 Code Cleanup
|
||||
- **REMOVED**: Test data functionality and related APIs
|
||||
- **SIMPLIFIED**: Initialization configuration components
|
||||
- **CLEANED**: Redundant UI components and unused code
|
||||
|
||||
|
||||
## [0.1.3] - 2025-09-16
|
||||
|
||||
### 🔒 Security Features
|
||||
- **NEW**: Added login authentication functionality to enhance system security
|
||||
- Implemented user authentication and authorization mechanisms
|
||||
- Added session management and access control
|
||||
- Fixed XSS attack vulnerabilities in frontend components
|
||||
|
||||
### 📚 Documentation Updates
|
||||
- Added security notices in all README files (English, Chinese, Japanese)
|
||||
- Updated deployment recommendations emphasizing internal/private network deployment
|
||||
- Enhanced security guidelines to prevent information leakage risks
|
||||
- Fixed documentation spelling issues
|
||||
|
||||
### 🛡️ Security Improvements
|
||||
- Hide API keys in UI for security purposes
|
||||
- Enhanced input sanitization and XSS protection
|
||||
- Added comprehensive security utilities
|
||||
|
||||
### 🐛 Bug Fixes
|
||||
- Fixed OCR AVX support issues
|
||||
- Improved frontend health check dependencies
|
||||
- Enhanced Docker binary downloads for target architecture
|
||||
- Fixed COS file service initialization parameters and URL processing logic
|
||||
|
||||
### 🚀 Features & Enhancements
|
||||
- Improved application and docreader log output
|
||||
- Enhanced frontend routing and authentication flow
|
||||
- Added comprehensive user management system
|
||||
- Improved initialization configuration handling
|
||||
|
||||
### 🛡️ Security Recommendations
|
||||
- Deploy WeKnora services in internal/private network environments
|
||||
- Avoid direct exposure to public internet
|
||||
- Configure proper firewall rules and access controls
|
||||
- Regular updates for security patches and improvements
|
||||
|
||||
## [0.1.2] - 2025-09-10
|
||||
|
||||
- Fixed health check implementation for docreader service
|
||||
- Improved query handling for empty queries
|
||||
- Enhanced knowledge base column value update methods
|
||||
- Optimized logging throughout the application
|
||||
- Added process parsing documentation for markdown files
|
||||
- Fixed OCR model pre-fetching in Docker containers
|
||||
- Resolved image parser concurrency errors
|
||||
- Added support for modifying listening port configuration
|
||||
|
||||
## [0.1.0] - 2025-09-08
|
||||
|
||||
- Initial public release of WeKnora.
|
||||
@@ -14,4 +97,7 @@ All notable changes to this project will be documented in this file.
|
||||
- Docker Compose for quick startup and service orchestration.
|
||||
- MCP server support for integrating with MCP-compatible clients.
|
||||
|
||||
[0.1.4]: https://github.com/Tencent/WeKnora/tree/v0.1.4
|
||||
[0.1.3]: https://github.com/Tencent/WeKnora/tree/v0.1.3
|
||||
[0.1.2]: https://github.com/Tencent/WeKnora/tree/v0.1.2
|
||||
[0.1.0]: https://github.com/Tencent/WeKnora/tree/v0.1.0
|
||||
|
||||
17
Makefile
@@ -85,7 +85,15 @@ clean:
|
||||
|
||||
# Build Docker image
|
||||
docker-build-app:
|
||||
docker build --platform $(PLATFORM) -f docker/Dockerfile.app -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
|
||||
@echo "获取版本信息..."
|
||||
@eval $$(./scripts/get_version.sh env); \
|
||||
./scripts/get_version.sh info; \
|
||||
docker build --platform $(PLATFORM) \
|
||||
--build-arg VERSION_ARG="$$VERSION" \
|
||||
--build-arg COMMIT_ID_ARG="$$COMMIT_ID" \
|
||||
--build-arg BUILD_TIME_ARG="$$BUILD_TIME" \
|
||||
--build-arg GO_VERSION_ARG="$$GO_VERSION" \
|
||||
-f docker/Dockerfile.app -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
|
||||
|
||||
# Build docreader Docker image
|
||||
docker-build-docreader:
|
||||
@@ -168,7 +176,12 @@ deps:
|
||||
|
||||
# Build for production
|
||||
build-prod:
|
||||
GOOS=linux go build -a -installsuffix cgo -ldflags="-w -s" -o $(BINARY_NAME) $(MAIN_PATH)
|
||||
VERSION=$${VERSION:-unknown}; \
|
||||
COMMIT_ID=$${COMMIT_ID:-unknown}; \
|
||||
BUILD_TIME=$${BUILD_TIME:-unknown}; \
|
||||
GO_VERSION=$${GO_VERSION:-unknown}; \
|
||||
LDFLAGS="-X 'github.com/Tencent/WeKnora/internal/handler.Version=$$VERSION' -X 'github.com/Tencent/WeKnora/internal/handler.CommitID=$$COMMIT_ID' -X 'github.com/Tencent/WeKnora/internal/handler.BuildTime=$$BUILD_TIME' -X 'github.com/Tencent/WeKnora/internal/handler.GoVersion=$$GO_VERSION'"; \
|
||||
go build -ldflags="-w -s $$LDFLAGS" -o $(BINARY_NAME) $(MAIN_PATH)
|
||||
|
||||
clean-db:
|
||||
@echo "Cleaning database..."
|
||||
|
||||
112
README.md
@@ -15,7 +15,7 @@
|
||||
<img src="https://img.shields.io/badge/License-MIT-ffffff?labelColor=d4eaf7&color=2e6cc4" alt="License">
|
||||
</a>
|
||||
<a href="./CHANGELOG.md">
|
||||
<img alt="Version" src="https://img.shields.io/badge/version-0.1.0-2e6cc4?labelColor=d4eaf7">
|
||||
<img alt="Version" src="https://img.shields.io/badge/version-0.1.3-2e6cc4?labelColor=d4eaf7">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
@@ -41,6 +41,15 @@ It adopts a modular architecture that combines multimodal preprocessing, semanti
|
||||
|
||||
**Website:** https://weknora.weixin.qq.com
|
||||
|
||||
## 🔒 Security Notice
|
||||
|
||||
**Important:** Starting from v0.1.3, WeKnora includes login authentication functionality to enhance system security. For production deployments, we strongly recommend:
|
||||
|
||||
- Deploy WeKnora services in internal/private network environments rather than public internet
|
||||
- Avoid exposing the service directly to public networks to prevent potential information leakage
|
||||
- Configure proper firewall rules and access controls for your deployment environment
|
||||
- Regularly update to the latest version for security patches and improvements
|
||||
|
||||
## 🏗️ Architecture
|
||||
|
||||

|
||||
@@ -110,25 +119,58 @@ cp .env.example .env
|
||||
# All variables are documented in the .env.example comments
|
||||
```
|
||||
|
||||
#### ③ Start the services
|
||||
#### ③ Start the services (include Ollama)
|
||||
|
||||
Check the images that need to be started in the .env file.
|
||||
|
||||
```bash
|
||||
# Start all services (Ollama + backend containers)
|
||||
./scripts/start_all.sh
|
||||
# Or
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
make start-all
|
||||
```
|
||||
|
||||
#### ③ Start the services (backup)
|
||||
#### ③.0 Start ollama services (Optional)
|
||||
|
||||
```bash
|
||||
# Start ollama services (Optional)
|
||||
ollama serve > /dev/null 2>&1 &
|
||||
```
|
||||
|
||||
# Start the service
|
||||
#### ③.1 Activate different combinations of features
|
||||
|
||||
- Minimum core services
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
- All features enabled
|
||||
```bash
|
||||
docker-compose --profile full up -d
|
||||
```
|
||||
|
||||
- Tracing logs required
|
||||
```bash
|
||||
docker-compose --profile jaeger up -d
|
||||
```
|
||||
|
||||
- Neo4j knowledge graph required
|
||||
```bash
|
||||
docker-compose --profile neo4j up -d
|
||||
```
|
||||
|
||||
- Minio file storage service required
|
||||
```bash
|
||||
docker-compose --profile minio up -d
|
||||
```
|
||||
|
||||
- Multiple options combination
|
||||
```bash
|
||||
docker-compose --profile neo4j --profile minio up -d
|
||||
```
|
||||
|
||||
#### ④ Stop the services
|
||||
|
||||
```bash
|
||||
@@ -161,6 +203,8 @@ git clone https://github.com/Tencent/WeKnora
|
||||
```
|
||||
|
||||
#### 2️⃣ Configure MCP Server
|
||||
> It is recommended to directly refer to the [MCP Configuration Guide](./mcp-server/MCP_CONFIG.md) for configuration.
|
||||
|
||||
Configure the MCP client to connect to the server:
|
||||
```json
|
||||
{
|
||||
@@ -212,9 +256,7 @@ make clean-db
|
||||
|
||||
http://localhost
|
||||
|
||||
On first access, it will automatically redirect to the initialization configuration page. After configuration is complete, it will automatically redirect to the knowledge base page. Please follow the page instructions to complete model configuration.
|
||||
|
||||

|
||||
On your first visit, you will be automatically redirected to the registration/login page. After completing registration, please create a new knowledge base and finish the relevant settings on its configuration page.
|
||||
|
||||
## 📱 Interface Showcase
|
||||
|
||||
@@ -234,17 +276,13 @@ On first access, it will automatically redirect to the initialization configurat
|
||||
|
||||
### Document Knowledge Graph
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="./docs/images/graph2.png" alt="Knowledge Graph View 1"></td>
|
||||
<td><img src="./docs/images/graph1.png" alt="Knowledge Graph View 2"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
WeKnora supports transforming documents into knowledge graphs, displaying the relationships between different sections of the documents. Once the knowledge graph feature is enabled, the system analyzes and constructs an internal semantic association network that not only helps users understand document content but also provides structured support for indexing and retrieval, enhancing the relevance and breadth of search results.
|
||||
|
||||
### MCP Server Integration Effects
|
||||
<img width="950" height="2063" alt="MCP Server Integration Demo" src="https://github.com/user-attachments/assets/09111ec8-0489-415c-969d-aa3835778e14" />
|
||||
For detailed configuration, please refer to the [Knowledge Graph Configuration Guide](./docs/KnowledgeGraph.md).
|
||||
|
||||
### MCP Server
|
||||
|
||||
Please refer to the [MCP Configuration Guide](./mcp-server/MCP_CONFIG.md) for the necessary setup.
|
||||
|
||||
## 📘 API Reference
|
||||
|
||||
@@ -258,21 +296,17 @@ Detailed API documentation is available at: [API Docs](./docs/API.md)
|
||||
|
||||
```
|
||||
WeKnora/
|
||||
├── client/ # go client
|
||||
├── cmd/ # Main entry point
|
||||
├── internal/ # Core business logic
|
||||
├── config/ # Configuration files
|
||||
├── migrations/ # DB migration scripts
|
||||
├── scripts/ # Shell scripts
|
||||
├── services/ # Microservice logic
|
||||
├── docker/ # docker images files
|
||||
├── docreader/ # Document parsing app
|
||||
├── docs/ # Project documentation
|
||||
├── frontend/ # Frontend app
|
||||
└── docs/ # Project documentation
|
||||
```
|
||||
|
||||
### 🔧 Common Commands
|
||||
|
||||
```bash
|
||||
# Wipe all data from DB (use with caution)
|
||||
make clean-db
|
||||
├── internal/ # Core business logic
|
||||
├── mcp-server/ # MCP server
|
||||
├── migrations/ # DB migration scripts
|
||||
└── scripts/ # Shell scripts
|
||||
```
|
||||
|
||||
## 🤝 Contributing
|
||||
@@ -314,7 +348,23 @@ test: Add retrieval engine test cases
|
||||
refactor: Restructure document parsing module
|
||||
```
|
||||
|
||||
## 👥 Contributors
|
||||
|
||||
Thanks to these excellent contributors:
|
||||
|
||||
[](https://github.com/Tencent/WeKnora/graphs/contributors)
|
||||
|
||||
## 📄 License
|
||||
|
||||
This project is licensed under the [MIT License](./LICENSE).
|
||||
You are free to use, modify, and distribute the code with proper attribution.
|
||||
|
||||
## 📈 Project Statistics
|
||||
|
||||
<a href="https://www.star-history.com/#Tencent/WeKnora&type=date&legend=top-left">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&theme=dark&legend=top-left" />
|
||||
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
</picture>
|
||||
</a>
|
||||
|
||||
121
README_CN.md
@@ -15,7 +15,7 @@
|
||||
<img src="https://img.shields.io/badge/License-MIT-ffffff?labelColor=d4eaf7&color=2e6cc4" alt="License">
|
||||
</a>
|
||||
<a href="./CHANGELOG.md">
|
||||
<img alt="版本" src="https://img.shields.io/badge/version-0.1.0-2e6cc4?labelColor=d4eaf7">
|
||||
<img alt="版本" src="https://img.shields.io/badge/version-0.1.3-2e6cc4?labelColor=d4eaf7">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
@@ -41,6 +41,15 @@
|
||||
|
||||
**官网:** https://weknora.weixin.qq.com
|
||||
|
||||
## 🔒 安全声明
|
||||
|
||||
**重要提示:** 从 v0.1.3 版本开始,WeKnora 提供了登录鉴权功能,以增强系统安全性。在生产环境部署时,我们强烈建议:
|
||||
|
||||
- 将 WeKnora 服务部署在内网/私有网络环境中,而非公网环境
|
||||
- 避免将服务直接暴露在公网上,以防止重要信息泄露风险
|
||||
- 为部署环境配置适当的防火墙规则和访问控制
|
||||
- 定期更新到最新版本以获取安全补丁和改进
|
||||
|
||||
## 🏗️ 架构设计
|
||||
|
||||

|
||||
@@ -110,25 +119,58 @@ cp .env.example .env
|
||||
# 所有变量说明详见 .env.example 注释
|
||||
```
|
||||
|
||||
#### ③ 启动服务
|
||||
#### ③ 启动服务 (含 Ollama)
|
||||
|
||||
检查 .env 文件中需要启动的镜像。
|
||||
|
||||
```bash
|
||||
# 启动全部服务(含 Ollama 与后端容器)
|
||||
./scripts/start_all.sh
|
||||
# 或
|
||||
```
|
||||
|
||||
或者
|
||||
|
||||
```bash
|
||||
make start-all
|
||||
```
|
||||
|
||||
#### ③ 启动服务备选
|
||||
#### ③.0 启动Ollama (可选)
|
||||
|
||||
```bash
|
||||
# 启动 ollama 服务 (可选)
|
||||
ollama serve > /dev/null 2>&1 &
|
||||
```
|
||||
|
||||
# 启动服务
|
||||
#### ③.1 激活不同组合的功能
|
||||
|
||||
- 启动最小功能
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
- 启动全部功能
|
||||
```bash
|
||||
docker-compose --profile full up -d
|
||||
```
|
||||
|
||||
- 需要 tracing 日志
|
||||
```bash
|
||||
docker-compose --profile jaeger up -d
|
||||
```
|
||||
|
||||
- 需要 neo4j 知识图谱
|
||||
```bash
|
||||
docker-compose --profile neo4j up -d
|
||||
```
|
||||
|
||||
- 需要 minio 文件存储服务
|
||||
```bash
|
||||
docker-compose --profile minio up -d
|
||||
```
|
||||
|
||||
- 多选项组合
|
||||
```bash
|
||||
docker-compose --profile neo4j --profile minio up -d
|
||||
```
|
||||
|
||||
#### ④ 停止服务
|
||||
|
||||
```bash
|
||||
@@ -152,12 +194,19 @@ WeKnora 作为[微信对话开放平台](https://chatbot.weixin.qq.com)的核心
|
||||
- **零代码部署**:只需上传知识,即可在微信生态中快速部署智能问答服务,实现"即问即答"的体验
|
||||
- **高效问题管理**:支持高频问题的独立分类管理,提供丰富的数据工具,确保回答精准可靠且易于维护
|
||||
- **微信生态覆盖**:通过微信对话开放平台,WeKnora 的智能问答能力可无缝集成到公众号、小程序等微信场景中,提升用户交互体验
|
||||
### 🔗MCP服务器访问已经部署好的WEKnora
|
||||
|
||||
### 🔗 MCP 服务器访问已经部署好的 WeKnora
|
||||
|
||||
#### 1️⃣克隆储存库
|
||||
|
||||
```
|
||||
git clone https://github.com/Tencent/WeKnora
|
||||
```
|
||||
|
||||
#### 2️⃣配置MCP服务器
|
||||
|
||||
> 推荐直接参考 [MCP配置说明](./mcp-server/MCP_CONFIG.md) 进行配置。
|
||||
|
||||
mcp客户端配置服务器
|
||||
```json
|
||||
{
|
||||
@@ -175,6 +224,7 @@ mcp客户端配置服务器
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
使用stdio命令直接运行
|
||||
```
|
||||
pip install weknora-mcp-server
|
||||
@@ -208,10 +258,7 @@ make clean-db
|
||||
|
||||
http://localhost
|
||||
|
||||
首次访问会自动跳转到初始化配置页面,配置完成后会自动跳转到知识库页面。请按照页面提示信息完成模型的配置。
|
||||
|
||||

|
||||
|
||||
首次访问会自动跳转到注册登录页面,完成注册后,请创建一个新的知识库,并在该知识库的设置页面完成相关设置。
|
||||
|
||||
## 📱 功能展示
|
||||
|
||||
@@ -231,17 +278,13 @@ http://localhost
|
||||
|
||||
### 文档知识图谱
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="./docs/images/graph2.png" alt="知识图谱展示1"></td>
|
||||
<td><img src="./docs/images/graph1.png" alt="知识图谱展示2"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
WeKnora 支持将文档转化为知识图谱,展示文档中不同段落之间的关联关系。开启知识图谱功能后,系统会分析并构建文档内部的语义关联网络,不仅帮助用户理解文档内容,还为索引和检索提供结构化支撑,提升检索结果的相关性和广度。
|
||||
### 配套MCP服务器调用效果
|
||||
<img width="950" height="2063" alt="118d078426f42f3d4983c13386085d7f" src="https://github.com/user-attachments/assets/09111ec8-0489-415c-969d-aa3835778e14" />
|
||||
|
||||
具体配置请参考 [知识图谱配置说明](./docs/KnowledgeGraph.md) 进行相关配置。
|
||||
|
||||
### 配套MCP服务器
|
||||
|
||||
请参考 [MCP配置说明](./mcp-server/MCP_CONFIG.md) 进行相关配置。
|
||||
|
||||
## 📘 文档
|
||||
|
||||
@@ -255,21 +298,17 @@ WeKnora 支持将文档转化为知识图谱,展示文档中不同段落之间
|
||||
|
||||
```
|
||||
WeKnora/
|
||||
├── client/ # go客户端
|
||||
├── cmd/ # 应用入口
|
||||
├── internal/ # 核心业务逻辑
|
||||
├── config/ # 配置文件
|
||||
├── migrations/ # 数据库迁移脚本
|
||||
├── scripts/ # 启动与工具脚本
|
||||
├── services/ # 各子服务实现
|
||||
├── docker/ # docker 镜像文件
|
||||
├── docreader/ # 文档解析项目
|
||||
├── docs/ # 项目文档
|
||||
├── frontend/ # 前端项目
|
||||
└── docs/ # 项目文档
|
||||
```
|
||||
|
||||
### 🔧 常用命令
|
||||
|
||||
```bash
|
||||
# 清空数据库(慎用!)
|
||||
make clean-db
|
||||
├── internal/ # 核心业务逻辑
|
||||
├── mcp-server/ # MCP服务器
|
||||
├── migrations/ # 数据库迁移脚本
|
||||
└── scripts/ # 启动与工具脚本
|
||||
```
|
||||
|
||||
## 🤝 贡献指南
|
||||
@@ -311,7 +350,23 @@ test: 添加检索引擎测试用例
|
||||
refactor: 重构文档解析模块
|
||||
```
|
||||
|
||||
## 👥 贡献者
|
||||
|
||||
感谢以下优秀的贡献者们:
|
||||
|
||||
[](https://github.com/Tencent/WeKnora/graphs/contributors)
|
||||
|
||||
## 📄 许可证
|
||||
|
||||
本项目基于 [MIT](./LICENSE) 协议发布。
|
||||
你可以自由使用、修改和分发本项目代码,但需保留原始版权声明。
|
||||
|
||||
## 📈 项目统计
|
||||
|
||||
<a href="https://www.star-history.com/#Tencent/WeKnora&type=date&legend=top-left">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&theme=dark&legend=top-left" />
|
||||
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
</picture>
|
||||
</a>
|
||||
|
||||
129
README_JA.md
@@ -15,7 +15,7 @@
|
||||
<img src="https://img.shields.io/badge/License-MIT-ffffff?labelColor=d4eaf7&color=2e6cc4" alt="License">
|
||||
</a>
|
||||
<a href="./CHANGELOG.md">
|
||||
<img alt="バージョン" src="https://img.shields.io/badge/version-0.1.0-2e6cc4?labelColor=d4eaf7">
|
||||
<img alt="バージョン" src="https://img.shields.io/badge/version-0.1.3-2e6cc4?labelColor=d4eaf7">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
@@ -41,6 +41,15 @@
|
||||
|
||||
**公式サイト:** https://weknora.weixin.qq.com
|
||||
|
||||
## 🔒 セキュリティ通知
|
||||
|
||||
**重要:** v0.1.3バージョンより、WeKnoraにはシステムセキュリティを強化するためのログイン認証機能が含まれています。本番環境でのデプロイメントにおいて、以下を強く推奨します:
|
||||
|
||||
- WeKnoraサービスはパブリックインターネットではなく、内部/プライベートネットワーク環境にデプロイしてください
|
||||
- 重要な情報漏洩を防ぐため、サービスを直接パブリックネットワークに公開することは避けてください
|
||||
- デプロイメント環境に適切なファイアウォールルールとアクセス制御を設定してください
|
||||
- セキュリティパッチと改善のため、定期的に最新バージョンに更新してください
|
||||
|
||||
## 🏗️ アーキテクチャ設計
|
||||
|
||||

|
||||
@@ -110,25 +119,58 @@ cp .env.example .env
|
||||
# すべての変数の説明は.env.exampleのコメントを参照
|
||||
```
|
||||
|
||||
#### ③ サービスの起動
|
||||
#### ③ サービスを起動します(Ollama を含む)
|
||||
|
||||
.env ファイルで、起動する必要があるイメージを確認します。
|
||||
|
||||
```bash
|
||||
# すべてのサービスを起動(Ollamaとバックエンドコンテナを含む)
|
||||
./scripts/start_all.sh
|
||||
# または
|
||||
```
|
||||
|
||||
または
|
||||
|
||||
```bash
|
||||
make start-all
|
||||
```
|
||||
|
||||
#### ③ サービス起動の代替方法
|
||||
#### ③.0 ollama サービスを起動する (オプション)
|
||||
|
||||
```bash
|
||||
# ollamaサービスを起動(オプション)
|
||||
ollama serve > /dev/null 2>&1 &
|
||||
```
|
||||
|
||||
# サービスを起動
|
||||
#### ③.1 さまざまな機能の組み合わせを有効にする
|
||||
|
||||
- 最小限のコアサービス
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
- すべての機能を有効にする
|
||||
```bash
|
||||
docker-compose --profile full up -d
|
||||
```
|
||||
|
||||
- トレースログが必要
|
||||
```bash
|
||||
docker-compose --profile jaeger up -d
|
||||
```
|
||||
|
||||
- Neo4j ナレッジグラフが必要
|
||||
```bash
|
||||
docker-compose --profile neo4j up -d
|
||||
```
|
||||
|
||||
- Minio ファイルストレージサービスが必要
|
||||
```bash
|
||||
docker-compose --profile minio up -d
|
||||
```
|
||||
|
||||
- 複数のオプションの組み合わせ
|
||||
```bash
|
||||
docker-compose --profile neo4j --profile minio up -d
|
||||
```
|
||||
|
||||
#### ④ サービスの停止
|
||||
|
||||
```bash
|
||||
@@ -153,12 +195,17 @@ WeKnoraは[WeChat対話オープンプラットフォーム](https://chatbot.wei
|
||||
- **効率的な問題管理**:高頻度の問題の独立した分類管理をサポートし、豊富なデータツールを提供して、正確で信頼性が高く、メンテナンスが容易な回答を保証
|
||||
- **WeChatエコシステムカバレッジ**:WeChat対話オープンプラットフォームを通じて、WeKnoraのインテリジェントQ&A能力を公式アカウント、ミニプログラムなどのWeChatシナリオにシームレスに統合し、ユーザーインタラクション体験を向上
|
||||
|
||||
### 🔗MCPサーバーを使用してデプロイ済みのWeKnoraにアクセス
|
||||
### 🔗 MCP サーバーを使用してデプロイ済みの WeKnora にアクセス
|
||||
|
||||
#### 1️⃣リポジトリのクローン
|
||||
```
|
||||
git clone https://github.com/Tencent/WeKnora
|
||||
```
|
||||
#### 2️⃣MCPサーバーの設定
|
||||
|
||||
#### 2️⃣ MCPサーバーの設定
|
||||
|
||||
> 設定には直接 [MCP設定説明](./mcp-server/MCP_CONFIG.md) を参照することをお勧めします。
|
||||
|
||||
MCPクライアントでサーバーを設定
|
||||
```json
|
||||
{
|
||||
@@ -176,6 +223,7 @@ MCPクライアントでサーバーを設定
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
stdioコマンドで直接実行
|
||||
```
|
||||
pip install weknora-mcp-server
|
||||
@@ -209,10 +257,7 @@ make clean-db
|
||||
|
||||
http://localhost
|
||||
|
||||
初回アクセス時は自動的に初期設定ページにリダイレクトされ、設定完了後は自動的にナレッジベースページにリダイレクトされます。ページの指示に従ってモデルの設定を完了してください。
|
||||
|
||||

|
||||
|
||||
初回アクセス時は自動的に登録・ログインページに遷移します。登録完了後、新規にナレッジベースを作成し、その設定画面で必要な項目を構成してください。
|
||||
|
||||
## 📱 機能デモ
|
||||
|
||||
@@ -232,17 +277,13 @@ http://localhost
|
||||
|
||||
### 文書ナレッジグラフ
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="./docs/images/graph2.png" alt="ナレッジグラフ表示1"></td>
|
||||
<td><img src="./docs/images/graph1.png" alt="ナレッジグラフ表示2"></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
WeKnoraは文書をナレッジグラフに変換し、文書内の異なる段落間の関連関係を表示することをサポートします。ナレッジグラフ機能を有効にすると、システムは文書内部の意味関連ネットワークを分析・構築し、ユーザーが文書内容を理解するのを助けるだけでなく、インデックスと検索に構造化サポートを提供し、検索結果の関連性と幅を向上させます。
|
||||
|
||||
### 対応MCPサーバー呼び出し効果
|
||||
<img width="950" height="2063" alt="118d078426f42f3d4983c13386085d7f" src="https://github.com/user-attachments/assets/09111ec8-0489-415c-969d-aa3835778e14" />
|
||||
詳細な設定については、[ナレッジグラフ設定ガイド](./docs/KnowledgeGraph.md)をご参照ください。
|
||||
|
||||
### 対応するMCPサーバー
|
||||
|
||||
[MCP設定ガイド](./mcp-server/MCP_CONFIG.md) をご参照のうえ、必要な設定を行ってください。
|
||||
|
||||
|
||||
## 📘 ドキュメント
|
||||
@@ -256,22 +297,18 @@ WeKnoraは文書をナレッジグラフに変換し、文書内の異なる段
|
||||
### 📁 プロジェクトディレクトリ構造
|
||||
|
||||
```
|
||||
WeKnora/
|
||||
├── cmd/ # アプリケーションエントリー
|
||||
├── internal/ # コアビジネスロジック
|
||||
├── config/ # 設定ファイル
|
||||
├── migrations/ # データベースマイグレーションスクリプト
|
||||
├── scripts/ # 起動とツールスクリプト
|
||||
├── services/ # 各サブサービスの実装
|
||||
├── frontend/ # フロントエンドプロジェクト
|
||||
└── docs/ # プロジェクトドキュメント
|
||||
```
|
||||
|
||||
### 🔧 よく使うコマンド
|
||||
|
||||
```bash
|
||||
# データベースをクリア(注意して使用!)
|
||||
make clean-db
|
||||
WeKnora/
|
||||
├── client/ # Goクライアント
|
||||
├── cmd/ # アプリケーションエントリ
|
||||
├── config/ # 設定ファイル
|
||||
├── docker/ # Dockerイメージファイル
|
||||
├── docreader/ # 文書解析プロジェクト
|
||||
├── docs/ # プロジェクトドキュメント
|
||||
├── frontend/ # フロントエンドプロジェクト
|
||||
├── internal/ # コアビジネスロジック
|
||||
├── mcp-server/ # MCPサーバー
|
||||
├── migrations/ # データベースマイグレーションスクリプト
|
||||
└── scripts/ # 起動およびツールスクリプト
|
||||
```
|
||||
|
||||
## 🤝 貢献ガイド
|
||||
@@ -313,7 +350,23 @@ test: 検索エンジンテストケースを追加
|
||||
refactor: 文書解析モジュールをリファクタリング
|
||||
```
|
||||
|
||||
## 👥 コントリビューター
|
||||
|
||||
素晴らしいコントリビューターに感謝します:
|
||||
|
||||
[](https://github.com/Tencent/WeKnora/graphs/contributors )
|
||||
|
||||
## 📄 ライセンス
|
||||
|
||||
このプロジェクトは[MIT](./LICENSE)ライセンスの下で公開されています。
|
||||
このプロジェクトのコードを自由に使用、変更、配布できますが、元の著作権表示を保持する必要があります。
|
||||
|
||||
## 📈 プロジェクト統計
|
||||
|
||||
<a href="https://www.star-history.com/#Tencent/WeKnora&type=date&legend=top-left">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&theme=dark&legend=top-left" />
|
||||
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Tencent/WeKnora&type=date&legend=top-left" />
|
||||
</picture>
|
||||
</a>
|
||||
|
||||
@@ -74,6 +74,9 @@ type UpdateImageInfoRequest struct {
|
||||
// ErrDuplicateFile is returned when attempting to create a knowledge entry with a file that already exists
|
||||
var ErrDuplicateFile = errors.New("file already exists")
|
||||
|
||||
// ErrDuplicateURL is returned when attempting to create a knowledge entry with a URL that already exists
|
||||
var ErrDuplicateURL = errors.New("URL already exists")
|
||||
|
||||
// CreateKnowledgeFromFile creates a knowledge entry from a local file path
|
||||
func (c *Client) CreateKnowledgeFromFile(ctx context.Context,
|
||||
knowledgeBaseID string, filePath string, metadata map[string]string, enableMultimodel *bool,
|
||||
@@ -186,7 +189,12 @@ func (c *Client) CreateKnowledgeFromURL(ctx context.Context, knowledgeBaseID str
|
||||
}
|
||||
|
||||
var response KnowledgeResponse
|
||||
if err := parseResponse(resp, &response); err != nil {
|
||||
if resp.StatusCode == http.StatusConflict {
|
||||
if err := json.NewDecoder(resp.Body).Decode(&response); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse response: %w", err)
|
||||
}
|
||||
return &response.Data, ErrDuplicateURL
|
||||
} else if err := parseResponse(resp, &response); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ conversation:
|
||||
keyword_threshold: 0.3
|
||||
embedding_top_k: 10
|
||||
vector_threshold: 0.5
|
||||
rerank_threshold: 0.7
|
||||
rerank_threshold: 0.5
|
||||
rerank_top_k: 5
|
||||
fallback_strategy: "fixed"
|
||||
fallback_response: "抱歉,我无法回答这个问题。"
|
||||
@@ -534,3 +534,69 @@ knowledge_base:
|
||||
split_markers: ["\n\n", "\n", "。"]
|
||||
image_processing:
|
||||
enable_multimodal: true
|
||||
|
||||
extract:
|
||||
extract_graph:
|
||||
description: |
|
||||
请基于给定文本,按以下步骤完成信息提取任务,确保逻辑清晰、信息完整准确:
|
||||
|
||||
## 一、实体提取与属性补充
|
||||
1. **提取核心实体**:通读文本,按逻辑顺序(如文本叙述顺序、实体关联紧密程度)提取所有与任务相关的核心实体。
|
||||
2. **补充实体详细属性**:针对每个提取的实体,全面补充其在文本中明确提及的详细属性,确保无关键属性遗漏。
|
||||
|
||||
## 二、关系提取与验证
|
||||
1. **明确关系类型**:仅从指定关系列表中选择对应类型,限定关系类型为: %s。
|
||||
2. **提取有效关系**:基于已提取的实体及属性,识别文本中真实存在的关系,确保关系符合文本事实、无虚假关联。
|
||||
3. **明确关系主体**:对每一组提取的关系,清晰标注两个关联主体,避免主体混淆。
|
||||
4. **补充关联属性**:若文本中存在与该关系直接相关的补充信息,需将该信息作为关系的关联属性补充,进一步完善关系信息。
|
||||
tags:
|
||||
- "作者"
|
||||
- "别名"
|
||||
examples:
|
||||
- text: |
|
||||
《红楼梦》,又名《石头记》,是清代作家曹雪芹创作的中国古典四大名著之一,被誉为中国封建社会的百科全书。该书前80回由曹雪芹所著,后40回一般认为是高鹗所续。
|
||||
小说以贾、史、王、薛四大家族的兴衰为背景,以贾宝玉、林黛玉和薛宝钗的爱情悲剧为主线,刻画了以贾宝玉和金陵十二钗为中心的正邪两赋、贤愚并出的高度复杂的人物群像。
|
||||
成书于乾隆年间(1743年前后),是中国文学史上现实主义的高峰,对后世影响深远。
|
||||
node:
|
||||
- name: "红楼梦"
|
||||
attributes:
|
||||
- "中国古典四大名著之一"
|
||||
- "又名《石头记》"
|
||||
- "被誉为中国封建社会的百科全书"
|
||||
- name: "石头记"
|
||||
attributes:
|
||||
- "《红楼梦》的别名"
|
||||
- name: "曹雪芹"
|
||||
attributes:
|
||||
- "清代作家"
|
||||
- "《红楼梦》前 80 回的作者"
|
||||
- name: "高鹗"
|
||||
attributes:
|
||||
- "一般认为是《红楼梦》后 40 回的续写者"
|
||||
relation:
|
||||
- node1: "红楼梦"
|
||||
node2: "曹雪芹"
|
||||
type: "作者"
|
||||
- node1: "红楼梦"
|
||||
node2: "高鹗"
|
||||
type: "作者"
|
||||
- node1: "红楼梦"
|
||||
node2: "石头记"
|
||||
type: "别名"
|
||||
extract_entity:
|
||||
description: |
|
||||
请基于用户给的问题,按以下步骤处理关键信息提取任务:
|
||||
1. 梳理逻辑关联:首先完整分析文本内容,明确其核心逻辑关系,并简要标注该核心逻辑类型;
|
||||
2. 提取关键实体:围绕梳理出的逻辑关系,精准提取文本中的关键信息并归类为明确实体,确保不遗漏核心信息、不添加冗余内容;
|
||||
3. 排序实体优先级:按实体与文本核心主题的关联紧密程度排序,优先呈现对理解文本主旨最重要的实体;
|
||||
examples:
|
||||
- text: "《红楼梦》,又名《石头记》,是清代作家曹雪芹创作的中国古典四大名著之一,被誉为中国封建社会的百科全书。"
|
||||
node:
|
||||
- name: "红楼梦"
|
||||
- name: "曹雪芹"
|
||||
- name: "中国古典四大名著"
|
||||
fabri_text:
|
||||
with_tag: |
|
||||
请随机生成一段文本,要求内容与 %s 等相关,字数在 [50-200] 之间,并且尽量包含一些与这些标签相关的专业术语或典型元素,使文本更具针对性和相关性。
|
||||
with_no_tag: |
|
||||
请随机生成一段文本,内容请自由发挥,字数在 [50-200] 之间。
|
||||
@@ -1,27 +1,49 @@
|
||||
services:
|
||||
frontend:
|
||||
image: wechatopenai/weknora-ui:latest
|
||||
build: ./frontend
|
||||
container_name: WeKnora-frontend
|
||||
ports:
|
||||
- "${FRONTEND_PORT:-80}:80"
|
||||
depends_on:
|
||||
app:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
|
||||
app:
|
||||
image: wechatopenai/weknora-app:latest
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/Dockerfile.app
|
||||
container_name: WeKnora-app
|
||||
ports:
|
||||
- "${APP_PORT:-8080}:8080"
|
||||
volumes:
|
||||
- data-files:/data/files
|
||||
- ./config/config.yaml:/app/config/config.yaml
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
environment:
|
||||
- COS_SECRET_ID=${COS_SECRET_ID}
|
||||
- COS_SECRET_KEY=${COS_SECRET_KEY}
|
||||
- COS_REGION=${COS_REGION}
|
||||
- COS_BUCKET_NAME=${COS_BUCKET_NAME}
|
||||
- COS_APP_ID=${COS_APP_ID}
|
||||
- COS_PATH_PREFIX=${COS_PATH_PREFIX}
|
||||
- COS_ENABLE_OLD_DOMAIN=${COS_ENABLE_OLD_DOMAIN}
|
||||
- GIN_MODE=${GIN_MODE}
|
||||
- COS_SECRET_ID=${COS_SECRET_ID:-}
|
||||
- COS_SECRET_KEY=${COS_SECRET_KEY:-}
|
||||
- COS_REGION=${COS_REGION:-}
|
||||
- COS_BUCKET_NAME=${COS_BUCKET_NAME:-}
|
||||
- COS_APP_ID=${COS_APP_ID:-}
|
||||
- COS_PATH_PREFIX=${COS_PATH_PREFIX:-}
|
||||
- COS_ENABLE_OLD_DOMAIN=${COS_ENABLE_OLD_DOMAIN:-}
|
||||
- GIN_MODE=${GIN_MODE:-}
|
||||
- DB_DRIVER=postgres
|
||||
- DB_HOST=postgres
|
||||
- DB_PORT=5432
|
||||
- DB_USER=${DB_USER}
|
||||
- DB_PASSWORD=${DB_PASSWORD}
|
||||
- DB_NAME=${DB_NAME}
|
||||
- DB_USER=${DB_USER:-}
|
||||
- DB_PASSWORD=${DB_PASSWORD:-}
|
||||
- DB_NAME=${DB_NAME:-}
|
||||
- TZ=Asia/Shanghai
|
||||
- OTEL_EXPORTER_OTLP_ENDPOINT=jaeger:4317
|
||||
- OTEL_SERVICE_NAME=WeKnora
|
||||
@@ -29,45 +51,47 @@ services:
|
||||
- OTEL_METRICS_EXPORTER=none
|
||||
- OTEL_LOGS_EXPORTER=none
|
||||
- OTEL_PROPAGATORS=tracecontext,baggage
|
||||
- RETRIEVE_DRIVER=${RETRIEVE_DRIVER}
|
||||
- ELASTICSEARCH_ADDR=${ELASTICSEARCH_ADDR}
|
||||
- ELASTICSEARCH_USERNAME=${ELASTICSEARCH_USERNAME}
|
||||
- ELASTICSEARCH_PASSWORD=${ELASTICSEARCH_PASSWORD}
|
||||
- ELASTICSEARCH_INDEX=${ELASTICSEARCH_INDEX}
|
||||
- RETRIEVE_DRIVER=${RETRIEVE_DRIVER:-}
|
||||
- ELASTICSEARCH_ADDR=${ELASTICSEARCH_ADDR:-}
|
||||
- ELASTICSEARCH_USERNAME=${ELASTICSEARCH_USERNAME:-}
|
||||
- ELASTICSEARCH_PASSWORD=${ELASTICSEARCH_PASSWORD:-}
|
||||
- ELASTICSEARCH_INDEX=${ELASTICSEARCH_INDEX:-}
|
||||
- DOCREADER_ADDR=docreader:50051
|
||||
- STORAGE_TYPE=${STORAGE_TYPE}
|
||||
- LOCAL_STORAGE_BASE_DIR=${LOCAL_STORAGE_BASE_DIR}
|
||||
- STORAGE_TYPE=${STORAGE_TYPE:-}
|
||||
- LOCAL_STORAGE_BASE_DIR=${LOCAL_STORAGE_BASE_DIR:-}
|
||||
- MINIO_ENDPOINT=minio:9000
|
||||
- MINIO_ACCESS_KEY_ID=${MINIO_ACCESS_KEY_ID:-minioadmin}
|
||||
- MINIO_SECRET_ACCESS_KEY=${MINIO_SECRET_ACCESS_KEY:-minioadmin}
|
||||
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME}
|
||||
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
|
||||
- OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-http://host.docker.internal:11434}
|
||||
- STREAM_MANAGER_TYPE=${STREAM_MANAGER_TYPE}
|
||||
- STREAM_MANAGER_TYPE=${STREAM_MANAGER_TYPE:-}
|
||||
- REDIS_ADDR=redis:6379
|
||||
- REDIS_PASSWORD=${REDIS_PASSWORD}
|
||||
- REDIS_DB=${REDIS_DB}
|
||||
- REDIS_PREFIX=${REDIS_PREFIX}
|
||||
- ENABLE_GRAPH_RAG=${ENABLE_GRAPH_RAG}
|
||||
- TENANT_AES_KEY=${TENANT_AES_KEY}
|
||||
- REDIS_PASSWORD=${REDIS_PASSWORD:-}
|
||||
- REDIS_DB=${REDIS_DB:-}
|
||||
- REDIS_PREFIX=${REDIS_PREFIX:-}
|
||||
- ENABLE_GRAPH_RAG=${ENABLE_GRAPH_RAG:-}
|
||||
- NEO4J_ENABLE=${NEO4J_ENABLE:-}
|
||||
- NEO4J_URI=bolt://neo4j:7687
|
||||
- NEO4J_USERNAME=${NEO4J_USERNAME:-neo4j}
|
||||
- NEO4J_PASSWORD=${NEO4J_PASSWORD:-password}
|
||||
- TENANT_AES_KEY=${TENANT_AES_KEY:-}
|
||||
- CONCURRENCY_POOL_SIZE=${CONCURRENCY_POOL_SIZE:-5}
|
||||
- INIT_LLM_MODEL_NAME=${INIT_LLM_MODEL_NAME}
|
||||
- INIT_LLM_MODEL_BASE_URL=${INIT_LLM_MODEL_BASE_URL}
|
||||
- INIT_LLM_MODEL_API_KEY=${INIT_LLM_MODEL_API_KEY}
|
||||
- INIT_EMBEDDING_MODEL_NAME=${INIT_EMBEDDING_MODEL_NAME}
|
||||
- INIT_EMBEDDING_MODEL_BASE_URL=${INIT_EMBEDDING_MODEL_BASE_URL}
|
||||
- INIT_EMBEDDING_MODEL_API_KEY=${INIT_EMBEDDING_MODEL_API_KEY}
|
||||
- INIT_EMBEDDING_MODEL_DIMENSION=${INIT_EMBEDDING_MODEL_DIMENSION}
|
||||
- INIT_EMBEDDING_MODEL_ID=${INIT_EMBEDDING_MODEL_ID}
|
||||
- INIT_RERANK_MODEL_NAME=${INIT_RERANK_MODEL_NAME}
|
||||
- INIT_RERANK_MODEL_BASE_URL=${INIT_RERANK_MODEL_BASE_URL}
|
||||
- INIT_RERANK_MODEL_API_KEY=${INIT_RERANK_MODEL_API_KEY}
|
||||
- INIT_LLM_MODEL_NAME=${INIT_LLM_MODEL_NAME:-}
|
||||
- INIT_LLM_MODEL_BASE_URL=${INIT_LLM_MODEL_BASE_URL:-}
|
||||
- INIT_LLM_MODEL_API_KEY=${INIT_LLM_MODEL_API_KEY:-}
|
||||
- INIT_EMBEDDING_MODEL_NAME=${INIT_EMBEDDING_MODEL_NAME:-}
|
||||
- INIT_EMBEDDING_MODEL_BASE_URL=${INIT_EMBEDDING_MODEL_BASE_URL:-}
|
||||
- INIT_EMBEDDING_MODEL_API_KEY=${INIT_EMBEDDING_MODEL_API_KEY:-}
|
||||
- INIT_EMBEDDING_MODEL_DIMENSION=${INIT_EMBEDDING_MODEL_DIMENSION:-}
|
||||
- INIT_EMBEDDING_MODEL_ID=${INIT_EMBEDDING_MODEL_ID:-}
|
||||
- INIT_RERANK_MODEL_NAME=${INIT_RERANK_MODEL_NAME:-}
|
||||
- INIT_RERANK_MODEL_BASE_URL=${INIT_RERANK_MODEL_BASE_URL:-}
|
||||
- INIT_RERANK_MODEL_API_KEY=${INIT_RERANK_MODEL_API_KEY:-}
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_started
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
minio:
|
||||
condition: service_started
|
||||
docreader:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
@@ -76,61 +100,34 @@ services:
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
container_name: WeKnora-minio
|
||||
ports:
|
||||
- "${MINIO_PORT:-9000}:9000"
|
||||
- "${MINIO_CONSOLE_PORT:-9001}:9001"
|
||||
environment:
|
||||
- MINIO_ROOT_USER=${MINIO_ACCESS_KEY_ID:-minioadmin}
|
||||
- MINIO_ROOT_PASSWORD=${MINIO_SECRET_ACCESS_KEY:-minioadmin}
|
||||
command: server --console-address ":9001" /data
|
||||
volumes:
|
||||
- minio_data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
networks:
|
||||
- WeKnora-network
|
||||
|
||||
frontend:
|
||||
image: wechatopenai/weknora-ui:latest
|
||||
container_name: WeKnora-frontend
|
||||
ports:
|
||||
- "${FRONTEND_PORT:-80}:80"
|
||||
depends_on:
|
||||
- app
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
|
||||
docreader:
|
||||
image: wechatopenai/weknora-docreader:latest
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/Dockerfile.docreader
|
||||
container_name: WeKnora-docreader
|
||||
ports:
|
||||
- "${DOCREADER_PORT:-50051}:50051"
|
||||
environment:
|
||||
- COS_SECRET_ID=${COS_SECRET_ID}
|
||||
- COS_SECRET_KEY=${COS_SECRET_KEY}
|
||||
- COS_REGION=${COS_REGION}
|
||||
- COS_BUCKET_NAME=${COS_BUCKET_NAME}
|
||||
- COS_APP_ID=${COS_APP_ID}
|
||||
- COS_PATH_PREFIX=${COS_PATH_PREFIX}
|
||||
- COS_ENABLE_OLD_DOMAIN=${COS_ENABLE_OLD_DOMAIN}
|
||||
- VLM_MODEL_BASE_URL=${VLM_MODEL_BASE_URL}
|
||||
- VLM_MODEL_NAME=${VLM_MODEL_NAME}
|
||||
- VLM_MODEL_API_KEY=${VLM_MODEL_API_KEY}
|
||||
- STORAGE_TYPE=${STORAGE_TYPE}
|
||||
- COS_SECRET_ID=${COS_SECRET_ID:-}
|
||||
- COS_SECRET_KEY=${COS_SECRET_KEY:-}
|
||||
- COS_REGION=${COS_REGION:-}
|
||||
- COS_BUCKET_NAME=${COS_BUCKET_NAME:-}
|
||||
- COS_APP_ID=${COS_APP_ID:-}
|
||||
- COS_PATH_PREFIX=${COS_PATH_PREFIX:-}
|
||||
- COS_ENABLE_OLD_DOMAIN=${COS_ENABLE_OLD_DOMAIN:-}
|
||||
- VLM_MODEL_BASE_URL=${VLM_MODEL_BASE_URL:-}
|
||||
- VLM_MODEL_NAME=${VLM_MODEL_NAME:-}
|
||||
- VLM_MODEL_API_KEY=${VLM_MODEL_API_KEY:-}
|
||||
- STORAGE_TYPE=${STORAGE_TYPE:-}
|
||||
- MINIO_PUBLIC_ENDPOINT=http://localhost:${MINIO_PORT:-9000}
|
||||
- MINIO_ENDPOINT=minio:9000
|
||||
- MINIO_ACCESS_KEY_ID=${MINIO_ACCESS_KEY_ID:-minioadmin}
|
||||
- MINIO_SECRET_ACCESS_KEY=${MINIO_SECRET_ACCESS_KEY:-minioadmin}
|
||||
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME}
|
||||
- MINIO_USE_SSL=${MINIO_USE_SSL}
|
||||
- WEB_PROXY=${WEB_PROXY}
|
||||
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
|
||||
- MINIO_USE_SSL=${MINIO_USE_SSL:-}
|
||||
- WEB_PROXY=${WEB_PROXY:-}
|
||||
- MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
|
||||
healthcheck:
|
||||
test: ["CMD", "grpc_health_probe", "-addr=:50051"]
|
||||
interval: 30s
|
||||
@@ -143,35 +140,12 @@ services:
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
ports:
|
||||
- "6831:6831/udp" # Jaeger Thrift接收器
|
||||
- "6832:6832/udp" # Jaeger Thrift接收器(Compact)
|
||||
- "5778:5778" # 配置端口
|
||||
- "16686:16686" # Web UI
|
||||
- "4317:4317" # OTLP gRPC接收器
|
||||
- "4318:4318" # OTLP HTTP接收器
|
||||
- "14250:14250" # 接收模型端口
|
||||
- "14268:14268" # Jaeger HTTP接收器
|
||||
- "9411:9411" # Zipkin兼容性端口
|
||||
environment:
|
||||
- COLLECTOR_OTLP_ENABLED=true
|
||||
- COLLECTOR_ZIPKIN_HOST_PORT=:9411
|
||||
volumes:
|
||||
- jaeger_data:/var/lib/jaeger # 持久化 Jaeger 数据
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
# 修改的PostgreSQL配置
|
||||
postgres:
|
||||
image: paradedb/paradedb:latest
|
||||
image: paradedb/paradedb:v0.18.9-pg17
|
||||
container_name: WeKnora-postgres
|
||||
ports:
|
||||
- "${DB_PORT}:5432"
|
||||
environment:
|
||||
- POSTGRES_USER=${DB_USER}
|
||||
# NOCC:hardcode-password(工具误报)
|
||||
- POSTGRES_PASSWORD=${DB_PASSWORD}
|
||||
- POSTGRES_DB=${DB_NAME}
|
||||
volumes:
|
||||
@@ -193,15 +167,79 @@ services:
|
||||
redis:
|
||||
image: redis:7.0-alpine
|
||||
container_name: WeKnora-redis
|
||||
ports:
|
||||
- "${REDIS_PORT}:6379"
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
command: redis-server --appendonly yes --requirepass ${REDIS_PASSWORD}
|
||||
restart: always
|
||||
networks:
|
||||
- WeKnora-network
|
||||
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
container_name: WeKnora-minio
|
||||
ports:
|
||||
- "${MINIO_PORT:-9000}:9000"
|
||||
- "${MINIO_CONSOLE_PORT:-9001}:9001"
|
||||
environment:
|
||||
- MINIO_ROOT_USER=${MINIO_ACCESS_KEY_ID:-minioadmin}
|
||||
- MINIO_ROOT_PASSWORD=${MINIO_SECRET_ACCESS_KEY:-minioadmin}
|
||||
command: server --console-address ":9001" /data
|
||||
volumes:
|
||||
- minio_data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
networks:
|
||||
- WeKnora-network
|
||||
profiles:
|
||||
- minio
|
||||
- full
|
||||
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
ports:
|
||||
- "6831:6831/udp" # Jaeger Thrift接收器
|
||||
- "6832:6832/udp" # Jaeger Thrift接收器(Compact)
|
||||
- "5778:5778" # 配置端口
|
||||
- "16686:16686" # Web UI
|
||||
- "4317:4317" # OTLP gRPC接收器
|
||||
- "4318:4318" # OTLP HTTP接收器
|
||||
- "14250:14250" # 接收模型端口
|
||||
- "14268:14268" # Jaeger HTTP接收器
|
||||
- "9411:9411" # Zipkin兼容性端口
|
||||
environment:
|
||||
- COLLECTOR_OTLP_ENABLED=true
|
||||
- COLLECTOR_ZIPKIN_HOST_PORT=:9411
|
||||
volumes:
|
||||
- jaeger_data:/var/lib/jaeger # 持久化 Jaeger 数据
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
profiles:
|
||||
- jaeger
|
||||
- full
|
||||
|
||||
neo4j:
|
||||
image: neo4j:latest
|
||||
container_name: WeKnora-neo4j
|
||||
volumes:
|
||||
- neo4j-data:/data
|
||||
environment:
|
||||
- NEO4J_AUTH=${NEO4J_USERNAME:-neo4j}/${NEO4J_PASSWORD:-password}
|
||||
- NEO4J_apoc_export_file_enabled=true
|
||||
- NEO4J_apoc_import_file_enabled=true
|
||||
- NEO4J_apoc_import_file_use__neo4j__config=true
|
||||
- NEO4JLABS_PLUGINS=["apoc"]
|
||||
ports:
|
||||
- "7474:7474"
|
||||
- "7687:7687"
|
||||
restart: always
|
||||
networks:
|
||||
- WeKnora-network
|
||||
profiles:
|
||||
- neo4j
|
||||
- full
|
||||
|
||||
networks:
|
||||
WeKnora-network:
|
||||
driver: bridge
|
||||
@@ -210,5 +248,5 @@ volumes:
|
||||
postgres-data:
|
||||
data-files:
|
||||
jaeger_data:
|
||||
redis_data:
|
||||
minio_data:
|
||||
neo4j-data:
|
||||
|
||||
@@ -3,10 +3,6 @@ FROM golang:1.24-alpine AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install dependencies
|
||||
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apk/repositories && \
|
||||
apk add --no-cache git build-base
|
||||
|
||||
# 通过构建参数接收敏感信息
|
||||
ARG GOPRIVATE_ARG
|
||||
ARG GOPROXY_ARG
|
||||
@@ -17,19 +13,33 @@ ENV GOPRIVATE=${GOPRIVATE_ARG}
|
||||
ENV GOPROXY=${GOPROXY_ARG}
|
||||
ENV GOSUMDB=${GOSUMDB_ARG}
|
||||
|
||||
# Copy go mod and sum files
|
||||
COPY go.mod go.sum ./
|
||||
RUN go mod download
|
||||
# Install dependencies
|
||||
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apk/repositories && \
|
||||
apk add --no-cache git build-base
|
||||
|
||||
ENV CGO_ENABLED=1
|
||||
# Install migrate tool
|
||||
RUN go install -tags 'postgres' github.com/golang-migrate/migrate/v4/cmd/migrate@latest
|
||||
|
||||
# Copy source code
|
||||
# Copy go mod and sum files
|
||||
COPY go.mod go.sum ./
|
||||
RUN --mount=type=cache,target=/go/pkg/mod go mod download
|
||||
COPY . .
|
||||
|
||||
# Build the application
|
||||
RUN make build-prod
|
||||
# Get version and commit info for build injection
|
||||
ARG VERSION_ARG
|
||||
ARG COMMIT_ID_ARG
|
||||
ARG BUILD_TIME_ARG
|
||||
ARG GO_VERSION_ARG
|
||||
|
||||
# Set build-time variables
|
||||
ENV VERSION=${VERSION_ARG}
|
||||
ENV COMMIT_ID=${COMMIT_ID_ARG}
|
||||
ENV BUILD_TIME=${BUILD_TIME_ARG}
|
||||
ENV GO_VERSION=${GO_VERSION_ARG}
|
||||
|
||||
# Build the application with version info
|
||||
RUN --mount=type=cache,target=/go/pkg/mod make build-prod
|
||||
RUN --mount=type=cache,target=/go/pkg/mod cp -r /go/pkg/mod/github.com/yanyiwu/ /app/yanyiwu/
|
||||
|
||||
# Final stage
|
||||
FROM alpine:3.17
|
||||
@@ -39,36 +49,31 @@ WORKDIR /app
|
||||
# Install runtime dependencies
|
||||
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apk/repositories && \
|
||||
apk update && apk upgrade && \
|
||||
apk add --no-cache build-base postgresql-client mysql-client ca-certificates tzdata sed curl bash supervisor vim wget
|
||||
|
||||
# Copy the binary from the builder stage
|
||||
COPY --from=builder /app/WeKnora .
|
||||
COPY --from=builder /app/config ./config
|
||||
COPY --from=builder /app/scripts ./scripts
|
||||
COPY --from=builder /app/migrations ./migrations
|
||||
COPY --from=builder /app/dataset/samples ./dataset/samples
|
||||
|
||||
# Copy migrate tool from builder stage
|
||||
COPY --from=builder /go/bin/migrate /usr/local/bin/
|
||||
COPY --from=builder /go/pkg/mod/github.com/yanyiwu /go/pkg/mod/github.com/yanyiwu/
|
||||
|
||||
# Make scripts executable
|
||||
RUN chmod +x ./scripts/*.sh
|
||||
|
||||
# Setup supervisor configuration
|
||||
RUN mkdir -p /etc/supervisor.d/
|
||||
COPY docker/config/supervisord.conf /etc/supervisor.d/supervisord.conf
|
||||
|
||||
# Expose ports
|
||||
EXPOSE 8080
|
||||
|
||||
# Set environment variables
|
||||
ENV CGO_ENABLED=1
|
||||
apk add --no-cache build-base postgresql-client mysql-client ca-certificates tzdata sed curl bash vim wget
|
||||
|
||||
# Create a non-root user and switch to it
|
||||
RUN mkdir -p /data/files && \
|
||||
adduser -D -g '' appuser && \
|
||||
chown -R appuser:appuser /app /data/files
|
||||
|
||||
# Run supervisor instead of direct application start
|
||||
CMD ["supervisord", "-c", "/etc/supervisor.d/supervisord.conf"]
|
||||
# Copy migrate tool from builder stage
|
||||
COPY --from=builder /go/bin/migrate /usr/local/bin/
|
||||
COPY --from=builder /app/yanyiwu/ /go/pkg/mod/github.com/yanyiwu/
|
||||
|
||||
# Copy the binary from the builder stage
|
||||
COPY --from=builder /app/config ./config
|
||||
COPY --from=builder /app/scripts ./scripts
|
||||
COPY --from=builder /app/migrations ./migrations
|
||||
COPY --from=builder /app/dataset/samples ./dataset/samples
|
||||
COPY --from=builder /app/WeKnora .
|
||||
|
||||
# Make scripts executable
|
||||
RUN chmod +x ./scripts/*.sh
|
||||
|
||||
# Expose ports
|
||||
EXPOSE 8080
|
||||
|
||||
# Switch to non-root user and run the application directly
|
||||
USER appuser
|
||||
|
||||
CMD ["./WeKnora"]
|
||||
@@ -26,30 +26,33 @@ RUN apt-get update && apt-get install -y \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 检查是否存在本地protoc安装包,如果存在则离线安装,否则在线安装,其他安装包按需求添加
|
||||
ARG TARGETARCH
|
||||
COPY packages/ /app/packages/
|
||||
RUN echo "检查本地protoc安装包..." && \
|
||||
if [ -f "/app/packages/protoc-3.19.4-linux-x86_64.zip" ]; then \
|
||||
# 根据目标架构选择正确的protoc包名
|
||||
case ${TARGETARCH} in \
|
||||
"amd64") PROTOC_ARCH="x86_64" ;; \
|
||||
"arm64") PROTOC_ARCH="aarch_64" ;; \
|
||||
"arm") PROTOC_ARCH="arm" ;; \
|
||||
*) echo "Unsupported architecture for protoc: ${TARGETARCH}" && exit 1 ;; \
|
||||
esac && \
|
||||
PROTOC_PACKAGE="protoc-3.19.4-linux-${PROTOC_ARCH}.zip" && \
|
||||
if [ -f "/app/packages/${PROTOC_PACKAGE}" ]; then \
|
||||
echo "发现本地protoc安装包,将进行离线安装"; \
|
||||
# 离线安装:使用本地包(精确路径避免歧义)
|
||||
cp /app/packages/protoc-*.zip /app/ && \
|
||||
unzip -o /app/protoc-*.zip -d /usr/local && \
|
||||
cp /app/packages/${PROTOC_PACKAGE} /app/ && \
|
||||
unzip -o /app/${PROTOC_PACKAGE} -d /usr/local && \
|
||||
chmod +x /usr/local/bin/protoc && \
|
||||
rm -f /app/protoc-*.zip; \
|
||||
rm -f /app/${PROTOC_PACKAGE}; \
|
||||
else \
|
||||
echo "未发现本地protoc安装包,将进行在线安装"; \
|
||||
# 在线安装:从网络下载
|
||||
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \
|
||||
unzip -o protoc-3.19.4-linux-x86_64.zip -d /usr/local && \
|
||||
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/${PROTOC_PACKAGE} && \
|
||||
unzip -o ${PROTOC_PACKAGE} -d /usr/local && \
|
||||
chmod +x /usr/local/bin/protoc && \
|
||||
rm -f protoc-3.19.4-linux-x86_64.zip; \
|
||||
rm -f ${PROTOC_PACKAGE}; \
|
||||
fi
|
||||
|
||||
# 复制依赖文件
|
||||
COPY services/docreader/requirements.txt .
|
||||
|
||||
# 安装依赖
|
||||
RUN pip cache purge && pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# 预下载 PP-OCRv4 模型
|
||||
RUN mkdir -p /root/.paddleocr/whl/det/ch && \
|
||||
mkdir -p /root/.paddleocr/whl/rec/ch && \
|
||||
@@ -71,17 +74,21 @@ RUN mkdir -p /root/.paddleocr/whl/det/ch && \
|
||||
rm -f /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
|
||||
rm -f /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar
|
||||
|
||||
# 复制依赖文件
|
||||
COPY docreader/pyproject.toml docreader/uv.lock ./
|
||||
RUN pip install uv --break-system-packages && \
|
||||
python -m uv sync --locked --no-dev
|
||||
|
||||
# 复制源代码和生成脚本
|
||||
COPY services/docreader/src/ /app/src/
|
||||
COPY services/docreader/scripts/ /app/scripts/
|
||||
COPY docreader docreader
|
||||
|
||||
# 生成 protobuf 代码
|
||||
RUN chmod +x docreader/scripts/generate_proto.sh && \
|
||||
bash docreader/scripts/generate_proto.sh
|
||||
|
||||
# 确保模型目录存在
|
||||
RUN ls -la /root/.paddleocr/whl/
|
||||
|
||||
# 生成 protobuf 代码
|
||||
RUN chmod +x /app/scripts/generate_proto.sh && bash /app/scripts/generate_proto.sh
|
||||
|
||||
|
||||
# =========================
|
||||
# 运行阶段
|
||||
# =========================
|
||||
@@ -102,7 +109,6 @@ RUN apt-get update && apt-get install -y \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
antiword \
|
||||
supervisor \
|
||||
vim \
|
||||
tar \
|
||||
dpkg \
|
||||
@@ -118,30 +124,38 @@ RUN apt-get update && apt-get install -y \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 安装 grpc_health_probe
|
||||
ARG TARGETARCH
|
||||
RUN GRPC_HEALTH_PROBE_VERSION=v0.4.24 && \
|
||||
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-amd64 && \
|
||||
# 根据目标架构选择正确的二进制文件
|
||||
case ${TARGETARCH} in \
|
||||
"amd64") ARCH="amd64" ;; \
|
||||
"arm64") ARCH="arm64" ;; \
|
||||
"arm") ARCH="arm" ;; \
|
||||
*) echo "Unsupported architecture: ${TARGETARCH}" && exit 1 ;; \
|
||||
esac && \
|
||||
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-${ARCH} && \
|
||||
chmod +x /bin/grpc_health_probe
|
||||
|
||||
# 从构建阶段复制已安装的依赖和生成的代码
|
||||
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
|
||||
ENV VIRTUAL_ENV=/app/.venv
|
||||
COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
|
||||
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
|
||||
|
||||
COPY --from=builder /usr/local/bin /usr/local/bin
|
||||
COPY --from=builder /root/.paddleocr /root/.paddleocr
|
||||
COPY --from=builder /app/src /app/src
|
||||
|
||||
# 安装 Playwright 浏览器
|
||||
RUN python -m playwright install webkit
|
||||
RUN python -m playwright install-deps webkit
|
||||
|
||||
# 设置 Python 路径
|
||||
ENV PYTHONPATH=/app/src
|
||||
RUN cd /app/src && python -m download_deps
|
||||
# COPY docreader/scripts/download_deps.py download_deps.py
|
||||
# RUN python -m download_deps
|
||||
|
||||
# 创建supervisor配置
|
||||
RUN mkdir -p /etc/supervisor/conf.d
|
||||
COPY services/docreader/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||
COPY docreader/pyproject.toml docreader/uv.lock ./
|
||||
COPY --from=builder /app/docreader docreader
|
||||
|
||||
# 暴露 gRPC 端口
|
||||
EXPOSE 50051
|
||||
|
||||
# 使用supervisor启动服务
|
||||
CMD ["supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
||||
# 直接运行 Python 服务(日志输出到 stdout/stderr)
|
||||
CMD ["uv", "run", "-m", "docreader.main"]
|
||||
@@ -3,6 +3,9 @@ version: "3.8"
|
||||
services:
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
read_only: true
|
||||
tmpfs:
|
||||
- /tmp
|
||||
container_name: WeKnora-minio
|
||||
ports:
|
||||
- "9000:9000"
|
||||
@@ -26,4 +29,4 @@ volumes:
|
||||
|
||||
networks:
|
||||
WeKnora-network:
|
||||
external: true
|
||||
external: true
|
||||
|
||||
5
docreader/.pylintrc
Normal file
@@ -0,0 +1,5 @@
|
||||
[LOGGING]
|
||||
logging-format-style=fstr
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
; disable=W1203
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/Tencent/WeKnora/services/docreader/src/proto"
|
||||
"github.com/Tencent/WeKnora/docreader/proto"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
"google.golang.org/grpc/resolver"
|
||||
@@ -16,10 +16,8 @@ const (
|
||||
maxMessageSize = 50 * 1024 * 1024 // 50MB
|
||||
)
|
||||
|
||||
var (
|
||||
// Logger is the default logger used by the client
|
||||
Logger = log.New(os.Stdout, "[DocReader] ", log.LstdFlags|log.Lmicroseconds)
|
||||
)
|
||||
// Logger is the default logger used by the client
|
||||
var Logger = log.New(os.Stdout, "[DocReader] ", log.LstdFlags|log.Lmicroseconds)
|
||||
|
||||
// ImageInfo 表示一个图片的信息
|
||||
type ImageInfo struct {
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/Tencent/WeKnora/services/docreader/src/proto"
|
||||
"github.com/Tencent/WeKnora/docreader/proto"
|
||||
)
|
||||
|
||||
func init() {
|
||||
@@ -1,39 +1,28 @@
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from concurrent import futures
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import traceback
|
||||
import grpc
|
||||
import uuid
|
||||
import atexit
|
||||
from concurrent import futures
|
||||
from typing import Optional
|
||||
|
||||
import grpc
|
||||
from grpc_health.v1 import health_pb2_grpc
|
||||
from grpc_health.v1.health import HealthServicer
|
||||
|
||||
# Add parent directory to Python path
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parent_dir = os.path.dirname(current_dir)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
from docreader.models.read_config import ChunkingConfig
|
||||
from docreader.parser import Parser
|
||||
from docreader.parser.ocr_engine import OCREngine
|
||||
from docreader.proto import docreader_pb2_grpc
|
||||
from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse
|
||||
from docreader.utils.request import init_logging_request_id, request_id_context
|
||||
|
||||
from proto.docreader_pb2 import ReadResponse, Chunk, Image
|
||||
from proto import docreader_pb2_grpc
|
||||
from parser import Parser, OCREngine
|
||||
from parser.config import ChunkingConfig
|
||||
from utils.request import request_id_context, init_logging_request_id
|
||||
|
||||
# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
# Optional dependency for charset detection; install via `pip install charset-normalizer`
|
||||
from charset_normalizer import from_bytes as _cn_from_bytes # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
_cn_from_bytes = None # type: ignore
|
||||
|
||||
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
|
||||
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
|
||||
# cannot be encoded to UTF-8
|
||||
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
|
||||
|
||||
|
||||
def to_valid_utf8_text(s: Optional[str]) -> str:
|
||||
"""Return a UTF-8 safe string for protobuf.
|
||||
|
||||
@@ -42,30 +31,9 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
|
||||
"""
|
||||
if not s:
|
||||
return ""
|
||||
s = _SURROGATE_RE.sub("\uFFFD", s)
|
||||
s = _SURROGATE_RE.sub("\ufffd", s)
|
||||
return s.encode("utf-8", errors="replace").decode("utf-8")
|
||||
|
||||
def read_text_with_fallback(file_path: str) -> str:
|
||||
"""Read text from file supporting multiple encodings with graceful fallback.
|
||||
|
||||
This server currently receives bytes over gRPC and delegates decoding to the parser.
|
||||
This helper is provided for future local-file reads if needed.
|
||||
"""
|
||||
with open(file_path, "rb") as f:
|
||||
raw = f.read()
|
||||
if _cn_from_bytes is not None:
|
||||
try:
|
||||
result = _cn_from_bytes(raw).best()
|
||||
if result:
|
||||
return str(result)
|
||||
except Exception:
|
||||
pass
|
||||
for enc in ("utf-8", "gb18030", "latin-1"):
|
||||
try:
|
||||
return raw.decode(enc, errors="replace")
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
return raw.decode("utf-8", errors="replace")
|
||||
|
||||
# Ensure no existing handlers
|
||||
for handler in logging.root.handlers[:]:
|
||||
@@ -88,6 +56,7 @@ MAX_MESSAGE_LENGTH = 50 * 1024 * 1024
|
||||
|
||||
parser = Parser()
|
||||
|
||||
|
||||
class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@@ -109,7 +78,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
request.file_type or os.path.splitext(request.file_name)[1][1:]
|
||||
)
|
||||
logger.info(
|
||||
f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
|
||||
f"ReadFromFile for file: {request.file_name}, type: {file_type}"
|
||||
)
|
||||
logger.info(f"File content size: {len(request.file_content)} bytes")
|
||||
|
||||
@@ -120,36 +89,42 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
enable_multimodal = request.read_config.enable_multimodal or False
|
||||
|
||||
logger.info(
|
||||
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
|
||||
f"multimodal={enable_multimodal}"
|
||||
f"Using chunking config: size={chunk_size}, "
|
||||
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
|
||||
)
|
||||
|
||||
# Get Storage and VLM config from request
|
||||
storage_config = None
|
||||
vlm_config = None
|
||||
|
||||
|
||||
sc = request.read_config.storage_config
|
||||
# Keep parser-side key name as cos_config for backward compatibility
|
||||
storage_config = {
|
||||
'provider': 'minio' if sc.provider == 2 else 'cos',
|
||||
'region': sc.region,
|
||||
'bucket_name': sc.bucket_name,
|
||||
'access_key_id': sc.access_key_id,
|
||||
'secret_access_key': sc.secret_access_key,
|
||||
'app_id': sc.app_id,
|
||||
'path_prefix': sc.path_prefix,
|
||||
"provider": "minio" if sc.provider == 2 else "cos",
|
||||
"region": sc.region,
|
||||
"bucket_name": sc.bucket_name,
|
||||
"access_key_id": sc.access_key_id,
|
||||
"secret_access_key": sc.secret_access_key,
|
||||
"app_id": sc.app_id,
|
||||
"path_prefix": sc.path_prefix,
|
||||
}
|
||||
logger.info(f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}")
|
||||
|
||||
logger.info(
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, "
|
||||
f"bucket={storage_config['bucket_name']}"
|
||||
)
|
||||
|
||||
vlm_config = {
|
||||
'model_name': request.read_config.vlm_config.model_name,
|
||||
'base_url': request.read_config.vlm_config.base_url,
|
||||
'api_key': request.read_config.vlm_config.api_key or '',
|
||||
'interface_type': request.read_config.vlm_config.interface_type or 'openai',
|
||||
"model_name": request.read_config.vlm_config.model_name,
|
||||
"base_url": request.read_config.vlm_config.base_url,
|
||||
"api_key": request.read_config.vlm_config.api_key or "",
|
||||
"interface_type": request.read_config.vlm_config.interface_type
|
||||
or "openai",
|
||||
}
|
||||
logger.info(f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}")
|
||||
logger.info(
|
||||
f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}"
|
||||
)
|
||||
|
||||
chunking_config = ChunkingConfig(
|
||||
chunk_size=chunk_size,
|
||||
@@ -161,7 +136,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
)
|
||||
|
||||
# Parse file
|
||||
logger.info(f"Starting file parsing process")
|
||||
logger.info("Starting file parsing process")
|
||||
result = self.parser.parse_file(
|
||||
request.file_name, file_type, request.file_content, chunking_config
|
||||
)
|
||||
@@ -175,12 +150,14 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
|
||||
# Convert to protobuf message
|
||||
logger.info(
|
||||
f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
|
||||
f"Parsed file {request.file_name}, with {len(result.chunks)} chunks"
|
||||
)
|
||||
|
||||
|
||||
# Build response, including image info
|
||||
response = ReadResponse(
|
||||
chunks=[self._convert_chunk_to_proto(chunk) for chunk in result.chunks]
|
||||
chunks=[
|
||||
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
|
||||
]
|
||||
)
|
||||
logger.info(f"Response size: {response.ByteSize()} bytes")
|
||||
return response
|
||||
@@ -213,36 +190,42 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
enable_multimodal = request.read_config.enable_multimodal or False
|
||||
|
||||
logger.info(
|
||||
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
|
||||
f"multimodal={enable_multimodal}"
|
||||
f"Using chunking config: size={chunk_size}, "
|
||||
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
|
||||
)
|
||||
|
||||
# Get Storage and VLM config from request
|
||||
storage_config = None
|
||||
vlm_config = None
|
||||
|
||||
|
||||
sc = request.read_config.storage_config
|
||||
storage_config = {
|
||||
'provider': 'minio' if sc.provider == 2 else 'cos',
|
||||
'region': sc.region,
|
||||
'bucket_name': sc.bucket_name,
|
||||
'access_key_id': sc.access_key_id,
|
||||
'secret_access_key': sc.secret_access_key,
|
||||
'app_id': sc.app_id,
|
||||
'path_prefix': sc.path_prefix,
|
||||
"provider": "minio" if sc.provider == 2 else "cos",
|
||||
"region": sc.region,
|
||||
"bucket_name": sc.bucket_name,
|
||||
"access_key_id": sc.access_key_id,
|
||||
"secret_access_key": sc.secret_access_key,
|
||||
"app_id": sc.app_id,
|
||||
"path_prefix": sc.path_prefix,
|
||||
}
|
||||
logger.info(f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}")
|
||||
logger.info(
|
||||
f"Using Storage config: provider={storage_config.get('provider')}, "
|
||||
f"bucket={storage_config['bucket_name']}"
|
||||
)
|
||||
|
||||
vlm_config = {
|
||||
'model_name': request.read_config.vlm_config.model_name,
|
||||
'base_url': request.read_config.vlm_config.base_url,
|
||||
'api_key': request.read_config.vlm_config.api_key or '',
|
||||
'interface_type': request.read_config.vlm_config.interface_type or 'openai',
|
||||
"model_name": request.read_config.vlm_config.model_name,
|
||||
"base_url": request.read_config.vlm_config.base_url,
|
||||
"api_key": request.read_config.vlm_config.api_key or "",
|
||||
"interface_type": request.read_config.vlm_config.interface_type
|
||||
or "openai",
|
||||
}
|
||||
logger.info(f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}")
|
||||
|
||||
logger.info(
|
||||
f"Using VLM config: model={vlm_config['model_name']}, "
|
||||
f"base_url={vlm_config['base_url']}, "
|
||||
f"interface_type={vlm_config['interface_type']}"
|
||||
)
|
||||
|
||||
chunking_config = ChunkingConfig(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
@@ -253,8 +236,10 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
)
|
||||
|
||||
# Parse URL
|
||||
logger.info(f"Starting URL parsing process")
|
||||
result = self.parser.parse_url(request.url, request.title, chunking_config)
|
||||
logger.info("Starting URL parsing process")
|
||||
result = self.parser.parse_url(
|
||||
request.url, request.title, chunking_config
|
||||
)
|
||||
if not result:
|
||||
error_msg = "Failed to parse URL"
|
||||
logger.error(error_msg)
|
||||
@@ -264,11 +249,13 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
|
||||
# Convert to protobuf message, including image info
|
||||
logger.info(
|
||||
f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
|
||||
f"Parsed URL {request.url}, returning {len(result.chunks)} chunks"
|
||||
)
|
||||
|
||||
|
||||
response = ReadResponse(
|
||||
chunks=[self._convert_chunk_to_proto(chunk) for chunk in result.chunks]
|
||||
chunks=[
|
||||
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
|
||||
]
|
||||
)
|
||||
logger.info(f"Response size: {response.ByteSize()} bytes")
|
||||
return response
|
||||
@@ -280,7 +267,7 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(str(e))
|
||||
return ReadResponse(error=str(e))
|
||||
|
||||
|
||||
def _convert_chunk_to_proto(self, chunk):
|
||||
"""Convert internal Chunk object to protobuf Chunk message
|
||||
Ensures all string fields are valid UTF-8 for protobuf (no lone surrogates).
|
||||
@@ -294,10 +281,12 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
start=getattr(chunk, "start", 0),
|
||||
end=getattr(chunk, "end", 0),
|
||||
)
|
||||
|
||||
|
||||
# If chunk has images attribute and is not empty, add image info
|
||||
if hasattr(chunk, "images") and chunk.images:
|
||||
logger.info(f"Adding {len(chunk.images)} images to chunk {getattr(chunk, 'seq', 0)}")
|
||||
logger.info(
|
||||
f"Adding {len(chunk.images)} images to chunk {getattr(chunk, 'seq', 0)}"
|
||||
)
|
||||
for img_info in chunk.images:
|
||||
# img_info expected as dict
|
||||
proto_image = Image(
|
||||
@@ -309,63 +298,52 @@ class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
|
||||
end=int(img_info.get("end", 0) or 0),
|
||||
)
|
||||
proto_chunk.images.append(proto_image)
|
||||
|
||||
|
||||
return proto_chunk
|
||||
|
||||
def init_ocr_engine(ocr_backend, ocr_config):
|
||||
|
||||
def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs):
|
||||
"""Initialize OCR engine"""
|
||||
try:
|
||||
logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
|
||||
ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
|
||||
if ocr_engine:
|
||||
logger.info("OCR engine initialized successfully")
|
||||
return True
|
||||
else:
|
||||
logger.error("OCR engine initialization failed")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Error initializing OCR engine: {str(e)}")
|
||||
return False
|
||||
backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle")
|
||||
logger.info(f"Initializing OCR engine with backend: {backend_type}")
|
||||
OCREngine.get_instance(backend_type=backend_type, **kwargs)
|
||||
|
||||
|
||||
def serve():
|
||||
|
||||
init_ocr_engine(os.getenv("OCR_BACKEND", "paddle"), {
|
||||
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
|
||||
})
|
||||
|
||||
def main():
|
||||
init_ocr_engine()
|
||||
|
||||
# Set max number of worker threads
|
||||
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
|
||||
logger.info(f"Starting DocReader service with {max_workers} worker threads")
|
||||
|
||||
|
||||
# Get port number
|
||||
port = os.environ.get("GRPC_PORT", "50051")
|
||||
|
||||
|
||||
# Create server
|
||||
server = grpc.server(
|
||||
futures.ThreadPoolExecutor(max_workers=max_workers),
|
||||
options=[
|
||||
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
|
||||
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
|
||||
("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
|
||||
("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
# Register services
|
||||
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
|
||||
|
||||
|
||||
# Register health check service
|
||||
health_servicer = HealthServicer()
|
||||
health_pb2_grpc.add_HealthServicer_to_server(health_servicer, server)
|
||||
|
||||
|
||||
# Set listen address
|
||||
server.add_insecure_port(f"[::]:{port}")
|
||||
|
||||
|
||||
# Start service
|
||||
server.start()
|
||||
|
||||
|
||||
logger.info(f"Server started on port {port}")
|
||||
logger.info("Server is ready to accept connections")
|
||||
|
||||
|
||||
try:
|
||||
# Wait for service termination
|
||||
server.wait_for_termination()
|
||||
@@ -373,5 +351,6 @@ def serve():
|
||||
logger.info("Received termination signal, shutting down server")
|
||||
server.stop(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
serve()
|
||||
main()
|
||||
0
docreader/models/__init__.py
Normal file
87
docreader/models/document.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""Chunk document schema."""
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class Chunk(BaseModel):
|
||||
"""Document Chunk including chunk content, chunk metadata."""
|
||||
|
||||
content: str = Field(default="", description="chunk text content")
|
||||
seq: int = Field(default=0, description="Chunk sequence number")
|
||||
start: int = Field(default=0, description="Chunk start position")
|
||||
end: int = Field(description="Chunk end position")
|
||||
images: List[Dict[str, Any]] = Field(
|
||||
default_factory=list, description="Images in the chunk"
|
||||
)
|
||||
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="metadata fields",
|
||||
)
|
||||
|
||||
def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
|
||||
"""Convert Chunk to dict."""
|
||||
|
||||
data = self.model_dump()
|
||||
data.update(kwargs)
|
||||
data["class_name"] = self.__class__.__name__
|
||||
return data
|
||||
|
||||
def to_json(self, **kwargs: Any) -> str:
|
||||
"""Convert Chunk to json."""
|
||||
data = self.to_dict(**kwargs)
|
||||
return json.dumps(data)
|
||||
|
||||
def __hash__(self):
|
||||
"""Hash function."""
|
||||
return hash((self.content,))
|
||||
|
||||
def __eq__(self, other):
|
||||
"""Equal function."""
|
||||
return self.content == other.content
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any], **kwargs: Any): # type: ignore
|
||||
"""Create Chunk from dict."""
|
||||
if isinstance(kwargs, dict):
|
||||
data.update(kwargs)
|
||||
|
||||
data.pop("class_name", None)
|
||||
return cls(**data)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, data_str: str, **kwargs: Any): # type: ignore
|
||||
"""Create Chunk from json."""
|
||||
data = json.loads(data_str)
|
||||
return cls.from_dict(data, **kwargs)
|
||||
|
||||
|
||||
class Document(BaseModel):
|
||||
"""Document including document content, document metadata."""
|
||||
|
||||
model_config = {"arbitrary_types_allowed": True}
|
||||
|
||||
content: str = Field(default="", description="document text content")
|
||||
images: Dict[str, str] = Field(
|
||||
default_factory=dict, description="Images in the document"
|
||||
)
|
||||
|
||||
chunks: List[Chunk] = Field(default_factory=list, description="document chunks")
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="metadata fields",
|
||||
)
|
||||
|
||||
def set_content(self, content: str) -> None:
|
||||
"""Set document content."""
|
||||
self.content = content
|
||||
|
||||
def get_content(self) -> str:
|
||||
"""Get document content."""
|
||||
return self.content
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.content != ""
|
||||
27
docreader/models/read_config.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkingConfig:
|
||||
"""
|
||||
Configuration for text chunking process.
|
||||
Controls how documents are split into smaller pieces for processing.
|
||||
"""
|
||||
|
||||
# Maximum size of each chunk in tokens/chars
|
||||
chunk_size: int = 512
|
||||
|
||||
# Number of tokens/chars to overlap between chunks
|
||||
chunk_overlap: int = 50
|
||||
|
||||
# Text separators in order of priority
|
||||
separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"])
|
||||
|
||||
# Whether to enable multimodal processing (text + images)
|
||||
enable_multimodal: bool = False
|
||||
|
||||
# Preferred field name going forward
|
||||
storage_config: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
# VLM configuration for image captioning
|
||||
vlm_config: dict[str, str] = field(default_factory=dict)
|
||||
@@ -13,22 +13,20 @@ The parsers extract content from documents and can split them into
|
||||
meaningful chunks for further processing and indexing.
|
||||
"""
|
||||
|
||||
from .base_parser import BaseParser, ParseResult
|
||||
from .docx_parser import DocxParser
|
||||
from .csv_parser import CSVParser
|
||||
from .doc_parser import DocParser
|
||||
from .pdf_parser import PDFParser
|
||||
from .markdown_parser import MarkdownParser
|
||||
from .text_parser import TextParser
|
||||
from .docx2_parser import Docx2Parser
|
||||
from .excel_parser import ExcelParser
|
||||
from .image_parser import ImageParser
|
||||
from .web_parser import WebParser
|
||||
from .markdown_parser import MarkdownParser
|
||||
from .parser import Parser
|
||||
from .config import ChunkingConfig
|
||||
from .ocr_engine import OCREngine
|
||||
from .pdf_parser import PDFParser
|
||||
from .text_parser import TextParser
|
||||
from .web_parser import WebParser
|
||||
|
||||
# Export public classes and modules
|
||||
__all__ = [
|
||||
"BaseParser", # Base parser class that all format parsers inherit from
|
||||
"DocxParser", # Parser for .docx files (modern Word documents)
|
||||
"Docx2Parser", # Parser for .docx files (modern Word documents)
|
||||
"DocParser", # Parser for .doc files (legacy Word documents)
|
||||
"PDFParser", # Parser for PDF documents
|
||||
"MarkdownParser", # Parser for Markdown text files
|
||||
@@ -36,7 +34,6 @@ __all__ = [
|
||||
"ImageParser", # Parser for images with text content
|
||||
"WebParser", # Parser for web pages
|
||||
"Parser", # Main parser factory that selects the appropriate parser
|
||||
"ChunkingConfig", # Configuration for text chunking behavior
|
||||
"ParseResult", # Standard result format returned by all parsers
|
||||
"OCREngine", # OCR engine for extracting text from images
|
||||
"CSVParser", # Parser for CSV files
|
||||
"ExcelParser", # Parser for Excel files
|
||||
]
|
||||
@@ -1,65 +1,28 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import os
|
||||
import asyncio
|
||||
from typing import List, Dict, Any, Optional, Tuple, Union
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
import logging
|
||||
import sys
|
||||
import traceback
|
||||
import numpy as np
|
||||
import time
|
||||
import io
|
||||
import json
|
||||
from .ocr_engine import OCREngine
|
||||
from .image_utils import image_to_base64
|
||||
from .config import ChunkingConfig
|
||||
from .storage import create_storage
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
# Add parent directory to Python path for src imports
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parent_dir = os.path.dirname(current_dir)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
|
||||
try:
|
||||
from services.docreader.src.parser.caption import Caption
|
||||
except ImportError:
|
||||
# Fallback: try relative import
|
||||
try:
|
||||
from .caption import Caption
|
||||
except ImportError:
|
||||
# If both imports fail, set to None
|
||||
Caption = None
|
||||
logging.warning(
|
||||
"Failed to import Caption, image captioning will be unavailable"
|
||||
)
|
||||
from docreader.models.document import Chunk, Document
|
||||
from docreader.models.read_config import ChunkingConfig
|
||||
from docreader.parser.caption import Caption
|
||||
from docreader.parser.ocr_engine import OCREngine
|
||||
from docreader.parser.storage import create_storage
|
||||
from docreader.splitter.splitter import TextSplitter
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
"""Chunk result"""
|
||||
|
||||
content: str # Chunk content
|
||||
seq: int # Chunk sequence number
|
||||
start: int # Chunk start position
|
||||
end: int # Chunk end position
|
||||
images: List[Dict[str, Any]] = field(default_factory=list) # Images in the chunk
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParseResult:
|
||||
"""Parse result"""
|
||||
|
||||
text: str # Extracted text content
|
||||
chunks: Optional[List[Chunk]] = None # Chunk results
|
||||
|
||||
|
||||
class BaseParser(ABC):
|
||||
"""Base parser interface"""
|
||||
|
||||
@@ -97,17 +60,18 @@ class BaseParser(ABC):
|
||||
def __init__(
|
||||
self,
|
||||
file_name: str = "",
|
||||
file_type: str = None,
|
||||
file_type: Optional[str] = None,
|
||||
enable_multimodal: bool = True,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
separators: list = ["\n\n", "\n", "。"],
|
||||
separators: list[str] = ["\n\n", "\n", "。"],
|
||||
ocr_backend: str = "paddle",
|
||||
ocr_config: dict = None,
|
||||
ocr_config: dict = {},
|
||||
max_image_size: int = 1920, # Maximum image size
|
||||
max_concurrent_tasks: int = 5, # Max concurrent tasks
|
||||
max_chunks: int = 1000, # Max number of returned chunks
|
||||
chunking_config: ChunkingConfig = None, # Chunking configuration object
|
||||
chunking_config: Optional[ChunkingConfig] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize parser
|
||||
|
||||
@@ -125,7 +89,6 @@ class BaseParser(ABC):
|
||||
max_chunks: Max number of returned chunks
|
||||
"""
|
||||
# Storage client instance
|
||||
self._storage = None
|
||||
self.file_name = file_name
|
||||
self.file_type = file_type or os.path.splitext(file_name)[1]
|
||||
self.enable_multimodal = enable_multimodal
|
||||
@@ -133,15 +96,16 @@ class BaseParser(ABC):
|
||||
self.chunk_overlap = chunk_overlap
|
||||
self.separators = separators
|
||||
self.ocr_backend = os.getenv("OCR_BACKEND", ocr_backend)
|
||||
self.ocr_config = ocr_config or {}
|
||||
self.ocr_config = ocr_config
|
||||
self.max_image_size = max_image_size
|
||||
self.max_concurrent_tasks = max_concurrent_tasks
|
||||
self.max_chunks = max_chunks
|
||||
self.chunking_config = chunking_config
|
||||
|
||||
logger.info(
|
||||
f"Initializing {self.__class__.__name__} for file: {file_name}, type: {self.file_type}"
|
||||
self.storage = create_storage(
|
||||
self.chunking_config.storage_config if self.chunking_config else None
|
||||
)
|
||||
|
||||
logger.info(f"Initializing parser for file: {file_name}, type: {file_type}")
|
||||
logger.info(
|
||||
f"Parser config: chunk_size={chunk_size}, "
|
||||
f"overlap={chunk_overlap}, "
|
||||
@@ -150,16 +114,24 @@ class BaseParser(ABC):
|
||||
f"max_chunks={max_chunks}"
|
||||
)
|
||||
# Only initialize Caption service if multimodal is enabled
|
||||
if self.enable_multimodal:
|
||||
try:
|
||||
self.caption_parser = Caption(self.chunking_config.vlm_config)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to initialize Caption service: {str(e)}")
|
||||
self.caption_parser = None
|
||||
else:
|
||||
self.caption_parser = None
|
||||
vlm_config = self.chunking_config.vlm_config if self.chunking_config else None
|
||||
self.caption_parser = (
|
||||
Caption(vlm_config=vlm_config) if self.enable_multimodal else None
|
||||
)
|
||||
|
||||
def perform_ocr(self, image):
|
||||
@abstractmethod
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""Parse document content
|
||||
|
||||
Args:
|
||||
content: Document content
|
||||
|
||||
Returns:
|
||||
Either a string containing the parsed text, or a tuple of (text, image_map)
|
||||
where image_map is a dict mapping image URLs to Image objects
|
||||
"""
|
||||
|
||||
def perform_ocr(self, image: Image.Image):
|
||||
"""Execute OCR recognition on the image
|
||||
|
||||
Args:
|
||||
@@ -170,53 +142,23 @@ class BaseParser(ABC):
|
||||
"""
|
||||
start_time = time.time()
|
||||
logger.info("Starting OCR recognition")
|
||||
resized_image = None
|
||||
|
||||
try:
|
||||
# Resize image to avoid processing large images
|
||||
resized_image = self._resize_image_if_needed(image)
|
||||
# Resize image to avoid processing large images
|
||||
resized_image = self._resize_image_if_needed(image)
|
||||
|
||||
# Get OCR engine
|
||||
ocr_engine = self.get_ocr_engine(
|
||||
backend_type=self.ocr_backend, **self.ocr_config
|
||||
)
|
||||
if ocr_engine is None:
|
||||
logger.error(
|
||||
f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, "
|
||||
"skipping OCR recognition"
|
||||
)
|
||||
return ""
|
||||
# Get OCR engine
|
||||
ocr_engine = OCREngine.get_instance(self.ocr_backend)
|
||||
|
||||
# Execute OCR prediction
|
||||
logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
|
||||
# Add extra exception handling
|
||||
try:
|
||||
ocr_result = ocr_engine.predict(resized_image)
|
||||
except RuntimeError as e:
|
||||
# Handle common CUDA memory issues or other runtime errors
|
||||
logger.error(f"OCR prediction runtime error: {str(e)}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
# Handle other prediction errors
|
||||
logger.error(f"Unexpected OCR prediction error: {str(e)}")
|
||||
return ""
|
||||
# Execute OCR prediction
|
||||
logger.info(f"Executing OCR prediction (using {self.ocr_backend} engine)")
|
||||
ocr_result = ocr_engine.predict(resized_image)
|
||||
|
||||
process_time = time.time() - start_time
|
||||
logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")
|
||||
return ocr_result
|
||||
except Exception as e:
|
||||
process_time = time.time() - start_time
|
||||
logger.error(
|
||||
f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds"
|
||||
)
|
||||
return ""
|
||||
finally:
|
||||
# Release image resources
|
||||
if resized_image is not image and hasattr(resized_image, "close"):
|
||||
# Only close the new image we created, not the original image
|
||||
resized_image.close()
|
||||
process_time = time.time() - start_time
|
||||
logger.info(f"OCR recognition completed, time: {process_time:.2f} seconds")
|
||||
|
||||
def _resize_image_if_needed(self, image):
|
||||
return ocr_result
|
||||
|
||||
def _resize_image_if_needed(self, image: Image.Image) -> Image.Image:
|
||||
"""Resize image if it exceeds maximum size limit
|
||||
|
||||
Args:
|
||||
@@ -225,102 +167,21 @@ class BaseParser(ABC):
|
||||
Returns:
|
||||
Resized image object
|
||||
"""
|
||||
try:
|
||||
# If it's a PIL Image
|
||||
if hasattr(image, "size"):
|
||||
width, height = image.size
|
||||
if width > self.max_image_size or height > self.max_image_size:
|
||||
logger.info(f"Resizing PIL image, original size: {width}x{height}")
|
||||
scale = min(
|
||||
self.max_image_size / width, self.max_image_size / height
|
||||
)
|
||||
new_width = int(width * scale)
|
||||
new_height = int(height * scale)
|
||||
resized_image = image.resize((new_width, new_height))
|
||||
logger.info(f"Resized to: {new_width}x{new_height}")
|
||||
return resized_image
|
||||
else:
|
||||
logger.info(
|
||||
f"PIL image size {width}x{height} is within limits, no resizing needed"
|
||||
)
|
||||
return image
|
||||
# If it's a numpy array
|
||||
elif hasattr(image, "shape"):
|
||||
height, width = image.shape[:2]
|
||||
if width > self.max_image_size or height > self.max_image_size:
|
||||
logger.info(
|
||||
f"Resizing numpy image, original size: {width}x{height}"
|
||||
)
|
||||
scale = min(
|
||||
self.max_image_size / width, self.max_image_size / height
|
||||
)
|
||||
new_width = int(width * scale)
|
||||
new_height = int(height * scale)
|
||||
# Use PIL for resizing numpy arrays
|
||||
pil_image = Image.fromarray(image)
|
||||
resized_pil = pil_image.resize((new_width, new_height))
|
||||
resized_image = np.array(resized_pil)
|
||||
logger.info(f"Resized to: {new_width}x{new_height}")
|
||||
return resized_image
|
||||
else:
|
||||
logger.info(
|
||||
f"Numpy image size {width}x{height} is within limits, no resizing needed"
|
||||
)
|
||||
return image
|
||||
else:
|
||||
logger.warning(f"Unknown image type: {type(image)}, cannot resize")
|
||||
return image
|
||||
except Exception as e:
|
||||
logger.error(f"Error resizing image: {str(e)}")
|
||||
return image
|
||||
width, height = image.size
|
||||
if width > self.max_image_size or height > self.max_image_size:
|
||||
logger.info(f"Resizing PIL image, original size: {width}x{height}")
|
||||
scale = min(self.max_image_size / width, self.max_image_size / height)
|
||||
new_width = int(width * scale)
|
||||
new_height = int(height * scale)
|
||||
resized_image = image.resize((new_width, new_height))
|
||||
logger.info(f"Resized to: {new_width}x{new_height}")
|
||||
return resized_image
|
||||
|
||||
def process_image(self, image, image_url=None):
|
||||
"""Process image: first perform OCR, then get caption if text is available
|
||||
logger.info(f"PIL image size is {width}x{height}, no resizing needed")
|
||||
return image
|
||||
|
||||
Args:
|
||||
image: Image object (PIL.Image or numpy array)
|
||||
image_url: Image URL (if uploaded)
|
||||
|
||||
Returns:
|
||||
tuple: (ocr_text, caption, image_url)
|
||||
- ocr_text: OCR extracted text
|
||||
- caption: Image description (if OCR has text) or empty string
|
||||
- image_url: Image URL (if provided)
|
||||
"""
|
||||
logger.info("Starting image processing (OCR + optional caption)")
|
||||
|
||||
# Resize image
|
||||
image = self._resize_image_if_needed(image)
|
||||
|
||||
# Perform OCR recognition
|
||||
ocr_text = self.perform_ocr(image)
|
||||
caption = ""
|
||||
|
||||
if self.caption_parser:
|
||||
logger.info(
|
||||
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
|
||||
)
|
||||
# Convert image to base64 for caption generation
|
||||
img_base64 = image_to_base64(image)
|
||||
if img_base64:
|
||||
caption = self.get_image_caption(img_base64)
|
||||
if caption:
|
||||
logger.info(f"Successfully obtained image caption: {caption}")
|
||||
else:
|
||||
logger.warning("Failed to get caption")
|
||||
else:
|
||||
logger.warning("Failed to convert image to base64")
|
||||
caption = ""
|
||||
else:
|
||||
logger.info("Caption service not initialized, skipping caption retrieval")
|
||||
|
||||
# Release image resources
|
||||
del image
|
||||
|
||||
return ocr_text, caption, image_url
|
||||
|
||||
async def process_image_async(self, image, image_url=None):
|
||||
"""Asynchronously process image: first perform OCR, then get caption if text is available
|
||||
async def process_image_async(self, image: Image.Image, image_url: str):
|
||||
"""Asynchronously process image: first perform OCR, then get caption
|
||||
|
||||
Args:
|
||||
image: Image object (PIL.Image or numpy array)
|
||||
@@ -333,84 +194,47 @@ class BaseParser(ABC):
|
||||
- image_url: Image URL (if provided)
|
||||
"""
|
||||
logger.info("Starting asynchronous image processing (OCR + optional caption)")
|
||||
resized_image = None
|
||||
|
||||
# Resize image
|
||||
resized_image = self._resize_image_if_needed(image)
|
||||
try:
|
||||
# Resize image
|
||||
resized_image = self._resize_image_if_needed(image)
|
||||
|
||||
# Perform OCR recognition (using run_in_executor to execute synchronous operations in the event loop)
|
||||
# Perform OCR recognition
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
# Add timeout mechanism to avoid infinite blocking (30 seconds timeout)
|
||||
ocr_task = loop.run_in_executor(None, self.perform_ocr, resized_image)
|
||||
ocr_text = await asyncio.wait_for(ocr_task, timeout=30.0)
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(
|
||||
"OCR processing timed out (30 seconds), skipping this image"
|
||||
)
|
||||
ocr_text = ""
|
||||
except Exception as e:
|
||||
logger.error(f"OCR processing error: {str(e)}")
|
||||
logger.error(f"OCR processing error, skipping this image: {str(e)}")
|
||||
ocr_text = ""
|
||||
|
||||
logger.info(
|
||||
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
|
||||
)
|
||||
caption = ""
|
||||
if self.caption_parser:
|
||||
try:
|
||||
# Convert image to base64 for caption generation
|
||||
img_base64 = image_to_base64(resized_image)
|
||||
if img_base64:
|
||||
# Add timeout to avoid blocking caption retrieval (30 seconds timeout)
|
||||
caption_task = self.get_image_caption_async(img_base64)
|
||||
image_data, caption = await asyncio.wait_for(
|
||||
caption_task, timeout=30.0
|
||||
)
|
||||
if caption:
|
||||
logger.info(
|
||||
f"Successfully obtained image caption: {caption}"
|
||||
)
|
||||
else:
|
||||
logger.warning("Failed to get caption")
|
||||
else:
|
||||
logger.warning("Failed to convert image to base64")
|
||||
caption = ""
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Caption retrieval timed out, skipping")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get caption: {str(e)}")
|
||||
else:
|
||||
logger.info(
|
||||
"Caption service not initialized, skipping caption retrieval"
|
||||
)
|
||||
|
||||
logger.info(f"Successfully obtained image ocr: {ocr_text}")
|
||||
img_base64 = endecode.decode_image(resized_image)
|
||||
caption = self.get_image_caption(img_base64)
|
||||
logger.info(f"Successfully obtained image caption: {caption}")
|
||||
return ocr_text, caption, image_url
|
||||
finally:
|
||||
# Release image resources
|
||||
if resized_image is not image and hasattr(resized_image, "close"):
|
||||
# Only close the new image we created, not the original image
|
||||
resized_image.close()
|
||||
resized_image.close()
|
||||
|
||||
async def process_with_limit(self, idx, image, url, semaphore):
|
||||
async def process_with_limit(
|
||||
self, idx: int, image: Image.Image, url: str, semaphore: asyncio.Semaphore
|
||||
):
|
||||
"""Function to process a single image using a semaphore"""
|
||||
try:
|
||||
logger.info(f"Waiting to process image {idx+1}")
|
||||
logger.info(f"Waiting to process image {idx + 1}")
|
||||
async with semaphore: # Use semaphore to control concurrency
|
||||
logger.info(f"Starting to process image {idx+1}")
|
||||
logger.info(f"Starting to process image {idx + 1}")
|
||||
result = await self.process_image_async(image, url)
|
||||
logger.info(f"Completed processing image {idx+1}")
|
||||
logger.info(f"Completed processing image {idx + 1}")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing image {idx+1}: {str(e)}")
|
||||
logger.error(f"Error processing image {idx + 1}: {str(e)}")
|
||||
return ("", "", url) # Return empty result to avoid overall failure
|
||||
finally:
|
||||
# Manually release image resources
|
||||
if hasattr(image, "close"):
|
||||
image.close()
|
||||
image.close()
|
||||
|
||||
async def process_multiple_images(self, images_data):
|
||||
async def process_multiple_images(self, images_data: List[Tuple[Image.Image, str]]):
|
||||
"""Process multiple images concurrently
|
||||
|
||||
Args:
|
||||
@@ -450,7 +274,7 @@ class BaseParser(ABC):
|
||||
for i, result in enumerate(completed_results):
|
||||
if isinstance(result, Exception):
|
||||
logger.error(
|
||||
f"Image {i+1} processing returned an exception: {str(result)}"
|
||||
f"Image {i + 1} processing returned an exception: {str(result)}"
|
||||
)
|
||||
# For exceptions, add empty results
|
||||
if i < len(images_data):
|
||||
@@ -467,47 +291,10 @@ class BaseParser(ABC):
|
||||
logger.info("Image processing resource cleanup complete")
|
||||
|
||||
logger.info(
|
||||
f"Completed concurrent processing of {len(results)}/{len(images_data)} images"
|
||||
f"Concurrent processing of {len(results)}/{len(images_data)} images"
|
||||
)
|
||||
return results
|
||||
|
||||
def decode_bytes(self, content: bytes) -> str:
|
||||
"""Intelligently decode byte stream, supports multiple encodings
|
||||
|
||||
Tries to decode in common encodings, if all fail, uses latin-1 as fallback
|
||||
|
||||
Args:
|
||||
content: Byte stream to decode
|
||||
|
||||
Returns:
|
||||
Decoded string
|
||||
"""
|
||||
logger.info(f"Attempting to decode bytes of length: {len(content)}")
|
||||
# Common encodings, sorted by priority
|
||||
encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii", "latin-1"]
|
||||
text = None
|
||||
|
||||
# Try decoding with each encoding format
|
||||
for encoding in encodings:
|
||||
try:
|
||||
text = content.decode(encoding)
|
||||
logger.info(f"Successfully decoded content using {encoding} encoding")
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
logger.info(f"Failed to decode using {encoding} encoding")
|
||||
continue
|
||||
|
||||
# If all encodings fail, use latin-1 as fallback
|
||||
if text is None:
|
||||
text = content.decode("latin-1")
|
||||
logger.warning(
|
||||
f"Unable to determine correct encoding, using latin-1 as fallback. "
|
||||
f"This may cause character issues."
|
||||
)
|
||||
|
||||
logger.info(f"Decoded text length: {len(text)} characters")
|
||||
return text
|
||||
|
||||
def get_image_caption(self, image_data: str) -> str:
|
||||
"""Get image description
|
||||
|
||||
@@ -517,6 +304,9 @@ class BaseParser(ABC):
|
||||
Returns:
|
||||
Image description
|
||||
"""
|
||||
if not self.caption_parser:
|
||||
logger.warning("Caption parser not initialized")
|
||||
return ""
|
||||
start_time = time.time()
|
||||
logger.info(
|
||||
f"Getting caption for image: {image_data[:250]}..."
|
||||
@@ -533,80 +323,7 @@ class BaseParser(ABC):
|
||||
logger.warning("Failed to get caption for image")
|
||||
return caption
|
||||
|
||||
async def get_image_caption_async(self, image_data: str) -> Tuple[str, str]:
|
||||
"""Asynchronously get image description
|
||||
|
||||
Args:
|
||||
image_data: Image data (base64 encoded string or URL)
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: Image data and corresponding description
|
||||
"""
|
||||
caption = self.get_image_caption(image_data)
|
||||
return image_data, caption
|
||||
|
||||
def __init_storage(self):
|
||||
"""Initialize storage client based on configuration"""
|
||||
if self._storage is None:
|
||||
storage_config = (
|
||||
self.chunking_config.storage_config if self.chunking_config else None
|
||||
)
|
||||
self._storage = create_storage(storage_config)
|
||||
logger.info(
|
||||
f"Initialized storage client: {self._storage.__class__.__name__}"
|
||||
)
|
||||
return self._storage
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to object storage
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
logger.info(f"Uploading file: {file_path}")
|
||||
try:
|
||||
storage = self.__init_storage()
|
||||
return storage.upload_file(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file: {str(e)}")
|
||||
return ""
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to object storage
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
logger.info(f"Uploading bytes content, size: {len(content)} bytes")
|
||||
try:
|
||||
storage = self.__init_storage()
|
||||
return storage.upload_bytes(content, file_ext)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload bytes to storage: {str(e)}")
|
||||
traceback.print_exc()
|
||||
return ""
|
||||
|
||||
@abstractmethod
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
"""Parse document content
|
||||
|
||||
Args:
|
||||
content: Document content
|
||||
|
||||
Returns:
|
||||
Either a string containing the parsed text, or a tuple of (text, image_map)
|
||||
where image_map is a dict mapping image URLs to Image objects
|
||||
"""
|
||||
pass
|
||||
|
||||
def parse(self, content: bytes) -> ParseResult:
|
||||
def parse(self, content: bytes) -> Document:
|
||||
"""Parse document content
|
||||
|
||||
Args:
|
||||
@@ -616,17 +333,22 @@ class BaseParser(ABC):
|
||||
Parse result
|
||||
"""
|
||||
logger.info(
|
||||
f"Parsing document with {self.__class__.__name__}, content size: {len(content)} bytes"
|
||||
f"Parsing document with {self.__class__.__name__}, bytes: {len(content)}"
|
||||
)
|
||||
parse_result = self.parse_into_text(content)
|
||||
if isinstance(parse_result, tuple):
|
||||
text, image_map = parse_result
|
||||
else:
|
||||
text = parse_result
|
||||
image_map = {}
|
||||
logger.info(f"Extracted {len(text)} characters of text from {self.file_name}")
|
||||
logger.info(f"Beginning chunking process for text")
|
||||
chunks = self.chunk_text(text)
|
||||
document = self.parse_into_text(content)
|
||||
logger.info(
|
||||
f"Extracted {len(document.content)} characters from {self.file_name}"
|
||||
)
|
||||
if document.chunks:
|
||||
return document
|
||||
|
||||
splitter = TextSplitter(
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
separators=self.separators,
|
||||
)
|
||||
chunk_str = splitter.split_text(document.content)
|
||||
chunks = self._str_to_chunk(chunk_str)
|
||||
logger.info(f"Created {len(chunks)} chunks from document")
|
||||
|
||||
# Limit the number of returned chunks
|
||||
@@ -636,7 +358,7 @@ class BaseParser(ABC):
|
||||
)
|
||||
chunks = chunks[: self.max_chunks]
|
||||
|
||||
# If multimodal is enabled and file type is supported, process images in each chunk
|
||||
# If multimodal is enabled and file type is supported, process images
|
||||
if self.enable_multimodal:
|
||||
# Get file extension and convert to lowercase
|
||||
file_ext = (
|
||||
@@ -647,11 +369,12 @@ class BaseParser(ABC):
|
||||
|
||||
# Define allowed file types for image processing
|
||||
allowed_types = [
|
||||
".pdf", # PDF files
|
||||
# Text files
|
||||
".pdf",
|
||||
".md",
|
||||
".markdown", # Markdown files
|
||||
".markdown",
|
||||
".doc",
|
||||
".docx", # Word documents
|
||||
".docx",
|
||||
# Image files
|
||||
".jpg",
|
||||
".jpeg",
|
||||
@@ -666,13 +389,21 @@ class BaseParser(ABC):
|
||||
logger.info(
|
||||
f"Processing images in each chunk for file type: {file_ext}"
|
||||
)
|
||||
chunks = self.process_chunks_images(chunks, image_map)
|
||||
chunks = self.process_chunks_images(chunks, document.images)
|
||||
else:
|
||||
logger.info(
|
||||
f"Skipping image processing for unsupported file type: {file_ext}"
|
||||
)
|
||||
|
||||
return ParseResult(text=text, chunks=chunks)
|
||||
document.chunks = chunks
|
||||
return document
|
||||
|
||||
def _str_to_chunk(self, text: List[Tuple[int, int, str]]) -> List[Chunk]:
|
||||
"""Convert string to Chunk object"""
|
||||
return [
|
||||
Chunk(seq=i, content=t, start=start, end=end)
|
||||
for i, (start, end, t) in enumerate(text)
|
||||
]
|
||||
|
||||
def _split_into_units(self, text: str) -> List[str]:
|
||||
"""
|
||||
@@ -682,9 +413,7 @@ class BaseParser(ABC):
|
||||
Returns:
|
||||
基本单元的列表
|
||||
"""
|
||||
logger.info(
|
||||
f"Splitting text into basic units with robust structure protection, text length: {len(text)}"
|
||||
)
|
||||
logger.info(f"Splitting text into basic units, text length: {len(text)}")
|
||||
|
||||
# 定义所有需要作为整体保护的结构模式 ---
|
||||
table_pattern = r"(?m)(^\|.*\|[ \t]*\r?\n(?:[ \t]*\r?\n)?^\|\s*:?--+.*\r?\n(?:^\|.*\|\r?\n?)*)"
|
||||
@@ -710,7 +439,8 @@ class BaseParser(ABC):
|
||||
# 按起始位置排序
|
||||
protected_ranges.sort(key=lambda x: x[0])
|
||||
logger.info(
|
||||
f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links)."
|
||||
f"Found {len(protected_ranges)} protected structures "
|
||||
"(tables, code, formulas, images, links)."
|
||||
)
|
||||
|
||||
# 合并可能重叠的保护范围 ---
|
||||
@@ -731,7 +461,7 @@ class BaseParser(ABC):
|
||||
merged_ranges.append((current_start, current_end))
|
||||
protected_ranges = merged_ranges
|
||||
logger.info(
|
||||
f"After merging overlaps, {len(protected_ranges)} protected ranges remain."
|
||||
f"After overlaps, {len(protected_ranges)} protected ranges remain."
|
||||
)
|
||||
|
||||
# 根据保护范围和分隔符来分割文本 ---
|
||||
@@ -749,7 +479,7 @@ class BaseParser(ABC):
|
||||
segments = re.split(separator_pattern, pre_text)
|
||||
units.extend([s for s in segments if s]) # 添加所有非空部分
|
||||
|
||||
# b. 将整个受保护的块(例如,一个完整的表格)作为一个单独的、不可分割的单元添加
|
||||
# b. 将整个受保护的块(例如,一个完整的表格)作为一个不可分割的单元添加
|
||||
protected_text = text[start:end]
|
||||
units.append(protected_text)
|
||||
|
||||
@@ -764,38 +494,6 @@ class BaseParser(ABC):
|
||||
logger.info(f"Text splitting complete, created {len(units)} final basic units.")
|
||||
return units
|
||||
|
||||
def _find_complete_units(self, units: List[str], target_size: int) -> List[str]:
|
||||
"""Find a list of complete units that do not exceed the target size
|
||||
|
||||
Args:
|
||||
units: List of units
|
||||
target_size: Target size
|
||||
|
||||
Returns:
|
||||
List of complete units
|
||||
"""
|
||||
logger.info(f"Finding complete units with target size: {target_size}")
|
||||
result = []
|
||||
current_size = 0
|
||||
|
||||
for unit in units:
|
||||
unit_size = len(unit)
|
||||
if current_size + unit_size > target_size and result:
|
||||
logger.info(
|
||||
f"Reached target size limit at {current_size} characters, stopping"
|
||||
)
|
||||
break
|
||||
result.append(unit)
|
||||
current_size += unit_size
|
||||
logger.info(
|
||||
f"Added unit of size {unit_size}, current total: {current_size}/{target_size}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Found {len(result)} complete units totaling {current_size} characters"
|
||||
)
|
||||
return result
|
||||
|
||||
def chunk_text(self, text: str) -> List[Chunk]:
|
||||
"""Chunk text, preserving Markdown structure
|
||||
|
||||
@@ -825,7 +523,7 @@ class BaseParser(ABC):
|
||||
|
||||
for i, unit in enumerate(units):
|
||||
unit_size = len(unit)
|
||||
logger.info(f"Processing unit {i+1}/{len(units)}, size: {unit_size}")
|
||||
logger.info(f"Processing unit {i + 1}/{len(units)}, size: {unit_size}")
|
||||
|
||||
# If current chunk plus new unit exceeds size limit, create new chunk
|
||||
if current_size + unit_size > self.chunk_size and current_chunk:
|
||||
@@ -855,14 +553,12 @@ class BaseParser(ABC):
|
||||
for u in reversed(current_chunk):
|
||||
if overlap_size + len(u) > overlap_target:
|
||||
logger.info(
|
||||
f"Reached overlap target ({overlap_size}/{overlap_target})"
|
||||
f"Overlap target ({overlap_size}/{overlap_target})"
|
||||
)
|
||||
break
|
||||
overlap_units.insert(0, u)
|
||||
overlap_size += len(u)
|
||||
logger.info(
|
||||
f"Added unit to overlap, current overlap size: {overlap_size}"
|
||||
)
|
||||
logger.info(f"Added unit to overlap, size: {overlap_size}")
|
||||
|
||||
# Remove elements from overlap that are included in separators
|
||||
start_index = 0
|
||||
@@ -883,7 +579,7 @@ class BaseParser(ABC):
|
||||
|
||||
overlap_units = overlap_units[start_index:]
|
||||
logger.info(
|
||||
f"Final overlap: {len(overlap_units)} units, {overlap_size} characters"
|
||||
f"Overlap: {len(overlap_units)} units, {overlap_size} size"
|
||||
)
|
||||
|
||||
current_chunk = overlap_units
|
||||
@@ -899,7 +595,7 @@ class BaseParser(ABC):
|
||||
current_chunk.append(unit)
|
||||
current_size += unit_size
|
||||
logger.info(
|
||||
f"Added unit to current chunk, now at {current_size}/{self.chunk_size} characters"
|
||||
f"Added unit to current chunk, at {current_size}/{self.chunk_size}"
|
||||
)
|
||||
|
||||
# Add the last chunk
|
||||
@@ -925,12 +621,13 @@ class BaseParser(ABC):
|
||||
chunk: Document chunk
|
||||
|
||||
Returns:
|
||||
List of image information, each element contains image URL and match position
|
||||
List of image information
|
||||
"""
|
||||
logger.info(f"Extracting image information from Chunk #{chunk.seq}")
|
||||
text = chunk.content
|
||||
|
||||
# Regex to extract image information from text, supporting Markdown images and HTML images
|
||||
# Regex to extract image information from text,
|
||||
# support: Markdown images, HTML images
|
||||
img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)|<img [^>]*src="([^"]+)" [^>]*>'
|
||||
|
||||
# Extract image information
|
||||
@@ -954,28 +651,28 @@ class BaseParser(ABC):
|
||||
images_info.append(image_info)
|
||||
|
||||
logger.info(
|
||||
f"Image in Chunk #{chunk.seq} {match_idx+1}: " f"URL={img_url[:50]}..."
|
||||
f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url[:50]}..."
|
||||
if len(img_url) > 50
|
||||
else f"Image in Chunk #{chunk.seq} {match_idx+1}: URL={img_url}"
|
||||
else f"Image in Chunk #{chunk.seq} {match_idx + 1}: URL={img_url}"
|
||||
)
|
||||
|
||||
return images_info
|
||||
|
||||
async def download_and_upload_image(self, img_url: str):
|
||||
"""Download image and upload to object storage, if it's already an object storage path or local path, use directly
|
||||
async def download_and_upload_image(
|
||||
self, img_url: str
|
||||
) -> Tuple[str, str, Image.Image | None]:
|
||||
"""Download image and upload to object storage,
|
||||
if it's already an object storage path or local path, use directly
|
||||
|
||||
Args:
|
||||
img_url: Image URL or local path
|
||||
|
||||
Returns:
|
||||
tuple: (original URL, storage URL, image object), if failed returns (original URL, None, None)
|
||||
tuple: (original URL, storage URL, image object),
|
||||
if failed returns (original URL, None, None)
|
||||
"""
|
||||
|
||||
try:
|
||||
import requests
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
# Check if it's already a storage URL (COS or MinIO)
|
||||
is_storage_url = any(
|
||||
pattern in img_url
|
||||
@@ -997,12 +694,7 @@ class BaseParser(ABC):
|
||||
response = requests.get(img_url, timeout=5, proxies=proxies)
|
||||
if response.status_code == 200:
|
||||
image = Image.open(io.BytesIO(response.content))
|
||||
try:
|
||||
return img_url, img_url, image
|
||||
finally:
|
||||
# Ensure image resources are also released after the function returns
|
||||
# Image will be closed by the caller
|
||||
pass
|
||||
return img_url, img_url, image
|
||||
else:
|
||||
logger.warning(
|
||||
f"Failed to get storage image: {response.status_code}"
|
||||
@@ -1022,7 +714,7 @@ class BaseParser(ABC):
|
||||
# Upload to storage
|
||||
with open(img_url, "rb") as f:
|
||||
content = f.read()
|
||||
storage_url = self.upload_bytes(content)
|
||||
storage_url = self.storage.upload_bytes(content)
|
||||
logger.info(
|
||||
f"Successfully uploaded local image to storage: {storage_url}"
|
||||
)
|
||||
@@ -1031,7 +723,7 @@ class BaseParser(ABC):
|
||||
logger.error(f"Error processing local image: {str(e)}")
|
||||
if image and hasattr(image, "close"):
|
||||
image.close()
|
||||
return img_url, None, None
|
||||
return img_url, img_url, None
|
||||
|
||||
# Normal remote URL download handling
|
||||
else:
|
||||
@@ -1044,9 +736,7 @@ class BaseParser(ABC):
|
||||
if https_proxy:
|
||||
proxies["https"] = https_proxy
|
||||
|
||||
logger.info(
|
||||
f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}"
|
||||
)
|
||||
logger.info(f"Downloading image {img_url}, using proxy: {proxies}")
|
||||
response = requests.get(img_url, timeout=5, proxies=proxies)
|
||||
|
||||
if response.status_code == 200:
|
||||
@@ -1054,7 +744,7 @@ class BaseParser(ABC):
|
||||
image = Image.open(io.BytesIO(response.content))
|
||||
try:
|
||||
# Upload to storage using the method in BaseParser
|
||||
storage_url = self.upload_bytes(response.content)
|
||||
storage_url = self.storage.upload_bytes(response.content)
|
||||
logger.info(
|
||||
f"Successfully uploaded image to storage: {storage_url}"
|
||||
)
|
||||
@@ -1064,11 +754,11 @@ class BaseParser(ABC):
|
||||
pass
|
||||
else:
|
||||
logger.warning(f"Failed to download image: {response.status_code}")
|
||||
return img_url, None, None
|
||||
return img_url, img_url, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading or processing image: {str(e)}")
|
||||
return img_url, None, None
|
||||
return img_url, img_url, None
|
||||
|
||||
async def process_chunk_images_async(
|
||||
self, chunk, chunk_idx, total_chunks, image_map=None
|
||||
@@ -1086,18 +776,19 @@ class BaseParser(ABC):
|
||||
"""
|
||||
|
||||
logger.info(
|
||||
f"Starting to process images in Chunk #{chunk_idx+1}/{total_chunks}"
|
||||
f"Starting to process images in Chunk #{chunk_idx + 1}/{total_chunks}"
|
||||
)
|
||||
|
||||
# Extract image information from the Chunk
|
||||
images_info = self.extract_images_from_chunk(chunk)
|
||||
if not images_info:
|
||||
logger.info(f"Chunk #{chunk_idx+1} found no images")
|
||||
logger.info(f"Chunk #{chunk_idx + 1} found no images")
|
||||
return chunk
|
||||
|
||||
# Prepare images that need to be downloaded and processed
|
||||
images_to_process = []
|
||||
url_to_info_map = {} # Map URL to image information
|
||||
# Map URL to image information
|
||||
url_to_info_map = {}
|
||||
|
||||
# Record all image URLs that need to be processed
|
||||
for img_info in images_info:
|
||||
@@ -1106,14 +797,21 @@ class BaseParser(ABC):
|
||||
|
||||
results = []
|
||||
download_tasks = []
|
||||
for img_url in url_to_info_map.keys(): # Check if image is already in the image_map
|
||||
# Check if image is already in the image_map
|
||||
for img_url in url_to_info_map.keys():
|
||||
if image_map and img_url in image_map:
|
||||
logger.info(f"Image already in image_map: {img_url}, using cached object")
|
||||
results.append((img_url, img_url, image_map[img_url]))
|
||||
logger.info(
|
||||
f"Image already in image_map: {img_url}, using cached object"
|
||||
)
|
||||
image = Image.open(
|
||||
io.BytesIO(endecode.encode_image(image_map[img_url]))
|
||||
)
|
||||
results.append((img_url, img_url, image))
|
||||
else:
|
||||
download_task = self.download_and_upload_image(img_url)
|
||||
download_tasks.append(download_task)
|
||||
# Concurrent download and upload of images, ignore images that are already in the image_map
|
||||
# Concurrent download and upload of images,
|
||||
# ignore images that are already in the image_map
|
||||
results.extend(await asyncio.gather(*download_tasks))
|
||||
|
||||
# Process download results, prepare for OCR processing
|
||||
@@ -1123,16 +821,17 @@ class BaseParser(ABC):
|
||||
img_info["cos_url"] = cos_url
|
||||
images_to_process.append((image, cos_url))
|
||||
|
||||
# If no images were successfully downloaded and uploaded, return the original Chunk
|
||||
# If no images were successfully downloaded and uploaded,
|
||||
# return the original Chunk
|
||||
if not images_to_process:
|
||||
logger.info(
|
||||
f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images"
|
||||
f"Chunk #{chunk_idx + 1} not found downloaded and uploaded images"
|
||||
)
|
||||
return chunk
|
||||
|
||||
# Concurrent processing of all images (OCR + caption)
|
||||
logger.info(
|
||||
f"Processing {len(images_to_process)} images in Chunk #{chunk_idx+1}"
|
||||
f"Processing {len(images_to_process)} images in Chunk #{chunk_idx + 1}"
|
||||
)
|
||||
|
||||
# Concurrent processing of all images
|
||||
@@ -1163,10 +862,12 @@ class BaseParser(ABC):
|
||||
# Update image information in the Chunk
|
||||
chunk.images = processed_images
|
||||
|
||||
logger.info(f"Completed image processing in Chunk #{chunk_idx+1}")
|
||||
logger.info(f"Completed image processing in Chunk #{chunk_idx + 1}")
|
||||
return chunk
|
||||
|
||||
def process_chunks_images(self, chunks: List[Chunk], image_map=None) -> List[Chunk]:
|
||||
def process_chunks_images(
|
||||
self, chunks: List[Chunk], image_map: Dict[str, str] = {}
|
||||
) -> List[Chunk]:
|
||||
"""Concurrent processing of images in all Chunks
|
||||
|
||||
Args:
|
||||
@@ -1210,7 +911,7 @@ class BaseParser(ABC):
|
||||
processed_chunks = []
|
||||
for i, result in enumerate(results):
|
||||
if isinstance(result, Exception):
|
||||
logger.error(f"Error processing Chunk {i+1}: {str(result)}")
|
||||
logger.error(f"Error processing Chunk {i + 1}: {str(result)}")
|
||||
# Keep original Chunk
|
||||
if i < len(chunks):
|
||||
processed_chunks.append(chunks[i])
|
||||
@@ -1235,7 +936,7 @@ class BaseParser(ABC):
|
||||
# Execute processing for all Chunks
|
||||
processed_chunks = loop.run_until_complete(process_all_chunks())
|
||||
logger.info(
|
||||
f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks"
|
||||
f"Completed processing of {len(processed_chunks)}/{len(chunks)} chunks"
|
||||
)
|
||||
|
||||
return processed_chunks
|
||||
@@ -3,11 +3,10 @@ import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Union
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import requests
|
||||
import ollama
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -158,11 +157,16 @@ class CaptionChatResp:
|
||||
Returns:
|
||||
The content string from the first choice, or empty string if no choices
|
||||
"""
|
||||
if self.choices:
|
||||
logger.info("Retrieving content from first choice")
|
||||
return self.choices[0].message.content
|
||||
logger.warning("No choices available in response")
|
||||
return ""
|
||||
if (
|
||||
not self.choices
|
||||
or not self.choices[0]
|
||||
or not self.choices[0].message
|
||||
or not self.choices[0].message.content
|
||||
):
|
||||
logger.warning("No choices available in response")
|
||||
return ""
|
||||
logger.info("Retrieving content from first choice")
|
||||
return self.choices[0].message.content
|
||||
|
||||
|
||||
class Caption:
|
||||
@@ -171,33 +175,43 @@ class Caption:
|
||||
Uses an external API to process images and return textual descriptions.
|
||||
"""
|
||||
|
||||
def __init__(self, vlm_config=None):
|
||||
"""Initialize the Caption service with configuration from parameters or environment variables."""
|
||||
def __init__(self, vlm_config: Optional[Dict[str, str]] = None):
|
||||
"""
|
||||
Initialize the Caption service with configuration
|
||||
from parameters or environment variables.
|
||||
"""
|
||||
logger.info("Initializing Caption service")
|
||||
self.prompt = """简单凝炼的描述图片的主要内容"""
|
||||
|
||||
# Use provided VLM config if available, otherwise fall back to environment variables
|
||||
self.timeout = 30
|
||||
|
||||
# Use provided VLM config if available,
|
||||
# otherwise fall back to environment variables
|
||||
if vlm_config and vlm_config.get("base_url") and vlm_config.get("model_name"):
|
||||
self.completion_url = vlm_config.get("base_url", "") + "/chat/completions"
|
||||
self.model = vlm_config.get("model_name", "")
|
||||
self.api_key = vlm_config.get("api_key", "")
|
||||
self.interface_type = vlm_config.get("interface_type", "openai").lower()
|
||||
else:
|
||||
if os.getenv("VLM_MODEL_BASE_URL") == "" or os.getenv("VLM_MODEL_NAME") == "":
|
||||
base_url = os.getenv("VLM_MODEL_BASE_URL")
|
||||
model_name = os.getenv("VLM_MODEL_NAME")
|
||||
if not base_url or not model_name:
|
||||
logger.error("VLM_MODEL_BASE_URL or VLM_MODEL_NAME is not set")
|
||||
return
|
||||
self.completion_url = os.getenv("VLM_MODEL_BASE_URL") + "/chat/completions"
|
||||
self.model = os.getenv("VLM_MODEL_NAME")
|
||||
self.api_key = os.getenv("VLM_MODEL_API_KEY")
|
||||
self.completion_url = base_url + "/chat/completions"
|
||||
self.model = model_name
|
||||
self.api_key = os.getenv("VLM_MODEL_API_KEY", "")
|
||||
self.interface_type = os.getenv("VLM_INTERFACE_TYPE", "openai").lower()
|
||||
|
||||
|
||||
# 验证接口类型
|
||||
if self.interface_type not in ["ollama", "openai"]:
|
||||
logger.warning(f"Unknown interface type: {self.interface_type}, defaulting to openai")
|
||||
logger.warning(
|
||||
f"Unknown interface type: {self.interface_type}, defaulting to openai"
|
||||
)
|
||||
self.interface_type = "openai"
|
||||
|
||||
|
||||
logger.info(
|
||||
f"Service configured with model: {self.model}, endpoint: {self.completion_url}, interface: {self.interface_type}"
|
||||
f"Configured with model: {self.model}, "
|
||||
f"endpoint: {self.completion_url}, interface: {self.interface_type}"
|
||||
)
|
||||
|
||||
def _call_caption_api(self, image_data: str) -> Optional[CaptionChatResp]:
|
||||
@@ -210,8 +224,8 @@ class Caption:
|
||||
Returns:
|
||||
CaptionChatResp object if successful, None otherwise
|
||||
"""
|
||||
logger.info(f"Calling Caption API for image captioning")
|
||||
logger.info(f"Processing image data: {image_data[:50] if len(image_data) > 50 else image_data}")
|
||||
logger.info("Calling Caption API for image captioning")
|
||||
logger.info(f"Processing image data: {image_data[:50]}...")
|
||||
|
||||
# 根据接口类型选择调用方式
|
||||
if self.interface_type == "ollama":
|
||||
@@ -226,39 +240,35 @@ class Caption:
|
||||
|
||||
client = ollama.Client(
|
||||
host=host,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
logger.info(f"Calling Ollama API with model: {self.model}")
|
||||
|
||||
|
||||
# 调用Ollama API,使用images参数传递base64编码的图片
|
||||
response = client.generate(
|
||||
model=self.model,
|
||||
prompt="简单凝炼的描述图片的主要内容",
|
||||
images=[image_base64], # image_base64是base64编码的图片数据
|
||||
images=[image_base64], # image_base64是base64编码的图片数据
|
||||
options={"temperature": 0.1},
|
||||
stream=False,
|
||||
)
|
||||
|
||||
|
||||
# 构造响应对象
|
||||
caption_resp = CaptionChatResp(
|
||||
id="ollama_response",
|
||||
created=int(time.time()),
|
||||
model=self.model,
|
||||
model=Model(id=self.model),
|
||||
object="chat.completion",
|
||||
choices=[
|
||||
Choice(
|
||||
message=Message(
|
||||
role="assistant",
|
||||
content=response.response
|
||||
)
|
||||
)
|
||||
]
|
||||
Choice(message=Message(role="assistant", content=response.response))
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
logger.info("Successfully received response from Ollama API")
|
||||
return caption_resp
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Ollama API: {e}")
|
||||
return None
|
||||
@@ -266,13 +276,16 @@ class Caption:
|
||||
def _call_openai_api(self, image_base64: str) -> Optional[CaptionChatResp]:
|
||||
"""Call OpenAI-compatible API for image captioning."""
|
||||
logger.info(f"Calling OpenAI-compatible API with model: {self.model}")
|
||||
|
||||
|
||||
user_msg = UserMessage(
|
||||
role="user",
|
||||
content=[
|
||||
Content(type="text", text=self.prompt),
|
||||
Content(
|
||||
type="image_url", image_url=ImageUrl(url="data:image/png;base64," + image_base64, detail="auto")
|
||||
type="image_url",
|
||||
image_url=ImageUrl(
|
||||
url="data:image/png;base64," + image_base64, detail="auto"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
@@ -295,23 +308,23 @@ class Caption:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
|
||||
try:
|
||||
logger.info(f"Sending request to OpenAI-compatible API with model: {self.model}")
|
||||
logger.info(
|
||||
f"Sending request to OpenAI-compatible API with model: {self.model}"
|
||||
)
|
||||
response = requests.post(
|
||||
self.completion_url,
|
||||
data=json.dumps(gpt_req, default=lambda o: o.__dict__, indent=4),
|
||||
headers=headers,
|
||||
timeout=30,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
if response.status_code != 200:
|
||||
logger.error(
|
||||
f"OpenAI-compatible API returned non-200 status code: {response.status_code}"
|
||||
f"OpenAI API returned non-200 status code: {response.status_code}"
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
logger.info(
|
||||
f"Successfully received response from OpenAI-compatible API with status: {response.status_code}"
|
||||
)
|
||||
logger.info(f"Converting response to CaptionChatResp object")
|
||||
logger.info(f"Received from OpenAI with status: {response.status_code}")
|
||||
logger.info("Converting response to CaptionChatResp object")
|
||||
caption_resp = CaptionChatResp.from_json(response.json())
|
||||
|
||||
if caption_resp.usage:
|
||||
@@ -322,7 +335,7 @@ class Caption:
|
||||
|
||||
return caption_resp
|
||||
except requests.exceptions.Timeout:
|
||||
logger.error(f"Timeout while calling OpenAI-compatible API after 30 seconds")
|
||||
logger.error("Timeout while calling OpenAI-compatible API after 30 seconds")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Request error calling OpenAI-compatible API: {e}")
|
||||
71
docreader/parser/chain_parser.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import logging
|
||||
from typing import Dict, List, Tuple, Type
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class FirstParser(BaseParser):
|
||||
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self._parsers: List[BaseParser] = []
|
||||
for parser_cls in self._parser_cls:
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
for p in self._parsers:
|
||||
logger.info(f"FirstParser: using parser {p.__class__.__name__}")
|
||||
document = p.parse_into_text(content)
|
||||
if document.is_valid():
|
||||
logger.info(f"FirstParser: parser {p.__class__.__name__} succeeded")
|
||||
return document
|
||||
return Document()
|
||||
|
||||
@classmethod
|
||||
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["FirstParser"]:
|
||||
names = "_".join([p.__name__ for p in parser_classes])
|
||||
return type(f"FirstParser_{names}", (cls,), {"_parser_cls": parser_classes})
|
||||
|
||||
|
||||
class PipelineParser(BaseParser):
|
||||
_parser_cls: Tuple[Type["BaseParser"], ...] = ()
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self._parsers: List[BaseParser] = []
|
||||
for parser_cls in self._parser_cls:
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
images: Dict[str, str] = {}
|
||||
document = Document()
|
||||
for p in self._parsers:
|
||||
logger.info(f"PipelineParser: using parser {p.__class__.__name__}")
|
||||
document = p.parse_into_text(content)
|
||||
content = endecode.encode_bytes(document.content)
|
||||
images.update(document.images)
|
||||
document.images.update(images)
|
||||
return document
|
||||
|
||||
@classmethod
|
||||
def create(cls, *parser_classes: Type["BaseParser"]) -> Type["PipelineParser"]:
|
||||
names = "_".join([p.__name__ for p in parser_classes])
|
||||
return type(f"PipelineParser_{names}", (cls,), {"_parser_cls": parser_classes})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
|
||||
cls = FirstParser.create(MarkdownParser)
|
||||
parser = cls()
|
||||
print(parser.parse_into_text(b"aaa"))
|
||||
50
docreader/parser/csv_parser.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docreader.models.document import Chunk, Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CSVParser(BaseParser):
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
chunks: List[Chunk] = []
|
||||
text: List[str] = []
|
||||
start, end = 0, 0
|
||||
|
||||
df = pd.read_csv(BytesIO(content), on_bad_lines="skip")
|
||||
|
||||
for i, (idx, row) in enumerate(df.iterrows()):
|
||||
content_row = (
|
||||
",".join(
|
||||
f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
end += len(content_row)
|
||||
text.append(content_row)
|
||||
chunks.append(Chunk(content=content_row, seq=i, start=start, end=end))
|
||||
start = end
|
||||
|
||||
return Document(
|
||||
content="".join(text),
|
||||
chunks=chunks,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_file = "/path/to/your/file.csv"
|
||||
parser = CSVParser()
|
||||
with open(your_file, "rb") as f:
|
||||
content = f.read()
|
||||
document = parser.parse_into_text(content)
|
||||
logger.error(document.content)
|
||||
|
||||
for chunk in document.chunks:
|
||||
logger.error(chunk.content)
|
||||
247
docreader/parser/doc_parser.py
Normal file
@@ -0,0 +1,247 @@
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from typing import List, Optional
|
||||
|
||||
import textract
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.docx2_parser import Docx2Parser
|
||||
from docreader.utils.tempfile import TempDirContext, TempFileContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocParser(Docx2Parser):
|
||||
"""DOC document parser"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
logger.info(f"Parsing DOC document, content size: {len(content)} bytes")
|
||||
|
||||
handle_chain = [
|
||||
# 1. Try to convert to docx format to extract images
|
||||
self._parse_with_docx,
|
||||
# 2. If image extraction is not needed or conversion failed,
|
||||
# try using antiword to extract text
|
||||
self._parse_with_antiword,
|
||||
# 3. If antiword extraction fails, use textract
|
||||
self._parse_with_textract,
|
||||
]
|
||||
|
||||
# Save byte content as a temporary file
|
||||
with TempFileContext(content, ".doc") as temp_file_path:
|
||||
for handle in handle_chain:
|
||||
try:
|
||||
document = handle(temp_file_path)
|
||||
if document:
|
||||
return document
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse DOC with {handle.__name__} {e}")
|
||||
|
||||
return Document(content="")
|
||||
|
||||
def _parse_with_docx(self, temp_file_path: str) -> Document:
|
||||
logger.info("Multimodal enabled, attempting to extract images from DOC")
|
||||
|
||||
docx_content = self._try_convert_doc_to_docx(temp_file_path)
|
||||
if not docx_content:
|
||||
raise RuntimeError("Failed to convert DOC to DOCX")
|
||||
|
||||
logger.info("Successfully converted DOC to DOCX, using DocxParser")
|
||||
# Use existing DocxParser to parse the converted docx
|
||||
document = super(Docx2Parser, self).parse_into_text(docx_content)
|
||||
logger.info(f"Extracted {len(document.content)} characters using DocxParser")
|
||||
return document
|
||||
|
||||
def _parse_with_antiword(self, temp_file_path: str) -> Document:
|
||||
logger.info("Attempting to parse DOC file with antiword")
|
||||
|
||||
# Check if antiword is installed
|
||||
antiword_path = self._try_find_antiword()
|
||||
if not antiword_path:
|
||||
raise RuntimeError("antiword not found in PATH")
|
||||
|
||||
# Use antiword to extract text directly
|
||||
process = subprocess.Popen(
|
||||
[antiword_path, temp_file_path],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = process.communicate()
|
||||
if process.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
|
||||
)
|
||||
text = stdout.decode("utf-8", errors="ignore")
|
||||
logger.info(f"Successfully extracted {len(text)} characters using antiword")
|
||||
return Document(content=text)
|
||||
|
||||
def _parse_with_textract(self, temp_file_path: str) -> Document:
|
||||
logger.info(f"Parsing DOC file with textract: {temp_file_path}")
|
||||
text = textract.process(temp_file_path, method="antiword").decode("utf-8")
|
||||
logger.info(f"Successfully extracted {len(text)} bytes of DOC using textract")
|
||||
return Document(content=str(text))
|
||||
|
||||
def _try_convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
|
||||
"""Convert DOC file to DOCX format
|
||||
|
||||
Uses LibreOffice/OpenOffice for conversion
|
||||
|
||||
Args:
|
||||
doc_path: DOC file path
|
||||
|
||||
Returns:
|
||||
Byte stream of DOCX file content, or None if conversion fails
|
||||
"""
|
||||
logger.info(f"Converting DOC to DOCX: {doc_path}")
|
||||
|
||||
# Check if LibreOffice or OpenOffice is installed
|
||||
soffice_path = self._try_find_soffice()
|
||||
if not soffice_path:
|
||||
return None
|
||||
|
||||
# Execute conversion command
|
||||
logger.info(f"Using {soffice_path} to convert DOC to DOCX")
|
||||
|
||||
# Create a temporary directory to store the converted file
|
||||
with TempDirContext() as temp_dir:
|
||||
cmd = [
|
||||
soffice_path,
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"docx",
|
||||
"--outdir",
|
||||
temp_dir,
|
||||
doc_path,
|
||||
]
|
||||
logger.info(f"Running command: {' '.join(cmd)}")
|
||||
process = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
logger.warning(
|
||||
f"Error converting DOC to DOCX: {stderr.decode('utf-8')}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Find the converted file
|
||||
docx_file = [
|
||||
file for file in os.listdir(temp_dir) if file.endswith(".docx")
|
||||
]
|
||||
logger.info(f"Found {len(docx_file)} DOCX file(s) in temporary directory")
|
||||
for file in docx_file:
|
||||
converted_file = os.path.join(temp_dir, file)
|
||||
logger.info(f"Found converted file: {converted_file}")
|
||||
|
||||
# Read the converted file content
|
||||
with open(converted_file, "rb") as f:
|
||||
docx_content = f.read()
|
||||
logger.info(
|
||||
f"Successfully read DOCX file, size: {len(docx_content)}"
|
||||
)
|
||||
return docx_content
|
||||
return None
|
||||
|
||||
def _try_find_executable_path(
|
||||
self,
|
||||
executable_name: str,
|
||||
possible_path: List[str] = [],
|
||||
environment_variable: List[str] = [],
|
||||
) -> Optional[str]:
|
||||
"""Find executable path
|
||||
Args:
|
||||
executable_name: Executable name
|
||||
possible_path: List of possible paths
|
||||
environment_variable: List of environment variables to check
|
||||
Returns:
|
||||
Executable path, or None if not found
|
||||
"""
|
||||
# Common executable paths
|
||||
paths: List[str] = []
|
||||
paths.extend(possible_path)
|
||||
paths.extend(os.environ.get(env_var, "") for env_var in environment_variable)
|
||||
paths = list(set(paths))
|
||||
|
||||
# Check if path is set in environment variable
|
||||
for path in paths:
|
||||
if os.path.exists(path):
|
||||
logger.info(f"Found {executable_name} at {path}")
|
||||
return path
|
||||
|
||||
# Try to find in PATH
|
||||
result = subprocess.run(
|
||||
["which", executable_name], capture_output=True, text=True
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
path = result.stdout.strip()
|
||||
logger.info(f"Found {executable_name} at {path}")
|
||||
return path
|
||||
|
||||
logger.warning(f"Failed to find {executable_name}")
|
||||
return None
|
||||
|
||||
def _try_find_soffice(self) -> Optional[str]:
|
||||
"""Find LibreOffice/OpenOffice executable path
|
||||
|
||||
Returns:
|
||||
Executable path, or None if not found
|
||||
"""
|
||||
# Common LibreOffice/OpenOffice executable paths
|
||||
possible_paths = [
|
||||
# Linux
|
||||
"/usr/bin/soffice",
|
||||
"/usr/lib/libreoffice/program/soffice",
|
||||
"/opt/libreoffice25.2/program/soffice",
|
||||
# macOS
|
||||
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
||||
# Windows
|
||||
"C:\\Program Files\\LibreOffice\\program\\soffice.exe",
|
||||
"C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
|
||||
]
|
||||
return self._try_find_executable_path(
|
||||
executable_name="soffice",
|
||||
possible_path=possible_paths,
|
||||
environment_variable=["LIBREOFFICE_PATH"],
|
||||
)
|
||||
|
||||
def _try_find_antiword(self) -> Optional[str]:
|
||||
"""Find antiword executable path
|
||||
|
||||
Returns:
|
||||
Executable path, or None if not found
|
||||
"""
|
||||
# Common antiword executable paths
|
||||
possible_paths = [
|
||||
# Linux/macOS
|
||||
"/usr/bin/antiword",
|
||||
"/usr/local/bin/antiword",
|
||||
# Windows
|
||||
"C:\\Program Files\\Antiword\\antiword.exe",
|
||||
"C:\\Program Files (x86)\\Antiword\\antiword.exe",
|
||||
]
|
||||
return self._try_find_executable_path(
|
||||
executable_name="antiword",
|
||||
possible_path=possible_paths,
|
||||
environment_variable=["ANTIWORD_PATH"],
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
file_name = "/path/to/your/test.doc"
|
||||
logger.info(f"Processing file: {file_name}")
|
||||
doc_parser = DocParser(
|
||||
file_name=file_name,
|
||||
enable_multimodal=True,
|
||||
chunk_size=512,
|
||||
chunk_overlap=60,
|
||||
)
|
||||
with open(file_name, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
document = doc_parser.parse_into_text(content)
|
||||
logger.info(f"Processing complete, extracted text length: {len(document.content)}")
|
||||
logger.info(f"Sample text: {document.content[:200]}...")
|
||||
28
docreader/parser/docx2_parser.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import logging
|
||||
|
||||
from docreader.parser.chain_parser import FirstParser
|
||||
from docreader.parser.docx_parser import DocxParser
|
||||
from docreader.parser.markitdown_parser import MarkitdownParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Docx2Parser(FirstParser):
|
||||
_parser_cls = (MarkitdownParser, DocxParser)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_file = "/path/to/your/file.docx"
|
||||
parser = Docx2Parser(separators=[".", "?", "!", "。", "?", "!"])
|
||||
with open(your_file, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
document = parser.parse(content)
|
||||
for cc in document.chunks:
|
||||
logger.info(f"chunk: {cc}")
|
||||
|
||||
# document = parser.parse_into_text(content)
|
||||
# logger.info(f"docx content: {document.content}")
|
||||
# logger.info(f"find images {document.images.keys()}")
|
||||
@@ -1,37 +1,36 @@
|
||||
import logging
|
||||
import tempfile
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from io import BytesIO
|
||||
from typing import Optional, Dict, Any, Tuple, List, Union
|
||||
from dataclasses import dataclass, field
|
||||
from PIL import Image
|
||||
from docx import Document
|
||||
from docx.image.exceptions import (
|
||||
UnrecognizedImageError,
|
||||
UnexpectedEndOfFileError,
|
||||
InvalidImageStreamError,
|
||||
)
|
||||
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
|
||||
import re
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from dataclasses import dataclass, field
|
||||
from io import BytesIO
|
||||
from multiprocessing import Manager
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from .base_parser import BaseParser
|
||||
from docx import Document
|
||||
from docx.image.exceptions import (
|
||||
InvalidImageStreamError,
|
||||
UnexpectedEndOfFileError,
|
||||
UnrecognizedImageError,
|
||||
)
|
||||
from PIL import Image
|
||||
|
||||
from docreader.models.document import Document as DocumentModel
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
# Add thread local storage to track the processing status of each thread
|
||||
thread_local = threading.local()
|
||||
|
||||
|
||||
class ImageData:
|
||||
"""Represents a processed image of document content"""
|
||||
|
||||
local_path: str = ""
|
||||
object: Image.Image = None
|
||||
object: Optional[Image.Image] = None
|
||||
url: str = ""
|
||||
|
||||
|
||||
@@ -40,7 +39,9 @@ class LineData:
|
||||
"""Represents a processed line of document content with associated images"""
|
||||
|
||||
text: str = "" # Extracted text content
|
||||
images: List[ImageData] = field(default_factory=list) # List of images or image paths
|
||||
images: List[ImageData] = field(
|
||||
default_factory=list
|
||||
) # List of images or image paths
|
||||
extra_info: str = "" # Placeholder for additional info (currently unused)
|
||||
page_num: int = 0 # Page number
|
||||
content_sequence: List[Tuple[str, Any]] = field(
|
||||
@@ -53,18 +54,8 @@ class DocxParser(BaseParser):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_name: str = "",
|
||||
file_type: str = None,
|
||||
enable_multimodal: bool = True,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
separators: list = ["\n\n", "\n", "。"],
|
||||
ocr_backend: str = "paddle",
|
||||
ocr_config: dict = None,
|
||||
max_image_size: int = 1920,
|
||||
max_concurrent_tasks: int = 5,
|
||||
max_pages: int = 100, # Maximum number of pages to process, default to 50 pages
|
||||
chunking_config=None,
|
||||
max_pages: int = 100, # Maximum number of pages to process
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize DOCX document parser
|
||||
|
||||
@@ -79,37 +70,16 @@ class DocxParser(BaseParser):
|
||||
ocr_config: OCR engine configuration
|
||||
max_image_size: Maximum image size limit
|
||||
max_concurrent_tasks: Maximum number of concurrent tasks
|
||||
max_pages: Maximum number of pages to process, if more than this, only process the first max_pages pages
|
||||
max_pages: Maximum number of pages to process
|
||||
"""
|
||||
super().__init__(
|
||||
file_name=file_name,
|
||||
file_type=file_type,
|
||||
enable_multimodal=enable_multimodal,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
separators=separators,
|
||||
ocr_backend=ocr_backend,
|
||||
ocr_config=ocr_config,
|
||||
max_image_size=max_image_size,
|
||||
max_concurrent_tasks=max_concurrent_tasks,
|
||||
chunking_config=chunking_config,
|
||||
)
|
||||
super().__init__(**kwargs)
|
||||
self.max_pages = max_pages
|
||||
logger.info(f"DocxParser initialized with max_pages={max_pages}")
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
"""Parse DOCX document, extract text content and image Markdown links
|
||||
|
||||
Args:
|
||||
content: DOCX document content
|
||||
|
||||
Returns:
|
||||
Tuple of (parsed_text, image_map) where image_map maps image URLs to Image objects
|
||||
All LineData objects are used internally but not returned directly through this interface
|
||||
"""
|
||||
def parse_into_text(self, content: bytes) -> DocumentModel:
|
||||
"""Parse DOCX document, extract text content and image Markdown links"""
|
||||
logger.info(f"Parsing DOCX document, content size: {len(content)} bytes")
|
||||
logger.info(f"Max pages limit set to: {self.max_pages}")
|
||||
logger.info("Converting DOCX content to sections and tables")
|
||||
|
||||
start_time = time.time()
|
||||
# Use concurrent processing to handle the document
|
||||
@@ -123,7 +93,7 @@ class DocxParser(BaseParser):
|
||||
docx_processor = Docx(
|
||||
max_image_size=self.max_image_size,
|
||||
enable_multimodal=self.enable_multimodal,
|
||||
upload_file=self.upload_file,
|
||||
upload_file=self.storage.upload_file,
|
||||
)
|
||||
all_lines, tables = docx_processor(
|
||||
binary=content,
|
||||
@@ -140,7 +110,7 @@ class DocxParser(BaseParser):
|
||||
section_start_time = time.time()
|
||||
|
||||
text_parts = []
|
||||
image_parts = {}
|
||||
image_parts: Dict[str, str] = {}
|
||||
|
||||
for sec_idx, line in enumerate(all_lines):
|
||||
try:
|
||||
@@ -148,16 +118,19 @@ class DocxParser(BaseParser):
|
||||
text_parts.append(line.text)
|
||||
if sec_idx < 3 or sec_idx % 50 == 0:
|
||||
logger.info(
|
||||
f"Added section {sec_idx+1} text: {line.text[:50]}..."
|
||||
f"Added section {sec_idx + 1} text: {line.text[:50]}..."
|
||||
if len(line.text) > 50
|
||||
else f"Added section {sec_idx+1} text: {line.text}"
|
||||
else f"Added section {sec_idx + 1} text: {line.text}"
|
||||
)
|
||||
if line.images:
|
||||
for image_data in line.images:
|
||||
if image_data.url:
|
||||
image_parts[image_data.url] = image_data.object
|
||||
if image_data.url and image_data.object:
|
||||
image_parts[image_data.url] = endecode.decode_image(
|
||||
image_data.object
|
||||
)
|
||||
image_data.object.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing section {sec_idx+1}: {str(e)}")
|
||||
logger.error(f"Error processing section {sec_idx + 1}: {str(e)}")
|
||||
logger.error(f"Detailed stack trace: {traceback.format_exc()}")
|
||||
continue
|
||||
|
||||
@@ -176,17 +149,17 @@ class DocxParser(BaseParser):
|
||||
|
||||
total_processing_time = time.time() - start_time
|
||||
logger.info(
|
||||
f"Parsing complete in {total_processing_time:.2f}s, generated {len(text)} characters of text "
|
||||
f"Parsing complete in {total_processing_time:.2f}s, "
|
||||
f"generated {len(text)} characters of text"
|
||||
)
|
||||
|
||||
return text, image_parts
|
||||
return DocumentModel(content=text, images=image_parts)
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing DOCX document: {str(e)}")
|
||||
logger.error(f"Detailed stack trace: {traceback.format_exc()}")
|
||||
fallback_text = self._parse_using_simple_method(content)
|
||||
return fallback_text, {}
|
||||
return self._parse_using_simple_method(content)
|
||||
|
||||
def _parse_using_simple_method(self, content: bytes) -> str:
|
||||
def _parse_using_simple_method(self, content: bytes) -> DocumentModel:
|
||||
"""Parse document using a simplified method, as a fallback
|
||||
|
||||
Args:
|
||||
@@ -201,7 +174,8 @@ class DocxParser(BaseParser):
|
||||
doc = Document(BytesIO(content))
|
||||
logger.info(
|
||||
f"Successfully loaded document in simplified method, "
|
||||
f"contains {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables"
|
||||
f"contains {len(doc.paragraphs)} paragraphs "
|
||||
f"and {len(doc.tables)} tables"
|
||||
)
|
||||
text_parts = []
|
||||
|
||||
@@ -211,7 +185,7 @@ class DocxParser(BaseParser):
|
||||
para_with_text = 0
|
||||
for i, para in enumerate(doc.paragraphs):
|
||||
if i % 100 == 0:
|
||||
logger.info(f"Processing paragraph {i+1}/{para_count}")
|
||||
logger.info(f"Processing paragraph {i + 1}/{para_count}")
|
||||
if para.text.strip():
|
||||
text_parts.append(para.text.strip())
|
||||
para_with_text += 1
|
||||
@@ -225,7 +199,7 @@ class DocxParser(BaseParser):
|
||||
rows_processed = 0
|
||||
for i, table in enumerate(doc.tables):
|
||||
if i % 10 == 0:
|
||||
logger.info(f"Processing table {i+1}/{table_count}")
|
||||
logger.info(f"Processing table {i + 1}/{table_count}")
|
||||
|
||||
table_has_content = False
|
||||
for row in table.rows:
|
||||
@@ -256,25 +230,24 @@ class DocxParser(BaseParser):
|
||||
# If the result is still empty, return an error message
|
||||
if not result_text:
|
||||
logger.warning("No text extracted using simplified method")
|
||||
return "", {}
|
||||
return DocumentModel()
|
||||
|
||||
return result_text, {}
|
||||
return DocumentModel(content=result_text)
|
||||
except Exception as backup_error:
|
||||
processing_time = time.time() - start_time
|
||||
logger.error(
|
||||
f"Simplified parsing failed after {processing_time:.2f}s: {str(backup_error)}"
|
||||
f"Simplified parsing failed {processing_time:.2f}s: {backup_error}"
|
||||
)
|
||||
logger.error(f"Detailed traceback: {traceback.format_exc()}")
|
||||
return "", {}
|
||||
return DocumentModel()
|
||||
|
||||
|
||||
class Docx:
|
||||
def __init__(self, max_image_size=1920, enable_multimodal=False, upload_file=None):
|
||||
logger.info("Initializing DOCX processor")
|
||||
self.max_image_size = max_image_size # Maximum image size limit
|
||||
self.picture_cache = (
|
||||
{}
|
||||
) # Image cache to avoid processing the same image repeatedly
|
||||
# Image cache to avoid processing the same image repeatedly
|
||||
self.picture_cache = {}
|
||||
self.enable_multimodal = enable_multimodal
|
||||
self.upload_file = upload_file
|
||||
|
||||
@@ -454,7 +427,6 @@ class Docx:
|
||||
|
||||
return page_to_paragraphs
|
||||
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
binary: Optional[bytes] = None,
|
||||
@@ -611,7 +583,6 @@ class Docx:
|
||||
|
||||
return pages_to_process
|
||||
|
||||
|
||||
def _process_document(
|
||||
self,
|
||||
binary,
|
||||
@@ -806,7 +777,9 @@ class Docx:
|
||||
# Collect temporary image paths for later cleanup
|
||||
for line in page_lines:
|
||||
for image_data in line.images:
|
||||
if image_data.local_path and image_data.local_path.startswith("/tmp/docx_img_"):
|
||||
if image_data.local_path and image_data.local_path.startswith(
|
||||
"/tmp/docx_img_"
|
||||
):
|
||||
temp_img_paths.add(image_data.local_path)
|
||||
|
||||
results.extend(page_lines)
|
||||
@@ -876,7 +849,11 @@ class Docx:
|
||||
|
||||
# Process all image data objects
|
||||
for image_data in image_paths:
|
||||
if image_data.local_path and os.path.exists(image_data.local_path) and image_data.local_path not in image_url_map:
|
||||
if (
|
||||
image_data.local_path
|
||||
and os.path.exists(image_data.local_path)
|
||||
and image_data.local_path not in image_url_map
|
||||
):
|
||||
try:
|
||||
# Upload the image if it doesn't have a URL yet
|
||||
if not image_data.url:
|
||||
@@ -886,12 +863,16 @@ class Docx:
|
||||
image_data.url = image_url
|
||||
# Add image URL as Markdown format
|
||||
markdown_image = f""
|
||||
image_url_map[image_data.local_path] = markdown_image
|
||||
image_url_map[image_data.local_path] = (
|
||||
markdown_image
|
||||
)
|
||||
logger.info(
|
||||
f"Added image URL for {image_data.local_path}: {image_url}"
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Failed to upload image: {image_data.local_path}")
|
||||
logger.warning(
|
||||
f"Failed to upload image: {image_data.local_path}"
|
||||
)
|
||||
else:
|
||||
# Already has a URL, use it
|
||||
markdown_image = f""
|
||||
@@ -925,12 +906,19 @@ class Docx:
|
||||
# For ImageData objects, use the URL
|
||||
if isinstance(content, str) and content in image_url_map:
|
||||
combined_parts.append(image_url_map[content])
|
||||
elif hasattr(content, 'local_path') and content.local_path in image_url_map:
|
||||
elif (
|
||||
hasattr(content, "local_path")
|
||||
and content.local_path in image_url_map
|
||||
):
|
||||
combined_parts.append(image_url_map[content.local_path])
|
||||
|
||||
# Create the final text with proper ordering
|
||||
final_text = "\n\n".join(part for part in combined_parts if part)
|
||||
processed_lines.append(LineData(text=final_text, page_num=page_num, images=line_data.images))
|
||||
processed_lines.append(
|
||||
LineData(
|
||||
text=final_text, page_num=page_num, images=line_data.images
|
||||
)
|
||||
)
|
||||
else:
|
||||
processed_lines = lines
|
||||
|
||||
@@ -1003,11 +991,11 @@ class Docx:
|
||||
logger.info(f"Processing {table_count} tables")
|
||||
for tb_idx, tb in enumerate(self.doc.tables):
|
||||
if tb_idx % 10 == 0: # Log only every 10 tables to reduce log volume
|
||||
logger.info(f"Processing table {tb_idx+1}/{table_count}")
|
||||
logger.info(f"Processing table {tb_idx + 1}/{table_count}")
|
||||
|
||||
# Optimize: Check if table is empty
|
||||
if len(tb.rows) == 0 or all(len(r.cells) == 0 for r in tb.rows):
|
||||
logger.info(f"Skipping empty table {tb_idx+1}")
|
||||
logger.info(f"Skipping empty table {tb_idx + 1}")
|
||||
continue
|
||||
|
||||
table_html = self._convert_table_to_html(tb)
|
||||
@@ -1111,8 +1099,8 @@ def _save_image_to_temp(logger, image, page_num, img_idx):
|
||||
if not image:
|
||||
return None
|
||||
|
||||
import tempfile
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
try:
|
||||
# Create a temporary file
|
||||
@@ -1187,8 +1175,15 @@ def process_page_multiprocess(
|
||||
return []
|
||||
|
||||
# Extract page content
|
||||
combined_text, image_objects, content_sequence = _extract_page_content_in_process(
|
||||
process_logger, doc, page_num, paragraphs, enable_multimodal, max_image_size
|
||||
combined_text, image_objects, content_sequence = (
|
||||
_extract_page_content_in_process(
|
||||
process_logger,
|
||||
doc,
|
||||
page_num,
|
||||
paragraphs,
|
||||
enable_multimodal,
|
||||
max_image_size,
|
||||
)
|
||||
)
|
||||
|
||||
# Process content sequence to maintain order between processes
|
||||
@@ -1199,7 +1194,9 @@ def process_page_multiprocess(
|
||||
if enable_multimodal:
|
||||
# First pass: save all images to temporary files
|
||||
for i, image_object in enumerate(image_objects):
|
||||
img_path = _save_image_to_temp(process_logger, image_object, page_num, i)
|
||||
img_path = _save_image_to_temp(
|
||||
process_logger, image_object, page_num, i
|
||||
)
|
||||
if img_path:
|
||||
# Create ImageData object
|
||||
image_data = ImageData()
|
||||
54
docreader/parser/excel_parser.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docreader.models.document import Chunk, Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExcelParser(BaseParser):
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
chunks: List[Chunk] = []
|
||||
text: List[str] = []
|
||||
start, end = 0, 0
|
||||
|
||||
excel_file = pd.ExcelFile(BytesIO(content))
|
||||
for excel_sheet_name in excel_file.sheet_names:
|
||||
df = excel_file.parse(sheet_name=excel_sheet_name)
|
||||
df.dropna(how="all", inplace=True)
|
||||
|
||||
for _, row in df.iterrows():
|
||||
page_content = []
|
||||
for k, v in row.items():
|
||||
if pd.notna(v):
|
||||
page_content.append(f"{k}: {v}")
|
||||
if not page_content:
|
||||
continue
|
||||
content_row = ",".join(page_content) + "\n"
|
||||
end += len(content_row)
|
||||
text.append(content_row)
|
||||
chunks.append(
|
||||
Chunk(content=content_row, seq=len(chunks), start=start, end=end)
|
||||
)
|
||||
start = end
|
||||
|
||||
return Document(content="".join(text), chunks=chunks)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_file = "/path/to/your/file.xlsx"
|
||||
parser = ExcelParser()
|
||||
with open(your_file, "rb") as f:
|
||||
content = f.read()
|
||||
document = parser.parse_into_text(content)
|
||||
logger.error(document.content)
|
||||
|
||||
for chunk in document.chunks:
|
||||
logger.error(chunk.content)
|
||||
break
|
||||
44
docreader/parser/image_parser.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
|
||||
# Set up logger for this module
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ImageParser(BaseParser):
|
||||
"""
|
||||
Parser for image files with OCR capability.
|
||||
Extracts text from images and generates captions.
|
||||
|
||||
This parser handles image processing by:
|
||||
1. Uploading the image to storage
|
||||
2. Generating a descriptive caption
|
||||
3. Performing OCR to extract text content
|
||||
4. Returning a combined result with both text and image reference
|
||||
"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""
|
||||
Parse image content into markdown text
|
||||
:param content: bytes content of the image
|
||||
:return: Document object
|
||||
"""
|
||||
logger.info(f"Parsing image content, size: {len(content)} bytes")
|
||||
|
||||
# Get file extension
|
||||
ext = os.path.splitext(self.file_name)[1].lower()
|
||||
|
||||
# Upload image to storage
|
||||
image_url = self.storage.upload_bytes(content, file_ext=ext)
|
||||
logger.info(f"Successfully uploaded image, URL: {image_url[:50]}...")
|
||||
|
||||
# Generate markdown text
|
||||
text = f""
|
||||
images = {image_url: base64.b64encode(content).decode()}
|
||||
|
||||
# Create image object and add to map
|
||||
return Document(content=text, images=images)
|
||||
228
docreader/parser/markdown_parser.py
Normal file
@@ -0,0 +1,228 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from typing import Dict, List, Match, Optional, Tuple
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
# Get logger object
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownTableUtil:
|
||||
def __init__(self):
|
||||
self.align_pattern = re.compile(
|
||||
r"^([\t ]*)\|[\t ]*[:-]+(?:[\t ]*\|[\t ]*[:-]+)*[\t ]*\|[\t ]*$",
|
||||
re.MULTILINE,
|
||||
)
|
||||
self.line_pattern = re.compile(
|
||||
r"^([\t ]*)\|[\t ]*[^|\r\n]*(?:[\t ]*\|[^|\r\n]*)*\|[\t ]*$",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
def format_table(self, content: str) -> str:
|
||||
def process_align(match: Match[str]) -> str:
|
||||
columns = [col.strip() for col in match.group(0).split("|") if col.strip()]
|
||||
|
||||
processed = []
|
||||
for col in columns:
|
||||
left_colon = ":" if col.startswith(":") else ""
|
||||
right_colon = ":" if col.endswith(":") else ""
|
||||
processed.append(left_colon + "---" + right_colon)
|
||||
|
||||
prefix = match.group(1)
|
||||
return prefix + "| " + " | ".join(processed) + " |"
|
||||
|
||||
def process_line(match: Match[str]) -> str:
|
||||
columns = [col.strip() for col in match.group(0).split("|") if col.strip()]
|
||||
|
||||
prefix = match.group(1)
|
||||
return prefix + "| " + " | ".join(columns) + " |"
|
||||
|
||||
formatted_content = content
|
||||
formatted_content = self.line_pattern.sub(process_line, formatted_content)
|
||||
formatted_content = self.align_pattern.sub(process_align, formatted_content)
|
||||
|
||||
return formatted_content
|
||||
|
||||
@staticmethod
|
||||
def _self_test():
|
||||
test_content = """
|
||||
# 测试表格
|
||||
普通文本---不会被匹配
|
||||
|
||||
## 表格1(无前置空格)
|
||||
|
||||
| 姓名 | 年龄 | 城市 |
|
||||
| :---------- | -------: | :------ |
|
||||
| 张三 | 25 | 北京 |
|
||||
|
||||
## 表格3(前置4个空格+首尾|)
|
||||
| 产品 | 价格 | 库存 |
|
||||
| :-------------: | ----------- | :-----------: |
|
||||
| 手机 | 5999 | 100 |
|
||||
"""
|
||||
util = MarkdownTableUtil()
|
||||
format_content = util.format_table(test_content)
|
||||
print(format_content)
|
||||
|
||||
|
||||
class MarkdownTableFormatter(BaseParser):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.table_helper = MarkdownTableUtil()
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
text = endecode.decode_bytes(content)
|
||||
text = self.table_helper.format_table(text)
|
||||
return Document(content=text)
|
||||
|
||||
|
||||
class MarkdownImageUtil:
|
||||
def __init__(self):
|
||||
self.b64_pattern = re.compile(
|
||||
r"!\[([^\]]*)\]\(data:image/(\w+)\+?\w*;base64,([^\)]+)\)"
|
||||
)
|
||||
self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
|
||||
self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
|
||||
|
||||
def extract_image(
|
||||
self,
|
||||
content: str,
|
||||
path_prefix: Optional[str] = None,
|
||||
replace: bool = True,
|
||||
) -> Tuple[str, List[str]]:
|
||||
"""Extract base64 encoded images from Markdown content"""
|
||||
|
||||
# image_path => base64 bytes
|
||||
images: List[str] = []
|
||||
|
||||
def repl(match: Match[str]) -> str:
|
||||
title = match.group(1)
|
||||
image_path = match.group(2)
|
||||
if path_prefix:
|
||||
image_path = f"{path_prefix}/{image_path}"
|
||||
|
||||
images.append(image_path)
|
||||
|
||||
if not replace:
|
||||
return match.group(0)
|
||||
|
||||
# Replace image path with URL
|
||||
return f""
|
||||
|
||||
text = self.image_pattern.sub(repl, content)
|
||||
logger.debug(f"Extracted {len(images)} images from markdown")
|
||||
return text, images
|
||||
|
||||
def extract_base64(
|
||||
self,
|
||||
content: str,
|
||||
path_prefix: Optional[str] = None,
|
||||
replace: bool = True,
|
||||
) -> Tuple[str, Dict[str, bytes]]:
|
||||
"""Extract base64 encoded images from Markdown content"""
|
||||
|
||||
# image_path => base64 bytes
|
||||
images: Dict[str, bytes] = {}
|
||||
|
||||
def repl(match: Match[str]) -> str:
|
||||
title = match.group(1)
|
||||
img_ext = match.group(2)
|
||||
img_b64 = match.group(3)
|
||||
|
||||
image_byte = endecode.encode_image(img_b64, errors="ignore")
|
||||
if not image_byte:
|
||||
logger.error(f"Failed to decode base64 image skip it: {img_b64}")
|
||||
return title
|
||||
|
||||
image_path = f"{uuid.uuid4()}.{img_ext}"
|
||||
if path_prefix:
|
||||
image_path = f"{path_prefix}/{image_path}"
|
||||
images[image_path] = image_byte
|
||||
|
||||
if not replace:
|
||||
return match.group(0)
|
||||
|
||||
# Replace image path with URL
|
||||
return f""
|
||||
|
||||
text = self.b64_pattern.sub(repl, content)
|
||||
logger.debug(f"Extracted {len(images)} base64 images from markdown")
|
||||
return text, images
|
||||
|
||||
def replace_path(self, content: str, images: Dict[str, str]) -> str:
|
||||
content_replace: set = set()
|
||||
|
||||
def repl(match: Match[str]) -> str:
|
||||
title = match.group(1)
|
||||
image_path = match.group(2)
|
||||
if image_path not in images:
|
||||
return match.group(0)
|
||||
|
||||
content_replace.add(image_path)
|
||||
image_path = images[image_path]
|
||||
return f""
|
||||
|
||||
text = self.replace_pattern.sub(repl, content)
|
||||
logger.debug(f"Replaced {len(content_replace)} images in markdown")
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def _self_test():
|
||||
your_content = "testtest"
|
||||
image_handle = MarkdownImageUtil()
|
||||
text, images = image_handle.extract_base64(your_content)
|
||||
print(text)
|
||||
|
||||
for image_url, image_byte in images.items():
|
||||
with open(image_url, "wb") as f:
|
||||
f.write(image_byte)
|
||||
|
||||
|
||||
class MarkdownImageBase64(BaseParser):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.image_helper = MarkdownImageUtil()
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
# Convert byte content to string using universal decoding method
|
||||
text = endecode.decode_bytes(content)
|
||||
text, img_b64 = self.image_helper.extract_base64(text, path_prefix="images")
|
||||
|
||||
images: Dict[str, str] = {}
|
||||
image_replace: Dict[str, str] = {}
|
||||
|
||||
logger.debug(f"Uploading {len(img_b64)} images from markdown")
|
||||
for ipath, b64_bytes in img_b64.items():
|
||||
ext = os.path.splitext(ipath)[1].lower()
|
||||
image_url = self.storage.upload_bytes(b64_bytes, ext)
|
||||
|
||||
image_replace[ipath] = image_url
|
||||
images[image_url] = base64.b64encode(b64_bytes).decode()
|
||||
|
||||
text = self.image_helper.replace_path(text, image_replace)
|
||||
return Document(content=text, images=images)
|
||||
|
||||
|
||||
class MarkdownParser(PipelineParser):
|
||||
_parser_cls = (MarkdownTableFormatter, MarkdownImageBase64)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_content = "testtest"
|
||||
parser = MarkdownParser()
|
||||
|
||||
document = parser.parse_into_text(your_content.encode())
|
||||
logger.info(document.content)
|
||||
logger.info(f"Images: {len(document.images)}, name: {document.images.keys()}")
|
||||
|
||||
MarkdownImageUtil._self_test()
|
||||
MarkdownTableUtil._self_test()
|
||||
31
docreader/parser/markitdown_parser.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import io
|
||||
import logging
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StdMarkitdownParser(BaseParser):
|
||||
"""
|
||||
PDF Document Parser
|
||||
|
||||
This parser handles PDF documents by extracting text content.
|
||||
It uses the markitdown library for simple text extraction.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.markitdown = MarkItDown()
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
result = self.markitdown.convert(io.BytesIO(content), keep_data_uris=True)
|
||||
return Document(content=result.text_content)
|
||||
|
||||
|
||||
class MarkitdownParser(PipelineParser):
|
||||
_parser_cls = (StdMarkitdownParser, MarkdownParser)
|
||||
132
docreader/parser/mineru_parser.py
Normal file
@@ -0,0 +1,132 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Dict
|
||||
|
||||
import markdownify
|
||||
import requests
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.parser.markdown_parser import MarkdownImageUtil, MarkdownTableFormatter
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StdMinerUParser(BaseParser):
|
||||
def __init__(
|
||||
self,
|
||||
enable_markdownify: bool = True,
|
||||
mineru_endpoint: str = "",
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.minerU = os.getenv("MINERU_ENDPOINT", mineru_endpoint)
|
||||
self.enable_markdownify = enable_markdownify
|
||||
self.image_helper = MarkdownImageUtil()
|
||||
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
|
||||
self.enable = self.ping()
|
||||
|
||||
def ping(self, timeout: int = 5) -> bool:
|
||||
try:
|
||||
response = requests.get(
|
||||
self.minerU + "/docs", timeout=timeout, allow_redirects=True
|
||||
)
|
||||
response.raise_for_status()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
if not self.enable:
|
||||
logger.debug("MinerU API is not enabled")
|
||||
return Document()
|
||||
|
||||
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
|
||||
md_content: str = ""
|
||||
images_b64: Dict[str, str] = {}
|
||||
try:
|
||||
response = requests.post(
|
||||
url=self.minerU + "/file_parse",
|
||||
data={
|
||||
"return_md": True,
|
||||
"return_images": True,
|
||||
"lang_list": ["ch", "en"],
|
||||
"table_enable": True,
|
||||
"formula_enable": True,
|
||||
"parse_method": "auto",
|
||||
"start_page_id": 0,
|
||||
"end_page_id": 99999,
|
||||
"backend": "pipeline",
|
||||
"response_format_zip": False,
|
||||
"return_middle_json": False,
|
||||
"return_model_output": False,
|
||||
"return_content_list": False,
|
||||
},
|
||||
files={"files": content},
|
||||
timeout=1000,
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()["results"]["files"]
|
||||
md_content = result["md_content"]
|
||||
images_b64 = result.get("images", {})
|
||||
except Exception as e:
|
||||
logger.error(f"MinerU parsing failed: {e}", exc_info=True)
|
||||
return Document()
|
||||
|
||||
# convert table(HTML) in markdown to markdown table
|
||||
if self.enable_markdownify:
|
||||
logger.debug("Converting HTML to Markdown")
|
||||
md_content = markdownify.markdownify(md_content)
|
||||
|
||||
images = {}
|
||||
image_replace = {}
|
||||
# image in images_bs64 may not be used in md_content
|
||||
# such as: table ...
|
||||
# so we need to filter them
|
||||
for ipath, b64_str in images_b64.items():
|
||||
if f"images/{ipath}" not in md_content:
|
||||
logger.debug(f"Image {ipath} not used in markdown")
|
||||
continue
|
||||
match = self.base64_pattern.match(b64_str)
|
||||
if match:
|
||||
file_ext = match.group(1)
|
||||
b64_str = match.group(2)
|
||||
|
||||
image_bytes = endecode.encode_image(b64_str, errors="ignore")
|
||||
if not image_bytes:
|
||||
logger.error("Failed to decode base64 image skip it")
|
||||
continue
|
||||
|
||||
image_url = self.storage.upload_bytes(
|
||||
image_bytes, file_ext=f".{file_ext}"
|
||||
)
|
||||
|
||||
images[image_url] = b64_str
|
||||
image_replace[f"images/{ipath}"] = image_url
|
||||
|
||||
logger.info(f"Replaced {len(image_replace)} images in markdown")
|
||||
text = self.image_helper.replace_path(md_content, image_replace)
|
||||
|
||||
logger.info(
|
||||
f"Successfully parsed PDF, text: {len(text)}, images: {len(images)}"
|
||||
)
|
||||
return Document(content=text, images=images)
|
||||
|
||||
|
||||
class MinerUParser(PipelineParser):
|
||||
_parser_cls = (StdMinerUParser, MarkdownTableFormatter)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
your_file = "/path/to/your/file.pdf"
|
||||
your_mineru = "http://host.docker.internal:9987"
|
||||
parser = MinerUParser(mineru_endpoint=your_mineru)
|
||||
with open(your_file, "rb") as f:
|
||||
content = f.read()
|
||||
document = parser.parse_into_text(content)
|
||||
logger.error(document.content)
|
||||
327
docreader/parser/ocr_engine.py
Normal file
@@ -0,0 +1,327 @@
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Union
|
||||
|
||||
import numpy as np
|
||||
from openai import OpenAI
|
||||
from PIL import Image
|
||||
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OCRBackend(ABC):
|
||||
"""Base class for OCR backends"""
|
||||
|
||||
@abstractmethod
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class DummyOCRBackend(OCRBackend):
|
||||
"""Dummy OCR backend implementation"""
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
logger.warning("Dummy OCR backend is used")
|
||||
return ""
|
||||
|
||||
|
||||
class PaddleOCRBackend(OCRBackend):
|
||||
"""PaddleOCR backend implementation"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize PaddleOCR backend"""
|
||||
self.ocr = None
|
||||
try:
|
||||
import paddle
|
||||
|
||||
# Set PaddlePaddle to use CPU and disable GPU
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||||
paddle.device.set_device("cpu")
|
||||
|
||||
# 尝试检测CPU是否支持AVX指令集
|
||||
try:
|
||||
# 检测CPU是否支持AVX
|
||||
if platform.system() == "Linux":
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["grep", "-o", "avx", "/proc/cpuinfo"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
has_avx = "avx" in result.stdout.lower()
|
||||
if not has_avx:
|
||||
logger.warning(
|
||||
"CPU does not support AVX instructions, "
|
||||
"using compatibility mode"
|
||||
)
|
||||
# 进一步限制指令集使用
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
except (
|
||||
subprocess.TimeoutExpired,
|
||||
FileNotFoundError,
|
||||
subprocess.SubprocessError,
|
||||
):
|
||||
logger.warning(
|
||||
"Could not detect AVX support, using compatibility mode"
|
||||
)
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error detecting CPU capabilities: {e}, using compatibility mode"
|
||||
)
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
# OCR configuration with text orientation classification enabled
|
||||
ocr_config = {
|
||||
"use_gpu": False,
|
||||
"text_det_limit_type": "max",
|
||||
"text_det_limit_side_len": 960,
|
||||
"use_doc_orientation_classify": True, # 启用文档方向分类
|
||||
"use_doc_unwarping": False,
|
||||
"use_textline_orientation": True, # 启用文本行方向检测
|
||||
"text_recognition_model_name": "PP-OCRv4_server_rec",
|
||||
"text_detection_model_name": "PP-OCRv4_server_det",
|
||||
"text_det_thresh": 0.3,
|
||||
"text_det_box_thresh": 0.6,
|
||||
"text_det_unclip_ratio": 1.5,
|
||||
"text_rec_score_thresh": 0.0,
|
||||
"ocr_version": "PP-OCRv4",
|
||||
"lang": "ch",
|
||||
"show_log": False,
|
||||
"use_dilation": True, # improves accuracy
|
||||
"det_db_score_mode": "slow", # improves accuracy
|
||||
}
|
||||
|
||||
self.ocr = PaddleOCR(**ocr_config)
|
||||
logger.info("PaddleOCR engine initialized successfully")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(
|
||||
f"Failed to import paddleocr: {str(e)}. "
|
||||
"Please install it with 'pip install paddleocr'"
|
||||
)
|
||||
except OSError as e:
|
||||
if "Illegal instruction" in str(e) or "core dumped" in str(e):
|
||||
logger.error(
|
||||
f"PaddlePaddle crashed due to CPU instruction set incompatibility:"
|
||||
f"{e}"
|
||||
)
|
||||
logger.error(
|
||||
"This happens when the CPU doesn't support AVX instructions. "
|
||||
"Try install CPU-only version of PaddlePaddle, "
|
||||
"or use a different OCR backend."
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"Failed to initialize PaddleOCR due to OS error: {str(e)}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
if isinstance(image, str):
|
||||
image = Image.open(image)
|
||||
elif isinstance(image, bytes):
|
||||
image = Image.open(io.BytesIO(image))
|
||||
|
||||
if not isinstance(image, Image.Image):
|
||||
raise TypeError("image must be a string, bytes, or PIL Image object")
|
||||
|
||||
return self._predict(image)
|
||||
|
||||
def _predict(self, image: Image.Image) -> str:
|
||||
"""Perform OCR recognition on the image
|
||||
|
||||
Args:
|
||||
image: Image object (PIL.Image or numpy array)
|
||||
|
||||
Returns:
|
||||
Extracted text string
|
||||
"""
|
||||
if self.ocr is None:
|
||||
logger.error("PaddleOCR engine not initialized")
|
||||
return ""
|
||||
try:
|
||||
# Ensure image is in RGB format
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
|
||||
# Convert to numpy array if needed
|
||||
image_array = np.array(image)
|
||||
|
||||
# Perform OCR
|
||||
ocr_result = self.ocr.ocr(image_array, cls=False)
|
||||
|
||||
# Extract text
|
||||
ocr_text = ""
|
||||
if ocr_result and ocr_result[0]:
|
||||
text = [
|
||||
line[1][0] if line and len(line) >= 2 and line[1] else ""
|
||||
for line in ocr_result[0]
|
||||
]
|
||||
text = [t.strip() for t in text if t]
|
||||
ocr_text = " ".join(text)
|
||||
|
||||
logger.info(f"OCR extracted {len(ocr_text)} characters")
|
||||
return ocr_text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR recognition error: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
class NanonetsOCRBackend(OCRBackend):
|
||||
"""Nanonets OCR backend implementation using OpenAI API format"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize Nanonets OCR backend
|
||||
|
||||
Args:
|
||||
api_key: API key for OpenAI API
|
||||
base_url: Base URL for OpenAI API
|
||||
model: Model name
|
||||
"""
|
||||
base_url = os.getenv("OCR_API_BASE_URL", "http://localhost:8000/v1")
|
||||
api_key = os.getenv("OCR_API_KEY", "123")
|
||||
timeout = 30
|
||||
self.client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout)
|
||||
|
||||
self.model = os.getenv("OCR_MODEL", "nanonets/Nanonets-OCR-s")
|
||||
logger.info(f"Nanonets OCR engine initialized with model: {self.model}")
|
||||
self.temperature = 0.0
|
||||
self.max_tokens = 15000
|
||||
self.prompt = """## 任务说明
|
||||
|
||||
请从上传的文档中提取文字内容,严格按自然阅读顺序(从上到下,从左到右)输出,并遵循以下格式规范。
|
||||
|
||||
### 1. **文本处理**
|
||||
|
||||
* 按正常阅读顺序提取文字,语句流畅自然。
|
||||
|
||||
### 2. **表格**
|
||||
|
||||
* 所有表格统一转换为 **Markdown 表格格式**。
|
||||
* 内容保持清晰、对齐整齐,便于阅读。
|
||||
|
||||
### 3. **公式**
|
||||
|
||||
* 所有公式转换为 **LaTeX 格式**,使用 `$$公式$$` 包裹。
|
||||
|
||||
### 4. **图片**
|
||||
|
||||
* 忽略图片信息
|
||||
|
||||
### 5. **链接**
|
||||
|
||||
* 不要猜测或补全不确定的链接地址。
|
||||
"""
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image using Nanonets OCR
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
if self.client is None:
|
||||
logger.error("Nanonets OCR client not initialized")
|
||||
return ""
|
||||
|
||||
try:
|
||||
# Encode image to base64
|
||||
img_base64 = endecode.decode_image(image)
|
||||
if not img_base64:
|
||||
return ""
|
||||
|
||||
# Call Nanonets OCR API
|
||||
logger.info(f"Calling Nanonets OCR API with model: {self.model}")
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{img_base64}"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": self.prompt,
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
temperature=self.temperature,
|
||||
max_tokens=self.max_tokens,
|
||||
)
|
||||
return response.choices[0].message.content or ""
|
||||
except Exception as e:
|
||||
logger.error(f"Nanonets OCR prediction error: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
class OCREngine:
|
||||
"""OCR Engine factory class"""
|
||||
|
||||
_instance: Dict[str, OCRBackend] = {}
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls, backend_type: str) -> OCRBackend:
|
||||
"""Get OCR engine instance
|
||||
|
||||
Args:
|
||||
backend_type: OCR backend type, one of: "paddle", "nanonets"
|
||||
**kwargs: Additional arguments for the backend
|
||||
|
||||
Returns:
|
||||
OCR engine instance or None if initialization fails
|
||||
"""
|
||||
backend_type = backend_type.lower()
|
||||
if cls._instance.get(backend_type):
|
||||
return cls._instance[backend_type]
|
||||
|
||||
logger.info(f"Initializing OCR engine with backend: {backend_type}")
|
||||
|
||||
if backend_type == "paddle":
|
||||
cls._instance[backend_type] = PaddleOCRBackend()
|
||||
|
||||
elif backend_type == "nanonets":
|
||||
cls._instance[backend_type] = NanonetsOCRBackend()
|
||||
|
||||
else:
|
||||
cls._instance[backend_type] = DummyOCRBackend()
|
||||
|
||||
return cls._instance[backend_type]
|
||||
163
docreader/parser/parser.py
Normal file
@@ -0,0 +1,163 @@
|
||||
import logging
|
||||
from typing import Dict, Type
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.models.read_config import ChunkingConfig
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.csv_parser import CSVParser
|
||||
from docreader.parser.doc_parser import DocParser
|
||||
from docreader.parser.docx2_parser import Docx2Parser
|
||||
from docreader.parser.excel_parser import ExcelParser
|
||||
from docreader.parser.image_parser import ImageParser
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
from docreader.parser.pdf_parser import PDFParser
|
||||
from docreader.parser.text_parser import TextParser
|
||||
from docreader.parser.web_parser import WebParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Parser:
|
||||
"""
|
||||
Document parser facade that integrates all specialized parsers.
|
||||
Provides a unified interface for parsing various document types.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Initialize all parser types
|
||||
self.parsers: Dict[str, Type[BaseParser]] = {
|
||||
"docx": Docx2Parser,
|
||||
"doc": DocParser,
|
||||
"pdf": PDFParser,
|
||||
"md": MarkdownParser,
|
||||
"txt": TextParser,
|
||||
"jpg": ImageParser,
|
||||
"jpeg": ImageParser,
|
||||
"png": ImageParser,
|
||||
"gif": ImageParser,
|
||||
"bmp": ImageParser,
|
||||
"tiff": ImageParser,
|
||||
"webp": ImageParser,
|
||||
"markdown": MarkdownParser,
|
||||
"csv": CSVParser,
|
||||
"xlsx": ExcelParser,
|
||||
"xls": ExcelParser,
|
||||
}
|
||||
logger.info(
|
||||
"Parser initialized with %d parsers: %s",
|
||||
len(self.parsers),
|
||||
", ".join(self.parsers.keys()),
|
||||
)
|
||||
|
||||
def get_parser(self, file_type: str) -> Type[BaseParser]:
|
||||
"""
|
||||
Get parser class for the specified file type.
|
||||
|
||||
Args:
|
||||
file_type: The file extension or type identifier
|
||||
|
||||
Returns:
|
||||
Parser class for the file type, or None if unsupported
|
||||
"""
|
||||
parser = self.parsers.get(file_type.lower())
|
||||
if not parser:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
return parser
|
||||
|
||||
def parse_file(
|
||||
self,
|
||||
file_name: str,
|
||||
file_type: str,
|
||||
content: bytes,
|
||||
config: ChunkingConfig,
|
||||
) -> Document:
|
||||
"""
|
||||
Parse file content using appropriate parser based on file type.
|
||||
|
||||
Args:
|
||||
file_name: Name of the file being parsed
|
||||
file_type: Type/extension of the file
|
||||
content: Raw file content as bytes
|
||||
config: Configuration for chunking process
|
||||
|
||||
Returns:
|
||||
ParseResult containing chunks and metadata, or None if parsing failed
|
||||
"""
|
||||
logger.info(f"Parsing file: {file_name} with type: {file_type}")
|
||||
logger.info(
|
||||
f"Chunking config: size={config.chunk_size}, "
|
||||
f"overlap={config.chunk_overlap}, "
|
||||
f"multimodal={config.enable_multimodal}"
|
||||
)
|
||||
|
||||
# Get appropriate parser for file type
|
||||
cls = self.get_parser(file_type)
|
||||
|
||||
# Parse file content
|
||||
logger.info(f"Creating parser instance for {file_type} file")
|
||||
parser = cls(
|
||||
file_name=file_name,
|
||||
file_type=file_type,
|
||||
chunk_size=config.chunk_size,
|
||||
chunk_overlap=config.chunk_overlap,
|
||||
separators=config.separators,
|
||||
enable_multimodal=config.enable_multimodal,
|
||||
max_image_size=1920, # Limit image size to 1920px
|
||||
max_concurrent_tasks=5, # Limit concurrent tasks to 5
|
||||
chunking_config=config, # Pass the entire chunking config
|
||||
)
|
||||
|
||||
logger.info(f"Starting to parse file content, size: {len(content)} bytes")
|
||||
result = parser.parse(content)
|
||||
|
||||
if not result.content:
|
||||
logger.warning(f"Parser returned empty content for file: {file_name}")
|
||||
elif not result.chunks:
|
||||
logger.warning(f"Parser returned empty chunks for file: {file_name}")
|
||||
elif result.chunks[0]:
|
||||
logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
|
||||
logger.info(f"Parsed file {file_name}, with {len(result.chunks)} chunks")
|
||||
return result
|
||||
|
||||
def parse_url(self, url: str, title: str, config: ChunkingConfig) -> Document:
|
||||
"""
|
||||
Parse content from a URL using the WebParser.
|
||||
|
||||
Args:
|
||||
url: URL to parse
|
||||
title: Title of the webpage (for metadata)
|
||||
config: Configuration for chunking process
|
||||
|
||||
Returns:
|
||||
ParseResult containing chunks and metadata, or None if parsing failed
|
||||
"""
|
||||
logger.info(f"Parsing URL: {url}, title: {title}")
|
||||
logger.info(
|
||||
f"Chunking config: size={config.chunk_size}, "
|
||||
f"overlap={config.chunk_overlap}, multimodal={config.enable_multimodal}"
|
||||
)
|
||||
|
||||
# Create web parser instance
|
||||
logger.info("Creating WebParser instance")
|
||||
parser = WebParser(
|
||||
title=title,
|
||||
chunk_size=config.chunk_size,
|
||||
chunk_overlap=config.chunk_overlap,
|
||||
separators=config.separators,
|
||||
enable_multimodal=config.enable_multimodal,
|
||||
max_image_size=1920, # Limit image size
|
||||
max_concurrent_tasks=5, # Limit concurrent tasks
|
||||
chunking_config=config,
|
||||
)
|
||||
|
||||
logger.info("Starting to parse URL content")
|
||||
result = parser.parse(url.encode())
|
||||
|
||||
if not result.content:
|
||||
logger.warning(f"Parser returned empty content for url: {url}")
|
||||
elif not result.chunks:
|
||||
logger.warning(f"Parser returned empty chunks for url: {url}")
|
||||
elif result.chunks[0]:
|
||||
logger.info(f"First chunk content length: {len(result.chunks[0].content)}")
|
||||
logger.info(f"Parsed url {url}, with {len(result.chunks)} chunks")
|
||||
return result
|
||||
7
docreader/parser/pdf_parser.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from docreader.parser.chain_parser import FirstParser
|
||||
from docreader.parser.markitdown_parser import MarkitdownParser
|
||||
from docreader.parser.mineru_parser import MinerUParser
|
||||
|
||||
|
||||
class PDFParser(FirstParser):
|
||||
_parser_cls = (MinerUParser, MarkitdownParser)
|
||||
@@ -1,64 +1,68 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import uuid
|
||||
import logging
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import traceback
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Tuple, Optional
|
||||
from typing import Dict
|
||||
|
||||
from qcloud_cos import CosConfig, CosS3Client
|
||||
from minio import Minio
|
||||
from qcloud_cos import CosConfig, CosS3Client
|
||||
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class Storage(ABC):
|
||||
"""Abstract base class for object storage operations"""
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to object storage
|
||||
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to object storage
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to object storage
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class CosStorage(Storage):
|
||||
"""Tencent Cloud COS storage implementation"""
|
||||
|
||||
|
||||
def __init__(self, storage_config=None):
|
||||
"""Initialize COS storage
|
||||
|
||||
|
||||
Args:
|
||||
storage_config: Storage configuration
|
||||
"""
|
||||
self.storage_config = storage_config
|
||||
self.client, self.bucket_name, self.region, self.prefix = self._init_cos_client()
|
||||
|
||||
self.client, self.bucket_name, self.region, self.prefix = (
|
||||
self._init_cos_client()
|
||||
)
|
||||
|
||||
def _init_cos_client(self):
|
||||
"""Initialize Tencent Cloud COS client"""
|
||||
try:
|
||||
# Use provided COS config if available, otherwise fall back to environment variables
|
||||
# Use provided COS config if available,
|
||||
# otherwise fall back to environment variables
|
||||
if self.storage_config and self.storage_config.get("access_key_id") != "":
|
||||
cos_config = self.storage_config
|
||||
secret_id = cos_config.get("access_key_id")
|
||||
@@ -75,15 +79,16 @@ class CosStorage(Storage):
|
||||
bucket_name = os.getenv("COS_BUCKET_NAME")
|
||||
appid = os.getenv("COS_APP_ID")
|
||||
prefix = os.getenv("COS_PATH_PREFIX")
|
||||
|
||||
|
||||
enable_old_domain = (
|
||||
os.getenv("COS_ENABLE_OLD_DOMAIN", "true").lower() == "true"
|
||||
)
|
||||
|
||||
if not all([secret_id, secret_key, region, bucket_name, appid]):
|
||||
logger.error(
|
||||
"Incomplete COS configuration, missing required environment variables"
|
||||
f"secret_id: {secret_id}, secret_key: {secret_key}, region: {region}, bucket_name: {bucket_name}, appid: {appid}"
|
||||
"Incomplete COS configuration, missing environment variables"
|
||||
f"secret_id: {secret_id}, secret_key: {secret_key}, "
|
||||
f"region: {region}, bucket_name: {bucket_name}, appid: {appid}"
|
||||
)
|
||||
return None, None, None, None
|
||||
|
||||
@@ -105,27 +110,26 @@ class CosStorage(Storage):
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize COS client: {str(e)}")
|
||||
return None, None, None, None
|
||||
|
||||
|
||||
def _get_download_url(self, bucket_name, region, object_key):
|
||||
"""Generate COS object URL
|
||||
|
||||
|
||||
Args:
|
||||
bucket_name: Bucket name
|
||||
region: Region
|
||||
object_key: Object key
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
|
||||
|
||||
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to Tencent Cloud COS
|
||||
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
@@ -135,16 +139,16 @@ class CosStorage(Storage):
|
||||
return ""
|
||||
|
||||
# Generate object key, use UUID to avoid conflicts
|
||||
file_name = os.path.basename(file_path)
|
||||
object_key = (
|
||||
f"{self.prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
)
|
||||
file_ext = os.path.splitext(file_path)[1]
|
||||
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
logger.info(f"Generated object key: {object_key}")
|
||||
|
||||
# Upload file
|
||||
logger.info("Attempting to upload file to COS")
|
||||
response = self.client.upload_file(
|
||||
Bucket=self.bucket_name, LocalFilePath=file_path, Key=object_key
|
||||
self.client.upload_file(
|
||||
Bucket=self.bucket_name,
|
||||
LocalFilePath=file_path,
|
||||
Key=object_key,
|
||||
)
|
||||
|
||||
# Get file URL
|
||||
@@ -156,14 +160,14 @@ class CosStorage(Storage):
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file to COS: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to Tencent Cloud COS
|
||||
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
@@ -171,10 +175,16 @@ class CosStorage(Storage):
|
||||
logger.info(f"Uploading bytes content to COS, size: {len(content)} bytes")
|
||||
if not self.client:
|
||||
return ""
|
||||
|
||||
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.prefix else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
|
||||
object_key = (
|
||||
f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
if self.prefix
|
||||
else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
)
|
||||
logger.info(f"Generated object key: {object_key}")
|
||||
self.client.put_object(Bucket=self.bucket_name, Body=content, Key=object_key)
|
||||
self.client.put_object(
|
||||
Bucket=self.bucket_name, Body=content, Key=object_key
|
||||
)
|
||||
file_url = self._get_download_url(self.bucket_name, self.region, object_key)
|
||||
logger.info(f"Successfully uploaded bytes to COS: {file_url}")
|
||||
return file_url
|
||||
@@ -186,16 +196,18 @@ class CosStorage(Storage):
|
||||
|
||||
class MinioStorage(Storage):
|
||||
"""MinIO storage implementation"""
|
||||
|
||||
|
||||
def __init__(self, storage_config=None):
|
||||
"""Initialize MinIO storage
|
||||
|
||||
|
||||
Args:
|
||||
storage_config: Storage configuration
|
||||
"""
|
||||
self.storage_config = storage_config
|
||||
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = self._init_minio_client()
|
||||
|
||||
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = (
|
||||
self._init_minio_client()
|
||||
)
|
||||
|
||||
def _init_minio_client(self):
|
||||
"""Initialize MinIO client from environment variables or injected config.
|
||||
|
||||
@@ -203,58 +215,69 @@ class MinioStorage(Storage):
|
||||
prefer those values to override envs.
|
||||
"""
|
||||
try:
|
||||
endpoint = os.getenv("MINIO_ENDPOINT")
|
||||
endpoint = os.getenv("MINIO_ENDPOINT", "")
|
||||
use_ssl = os.getenv("MINIO_USE_SSL", "false").lower() == "true"
|
||||
if self.storage_config and self.storage_config.get("bucket_name"):
|
||||
storage_config = self.storage_config
|
||||
bucket_name = storage_config.get("bucket_name")
|
||||
bucket_name = storage_config.get("bucket_name", "")
|
||||
path_prefix = storage_config.get("path_prefix").strip().strip("/")
|
||||
access_key = storage_config.get("access_key_id")
|
||||
secret_key = storage_config.get("secret_access_key")
|
||||
else:
|
||||
access_key = os.getenv("MINIO_ACCESS_KEY_ID")
|
||||
secret_key = os.getenv("MINIO_SECRET_ACCESS_KEY")
|
||||
bucket_name = os.getenv("MINIO_BUCKET_NAME")
|
||||
bucket_name = os.getenv("MINIO_BUCKET_NAME", "")
|
||||
path_prefix = os.getenv("MINIO_PATH_PREFIX", "").strip().strip("/")
|
||||
|
||||
if not all([endpoint, access_key, secret_key, bucket_name]):
|
||||
logger.error("Incomplete MinIO configuration, missing required environment variables")
|
||||
logger.error(
|
||||
"Incomplete MinIO configuration, missing environment variables"
|
||||
)
|
||||
return None, None, None, None, None
|
||||
|
||||
# Initialize client
|
||||
client = Minio(endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl)
|
||||
client = Minio(
|
||||
endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl
|
||||
)
|
||||
|
||||
# Ensure bucket exists
|
||||
found = client.bucket_exists(bucket_name)
|
||||
if not found:
|
||||
client.make_bucket(bucket_name)
|
||||
policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}' % (bucket_name, bucket_name)
|
||||
policy = (
|
||||
'{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetBucketLocation","s3:ListBucket"],"Resource":["arn:aws:s3:::%s"]},{"Effect":"Allow","Principal":{"AWS":["*"]},"Action":["s3:GetObject"],"Resource":["arn:aws:s3:::%s/*"]}]}'
|
||||
% (bucket_name, bucket_name)
|
||||
)
|
||||
client.set_bucket_policy(bucket_name, policy)
|
||||
|
||||
return client, bucket_name, use_ssl, endpoint, path_prefix
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize MinIO client: {str(e)}")
|
||||
return None, None, None, None, None
|
||||
|
||||
def _get_download_url(self, bucket_name: str, object_key: str, use_ssl: bool, endpoint: str, public_endpoint: str = None):
|
||||
|
||||
def _get_download_url(self, object_key: str):
|
||||
"""Construct a public URL for MinIO object.
|
||||
|
||||
If MINIO_PUBLIC_ENDPOINT is provided, use it; otherwise fallback to endpoint.
|
||||
"""
|
||||
if public_endpoint:
|
||||
base = public_endpoint
|
||||
else:
|
||||
scheme = "https" if use_ssl else "http"
|
||||
base = f"{scheme}://{endpoint}"
|
||||
# Path-style URL for MinIO
|
||||
return f"{base}/{bucket_name}/{object_key}"
|
||||
|
||||
# 1. Use public endpoint if provided
|
||||
endpoint = os.getenv("MINIO_PUBLIC_ENDPOINT")
|
||||
if endpoint:
|
||||
return f"{endpoint}/{self.bucket_name}/{object_key}"
|
||||
|
||||
# 2. Use SSL if enabled
|
||||
if self.use_ssl:
|
||||
return f"https://{self.endpoint}/{self.bucket_name}/{object_key}"
|
||||
|
||||
# 3. Use HTTP default
|
||||
return f"http://{self.endpoint}/{self.bucket_name}/{object_key}"
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
"""Upload file to MinIO
|
||||
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
@@ -265,29 +288,27 @@ class MinioStorage(Storage):
|
||||
|
||||
# Generate object key, use UUID to avoid conflicts
|
||||
file_name = os.path.basename(file_path)
|
||||
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}" if self.path_prefix else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
object_key = (
|
||||
f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
if self.path_prefix
|
||||
else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
)
|
||||
logger.info(f"Generated MinIO object key: {object_key}")
|
||||
|
||||
# Upload file
|
||||
logger.info("Attempting to upload file to MinIO")
|
||||
with open(file_path, 'rb') as file_data:
|
||||
with open(file_path, "rb") as file_data:
|
||||
file_size = os.path.getsize(file_path)
|
||||
self.client.put_object(
|
||||
bucket_name=self.bucket_name,
|
||||
bucket_name=self.bucket_name or "",
|
||||
object_name=object_key,
|
||||
data=file_data,
|
||||
length=file_size,
|
||||
content_type='application/octet-stream'
|
||||
content_type="application/octet-stream",
|
||||
)
|
||||
|
||||
# Get file URL
|
||||
file_url = self._get_download_url(
|
||||
self.bucket_name,
|
||||
object_key,
|
||||
self.use_ssl,
|
||||
self.endpoint,
|
||||
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
|
||||
)
|
||||
file_url = self._get_download_url(object_key)
|
||||
|
||||
logger.info(f"Successfully uploaded file to MinIO: {file_url}")
|
||||
return file_url
|
||||
@@ -295,14 +316,14 @@ class MinioStorage(Storage):
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file to MinIO: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
"""Upload bytes to MinIO
|
||||
|
||||
|
||||
Args:
|
||||
content: Byte content to upload
|
||||
file_ext: File extension
|
||||
|
||||
|
||||
Returns:
|
||||
File URL
|
||||
"""
|
||||
@@ -310,23 +331,21 @@ class MinioStorage(Storage):
|
||||
logger.info(f"Uploading bytes content to MinIO, size: {len(content)} bytes")
|
||||
if not self.client:
|
||||
return ""
|
||||
|
||||
object_key = f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}" if self.path_prefix else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
|
||||
object_key = (
|
||||
f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
if self.path_prefix
|
||||
else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
)
|
||||
logger.info(f"Generated MinIO object key: {object_key}")
|
||||
self.client.put_object(
|
||||
self.bucket_name,
|
||||
object_key,
|
||||
data=io.BytesIO(content),
|
||||
length=len(content),
|
||||
content_type="application/octet-stream"
|
||||
)
|
||||
file_url = self._get_download_url(
|
||||
self.bucket_name,
|
||||
object_key,
|
||||
self.use_ssl,
|
||||
self.endpoint,
|
||||
os.getenv("MINIO_PUBLIC_ENDPOINT", None)
|
||||
self.bucket_name or "",
|
||||
object_key,
|
||||
data=io.BytesIO(content),
|
||||
length=len(content),
|
||||
content_type="application/octet-stream",
|
||||
)
|
||||
file_url = self._get_download_url(object_key)
|
||||
logger.info(f"Successfully uploaded bytes to MinIO: {file_url}")
|
||||
return file_url
|
||||
except Exception as e:
|
||||
@@ -335,26 +354,61 @@ class MinioStorage(Storage):
|
||||
return ""
|
||||
|
||||
|
||||
def create_storage(storage_config=None) -> Storage:
|
||||
class LocalStorage(Storage):
|
||||
"""Local file system storage implementation"""
|
||||
|
||||
def __init__(self, storage_config: Dict[str, str] = {}):
|
||||
self.storage_config = storage_config
|
||||
base_dir = storage_config.get(
|
||||
"base_dir", os.getenv("LOCAL_STORAGE_BASE_DIR", "")
|
||||
)
|
||||
self.image_dir = os.path.join(base_dir, "images")
|
||||
os.makedirs(self.image_dir, exist_ok=True)
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
logger.info(f"Uploading file to local storage: {file_path}")
|
||||
return file_path
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
logger.info(f"Uploading file to local storage: {len(content)} bytes")
|
||||
fname = os.path.join(self.image_dir, f"{uuid.uuid4()}{file_ext}")
|
||||
with open(fname, "wb") as f:
|
||||
f.write(content)
|
||||
return fname
|
||||
|
||||
|
||||
class Base64Storage(Storage):
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
logger.info(f"Uploading file to base64 storage: {file_path}")
|
||||
return file_path
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
logger.info(f"Uploading file to base64 storage: {len(content)} bytes")
|
||||
file_ext = file_ext.lstrip(".")
|
||||
return f"data:image/{file_ext};base64,{endecode.decode_image(content)}"
|
||||
|
||||
|
||||
def create_storage(storage_config: Dict[str, str] | None = None) -> Storage:
|
||||
"""Create a storage instance based on configuration or environment variables
|
||||
|
||||
|
||||
Args:
|
||||
storage_config: Storage configuration dictionary
|
||||
|
||||
|
||||
Returns:
|
||||
Storage instance
|
||||
"""
|
||||
storage_type = os.getenv("STORAGE_TYPE", "cos").lower()
|
||||
|
||||
if storage_config:
|
||||
storage_type = str(storage_config.get("provider", storage_type)).lower()
|
||||
|
||||
logger.info(f"Creating {storage_type} storage instance")
|
||||
|
||||
|
||||
if storage_type == "minio":
|
||||
return MinioStorage(storage_config)
|
||||
elif storage_type == "cos":
|
||||
# Default to COS
|
||||
return CosStorage(storage_config)
|
||||
else:
|
||||
return None
|
||||
elif storage_type == "local":
|
||||
return LocalStorage(storage_config or {})
|
||||
elif storage_type == "base64":
|
||||
return Base64Storage()
|
||||
|
||||
raise ValueError(f"Invalid storage type: {storage_type}")
|
||||
@@ -1,6 +1,8 @@
|
||||
import logging
|
||||
from .base_parser import BaseParser
|
||||
from typing import Dict, Any, Tuple, Union
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -11,7 +13,7 @@ class TextParser(BaseParser):
|
||||
This parser handles text extraction and chunking from plain text documents.
|
||||
"""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""
|
||||
Parse text document content by decoding bytes to string.
|
||||
|
||||
@@ -25,20 +27,15 @@ class TextParser(BaseParser):
|
||||
Parsed text content as string
|
||||
"""
|
||||
logger.info(f"Parsing text document, content size: {len(content)} bytes")
|
||||
text = self.decode_bytes(content)
|
||||
text = endecode.decode_bytes(content)
|
||||
logger.info(
|
||||
f"Successfully parsed text document, extracted {len(text)} characters"
|
||||
)
|
||||
return text
|
||||
return Document(content=text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger.info("Running TextParser in standalone mode")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Sample text for testing
|
||||
text = """## 标题1
|
||||
104
docreader/parser/web_parser.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
from trafilatura import extract
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StdWebParser(BaseParser):
|
||||
"""Web page parser"""
|
||||
|
||||
def __init__(self, title: str, **kwargs):
|
||||
self.title = title
|
||||
self.proxy = os.environ.get("WEB_PROXY", "")
|
||||
super().__init__(file_name=title, **kwargs)
|
||||
logger.info(f"Initialized WebParser with title: {title}")
|
||||
|
||||
async def scrape(self, url: str) -> str:
|
||||
logger.info(f"Starting web page scraping for URL: {url}")
|
||||
try:
|
||||
async with async_playwright() as p:
|
||||
kwargs = {}
|
||||
if self.proxy:
|
||||
kwargs["proxy"] = {"server": self.proxy}
|
||||
logger.info("Launching WebKit browser")
|
||||
browser = await p.webkit.launch(**kwargs)
|
||||
page = await browser.new_page()
|
||||
|
||||
logger.info(f"Navigating to URL: {url}")
|
||||
try:
|
||||
await page.goto(url, timeout=30000)
|
||||
logger.info("Initial page load complete")
|
||||
except Exception as e:
|
||||
logger.error(f"Error navigating to URL: {str(e)}")
|
||||
await browser.close()
|
||||
return ""
|
||||
|
||||
logger.info("Retrieving page HTML content")
|
||||
content = await page.content()
|
||||
logger.info(f"Retrieved {len(content)} bytes of HTML content")
|
||||
|
||||
await browser.close()
|
||||
logger.info("Browser closed")
|
||||
|
||||
# Parse HTML content with BeautifulSoup
|
||||
logger.info("Parsing HTML with BeautifulSoup")
|
||||
logger.info("Successfully parsed HTML content")
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to scrape web page: {str(e)}")
|
||||
# Return empty BeautifulSoup object on error
|
||||
return ""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""Parse web page
|
||||
|
||||
Args:
|
||||
content: Web page content
|
||||
|
||||
Returns:
|
||||
Parse result
|
||||
"""
|
||||
url = endecode.decode_bytes(content)
|
||||
|
||||
logger.info(f"Scraping web page: {url}")
|
||||
chtml = asyncio.run(self.scrape(url))
|
||||
md_text = extract(
|
||||
chtml,
|
||||
output_format="markdown",
|
||||
with_metadata=True,
|
||||
include_images=True,
|
||||
include_tables=True,
|
||||
include_links=True,
|
||||
deduplicate=True,
|
||||
)
|
||||
if not md_text:
|
||||
logger.error("Failed to parse web page")
|
||||
return Document(content=f"Error parsing web page: {url}")
|
||||
return Document(content=md_text)
|
||||
|
||||
|
||||
class WebParser(PipelineParser):
|
||||
_parser_cls = (StdWebParser, MarkdownParser)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
url = "https://cloud.tencent.com/document/product/457/6759"
|
||||
|
||||
parser = WebParser(title="")
|
||||
cc = parser.parse_into_text(url.encode())
|
||||
with open("./tencent.md", "w") as f:
|
||||
f.write(cc.content)
|
||||
55
docreader/proto/docreader_pb2.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# NO CHECKED-IN PROTOBUF GENCODE
|
||||
# source: docreader.proto
|
||||
# Protobuf Python Version: 6.31.1
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import runtime_version as _runtime_version
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
_runtime_version.ValidateProtobufRuntimeVersion(
|
||||
_runtime_version.Domain.PUBLIC,
|
||||
6,
|
||||
31,
|
||||
1,
|
||||
'',
|
||||
'docreader.proto'
|
||||
)
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x64ocreader.proto\x12\tdocreader\"\xb9\x01\n\rStorageConfig\x12,\n\x08provider\x18\x01 \x01(\x0e\x32\x1a.docreader.StorageProvider\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x03 \x01(\t\x12\x15\n\raccess_key_id\x18\x04 \x01(\t\x12\x19\n\x11secret_access_key\x18\x05 \x01(\t\x12\x0e\n\x06\x61pp_id\x18\x06 \x01(\t\x12\x13\n\x0bpath_prefix\x18\x07 \x01(\t\"Z\n\tVLMConfig\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x10\n\x08\x62\x61se_url\x18\x02 \x01(\t\x12\x0f\n\x07\x61pi_key\x18\x03 \x01(\t\x12\x16\n\x0einterface_type\x18\x04 \x01(\t\"\xc2\x01\n\nReadConfig\x12\x12\n\nchunk_size\x18\x01 \x01(\x05\x12\x15\n\rchunk_overlap\x18\x02 \x01(\x05\x12\x12\n\nseparators\x18\x03 \x03(\t\x12\x19\n\x11\x65nable_multimodal\x18\x04 \x01(\x08\x12\x30\n\x0estorage_config\x18\x05 \x01(\x0b\x32\x18.docreader.StorageConfig\x12(\n\nvlm_config\x18\x06 \x01(\x0b\x32\x14.docreader.VLMConfig\"\x91\x01\n\x13ReadFromFileRequest\x12\x14\n\x0c\x66ile_content\x18\x01 \x01(\x0c\x12\x11\n\tfile_name\x18\x02 \x01(\t\x12\x11\n\tfile_type\x18\x03 \x01(\t\x12*\n\x0bread_config\x18\x04 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x05 \x01(\t\"p\n\x12ReadFromURLRequest\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\r\n\x05title\x18\x02 \x01(\t\x12*\n\x0bread_config\x18\x03 \x01(\x0b\x32\x15.docreader.ReadConfig\x12\x12\n\nrequest_id\x18\x04 \x01(\t\"i\n\x05Image\x12\x0b\n\x03url\x18\x01 \x01(\t\x12\x0f\n\x07\x63\x61ption\x18\x02 \x01(\t\x12\x10\n\x08ocr_text\x18\x03 \x01(\t\x12\x14\n\x0coriginal_url\x18\x04 \x01(\t\x12\r\n\x05start\x18\x05 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x06 \x01(\x05\"c\n\x05\x43hunk\x12\x0f\n\x07\x63ontent\x18\x01 \x01(\t\x12\x0b\n\x03seq\x18\x02 \x01(\x05\x12\r\n\x05start\x18\x03 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x04 \x01(\x05\x12 \n\x06images\x18\x05 \x03(\x0b\x32\x10.docreader.Image\"?\n\x0cReadResponse\x12 \n\x06\x63hunks\x18\x01 \x03(\x0b\x32\x10.docreader.Chunk\x12\r\n\x05\x65rror\x18\x02 \x01(\t*G\n\x0fStorageProvider\x12 \n\x1cSTORAGE_PROVIDER_UNSPECIFIED\x10\x00\x12\x07\n\x03\x43OS\x10\x01\x12\t\n\x05MINIO\x10\x02\x32\x9f\x01\n\tDocReader\x12I\n\x0cReadFromFile\x12\x1e.docreader.ReadFromFileRequest\x1a\x17.docreader.ReadResponse\"\x00\x12G\n\x0bReadFromURL\x12\x1d.docreader.ReadFromURLRequest\x1a\x17.docreader.ReadResponse\"\x00\x42\x35Z3github.com/Tencent/WeKnora/internal/docreader/protob\x06proto3')
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'docreader_pb2', _globals)
|
||||
if not _descriptor._USE_C_DESCRIPTORS:
|
||||
_globals['DESCRIPTOR']._loaded_options = None
|
||||
_globals['DESCRIPTOR']._serialized_options = b'Z3github.com/Tencent/WeKnora/internal/docreader/proto'
|
||||
_globals['_STORAGEPROVIDER']._serialized_start=1042
|
||||
_globals['_STORAGEPROVIDER']._serialized_end=1113
|
||||
_globals['_STORAGECONFIG']._serialized_start=31
|
||||
_globals['_STORAGECONFIG']._serialized_end=216
|
||||
_globals['_VLMCONFIG']._serialized_start=218
|
||||
_globals['_VLMCONFIG']._serialized_end=308
|
||||
_globals['_READCONFIG']._serialized_start=311
|
||||
_globals['_READCONFIG']._serialized_end=505
|
||||
_globals['_READFROMFILEREQUEST']._serialized_start=508
|
||||
_globals['_READFROMFILEREQUEST']._serialized_end=653
|
||||
_globals['_READFROMURLREQUEST']._serialized_start=655
|
||||
_globals['_READFROMURLREQUEST']._serialized_end=767
|
||||
_globals['_IMAGE']._serialized_start=769
|
||||
_globals['_IMAGE']._serialized_end=874
|
||||
_globals['_CHUNK']._serialized_start=876
|
||||
_globals['_CHUNK']._serialized_end=975
|
||||
_globals['_READRESPONSE']._serialized_start=977
|
||||
_globals['_READRESPONSE']._serialized_end=1040
|
||||
_globals['_DOCREADER']._serialized_start=1116
|
||||
_globals['_DOCREADER']._serialized_end=1275
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
127
docreader/proto/docreader_pb2.pyi
Normal file
@@ -0,0 +1,127 @@
|
||||
from google.protobuf.internal import containers as _containers
|
||||
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import message as _message
|
||||
from collections.abc import Iterable as _Iterable, Mapping as _Mapping
|
||||
from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
|
||||
|
||||
DESCRIPTOR: _descriptor.FileDescriptor
|
||||
|
||||
class StorageProvider(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
||||
__slots__ = ()
|
||||
STORAGE_PROVIDER_UNSPECIFIED: _ClassVar[StorageProvider]
|
||||
COS: _ClassVar[StorageProvider]
|
||||
MINIO: _ClassVar[StorageProvider]
|
||||
STORAGE_PROVIDER_UNSPECIFIED: StorageProvider
|
||||
COS: StorageProvider
|
||||
MINIO: StorageProvider
|
||||
|
||||
class StorageConfig(_message.Message):
|
||||
__slots__ = ("provider", "region", "bucket_name", "access_key_id", "secret_access_key", "app_id", "path_prefix")
|
||||
PROVIDER_FIELD_NUMBER: _ClassVar[int]
|
||||
REGION_FIELD_NUMBER: _ClassVar[int]
|
||||
BUCKET_NAME_FIELD_NUMBER: _ClassVar[int]
|
||||
ACCESS_KEY_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
SECRET_ACCESS_KEY_FIELD_NUMBER: _ClassVar[int]
|
||||
APP_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
PATH_PREFIX_FIELD_NUMBER: _ClassVar[int]
|
||||
provider: StorageProvider
|
||||
region: str
|
||||
bucket_name: str
|
||||
access_key_id: str
|
||||
secret_access_key: str
|
||||
app_id: str
|
||||
path_prefix: str
|
||||
def __init__(self, provider: _Optional[_Union[StorageProvider, str]] = ..., region: _Optional[str] = ..., bucket_name: _Optional[str] = ..., access_key_id: _Optional[str] = ..., secret_access_key: _Optional[str] = ..., app_id: _Optional[str] = ..., path_prefix: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class VLMConfig(_message.Message):
|
||||
__slots__ = ("model_name", "base_url", "api_key", "interface_type")
|
||||
MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
|
||||
BASE_URL_FIELD_NUMBER: _ClassVar[int]
|
||||
API_KEY_FIELD_NUMBER: _ClassVar[int]
|
||||
INTERFACE_TYPE_FIELD_NUMBER: _ClassVar[int]
|
||||
model_name: str
|
||||
base_url: str
|
||||
api_key: str
|
||||
interface_type: str
|
||||
def __init__(self, model_name: _Optional[str] = ..., base_url: _Optional[str] = ..., api_key: _Optional[str] = ..., interface_type: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class ReadConfig(_message.Message):
|
||||
__slots__ = ("chunk_size", "chunk_overlap", "separators", "enable_multimodal", "storage_config", "vlm_config")
|
||||
CHUNK_SIZE_FIELD_NUMBER: _ClassVar[int]
|
||||
CHUNK_OVERLAP_FIELD_NUMBER: _ClassVar[int]
|
||||
SEPARATORS_FIELD_NUMBER: _ClassVar[int]
|
||||
ENABLE_MULTIMODAL_FIELD_NUMBER: _ClassVar[int]
|
||||
STORAGE_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
VLM_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
chunk_size: int
|
||||
chunk_overlap: int
|
||||
separators: _containers.RepeatedScalarFieldContainer[str]
|
||||
enable_multimodal: bool
|
||||
storage_config: StorageConfig
|
||||
vlm_config: VLMConfig
|
||||
def __init__(self, chunk_size: _Optional[int] = ..., chunk_overlap: _Optional[int] = ..., separators: _Optional[_Iterable[str]] = ..., enable_multimodal: bool = ..., storage_config: _Optional[_Union[StorageConfig, _Mapping]] = ..., vlm_config: _Optional[_Union[VLMConfig, _Mapping]] = ...) -> None: ...
|
||||
|
||||
class ReadFromFileRequest(_message.Message):
|
||||
__slots__ = ("file_content", "file_name", "file_type", "read_config", "request_id")
|
||||
FILE_CONTENT_FIELD_NUMBER: _ClassVar[int]
|
||||
FILE_NAME_FIELD_NUMBER: _ClassVar[int]
|
||||
FILE_TYPE_FIELD_NUMBER: _ClassVar[int]
|
||||
READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
file_content: bytes
|
||||
file_name: str
|
||||
file_type: str
|
||||
read_config: ReadConfig
|
||||
request_id: str
|
||||
def __init__(self, file_content: _Optional[bytes] = ..., file_name: _Optional[str] = ..., file_type: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class ReadFromURLRequest(_message.Message):
|
||||
__slots__ = ("url", "title", "read_config", "request_id")
|
||||
URL_FIELD_NUMBER: _ClassVar[int]
|
||||
TITLE_FIELD_NUMBER: _ClassVar[int]
|
||||
READ_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
||||
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
|
||||
url: str
|
||||
title: str
|
||||
read_config: ReadConfig
|
||||
request_id: str
|
||||
def __init__(self, url: _Optional[str] = ..., title: _Optional[str] = ..., read_config: _Optional[_Union[ReadConfig, _Mapping]] = ..., request_id: _Optional[str] = ...) -> None: ...
|
||||
|
||||
class Image(_message.Message):
|
||||
__slots__ = ("url", "caption", "ocr_text", "original_url", "start", "end")
|
||||
URL_FIELD_NUMBER: _ClassVar[int]
|
||||
CAPTION_FIELD_NUMBER: _ClassVar[int]
|
||||
OCR_TEXT_FIELD_NUMBER: _ClassVar[int]
|
||||
ORIGINAL_URL_FIELD_NUMBER: _ClassVar[int]
|
||||
START_FIELD_NUMBER: _ClassVar[int]
|
||||
END_FIELD_NUMBER: _ClassVar[int]
|
||||
url: str
|
||||
caption: str
|
||||
ocr_text: str
|
||||
original_url: str
|
||||
start: int
|
||||
end: int
|
||||
def __init__(self, url: _Optional[str] = ..., caption: _Optional[str] = ..., ocr_text: _Optional[str] = ..., original_url: _Optional[str] = ..., start: _Optional[int] = ..., end: _Optional[int] = ...) -> None: ...
|
||||
|
||||
class Chunk(_message.Message):
|
||||
__slots__ = ("content", "seq", "start", "end", "images")
|
||||
CONTENT_FIELD_NUMBER: _ClassVar[int]
|
||||
SEQ_FIELD_NUMBER: _ClassVar[int]
|
||||
START_FIELD_NUMBER: _ClassVar[int]
|
||||
END_FIELD_NUMBER: _ClassVar[int]
|
||||
IMAGES_FIELD_NUMBER: _ClassVar[int]
|
||||
content: str
|
||||
seq: int
|
||||
start: int
|
||||
end: int
|
||||
images: _containers.RepeatedCompositeFieldContainer[Image]
|
||||
def __init__(self, content: _Optional[str] = ..., seq: _Optional[int] = ..., start: _Optional[int] = ..., end: _Optional[int] = ..., images: _Optional[_Iterable[_Union[Image, _Mapping]]] = ...) -> None: ...
|
||||
|
||||
class ReadResponse(_message.Message):
|
||||
__slots__ = ("chunks", "error")
|
||||
CHUNKS_FIELD_NUMBER: _ClassVar[int]
|
||||
ERROR_FIELD_NUMBER: _ClassVar[int]
|
||||
chunks: _containers.RepeatedCompositeFieldContainer[Chunk]
|
||||
error: str
|
||||
def __init__(self, chunks: _Optional[_Iterable[_Union[Chunk, _Mapping]]] = ..., error: _Optional[str] = ...) -> None: ...
|
||||
@@ -5,7 +5,7 @@ import warnings
|
||||
|
||||
import docreader_pb2 as docreader__pb2
|
||||
|
||||
GRPC_GENERATED_VERSION = '1.74.0'
|
||||
GRPC_GENERATED_VERSION = '1.76.0'
|
||||
GRPC_VERSION = grpc.__version__
|
||||
_version_not_supported = False
|
||||
|
||||
@@ -18,7 +18,7 @@ except ImportError:
|
||||
if _version_not_supported:
|
||||
raise RuntimeError(
|
||||
f'The grpc package installed is at version {GRPC_VERSION},'
|
||||
+ f' but the generated code in docreader_pb2_grpc.py depends on'
|
||||
+ ' but the generated code in docreader_pb2_grpc.py depends on'
|
||||
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
|
||||
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
|
||||
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
|
||||
38
docreader/pyproject.toml
Normal file
@@ -0,0 +1,38 @@
|
||||
[project]
|
||||
name = "docreader"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10.18"
|
||||
dependencies = [
|
||||
"antiword>=0.1.0",
|
||||
"asyncio>=4.0.0",
|
||||
"beautifulsoup4>=4.14.2",
|
||||
"cos-python-sdk-v5>=1.9.38",
|
||||
"goose3[all]>=3.1.20",
|
||||
"grpcio>=1.76.0",
|
||||
"grpcio-health-checking>=1.76.0",
|
||||
"grpcio-tools>=1.76.0",
|
||||
"lxml>=6.0.2",
|
||||
"markdown>=3.10",
|
||||
"markdownify>=1.2.0",
|
||||
"markitdown[docx,pdf,xls,xlsx]>=0.1.3",
|
||||
"minio>=7.2.18",
|
||||
"mistletoe>=1.5.0",
|
||||
"ollama>=0.6.0",
|
||||
"openai>=2.7.1",
|
||||
"paddleocr>=2.10.0,<3.0.0",
|
||||
"paddlepaddle>=3.0.0,<4.0.0",
|
||||
"pdfplumber>=0.11.7",
|
||||
"pillow>=12.0.0",
|
||||
"playwright>=1.55.0",
|
||||
"protobuf>=6.33.0",
|
||||
"pydantic>=2.12.3",
|
||||
"pypdf>=6.1.3",
|
||||
"pypdf2>=3.0.1",
|
||||
"python-docx>=1.2.0",
|
||||
"requests>=2.32.5",
|
||||
"textract==1.5.0",
|
||||
"trafilatura>=2.0.0",
|
||||
"urllib3>=2.5.0",
|
||||
]
|
||||
@@ -2,13 +2,14 @@
|
||||
set -x
|
||||
|
||||
# 设置目录
|
||||
PROTO_DIR="src/proto"
|
||||
PYTHON_OUT="src/proto"
|
||||
GO_OUT="src/proto"
|
||||
PROTO_DIR="docreader/proto"
|
||||
PYTHON_OUT="docreader/proto"
|
||||
GO_OUT="docreader/proto"
|
||||
|
||||
# 生成Python代码
|
||||
python3 -m grpc_tools.protoc -I${PROTO_DIR} \
|
||||
--python_out=${PYTHON_OUT} \
|
||||
--pyi_out=${PYTHON_OUT} \
|
||||
--grpc_python_out=${PYTHON_OUT} \
|
||||
${PROTO_DIR}/docreader.proto
|
||||
|
||||
@@ -22,10 +23,10 @@ protoc -I${PROTO_DIR} --go_out=${GO_OUT} \
|
||||
# 修复Python导入问题(MacOS兼容版本)
|
||||
if [ "$(uname)" == "Darwin" ]; then
|
||||
# MacOS版本
|
||||
sed -i '' 's/import docreader_pb2/from . import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
sed -i '' 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
else
|
||||
# Linux版本
|
||||
sed -i 's/import docreader_pb2/from . import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
sed -i 's/import docreader_pb2/from docreader.proto import docreader_pb2/g' ${PYTHON_OUT}/docreader_pb2_grpc.py
|
||||
fi
|
||||
|
||||
echo "Proto files generated successfully!"
|
||||
112
docreader/splitter/header_hook.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import re
|
||||
from typing import Callable, Dict, List, Match, Pattern, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class HeaderTrackerHook(BaseModel):
|
||||
"""表头追踪Hook的配置类,支持多种场景的表头识别"""
|
||||
|
||||
start_pattern: Pattern[str] = Field(
|
||||
description="表头开始匹配(正则表达式或字符串)"
|
||||
)
|
||||
end_pattern: Pattern[str] = Field(description="表头结束匹配(正则表达式或字符串)")
|
||||
extract_header_fn: Callable[[Match[str]], str] = Field(
|
||||
default=lambda m: m.group(0),
|
||||
description="从开始匹配结果中提取表头内容的函数(默认取匹配到的整个内容)",
|
||||
)
|
||||
priority: int = Field(default=0, description="优先级(多个配置时,高优先级先匹配)")
|
||||
case_sensitive: bool = Field(
|
||||
default=True, description="是否大小写敏感(仅当传入字符串pattern时生效)"
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
start_pattern: Union[str, Pattern[str]],
|
||||
end_pattern: Union[str, Pattern[str]],
|
||||
**kwargs,
|
||||
):
|
||||
flags = 0 if kwargs.get("case_sensitive", True) else re.IGNORECASE
|
||||
if isinstance(start_pattern, str):
|
||||
start_pattern = re.compile(start_pattern, flags | re.DOTALL)
|
||||
if isinstance(end_pattern, str):
|
||||
end_pattern = re.compile(end_pattern, flags | re.DOTALL)
|
||||
super().__init__(
|
||||
start_pattern=start_pattern,
|
||||
end_pattern=end_pattern,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
# 初始化表头Hook配置(提供默认配置:支持Markdown表格、代码块)
|
||||
DEFAULT_CONFIGS = [
|
||||
# 代码块配置(```开头,```结尾)
|
||||
# HeaderTrackerHook(
|
||||
# # 代码块开始(支持语言指定)
|
||||
# start_pattern=r"^\s*```(\w+).*(?!```$)",
|
||||
# # 代码块结束
|
||||
# end_pattern=r"^\s*```.*$",
|
||||
# extract_header_fn=lambda m: f"```{m.group(1)}" if m.group(1) else "```",
|
||||
# priority=20, # 代码块优先级高于表格
|
||||
# case_sensitive=True,
|
||||
# ),
|
||||
# Markdown表格配置(表头带下划线)
|
||||
HeaderTrackerHook(
|
||||
# 表头行 + 分隔行
|
||||
start_pattern=r"^\s*(?:\|[^|\n]*)+[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|?[\r\n]+$",
|
||||
# 空行或非表格内容
|
||||
end_pattern=r"^\s*$|^\s*[^|\s].*$",
|
||||
priority=15,
|
||||
case_sensitive=False,
|
||||
),
|
||||
]
|
||||
DEFAULT_CONFIGS.sort(key=lambda x: -x.priority)
|
||||
|
||||
|
||||
# 定义Hook状态数据结构
|
||||
class HeaderTracker(BaseModel):
|
||||
"""表头追踪 Hook 的状态类"""
|
||||
|
||||
header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS)
|
||||
active_headers: Dict[int, str] = Field(default_factory=dict)
|
||||
ended_headers: set[int] = Field(default_factory=set)
|
||||
|
||||
def update(self, split: str) -> Dict[int, str]:
|
||||
"""检测当前split中的表头开始/结束,更新Hook状态"""
|
||||
new_headers: Dict[int, str] = {}
|
||||
|
||||
# 1. 检查是否有表头结束标记
|
||||
for config in self.header_hook_configs:
|
||||
if config.priority in self.active_headers and config.end_pattern.search(
|
||||
split
|
||||
):
|
||||
self.ended_headers.add(config.priority)
|
||||
del self.active_headers[config.priority]
|
||||
|
||||
# 2. 检查是否有新的表头开始标记(只处理未活跃且未结束的)
|
||||
for config in self.header_hook_configs:
|
||||
if (
|
||||
config.priority not in self.active_headers
|
||||
and config.priority not in self.ended_headers
|
||||
):
|
||||
match = config.start_pattern.search(split)
|
||||
if match:
|
||||
header = config.extract_header_fn(match)
|
||||
self.active_headers[config.priority] = header
|
||||
new_headers[config.priority] = header
|
||||
|
||||
# 3. 检查是否所有活跃表头都已结束(清空结束标记)
|
||||
if not self.active_headers:
|
||||
self.ended_headers.clear()
|
||||
|
||||
return new_headers
|
||||
|
||||
def get_headers(self) -> str:
|
||||
"""获取当前所有活跃表头的拼接文本(按优先级排序)"""
|
||||
# 按优先级降序排列表头
|
||||
sorted_headers = sorted(self.active_headers.items(), key=lambda x: -x[0])
|
||||
return (
|
||||
"\n".join([header for _, header in sorted_headers])
|
||||
if sorted_headers
|
||||
else ""
|
||||
)
|
||||
311
docreader/splitter/splitter.py
Normal file
@@ -0,0 +1,311 @@
|
||||
"""Token splitter."""
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
from typing import Callable, Generic, List, Pattern, Tuple, TypeVar
|
||||
|
||||
from pydantic import BaseModel, Field, PrivateAttr
|
||||
|
||||
from docreader.splitter.header_hook import (
|
||||
HeaderTracker,
|
||||
)
|
||||
from docreader.utils.split import split_by_char, split_by_sep
|
||||
|
||||
DEFAULT_CHUNK_OVERLAP = 100
|
||||
DEFAULT_CHUNK_SIZE = 512
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextSplitter(BaseModel, Generic[T]):
|
||||
chunk_size: int = Field(description="The token chunk size for each chunk.")
|
||||
chunk_overlap: int = Field(
|
||||
description="The token overlap of each chunk when splitting."
|
||||
)
|
||||
separators: List[str] = Field(
|
||||
description="Default separators for splitting into words"
|
||||
)
|
||||
|
||||
# Try to keep the matched characters as a whole.
|
||||
# If it's too long, the content will be further segmented.
|
||||
protected_regex: List[str] = Field(
|
||||
description="Protected regex for splitting into words"
|
||||
)
|
||||
len_function: Callable[[str], int] = Field(description="The length function.")
|
||||
# Header tracking Hook related attributes
|
||||
header_hook: HeaderTracker = Field(default_factory=HeaderTracker, exclude=True)
|
||||
|
||||
_protected_fns: List[Pattern] = PrivateAttr()
|
||||
_split_fns: List[Callable] = PrivateAttr()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
||||
separators: List[str] = ["\n", "。", " "],
|
||||
protected_regex: List[str] = [
|
||||
# math formula
|
||||
r"\$\$[\s\S]*?\$\$",
|
||||
# image
|
||||
r"!\[.*?\]\(.*?\)",
|
||||
# link
|
||||
r"\[.*?\]\(.*?\)",
|
||||
# table header
|
||||
r"(?:\|[^|\n]*)+\|[\r\n]+\s*(?:\|\s*:?-{3,}:?\s*)+\|[\r\n]+",
|
||||
# table body
|
||||
r"(?:\|[^|\n]*)+\|[\r\n]+",
|
||||
# code header
|
||||
r"```(?:\w+)[\r\n]+[^\r\n]*",
|
||||
],
|
||||
length_function: Callable[[str], int] = lambda x: len(x),
|
||||
):
|
||||
"""Initialize with parameters."""
|
||||
if chunk_overlap > chunk_size:
|
||||
raise ValueError(
|
||||
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
|
||||
f"({chunk_size}), should be smaller."
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
separators=separators,
|
||||
protected_regex=protected_regex,
|
||||
len_function=length_function,
|
||||
)
|
||||
self._protected_fns = [re.compile(reg) for reg in protected_regex]
|
||||
self._split_fns = [split_by_sep(sep) for sep in separators] + [split_by_char()]
|
||||
|
||||
def split_text(self, text: str) -> List[Tuple[int, int, str]]:
|
||||
"""Split text into chunks."""
|
||||
if text == "":
|
||||
return []
|
||||
|
||||
splits = self._split(text)
|
||||
protect = self._split_protected(text)
|
||||
splits = self._join(splits, protect)
|
||||
|
||||
assert "".join(splits) == text
|
||||
|
||||
chunks = self._merge(splits)
|
||||
return chunks
|
||||
|
||||
def _split(self, text: str) -> List[str]:
|
||||
"""Break text into splits that are smaller than chunk size.
|
||||
|
||||
NOTE: the splits contain the separators.
|
||||
"""
|
||||
if self.len_function(text) <= self.chunk_size:
|
||||
return [text]
|
||||
|
||||
splits = []
|
||||
for split_fn in self._split_fns:
|
||||
splits = split_fn(text)
|
||||
if len(splits) > 1:
|
||||
break
|
||||
|
||||
new_splits = []
|
||||
for split in splits:
|
||||
split_len = self.len_function(split)
|
||||
if split_len <= self.chunk_size:
|
||||
new_splits.append(split)
|
||||
else:
|
||||
# recursively split
|
||||
new_splits.extend(self._split(split))
|
||||
return new_splits
|
||||
|
||||
def _merge(self, splits: List[str]) -> List[Tuple[int, int, str]]:
|
||||
"""Merge splits into chunks.
|
||||
|
||||
The high-level idea is to keep adding splits to a chunk until we
|
||||
exceed the chunk size, then we start a new chunk with overlap.
|
||||
|
||||
When we start a new chunk, we pop off the first element of the previous
|
||||
chunk until the total length is less than the chunk size.
|
||||
"""
|
||||
chunks: List[Tuple[int, int, str]] = []
|
||||
|
||||
cur_chunk: List[Tuple[int, int, str]] = []
|
||||
|
||||
cur_headers, cur_len = "", 0
|
||||
cur_start, cur_end = 0, 0
|
||||
for split in splits:
|
||||
cur_end = cur_start + len(split)
|
||||
split_len = self.len_function(split)
|
||||
if split_len > self.chunk_size:
|
||||
logger.error(
|
||||
f"Got a split of size {split_len}, ",
|
||||
f"larger than chunk size {self.chunk_size}.",
|
||||
)
|
||||
|
||||
self.header_hook.update(split)
|
||||
cur_headers = self.header_hook.get_headers()
|
||||
cur_headers_len = self.len_function(cur_headers)
|
||||
|
||||
if cur_headers_len > self.chunk_size:
|
||||
logger.error(
|
||||
f"Got headers of size {cur_headers_len}, ",
|
||||
f"larger than chunk size {self.chunk_size}.",
|
||||
)
|
||||
cur_headers, cur_headers_len = "", 0
|
||||
|
||||
# if we exceed the chunk size after adding the new split, then
|
||||
# we need to end the current chunk and start a new one
|
||||
if cur_len + split_len + cur_headers_len > self.chunk_size:
|
||||
# end the previous chunk
|
||||
if len(cur_chunk) > 0:
|
||||
chunks.append(
|
||||
(
|
||||
cur_chunk[0][0],
|
||||
cur_chunk[-1][1],
|
||||
"".join([c[2] for c in cur_chunk]),
|
||||
)
|
||||
)
|
||||
|
||||
# start a new chunk with overlap
|
||||
# keep popping off the first element of the previous chunk until:
|
||||
# 1. the current chunk length is less than chunk overlap
|
||||
# 2. the total length is less than chunk size
|
||||
while cur_chunk and (
|
||||
cur_len > self.chunk_overlap
|
||||
or cur_len + split_len + cur_headers_len > self.chunk_size
|
||||
):
|
||||
# pop off the first element
|
||||
first_chunk = cur_chunk.pop(0)
|
||||
cur_len -= self.len_function(first_chunk[2])
|
||||
|
||||
if (
|
||||
cur_headers
|
||||
and split_len + cur_headers_len < self.chunk_size
|
||||
and cur_headers not in split
|
||||
):
|
||||
cur_chunk.insert(
|
||||
0,
|
||||
(
|
||||
cur_chunk[0][0] if cur_chunk else cur_start,
|
||||
cur_chunk[0][1] if cur_chunk else cur_end,
|
||||
cur_headers,
|
||||
),
|
||||
)
|
||||
cur_len += cur_headers_len
|
||||
|
||||
cur_chunk.append((cur_start, cur_end, split))
|
||||
cur_len += split_len
|
||||
cur_start = cur_end
|
||||
|
||||
# handle the last chunk
|
||||
assert cur_chunk
|
||||
chunks.append(
|
||||
(
|
||||
cur_chunk[0][0],
|
||||
cur_chunk[-1][1],
|
||||
"".join([c[2] for c in cur_chunk]),
|
||||
)
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_protected(self, text: str) -> List[Tuple[int, str]]:
|
||||
matches = [
|
||||
(match.start(), match.end())
|
||||
for pattern in self._protected_fns
|
||||
for match in pattern.finditer(text)
|
||||
]
|
||||
matches.sort(key=lambda x: (x[0], -x[1]))
|
||||
|
||||
res = []
|
||||
|
||||
def fold(initial: int, current: Tuple[int, int]) -> int:
|
||||
if current[0] >= initial:
|
||||
if current[1] - current[0] < self.chunk_size:
|
||||
res.append((current[0], text[current[0] : current[1]]))
|
||||
else:
|
||||
logger.warning(f"Protected text ignore: {current}")
|
||||
return max(initial, current[1])
|
||||
|
||||
# filter overlapping matches
|
||||
list(itertools.accumulate(matches, fold, initial=-1))
|
||||
return res
|
||||
|
||||
def _join(self, splits: List[str], protect: List[Tuple[int, str]]) -> List[str]:
|
||||
"""
|
||||
Merges and splits elements in splits array based on protected substrings.
|
||||
|
||||
The function processes the input splits to ensure all protected substrings
|
||||
remain as single items. If a protected substring is concatenated with preceding
|
||||
or following content in any split element, it will be separated from
|
||||
the adjacent content. The final result maintains the original order of content
|
||||
while enforcing the integrity of protected substrings.
|
||||
|
||||
Key behaviors:
|
||||
1. Preserves the complete structure of each protected substring
|
||||
2. Separates protected substrings from any adjacent non-protected content
|
||||
3. Maintains the original sequence of all content except for necessary
|
||||
4. Handles cases where protected substrings are partially concatenated
|
||||
"""
|
||||
j = 0
|
||||
point, start = 0, 0
|
||||
res = []
|
||||
|
||||
for split in splits:
|
||||
end = start + len(split)
|
||||
|
||||
cur = split[point - start :]
|
||||
while j < len(protect):
|
||||
p_start, p_content = protect[j]
|
||||
p_end = p_start + len(p_content)
|
||||
|
||||
if end <= p_start:
|
||||
break
|
||||
|
||||
if point < p_start:
|
||||
local_end = p_start - point
|
||||
res.append(cur[:local_end])
|
||||
cur = cur[local_end:]
|
||||
point = p_start
|
||||
|
||||
res.append(p_content)
|
||||
j += 1
|
||||
|
||||
if point < p_end:
|
||||
local_start = p_end - point
|
||||
cur = cur[local_start:]
|
||||
point = p_end
|
||||
|
||||
if not cur:
|
||||
break
|
||||
|
||||
if cur:
|
||||
res.append(cur)
|
||||
point = end
|
||||
|
||||
start = end
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
s = """
|
||||
这是一些普通文本。
|
||||
|
||||
| 姓名 | 年龄 | 城市 |
|
||||
|------|------|------|
|
||||
| 张三 | 25 | 北京 |
|
||||
| 李四 | 30 | 上海 |
|
||||
| 王五 | 28 | 广州 |
|
||||
| 张三 | 25 | 北京 |
|
||||
| 李四 | 30 | 上海 |
|
||||
| 王五 | 28 | 广州 |
|
||||
|
||||
这是文本结束。
|
||||
|
||||
"""
|
||||
|
||||
sp = TextSplitter(chunk_size=200, chunk_overlap=2)
|
||||
ck = sp.split_text(s)
|
||||
for c in ck:
|
||||
print("------", len(c))
|
||||
print(c)
|
||||
pass
|
||||
|
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.8 KiB |
103
docreader/utils/endecode.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import base64
|
||||
import binascii
|
||||
import io
|
||||
import logging
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def decode_image(image: Union[str, bytes, Image.Image, np.ndarray]) -> str:
|
||||
"""Convert image to base64 encoded string
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, PIL Image object, or numpy array
|
||||
|
||||
Returns:
|
||||
Base64 encoded image string, or empty string if conversion fails
|
||||
"""
|
||||
if isinstance(image, str):
|
||||
# It's a file path
|
||||
with open(image, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode()
|
||||
|
||||
elif isinstance(image, bytes):
|
||||
# It's bytes data
|
||||
return base64.b64encode(image).decode()
|
||||
|
||||
elif isinstance(image, Image.Image):
|
||||
# It's a PIL Image
|
||||
buffer = io.BytesIO()
|
||||
image.save(buffer, format=image.format)
|
||||
return base64.b64encode(buffer.getvalue()).decode()
|
||||
|
||||
elif isinstance(image, np.ndarray):
|
||||
# It's a numpy array
|
||||
pil_image = Image.fromarray(image)
|
||||
buffer = io.BytesIO()
|
||||
pil_image.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode()
|
||||
|
||||
raise ValueError(f"Unsupported image type: {type(image)}")
|
||||
|
||||
|
||||
def encode_image(image: str, errors="strict") -> bytes:
|
||||
"""
|
||||
Decode image bytes using base64.
|
||||
|
||||
errors
|
||||
The error handling scheme to use for the handling of decoding errors.
|
||||
The default is 'strict' meaning that decoding errors raise a
|
||||
UnicodeDecodeError. Other possible values are 'ignore' and '????'
|
||||
as well as any other name registered with codecs.register_error that
|
||||
can handle UnicodeDecodeErrors.
|
||||
"""
|
||||
try:
|
||||
image_bytes = base64.b64decode(image)
|
||||
except binascii.Error as e:
|
||||
if errors == "ignore":
|
||||
return b""
|
||||
else:
|
||||
raise e
|
||||
return image_bytes
|
||||
|
||||
|
||||
def encode_bytes(content: str) -> bytes:
|
||||
return content.encode()
|
||||
|
||||
|
||||
def decode_bytes(
|
||||
content: bytes,
|
||||
encodings: List[str] = [
|
||||
"utf-8",
|
||||
"gb18030",
|
||||
"gb2312",
|
||||
"gbk",
|
||||
"big5",
|
||||
"ascii",
|
||||
"latin-1",
|
||||
],
|
||||
) -> str:
|
||||
# Try decoding with each encoding format
|
||||
for encoding in encodings:
|
||||
try:
|
||||
text = content.decode(encoding)
|
||||
logger.debug(f"Decode content with {encoding}: {len(text)} characters")
|
||||
return text
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
text = content.decode(encoding="latin-1", errors="replace")
|
||||
logger.warning(
|
||||
"Unable to determine correct encoding, using latin-1 as fallback. "
|
||||
"This may cause character issues."
|
||||
)
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
img = "testtest"
|
||||
encode_image(img, errors="ignore")
|
||||
@@ -1,10 +1,10 @@
|
||||
from contextvars import ContextVar
|
||||
import logging
|
||||
import uuid
|
||||
import contextlib
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
import uuid
|
||||
from contextvars import ContextVar
|
||||
from logging import LogRecord
|
||||
from typing import Optional
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -26,21 +26,21 @@ def get_request_id() -> Optional[str]:
|
||||
|
||||
class MillisecondFormatter(logging.Formatter):
|
||||
"""自定义日志格式化器,只显示毫秒级时间戳(3位数字)而不是微秒(6位)"""
|
||||
|
||||
|
||||
def formatTime(self, record, datefmt=None):
|
||||
"""重写formatTime方法,将微秒格式化为毫秒"""
|
||||
# 先获取标准的格式化时间
|
||||
result = super().formatTime(record, datefmt)
|
||||
|
||||
|
||||
# 如果使用了包含.%f的格式,则将微秒(6位)截断为毫秒(3位)
|
||||
if datefmt and ".%f" in datefmt:
|
||||
# 格式化的时间字符串应该在最后有6位微秒数
|
||||
parts = result.split('.')
|
||||
parts = result.split(".")
|
||||
if len(parts) > 1 and len(parts[1]) >= 6:
|
||||
# 只保留前3位作为毫秒
|
||||
millis = parts[1][:3]
|
||||
result = f"{parts[0]}.{millis}"
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
34
docreader/utils/split.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import re
|
||||
from typing import Callable, List
|
||||
|
||||
|
||||
def split_text_keep_separator(text: str, separator: str) -> List[str]:
|
||||
"""Split text with separator and keep the separator at the end of each split."""
|
||||
parts = text.split(separator)
|
||||
result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
|
||||
return [s for s in result if s]
|
||||
|
||||
|
||||
def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
|
||||
"""Split text by separator."""
|
||||
if keep_sep:
|
||||
return lambda text: split_text_keep_separator(text, sep)
|
||||
else:
|
||||
return lambda text: text.split(sep)
|
||||
|
||||
|
||||
def split_by_char() -> Callable[[str], List[str]]:
|
||||
"""Split text by character."""
|
||||
return lambda text: list(text)
|
||||
|
||||
|
||||
def split_by_regex(regex: str) -> Callable[[str], List[str]]:
|
||||
"""Split text by regex."""
|
||||
pattern = re.compile(f"({regex})")
|
||||
return lambda text: list(filter(None, pattern.split(text)))
|
||||
|
||||
|
||||
def match_by_regex(regex: str) -> Callable[[str], bool]:
|
||||
"""Split text by regex."""
|
||||
pattern = re.compile(regex)
|
||||
return lambda text: bool(pattern.match(text))
|
||||
77
docreader/utils/tempfile.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TempFileContext:
|
||||
def __init__(self, file_content: bytes, suffix: str):
|
||||
"""
|
||||
Initialize the context
|
||||
:param file_content: Byte data to write to file
|
||||
:param suffix: File suffix
|
||||
"""
|
||||
self.file_content = file_content
|
||||
self.suffix = suffix
|
||||
self.file = None
|
||||
|
||||
def __enter__(self):
|
||||
"""
|
||||
Create file when entering context
|
||||
"""
|
||||
self.temp_file = tempfile.NamedTemporaryFile(suffix=self.suffix, delete=False)
|
||||
self.temp_file.write(self.file_content)
|
||||
self.temp_file.flush()
|
||||
logger.info(
|
||||
f"Saved {self.suffix} content to temporary file: {self.temp_file.name}"
|
||||
)
|
||||
return self.temp_file.name
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""
|
||||
Delete file when exiting context
|
||||
"""
|
||||
if self.temp_file:
|
||||
self.temp_file.close()
|
||||
if os.path.exists(self.temp_file.name):
|
||||
os.remove(self.temp_file.name)
|
||||
logger.info(f"File {self.temp_file.name} has been deleted.")
|
||||
# Return False to propagate exception (if any exception occurred)
|
||||
return False
|
||||
|
||||
|
||||
class TempDirContext:
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize the context
|
||||
"""
|
||||
self.temp_dir = None
|
||||
|
||||
def __enter__(self):
|
||||
"""
|
||||
Create directory when entering context
|
||||
"""
|
||||
self.temp_dir = tempfile.TemporaryDirectory()
|
||||
logger.info(f"Created temporary directory: {self.temp_dir.name}")
|
||||
return self.temp_dir.name
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""
|
||||
Delete directory when exiting context
|
||||
"""
|
||||
if self.temp_dir and os.path.exists(self.temp_dir.name):
|
||||
self.temp_dir.cleanup()
|
||||
logger.info(f"Directory {self.temp_dir.name} has been deleted.")
|
||||
# Return False to propagate exception (if any exception occurred)
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_bytes = b"Hello, this is a test file."
|
||||
file_name = "test_file.txt"
|
||||
|
||||
# Using with statement
|
||||
with TempFileContext(example_bytes, file_name) as temp_file:
|
||||
# File operations can be performed within the context
|
||||
print(f"Does file {file_name} exist: {os.path.exists(file_name)}")
|
||||
3740
docreader/uv.lock
generated
Normal file
50
docs/API.md
@@ -44,9 +44,7 @@ X-Request-ID: unique_request_id
|
||||
|
||||
### 获取 API Key
|
||||
|
||||
获取 API Key 有以下方式:
|
||||
|
||||
**创建租户时获取**:通过 `POST /api/v1/tenants` 接口创建新租户时,响应中会自动返回生成的 API Key。
|
||||
在 web 页面完成账户注册后,请前往账户信息页面获取您的 API Key。
|
||||
|
||||
请妥善保管您的 API Key,避免泄露。API Key 代表您的账户身份,拥有完整的 API 访问权限。
|
||||
|
||||
@@ -336,7 +334,6 @@ curl --location 'http://localhost:8080/api/v1/tenants' \
|
||||
| GET | `/knowledge-bases/:id` | 获取知识库详情 |
|
||||
| PUT | `/knowledge-bases/:id` | 更新知识库 |
|
||||
| DELETE | `/knowledge-bases/:id` | 删除知识库 |
|
||||
| GET | `/knowledge-bases/:id/hybrid-search` | 混合搜索知识库内容 |
|
||||
| POST | `/knowledge-bases/copy` | 拷贝知识库 |
|
||||
|
||||
#### POST `/knowledge-bases` - 创建知识库
|
||||
@@ -658,51 +655,6 @@ curl --location --request DELETE 'http://localhost:8080/api/v1/knowledge-bases/b
|
||||
}
|
||||
```
|
||||
|
||||
#### GET `/knowledge-bases/:id/hybrid-search` - 混合搜索知识库内容
|
||||
|
||||
**请求**:
|
||||
|
||||
```curl
|
||||
curl --location --request GET 'http://localhost:8080/api/v1/knowledge-bases/kb-00000001/hybrid-search' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'X-API-Key: sk-vQHV2NZI_LK5W7wHQvH3yGYExX8YnhaHwZipUYbiZKCYJbBQ' \
|
||||
--data '{
|
||||
"query_text": "彗星",
|
||||
"vector_threshold": 0.1,
|
||||
"keyword_threshold": 0.1,
|
||||
"match_count": 1
|
||||
}'
|
||||
```
|
||||
|
||||
**响应**:
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"id": "7d955251-3f79-4fd5-a6aa-02f81e044091",
|
||||
"content": "有几位后来xxxxx",
|
||||
"knowledge_id": "a6790b93-4700-4676-bd48-0d4804e1456b",
|
||||
"chunk_index": 3,
|
||||
"knowledge_title": "彗星.txt",
|
||||
"start_at": 2287,
|
||||
"end_at": 2760,
|
||||
"seq": 3,
|
||||
"score": 0.7402352891601821,
|
||||
"match_type": 2,
|
||||
"sub_chunk_id": null,
|
||||
"metadata": {},
|
||||
"chunk_type": "text",
|
||||
"parent_chunk_id": "",
|
||||
"image_info": "",
|
||||
"knowledge_filename": "彗星.txt",
|
||||
"knowledge_source": ""
|
||||
}
|
||||
],
|
||||
"success": true
|
||||
}
|
||||
```
|
||||
|
||||
<div align="right"><a href="#weknora-api-文档">返回顶部 ↑</a></div>
|
||||
|
||||
### 知识管理API
|
||||
|
||||
28
docs/KnowledgeGraph.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# WeKnora 知识图谱
|
||||
|
||||
## 快速开始
|
||||
|
||||
- .env 配置相关环境变量
|
||||
- 启用 Neo4j: `NEO4J_ENABLE=true`
|
||||
- Neo4j URI: `NEO4J_URI=bolt://neo4j:7687`
|
||||
- Neo4j 用户名: `NEO4J_USERNAME=neo4j`
|
||||
- Neo4j 密码: `NEO4J_PASSWORD=password`
|
||||
|
||||
- 启动 Neo4j
|
||||
```bash
|
||||
docker-compose --profile neo4j up -d
|
||||
```
|
||||
|
||||
- 在知识库设置页面启用实体和关系提取,并根据提示配置相关内容
|
||||
|
||||
## 生成图谱
|
||||
|
||||
上传任意文档后,系统会自动提取实体和关系,并生成对应的知识图谱。
|
||||
|
||||

|
||||
|
||||
## 查看图谱
|
||||
|
||||
登陆 `http://localhost:7474`,执行 `match (n) return (n)` 即可查看生成的知识图谱。
|
||||
|
||||
在对话时,系统会自动查询知识图谱,并获取相关知识。
|
||||
@@ -2,11 +2,7 @@
|
||||
|
||||
## 1. 如何查看日志?
|
||||
```bash
|
||||
# 查看 主服务 日志
|
||||
docker exec -it WeKnora-app tail -f /var/log/WeKnora.log
|
||||
|
||||
# 查看 文档解析模块 日志
|
||||
docker exec -it WeKnora-docreader tail -f /var/log/docreader.log
|
||||
docker compose logs -f app docreader postgres
|
||||
```
|
||||
|
||||
## 2. 如何启动和停止服务?
|
||||
|
||||
BIN
docs/images/graph3.png
Normal file
|
After Width: | Height: | Size: 339 KiB |
BIN
docs/images/pipeline.png
Normal file
|
After Width: | Height: | Size: 504 KiB |
@@ -2,6 +2,16 @@ server {
|
||||
listen 80;
|
||||
server_name localhost;
|
||||
client_max_body_size 50M;
|
||||
|
||||
# 安全头配置
|
||||
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||
add_header X-Content-Type-Options "nosniff" always;
|
||||
add_header X-XSS-Protection "1; mode=block" always;
|
||||
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
|
||||
|
||||
# 错误日志配置
|
||||
error_log /var/log/nginx/error.log warn;
|
||||
access_log /var/log/nginx/access.log;
|
||||
|
||||
# 前端静态文件
|
||||
location / {
|
||||
@@ -18,6 +28,12 @@ server {
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# 连接和重试配置
|
||||
proxy_connect_timeout 30s; # 连接超时时间
|
||||
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
|
||||
proxy_next_upstream_tries 3; # 重试次数
|
||||
proxy_next_upstream_timeout 30s; # 重试超时时间
|
||||
|
||||
# SSE 相关配置
|
||||
proxy_http_version 1.1; # 使用 HTTP/1.1
|
||||
proxy_set_header Connection ""; # 禁用 Connection: close,保持连接打开
|
||||
|
||||
50
frontend/package-lock.json
generated
@@ -1,18 +1,21 @@
|
||||
{
|
||||
"name": "knowledage-base",
|
||||
"version": "0.0.0",
|
||||
"version": "0.1.3",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "knowledage-base",
|
||||
"version": "0.0.0",
|
||||
"version": "0.1.3",
|
||||
"dependencies": {
|
||||
"@microsoft/fetch-event-source": "^2.0.1",
|
||||
"@types/dompurify": "^3.0.5",
|
||||
"axios": "^1.8.4",
|
||||
"dompurify": "^3.2.6",
|
||||
"marked": "^5.1.2",
|
||||
"pagefind": "^1.1.1",
|
||||
"pinia": "^3.0.1",
|
||||
"tdesign-icons-vue-next": "^0.4.1",
|
||||
"tdesign-vue-next": "^1.11.5",
|
||||
"vue": "^3.5.13",
|
||||
"vue-router": "^4.5.0",
|
||||
@@ -1274,6 +1277,15 @@
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/dompurify": {
|
||||
"version": "3.0.5",
|
||||
"resolved": "https://mirrors.tencent.com/npm/@types/dompurify/-/dompurify-3.0.5.tgz",
|
||||
"integrity": "sha512-1Wg0g3BtQF7sSb27fJQAKck1HECM6zV1EB66j8JH9i3LCjYabJa0FSdiSgsD5K/RbrsR0SiraKacLB+T8ZVYAg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/trusted-types": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/eslint": {
|
||||
"version": "9.6.1",
|
||||
"resolved": "https://mirrors.tencent.com/npm/@types/eslint/-/eslint-9.6.1.tgz",
|
||||
@@ -1346,6 +1358,12 @@
|
||||
"resolved": "https://mirrors.tencent.com/npm/@types/tinycolor2/-/tinycolor2-1.4.6.tgz",
|
||||
"integrity": "sha512-iEN8J0BoMnsWBqjVbWH/c0G0Hh7O21lpR2/+PrvAVgWdzL7eexIFm4JN/Wn10PTcmNdtS6U67r499mlWMXOxNw=="
|
||||
},
|
||||
"node_modules/@types/trusted-types": {
|
||||
"version": "2.0.7",
|
||||
"resolved": "https://mirrors.tencent.com/npm/@types/trusted-types/-/trusted-types-2.0.7.tgz",
|
||||
"integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/validator": {
|
||||
"version": "13.15.2",
|
||||
"resolved": "https://mirrors.tencent.com/npm/@types/validator/-/validator-13.15.2.tgz",
|
||||
@@ -2121,6 +2139,15 @@
|
||||
"node": ">=0.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/dompurify": {
|
||||
"version": "3.2.6",
|
||||
"resolved": "https://mirrors.tencent.com/npm/dompurify/-/dompurify-3.2.6.tgz",
|
||||
"integrity": "sha512-/2GogDQlohXPZe6D6NOgQvXLPSYBqIWMnZ8zzOhn09REE4eyAzb+Hed3jhoM9OkuaJ8P6ZGTTVWQKAi8ieIzfQ==",
|
||||
"license": "(MPL-2.0 OR Apache-2.0)",
|
||||
"optionalDependencies": {
|
||||
"@types/trusted-types": "^2.0.7"
|
||||
}
|
||||
},
|
||||
"node_modules/dunder-proto": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://mirrors.tencent.com/npm/dunder-proto/-/dunder-proto-1.0.1.tgz",
|
||||
@@ -3374,9 +3401,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/tdesign-icons-vue-next": {
|
||||
"version": "0.3.6",
|
||||
"resolved": "https://mirrors.tencent.com/npm/tdesign-icons-vue-next/-/tdesign-icons-vue-next-0.3.6.tgz",
|
||||
"integrity": "sha512-X9u90dBv8tPhfpguUyx+BzF8CU2ef2L4RXOO7MYOj1ufHCHwBXTF8L3GPfq6KZd/2u4vMLYAA8lGURn4PZZICw==",
|
||||
"version": "0.4.1",
|
||||
"resolved": "https://mirrors.tencent.com/npm/tdesign-icons-vue-next/-/tdesign-icons-vue-next-0.4.1.tgz",
|
||||
"integrity": "sha512-uDPuTLRORnGcTyVGNoentNaK4V+ZcBmhYwcY3KqDaQQ5rrPeLMxu0ZVmgOEf0JtF2QZiqAxY7vodNEiLUdoRKA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.16.3"
|
||||
},
|
||||
@@ -3410,6 +3438,18 @@
|
||||
"vue": ">=3.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/tdesign-vue-next/node_modules/tdesign-icons-vue-next": {
|
||||
"version": "0.3.7",
|
||||
"resolved": "https://mirrors.tencent.com/npm/tdesign-icons-vue-next/-/tdesign-icons-vue-next-0.3.7.tgz",
|
||||
"integrity": "sha512-Q5ebVty/TCqhBa0l/17kkhjC0pBAOGvn7C35MAt1xS+johKVM9QEDOy9R6XEl332AiwQ37MwqioczqjYC30ckw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.16.3"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"vue": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/terser": {
|
||||
"version": "5.43.1",
|
||||
"resolved": "https://mirrors.tencent.com/npm/terser/-/terser-5.43.1.tgz",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "knowledage-base",
|
||||
"version": "0.1.0",
|
||||
"version": "0.1.3",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
@@ -13,12 +13,16 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"@microsoft/fetch-event-source": "^2.0.1",
|
||||
"@types/dompurify": "^3.0.5",
|
||||
"axios": "^1.8.4",
|
||||
"dompurify": "^3.2.6",
|
||||
"marked": "^5.1.2",
|
||||
"pagefind": "^1.1.1",
|
||||
"pinia": "^3.0.1",
|
||||
"tdesign-icons-vue-next": "^0.4.1",
|
||||
"tdesign-vue-next": "^1.11.5",
|
||||
"vue": "^3.5.13",
|
||||
"vue-i18n": "^9.9.0",
|
||||
"vue-router": "^4.5.0",
|
||||
"webpack": "^5.94.0"
|
||||
},
|
||||
|
||||
@@ -1,9 +1,31 @@
|
||||
<script setup lang="ts">
|
||||
import { computed } from 'vue'
|
||||
import { useI18n } from 'vue-i18n'
|
||||
import { ConfigProvider } from 'tdesign-vue-next'
|
||||
import enUS from 'tdesign-vue-next/es/locale/en_US'
|
||||
import zhCN from 'tdesign-vue-next/es/locale/zh_CN'
|
||||
import ruRU from 'tdesign-vue-next/es/locale/ru_RU'
|
||||
|
||||
const { locale } = useI18n()
|
||||
|
||||
const tdesignLocale = computed(() => {
|
||||
switch (locale.value) {
|
||||
case 'en-US':
|
||||
return enUS
|
||||
case 'ru-RU':
|
||||
return ruRU
|
||||
case 'zh-CN':
|
||||
default:
|
||||
return zhCN
|
||||
}
|
||||
})
|
||||
</script>
|
||||
<template>
|
||||
<div id="app">
|
||||
<RouterView />
|
||||
</div>
|
||||
<ConfigProvider :global-config="tdesignLocale">
|
||||
<div id="app">
|
||||
<RouterView />
|
||||
</div>
|
||||
</ConfigProvider>
|
||||
</template>
|
||||
<style>
|
||||
body,
|
||||
|
||||
239
frontend/src/api/auth/index.ts
Normal file
@@ -0,0 +1,239 @@
|
||||
import { post, get, put } from '@/utils/request'
|
||||
|
||||
// 用户登录接口
|
||||
export interface LoginRequest {
|
||||
email: string
|
||||
password: string
|
||||
}
|
||||
|
||||
export interface LoginResponse {
|
||||
success: boolean
|
||||
message?: string
|
||||
user?: {
|
||||
id: string
|
||||
username: string
|
||||
email: string
|
||||
avatar?: string
|
||||
tenant_id: number
|
||||
is_active: boolean
|
||||
created_at: string
|
||||
updated_at: string
|
||||
}
|
||||
tenant?: {
|
||||
id: number
|
||||
name: string
|
||||
description: string
|
||||
api_key: string
|
||||
status: string
|
||||
business: string
|
||||
storage_quota: number
|
||||
storage_used: number
|
||||
created_at: string
|
||||
updated_at: string
|
||||
}
|
||||
token?: string
|
||||
refresh_token?: string
|
||||
}
|
||||
|
||||
// 用户注册接口
|
||||
export interface RegisterRequest {
|
||||
username: string
|
||||
email: string
|
||||
password: string
|
||||
}
|
||||
|
||||
export interface RegisterResponse {
|
||||
success: boolean
|
||||
message?: string
|
||||
data?: {
|
||||
user: {
|
||||
id: string
|
||||
username: string
|
||||
email: string
|
||||
}
|
||||
tenant: {
|
||||
id: string
|
||||
name: string
|
||||
api_key: string
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 用户信息接口
|
||||
export interface UserInfo {
|
||||
id: string
|
||||
username: string
|
||||
email: string
|
||||
avatar?: string
|
||||
tenant_id: string
|
||||
created_at: string
|
||||
updated_at: string
|
||||
}
|
||||
|
||||
// 租户信息接口
|
||||
export interface TenantInfo {
|
||||
id: string
|
||||
name: string
|
||||
description?: string
|
||||
api_key: string
|
||||
status?: string
|
||||
business?: string
|
||||
owner_id: string
|
||||
storage_quota?: number
|
||||
storage_used?: number
|
||||
created_at: string
|
||||
updated_at: string
|
||||
knowledge_bases?: KnowledgeBaseInfo[]
|
||||
}
|
||||
|
||||
// 知识库信息接口
|
||||
export interface KnowledgeBaseInfo {
|
||||
id: string
|
||||
name: string
|
||||
description: string
|
||||
tenant_id: string
|
||||
created_at: string
|
||||
updated_at: string
|
||||
document_count?: number
|
||||
chunk_count?: number
|
||||
}
|
||||
|
||||
// 模型信息接口
|
||||
export interface ModelInfo {
|
||||
id: string
|
||||
name: string
|
||||
type: string
|
||||
source: string
|
||||
description?: string
|
||||
is_default?: boolean
|
||||
created_at: string
|
||||
updated_at: string
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户登录
|
||||
*/
|
||||
export async function login(data: LoginRequest): Promise<LoginResponse> {
|
||||
try {
|
||||
const response = await post('/api/v1/auth/login', data)
|
||||
return response as unknown as LoginResponse
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '登录失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户注册
|
||||
*/
|
||||
export async function register(data: RegisterRequest): Promise<RegisterResponse> {
|
||||
try {
|
||||
const response = await post('/api/v1/auth/register', data)
|
||||
return response as unknown as RegisterResponse
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '注册失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取当前用户信息
|
||||
*/
|
||||
export async function getCurrentUser(): Promise<{ success: boolean; data?: { user: UserInfo; tenant: TenantInfo }; message?: string }> {
|
||||
try {
|
||||
const response = await get('/api/v1/auth/me')
|
||||
return response as unknown as { success: boolean; data?: { user: UserInfo; tenant: TenantInfo }; message?: string }
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '获取用户信息失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取当前租户信息
|
||||
*/
|
||||
export async function getCurrentTenant(): Promise<{ success: boolean; data?: TenantInfo; message?: string }> {
|
||||
try {
|
||||
const response = await get('/api/v1/auth/tenant')
|
||||
return response as unknown as { success: boolean; data?: TenantInfo; message?: string }
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '获取租户信息失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 刷新Token
|
||||
*/
|
||||
export async function refreshToken(refreshToken: string): Promise<{ success: boolean; data?: { token: string; refreshToken: string }; message?: string }> {
|
||||
try {
|
||||
const response: any = await post('/api/v1/auth/refresh', { refreshToken })
|
||||
if (response && response.success) {
|
||||
if (response.access_token || response.refresh_token) {
|
||||
return {
|
||||
success: true,
|
||||
data: {
|
||||
token: response.access_token,
|
||||
refreshToken: response.refresh_token,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 其他情况直接返回原始消息
|
||||
return {
|
||||
success: false,
|
||||
message: response?.message || '刷新Token失败'
|
||||
}
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '刷新Token失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户登出
|
||||
*/
|
||||
export async function logout(): Promise<{ success: boolean; message?: string }> {
|
||||
try {
|
||||
await post('/api/v1/auth/logout', {})
|
||||
return {
|
||||
success: true
|
||||
}
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '登出失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证Token有效性
|
||||
*/
|
||||
export async function validateToken(): Promise<{ success: boolean; valid?: boolean; message?: string }> {
|
||||
try {
|
||||
const response = await get('/api/v1/auth/validate')
|
||||
return response as unknown as { success: boolean; valid?: boolean; message?: string }
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
valid: false,
|
||||
message: error.message || 'Token验证失败'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,54 +1,24 @@
|
||||
import { get, post, put, del, postChat } from "../../utils/request";
|
||||
import { loadTestData } from "../test-data";
|
||||
|
||||
// 从localStorage获取设置
|
||||
function getSettings() {
|
||||
const settingsStr = localStorage.getItem("WeKnora_settings");
|
||||
if (settingsStr) {
|
||||
try {
|
||||
const settings = JSON.parse(settingsStr);
|
||||
if (settings.apiKey && settings.endpoint) {
|
||||
return settings;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error("解析设置失败:", e);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// 根据是否有设置决定是否需要加载测试数据
|
||||
async function ensureConfigured() {
|
||||
const settings = getSettings();
|
||||
// 如果没有设置APIKey和Endpoint,则加载测试数据
|
||||
if (!settings) {
|
||||
await loadTestData();
|
||||
}
|
||||
}
|
||||
|
||||
export async function createSessions(data = {}) {
|
||||
await ensureConfigured();
|
||||
return post("/api/v1/sessions", data);
|
||||
}
|
||||
|
||||
export async function getSessionsList(page: number, page_size: number) {
|
||||
await ensureConfigured();
|
||||
return get(`/api/v1/sessions?page=${page}&page_size=${page_size}`);
|
||||
}
|
||||
|
||||
export async function generateSessionsTitle(session_id: string, data: any) {
|
||||
await ensureConfigured();
|
||||
return post(`/api/v1/sessions/${session_id}/generate_title`, data);
|
||||
}
|
||||
|
||||
export async function knowledgeChat(data: { session_id: string; query: string; }) {
|
||||
await ensureConfigured();
|
||||
return postChat(`/api/v1/knowledge-chat/${data.session_id}`, { query: data.query });
|
||||
}
|
||||
|
||||
export async function getMessageList(data: { session_id: string; limit: number, created_at: string }) {
|
||||
await ensureConfigured();
|
||||
|
||||
if (data.created_at) {
|
||||
return get(`/api/v1/messages/${data.session_id}/load?before_time=${encodeURIComponent(data.created_at)}&limit=${data.limit}`);
|
||||
} else {
|
||||
@@ -57,6 +27,5 @@ export async function getMessageList(data: { session_id: string; limit: number,
|
||||
}
|
||||
|
||||
export async function delSession(session_id: string) {
|
||||
await ensureConfigured();
|
||||
return del(`/api/v1/sessions/${session_id}`);
|
||||
}
|
||||
@@ -1,22 +1,8 @@
|
||||
import { fetchEventSource } from '@microsoft/fetch-event-source'
|
||||
import { ref, type Ref, onUnmounted, nextTick } from 'vue'
|
||||
import { generateRandomString } from '@/utils/index';
|
||||
import { getTestData } from '@/utils/request';
|
||||
import { loadTestData } from '@/api/test-data';
|
||||
|
||||
// 从localStorage获取设置
|
||||
function getSettings() {
|
||||
const settingsStr = localStorage.getItem("WeKnora_settings");
|
||||
if (settingsStr) {
|
||||
try {
|
||||
const settings = JSON.parse(settingsStr);
|
||||
return settings;
|
||||
} catch (e) {
|
||||
console.error("解析设置失败:", e);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
interface StreamOptions {
|
||||
// 请求方法 (默认POST)
|
||||
@@ -49,26 +35,15 @@ export function useStream() {
|
||||
isStreaming.value = true;
|
||||
isLoading.value = true;
|
||||
|
||||
// 获取设置信息
|
||||
const settings = getSettings();
|
||||
let apiUrl = '';
|
||||
let apiKey = '';
|
||||
|
||||
// 如果有设置信息,优先使用设置信息
|
||||
if (settings && settings.endpoint && settings.apiKey) {
|
||||
apiUrl = settings.endpoint;
|
||||
apiKey = settings.apiKey;
|
||||
} else {
|
||||
// 否则加载测试数据
|
||||
await loadTestData();
|
||||
const testData = getTestData();
|
||||
if (!testData) {
|
||||
error.value = "测试数据未初始化,无法进行聊天";
|
||||
stopStream();
|
||||
return;
|
||||
}
|
||||
apiUrl = import.meta.env.VITE_IS_DOCKER ? "" : "http://localhost:8080";
|
||||
apiKey = testData.tenant.api_key;
|
||||
// 获取API配置
|
||||
const apiUrl = import.meta.env.VITE_IS_DOCKER ? "" : "http://localhost:8080";
|
||||
|
||||
// 获取JWT Token
|
||||
const token = localStorage.getItem('weknora_token');
|
||||
if (!token) {
|
||||
error.value = "未找到登录令牌,请重新登录";
|
||||
stopStream();
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
@@ -80,7 +55,7 @@ export function useStream() {
|
||||
method: params.method,
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"X-API-Key": apiKey,
|
||||
"Authorization": `Bearer ${token}`,
|
||||
"X-Request-ID": `${generateRandomString(12)}`,
|
||||
},
|
||||
body:
|
||||
|
||||
@@ -19,6 +19,7 @@ export interface InitializationConfig {
|
||||
modelName: string;
|
||||
baseUrl: string;
|
||||
apiKey?: string;
|
||||
enabled: boolean;
|
||||
};
|
||||
multimodal: {
|
||||
enabled: boolean;
|
||||
@@ -49,6 +50,13 @@ export interface InitializationConfig {
|
||||
};
|
||||
// Frontend-only hint for storage selection UI
|
||||
storageType?: 'cos' | 'minio';
|
||||
nodeExtract: {
|
||||
enabled: boolean,
|
||||
text: string,
|
||||
tags: string[],
|
||||
nodes: Node[],
|
||||
relations: Relation[]
|
||||
}
|
||||
}
|
||||
|
||||
// 下载任务状态类型
|
||||
@@ -62,34 +70,18 @@ export interface DownloadTask {
|
||||
endTime?: string;
|
||||
}
|
||||
|
||||
// 系统初始化状态检查
|
||||
export function checkInitializationStatus(): Promise<{ initialized: boolean }> {
|
||||
// 根据知识库ID执行配置更新
|
||||
export function initializeSystemByKB(kbId: string, config: InitializationConfig): Promise<any> {
|
||||
return new Promise((resolve, reject) => {
|
||||
get('/api/v1/initialization/status')
|
||||
console.log('开始知识库配置更新...', kbId, config);
|
||||
post(`/api/v1/initialization/initialize/${kbId}`, config)
|
||||
.then((response: any) => {
|
||||
resolve(response.data || { initialized: false });
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.warn('检查初始化状态失败,假设需要初始化:', error);
|
||||
resolve({ initialized: false });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// 执行系统初始化
|
||||
export function initializeSystem(config: InitializationConfig): Promise<any> {
|
||||
return new Promise((resolve, reject) => {
|
||||
console.log('开始系统初始化...', config);
|
||||
post('/api/v1/initialization/initialize', config)
|
||||
.then((response: any) => {
|
||||
console.log('系统初始化完成', response);
|
||||
// 设置本地初始化状态标记
|
||||
localStorage.setItem('system_initialized', 'true');
|
||||
console.log('知识库配置更新完成', response);
|
||||
resolve(response);
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.error('系统初始化失败:', error);
|
||||
reject(error);
|
||||
console.error('知识库配置更新失败:', error);
|
||||
reject(error.error || error);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -178,15 +170,15 @@ export function listDownloadTasks(): Promise<DownloadTask[]> {
|
||||
});
|
||||
}
|
||||
|
||||
// 获取当前系统配置
|
||||
export function getCurrentConfig(): Promise<InitializationConfig & { hasFiles: boolean }> {
|
||||
|
||||
export function getCurrentConfigByKB(kbId: string): Promise<InitializationConfig & { hasFiles: boolean }> {
|
||||
return new Promise((resolve, reject) => {
|
||||
get('/api/v1/initialization/config')
|
||||
get(`/api/v1/initialization/config/${kbId}`)
|
||||
.then((response: any) => {
|
||||
resolve(response.data || {});
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.error('获取当前配置失败:', error);
|
||||
console.error('获取知识库配置失败:', error);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
@@ -311,9 +303,17 @@ export function testMultimodalFunction(testData: {
|
||||
formData.append('chunk_overlap', testData.chunk_overlap.toString());
|
||||
formData.append('separators', JSON.stringify(testData.separators));
|
||||
|
||||
// 获取鉴权Token
|
||||
const token = localStorage.getItem('weknora_token');
|
||||
const headers: Record<string, string> = {};
|
||||
if (token) {
|
||||
headers['Authorization'] = `Bearer ${token}`;
|
||||
}
|
||||
|
||||
// 使用原生fetch因为需要发送FormData
|
||||
fetch('/api/v1/initialization/multimodal/test', {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: formData
|
||||
})
|
||||
.then(response => response.json())
|
||||
@@ -329,4 +329,93 @@ export function testMultimodalFunction(testData: {
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 文本内容关系提取接口
|
||||
export interface TextRelationExtractionRequest {
|
||||
text: string;
|
||||
tags: string[];
|
||||
llmConfig: LLMConfig;
|
||||
}
|
||||
|
||||
export interface Node {
|
||||
name: string;
|
||||
attributes: string[];
|
||||
}
|
||||
|
||||
export interface Relation {
|
||||
node1: string;
|
||||
node2: string;
|
||||
type: string;
|
||||
}
|
||||
|
||||
export interface LLMConfig {
|
||||
source: 'local' | 'remote';
|
||||
modelName: string;
|
||||
baseUrl: string;
|
||||
apiKey: string;
|
||||
}
|
||||
|
||||
export interface TextRelationExtractionResponse {
|
||||
nodes: Node[];
|
||||
relations: Relation[];
|
||||
}
|
||||
|
||||
// 文本内容关系提取
|
||||
export function extractTextRelations(request: TextRelationExtractionRequest): Promise<TextRelationExtractionResponse> {
|
||||
return new Promise((resolve, reject) => {
|
||||
post('/api/v1/initialization/extract/text-relation', request)
|
||||
.then((response: any) => {
|
||||
resolve(response.data || { nodes: [], relations: [] });
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.error('文本内容关系提取失败:', error);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export interface FabriTextRequest {
|
||||
tags: string[];
|
||||
llmConfig: LLMConfig;
|
||||
}
|
||||
|
||||
export interface FabriTextResponse {
|
||||
text: string;
|
||||
}
|
||||
|
||||
// 文本内容生成
|
||||
export function fabriText(request: FabriTextRequest): Promise<FabriTextResponse> {
|
||||
return new Promise((resolve, reject) => {
|
||||
post('/api/v1/initialization/extract/fabri-text', request)
|
||||
.then((response: any) => {
|
||||
resolve(response.data || { text: '' });
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.error('文本内容生成失败:', error);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export interface FabriTagRequest {
|
||||
llmConfig: LLMConfig;
|
||||
}
|
||||
|
||||
export interface FabriTagResponse {
|
||||
tags: string[];
|
||||
}
|
||||
|
||||
// 文本内容生成
|
||||
export function fabriTag(request: FabriTagRequest): Promise<FabriTagResponse> {
|
||||
return new Promise((resolve, reject) => {
|
||||
post('/api/v1/initialization/extract/fabri-tag', request)
|
||||
.then((response: any) => {
|
||||
resolve(response.data || { tags: [] as string[] });
|
||||
})
|
||||
.catch((error: any) => {
|
||||
console.error('标签生成失败:', error);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -1,62 +1,55 @@
|
||||
import { get, post, put, del, postUpload, getDown, getTestData } from "../../utils/request";
|
||||
import { loadTestData } from "../test-data";
|
||||
import { get, post, put, del, postUpload, getDown } from "../../utils/request";
|
||||
|
||||
// 获取知识库ID(优先从设置中获取)
|
||||
async function getKnowledgeBaseID() {
|
||||
// 从localStorage获取设置中的知识库ID
|
||||
const settingsStr = localStorage.getItem("WeKnora_settings");
|
||||
let knowledgeBaseId = "";
|
||||
|
||||
if (settingsStr) {
|
||||
try {
|
||||
const settings = JSON.parse(settingsStr);
|
||||
if (settings.knowledgeBaseId) {
|
||||
return settings.knowledgeBaseId;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error("解析设置失败:", e);
|
||||
}
|
||||
}
|
||||
|
||||
// 如果设置中没有知识库ID,则使用测试数据
|
||||
await loadTestData();
|
||||
|
||||
const testData = getTestData();
|
||||
if (!testData || testData.knowledge_bases.length === 0) {
|
||||
console.error("测试数据未初始化或不包含知识库");
|
||||
throw new Error("测试数据未初始化或不包含知识库");
|
||||
}
|
||||
return testData.knowledge_bases[0].id;
|
||||
// 知识库管理 API(列表、创建、获取、更新、删除、复制)
|
||||
export function listKnowledgeBases() {
|
||||
return get(`/api/v1/knowledge-bases`);
|
||||
}
|
||||
|
||||
export async function uploadKnowledgeBase(data = {}) {
|
||||
const kbId = await getKnowledgeBaseID();
|
||||
export function createKnowledgeBase(data: { name: string; description?: string; chunking_config?: any }) {
|
||||
return post(`/api/v1/knowledge-bases`, data);
|
||||
}
|
||||
|
||||
export function getKnowledgeBaseById(id: string) {
|
||||
return get(`/api/v1/knowledge-bases/${id}`);
|
||||
}
|
||||
|
||||
export function updateKnowledgeBase(id: string, data: { name: string; description?: string; config: any }) {
|
||||
return put(`/api/v1/knowledge-bases/${id}` , data);
|
||||
}
|
||||
|
||||
export function deleteKnowledgeBase(id: string) {
|
||||
return del(`/api/v1/knowledge-bases/${id}`);
|
||||
}
|
||||
|
||||
export function copyKnowledgeBase(data: { source_id: string; target_id?: string }) {
|
||||
return post(`/api/v1/knowledge-bases/copy`, data);
|
||||
}
|
||||
|
||||
// 知识文件 API(基于具体知识库)
|
||||
export function uploadKnowledgeFile(kbId: string, data = {}) {
|
||||
return postUpload(`/api/v1/knowledge-bases/${kbId}/knowledge/file`, data);
|
||||
}
|
||||
|
||||
export async function getKnowledgeBase({page, page_size}) {
|
||||
const kbId = await getKnowledgeBaseID();
|
||||
return get(
|
||||
`/api/v1/knowledge-bases/${kbId}/knowledge?page=${page}&page_size=${page_size}`
|
||||
);
|
||||
export function listKnowledgeFiles(kbId: string, { page, page_size }: { page: number; page_size: number }) {
|
||||
return get(`/api/v1/knowledge-bases/${kbId}/knowledge?page=${page}&page_size=${page_size}`);
|
||||
}
|
||||
|
||||
export function getKnowledgeDetails(id: any) {
|
||||
export function getKnowledgeDetails(id: string) {
|
||||
return get(`/api/v1/knowledge/${id}`);
|
||||
}
|
||||
|
||||
export function delKnowledgeDetails(id: any) {
|
||||
export function delKnowledgeDetails(id: string) {
|
||||
return del(`/api/v1/knowledge/${id}`);
|
||||
}
|
||||
|
||||
export function downKnowledgeDetails(id: any) {
|
||||
export function downKnowledgeDetails(id: string) {
|
||||
return getDown(`/api/v1/knowledge/${id}/download`);
|
||||
}
|
||||
|
||||
export function batchQueryKnowledge(ids: any) {
|
||||
return get(`/api/v1/knowledge/batch?${ids}`);
|
||||
export function batchQueryKnowledge(idsQueryString: string) {
|
||||
return get(`/api/v1/knowledge/batch?${idsQueryString}`);
|
||||
}
|
||||
|
||||
export function getKnowledgeDetailsCon(id: any, page) {
|
||||
export function getKnowledgeDetailsCon(id: string, page: number) {
|
||||
return get(`/api/v1/chunks/${id}?page=${page}&page_size=25`);
|
||||
}
|
||||
12
frontend/src/api/system/index.ts
Normal file
@@ -0,0 +1,12 @@
|
||||
import { get } from '@/utils/request'
|
||||
|
||||
export interface SystemInfo {
|
||||
version: string
|
||||
commit_id?: string
|
||||
build_time?: string
|
||||
go_version?: string
|
||||
}
|
||||
|
||||
export function getSystemInfo(): Promise<{ data: SystemInfo }> {
|
||||
return get('/api/v1/system/info')
|
||||
}
|
||||
@@ -1,55 +0,0 @@
|
||||
import { get, setTestData } from '../../utils/request';
|
||||
|
||||
export interface TestDataResponse {
|
||||
success: boolean;
|
||||
data: {
|
||||
tenant: {
|
||||
id: number;
|
||||
name: string;
|
||||
api_key: string;
|
||||
};
|
||||
knowledge_bases: Array<{
|
||||
id: string;
|
||||
name: string;
|
||||
description: string;
|
||||
}>;
|
||||
}
|
||||
}
|
||||
|
||||
// 是否已加载测试数据
|
||||
let isTestDataLoaded = false;
|
||||
|
||||
/**
|
||||
* 加载测试数据
|
||||
* 在API调用前调用此函数以确保测试数据已加载
|
||||
* @returns Promise<boolean> 是否成功加载
|
||||
*/
|
||||
export async function loadTestData(): Promise<boolean> {
|
||||
// 如果已经加载过,直接返回
|
||||
if (isTestDataLoaded) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
console.log('开始加载测试数据...');
|
||||
const response = await get('/api/v1/test-data');
|
||||
console.log('测试数据', response);
|
||||
|
||||
if (response && response.data) {
|
||||
// 设置测试数据
|
||||
setTestData({
|
||||
tenant: response.data.tenant,
|
||||
knowledge_bases: response.data.knowledge_bases
|
||||
});
|
||||
isTestDataLoaded = true;
|
||||
console.log('测试数据加载成功');
|
||||
return true;
|
||||
} else {
|
||||
console.warn('测试数据响应为空');
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('加载测试数据失败:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
6
frontend/src/assets/img/logout.svg
Normal file
@@ -0,0 +1,6 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none">
|
||||
<path d="M10 3H6a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h4" stroke="#000" stroke-opacity="0.6" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
|
||||
<path d="M17 16l4-4-4-4" stroke="#000" stroke-opacity="0.6" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
|
||||
<path d="M21 12H10" stroke="#000" stroke-opacity="0.6" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
|
||||
</svg>
|
||||
|
||||
|
After Width: | Height: | Size: 509 B |
4
frontend/src/assets/img/user-green.svg
Normal file
@@ -0,0 +1,4 @@
|
||||
<svg width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<circle cx="10" cy="6" r="3" stroke="#07C05F" stroke-width="1.5" fill="none"/>
|
||||
<path d="M4 16c0-3.314 2.686-6 6-6s6 2.686 6 6" stroke="#07C05F" stroke-width="1.5" fill="none"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 284 B |
4
frontend/src/assets/img/user.svg
Normal file
@@ -0,0 +1,4 @@
|
||||
<svg width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<circle cx="10" cy="6" r="3" stroke="currentColor" stroke-width="1.5" fill="none"/>
|
||||
<path d="M4 16c0-3.314 2.686-6 6-6s6 2.686 6 6" stroke="currentColor" stroke-width="1.5" fill="none"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 294 B |
@@ -1,8 +1,11 @@
|
||||
<script setup lang="ts">
|
||||
import { ref, defineEmits, onMounted, defineProps, defineExpose } from "vue";
|
||||
import { useI18n } from 'vue-i18n';
|
||||
import useKnowledgeBase from '@/hooks/useKnowledgeBase';
|
||||
import { onBeforeRouteUpdate } from 'vue-router';
|
||||
import { MessagePlugin } from "tdesign-vue-next";
|
||||
|
||||
const { t } = useI18n();
|
||||
let { cardList, total, getKnowled } = useKnowledgeBase()
|
||||
let query = ref("");
|
||||
const props = defineProps({
|
||||
@@ -17,15 +20,15 @@ onMounted(() => {
|
||||
const emit = defineEmits(['send-msg']);
|
||||
const createSession = (val: string) => {
|
||||
if (!val.trim()) {
|
||||
MessagePlugin.info("请先输入内容!");
|
||||
MessagePlugin.info(t('chat.pleaseEnterContent'));
|
||||
return
|
||||
}
|
||||
if (!query.value && cardList.value.length == 0) {
|
||||
MessagePlugin.info("请先上传知识库!");
|
||||
MessagePlugin.info(t('chat.pleaseUploadKnowledgeBase'));
|
||||
return;
|
||||
}
|
||||
if (props.isReplying) {
|
||||
return MessagePlugin.error("正在回复中,请稍后再试!");
|
||||
return MessagePlugin.error(t('chat.replyingPleaseWait'));
|
||||
}
|
||||
emit('send-msg', val);
|
||||
clearvalue();
|
||||
@@ -50,9 +53,9 @@ onBeforeRouteUpdate((to, from, next) => {
|
||||
</script>
|
||||
<template>
|
||||
<div class="answers-input">
|
||||
<t-textarea v-model="query" placeholder="基于知识库提问" name="description" :autosize="true" @keydown="onKeydown" />
|
||||
<t-textarea v-model="query" :placeholder="t('chat.askKnowledgeBase')" name="description" :autosize="true" @keydown="onKeydown" />
|
||||
<div class="answers-input-source">
|
||||
<span>{{ total }}个来源</span>
|
||||
<span>{{ t('chat.sourcesCount', { count: total }) }}</span>
|
||||
</div>
|
||||
<div @click="createSession(query)" class="answers-input-send"
|
||||
:class="[query.length && total ? '' : 'grey-out']">
|
||||
|
||||
67
frontend/src/components/LanguageSwitcher.vue
Normal file
@@ -0,0 +1,67 @@
|
||||
<template>
|
||||
<div class="language-switcher">
|
||||
<t-select
|
||||
v-model="selectedLanguage"
|
||||
:options="languageOptions"
|
||||
@change="handleLanguageChange"
|
||||
:popup-props="{ overlayClassName: 'language-select-popup' }"
|
||||
size="small"
|
||||
>
|
||||
<template #prefixIcon>
|
||||
<t-icon name="translate" />
|
||||
</template>
|
||||
</t-select>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, watch } from 'vue'
|
||||
import { useI18n } from 'vue-i18n'
|
||||
|
||||
const { locale } = useI18n()
|
||||
|
||||
const languageOptions = [
|
||||
{ label: '中文', value: 'zh-CN' },
|
||||
{ label: 'English', value: 'en-US' },
|
||||
{ label: 'Русский', value: 'ru-RU' }
|
||||
]
|
||||
|
||||
const selectedLanguage = ref(localStorage.getItem('locale') || 'zh-CN')
|
||||
|
||||
const handleLanguageChange = (value: string) => {
|
||||
console.log('Язык изменен на:', value)
|
||||
if (value && ['ru-RU', 'en-US', 'zh-CN'].includes(value)) {
|
||||
locale.value = value
|
||||
localStorage.setItem('locale', value)
|
||||
// Перезагрузка страницы для применения нового языка
|
||||
setTimeout(() => {
|
||||
window.location.reload()
|
||||
}, 100)
|
||||
}
|
||||
}
|
||||
|
||||
// Синхронизация с i18n при инициализации
|
||||
watch(() => locale.value, (newLocale) => {
|
||||
if (selectedLanguage.value !== newLocale) {
|
||||
selectedLanguage.value = newLocale
|
||||
}
|
||||
}, { immediate: true })
|
||||
</script>
|
||||
|
||||
<style lang="less" scoped>
|
||||
.language-switcher {
|
||||
.t-button {
|
||||
color: #666;
|
||||
font-size: 14px;
|
||||
|
||||
&:hover {
|
||||
color: #333;
|
||||
background-color: rgba(0, 0, 0, 0.04);
|
||||
}
|
||||
}
|
||||
|
||||
.t-icon {
|
||||
margin-right: 4px;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
@@ -4,6 +4,8 @@ import { onMounted, ref, nextTick, onUnmounted, onUpdated, watch } from "vue";
|
||||
import { downKnowledgeDetails } from "@/api/knowledge-base/index";
|
||||
import { MessagePlugin } from "tdesign-vue-next";
|
||||
import picturePreview from '@/components/picture-preview.vue';
|
||||
import { sanitizeHTML, safeMarkdownToHTML, createSafeImage, isValidImageURL } from '@/utils/security';
|
||||
|
||||
marked.use({
|
||||
mangle: false,
|
||||
headerIds: false,
|
||||
@@ -37,10 +39,16 @@ const checkImage = (url) => {
|
||||
});
|
||||
};
|
||||
renderer.image = function (href, title, text) {
|
||||
// 自定义HTML结构,图片展示带标题
|
||||
// 安全地处理图片链接
|
||||
if (!isValidImageURL(href)) {
|
||||
return `<p>无效的图片链接</p>`;
|
||||
}
|
||||
|
||||
// 使用安全的图片创建函数
|
||||
const safeImage = createSafeImage(href, text || '', title || '');
|
||||
return `<figure>
|
||||
<img class="markdown-image" src="${href}" alt="${title}" title="${text}">
|
||||
<figcaption style="text-align: left;">${text}</figcaption>
|
||||
${safeImage}
|
||||
<figcaption style="text-align: left;">${text || ''}</figcaption>
|
||||
</figure>`;
|
||||
};
|
||||
const props = defineProps(["visible", "details"]);
|
||||
@@ -66,14 +74,23 @@ watch(() => props.details.md, (newVal) => {
|
||||
deep: true
|
||||
})
|
||||
|
||||
// 处理 Markdown 中的图片
|
||||
// 安全地处理 Markdown 内容
|
||||
const processMarkdown = (markdownText) => {
|
||||
// 自定义渲染器处理图片
|
||||
if (!markdownText || typeof markdownText !== 'string') {
|
||||
return '';
|
||||
}
|
||||
|
||||
// 首先对 Markdown 内容进行安全处理
|
||||
const safeMarkdown = safeMarkdownToHTML(markdownText);
|
||||
|
||||
// 使用安全的渲染器
|
||||
marked.use({ renderer });
|
||||
let html = marked.parse(markdownText);
|
||||
const parser = new DOMParser();
|
||||
const doc = parser.parseFromString(html, 'text/html');
|
||||
return doc.body.innerHTML;
|
||||
let html = marked.parse(safeMarkdown);
|
||||
|
||||
// 使用 DOMPurify 进行最终的安全清理
|
||||
const sanitizedHTML = sanitizeHTML(html);
|
||||
|
||||
return sanitizedHTML;
|
||||
};
|
||||
const closePreImg = () => {
|
||||
reviewImg.value = false
|
||||
@@ -87,15 +104,19 @@ const downloadFile = () => {
|
||||
downKnowledgeDetails(props.details.id)
|
||||
.then((result) => {
|
||||
if (result) {
|
||||
if (url.value) {
|
||||
URL.revokeObjectURL(url.value);
|
||||
}
|
||||
url.value = URL.createObjectURL(result);
|
||||
down.value.click();
|
||||
// const link = document.createElement("a");
|
||||
// link.style.display = "none";
|
||||
// link.setAttribute("href", url);
|
||||
// link.setAttribute("download", props.details.title);
|
||||
// link.click();
|
||||
// document.body.removeChild(link);
|
||||
window.URL.revokeObjectURL(url);
|
||||
const link = document.createElement("a");
|
||||
link.style.display = "none";
|
||||
link.setAttribute("href", url.value);
|
||||
link.setAttribute("download", props.details.title);
|
||||
link.click();
|
||||
nextTick(() => {
|
||||
document.body.removeChild(link);
|
||||
URL.revokeObjectURL(url.value);
|
||||
})
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
<script setup lang="ts">
|
||||
import { useI18n } from 'vue-i18n'
|
||||
|
||||
const { t } = useI18n()
|
||||
</script>
|
||||
<template>
|
||||
<div class="empty">
|
||||
<img class="empty-img" src="@/assets/img/upload.svg" alt="">
|
||||
<span class="empty-txt">知识为空,拖放上传</span>
|
||||
<span class="empty-type-txt">pdf、doc 格式文件,不超过10M</span>
|
||||
<span class="empty-type-txt">text、markdown格式文件,不超过200K</span>
|
||||
<span class="empty-txt">{{ t('knowledgeBase.emptyKnowledgeDragDrop') }}</span>
|
||||
<span class="empty-type-txt">{{ t('knowledgeBase.pdfDocFormat') }}</span>
|
||||
<span class="empty-type-txt">{{ t('knowledgeBase.textMarkdownFormat') }}</span>
|
||||
</div>
|
||||
</template>
|
||||
<style scoped lang="less">
|
||||
|
||||
@@ -1,68 +1,134 @@
|
||||
<template>
|
||||
<div class="aside_box">
|
||||
<div class="logo_box">
|
||||
<div class="logo_box" @click="router.push('/platform/knowledge-bases')" style="cursor: pointer;">
|
||||
<img class="logo" src="@/assets/img/weknora.png" alt="">
|
||||
</div>
|
||||
<div class="menu_box" v-for="(item, index) in menuArr" :key="index">
|
||||
<div @click="gotopage(item.path)"
|
||||
@mouseenter="mouseenteMenu(item.path)" @mouseleave="mouseleaveMenu(item.path)"
|
||||
:class="['menu_item', item.childrenPath && item.childrenPath == currentpath ? 'menu_item_c_active' : item.path == currentpath ? 'menu_item_active' : '']">
|
||||
<div class="menu_item-box">
|
||||
<div class="menu_icon">
|
||||
<img class="icon" :src="getImgSrc(item.icon == 'zhishiku' ? knowledgeIcon : item.icon == 'setting' ? settingIcon : prefixIcon)" alt="">
|
||||
|
||||
<!-- 上半部分:知识库和对话 -->
|
||||
<div class="menu_top">
|
||||
<div class="menu_box" :class="{ 'has-submenu': item.children }" v-for="(item, index) in topMenuItems" :key="index">
|
||||
<div @click="handleMenuClick(item.path)"
|
||||
@mouseenter="mouseenteMenu(item.path)" @mouseleave="mouseleaveMenu(item.path)"
|
||||
:class="['menu_item', item.childrenPath && item.childrenPath == currentpath ? 'menu_item_c_active' : isMenuItemActive(item.path) ? 'menu_item_active' : '']">
|
||||
<div class="menu_item-box">
|
||||
<div class="menu_icon">
|
||||
<img class="icon" :src="getImgSrc(item.icon == 'zhishiku' ? knowledgeIcon : item.icon == 'logout' ? logoutIcon : item.icon == 'tenant' ? tenantIcon : prefixIcon)" alt="">
|
||||
</div>
|
||||
<span class="menu_title" :title="item.path === 'knowledge-bases' && kbMenuItem?.title ? kbMenuItem.title : t(item.titleKey)">{{ item.path === 'knowledge-bases' && kbMenuItem?.title ? kbMenuItem.title : t(item.titleKey) }}</span>
|
||||
<!-- 知识库切换下拉箭头 -->
|
||||
<div v-if="item.path === 'knowledge-bases' && isInKnowledgeBase"
|
||||
class="kb-dropdown-icon"
|
||||
:class="{
|
||||
'rotate-180': showKbDropdown,
|
||||
'active': isMenuItemActive(item.path)
|
||||
}"
|
||||
@click.stop="toggleKbDropdown">
|
||||
<svg width="12" height="12" viewBox="0 0 12 12" fill="currentColor">
|
||||
<path d="M2.5 4.5L6 8L9.5 4.5H2.5Z"/>
|
||||
</svg>
|
||||
</div>
|
||||
</div>
|
||||
<span class="menu_title">{{ item.title }}</span>
|
||||
<!-- 知识库切换下拉菜单 -->
|
||||
<div v-if="item.path === 'knowledge-bases' && showKbDropdown && isInKnowledgeBase"
|
||||
class="kb-dropdown-menu">
|
||||
<div v-for="kb in initializedKnowledgeBases"
|
||||
:key="kb.id"
|
||||
class="kb-dropdown-item"
|
||||
:class="{ 'active': kb.name === currentKbName }"
|
||||
@click.stop="switchKnowledgeBase(kb.id)">
|
||||
{{ kb.name }}
|
||||
</div>
|
||||
</div>
|
||||
<t-popup overlayInnerClassName="upload-popup" class="placement top center" :content="t('menu.uploadKnowledge')"
|
||||
placement="top" show-arrow destroy-on-close>
|
||||
<div class="upload-file-wrap" @click.stop="uploadFile" variant="outline"
|
||||
v-if="item.path === 'knowledge-bases' && $route.name === 'knowledgeBaseDetail'">
|
||||
<img class="upload-file-icon" :class="[item.path == currentpath ? 'active-upload' : '']"
|
||||
:src="getImgSrc(fileAddIcon)" alt="">
|
||||
</div>
|
||||
</t-popup>
|
||||
</div>
|
||||
<t-popup overlayInnerClassName="upload-popup" class="placement top center" content="上传知识"
|
||||
placement="top" show-arrow destroy-on-close>
|
||||
<div class="upload-file-wrap" @click="uploadFile" variant="outline"
|
||||
v-if="item.path == 'knowledgeBase'">
|
||||
<img class="upload-file-icon" :class="[item.path == currentpath ? 'active-upload' : '']"
|
||||
:src="getImgSrc(fileAddIcon)" alt="">
|
||||
</div>
|
||||
</t-popup>
|
||||
</div>
|
||||
<div ref="submenuscrollContainer" @scroll="handleScroll" class="submenu" v-if="item.children">
|
||||
<div class="submenu_item_p" v-for="(subitem, subindex) in item.children" :key="subindex"
|
||||
@click="gotopage(subitem.path)">
|
||||
<div :class="['submenu_item', currentSecondpath == subitem.path ? 'submenu_item_active' : '']"
|
||||
@mouseenter="mouseenteBotDownr(subindex)" @mouseleave="mouseleaveBotDown">
|
||||
<i v-if="currentSecondpath == subitem.path" class="dot"></i>
|
||||
<span class="submenu_title"
|
||||
:style="currentSecondpath == subitem.path ? 'margin-left:14px;max-width:160px;' : 'margin-left:18px;max-width:173px;'">
|
||||
{{ subitem.title }}
|
||||
</span>
|
||||
<t-popup v-model:visible="subitem.isMore" @overlay-click="delCard(subindex, subitem)"
|
||||
@visible-change="onVisibleChange" overlayClassName="del-menu-popup" trigger="click"
|
||||
destroy-on-close placement="top-left">
|
||||
<div v-if="(activeSubmenu == subindex) || (currentSecondpath == subitem.path) || subitem.isMore"
|
||||
@click.stop="openMore(subindex)" variant="outline" class="menu-more-wrap">
|
||||
<t-icon name="ellipsis" class="menu-more" />
|
||||
</div>
|
||||
<template #content>
|
||||
<span class="del_submenu">删除记录</span>
|
||||
</template>
|
||||
</t-popup>
|
||||
<div ref="submenuscrollContainer" @scroll="handleScroll" class="submenu" v-if="item.children">
|
||||
<div class="submenu_item_p" v-for="(subitem, subindex) in item.children" :key="subindex"
|
||||
@click="gotopage(subitem.path)">
|
||||
<div :class="['submenu_item', currentSecondpath == subitem.path ? 'submenu_item_active' : '']"
|
||||
@mouseenter="mouseenteBotDownr(subindex)" @mouseleave="mouseleaveBotDown">
|
||||
<i v-if="currentSecondpath == subitem.path" class="dot"></i>
|
||||
<span class="submenu_title"
|
||||
:style="currentSecondpath == subitem.path ? 'margin-left:14px;max-width:160px;' : 'margin-left:18px;max-width:173px;'">
|
||||
{{ subitem.title }}
|
||||
</span>
|
||||
<t-popup v-model:visible="subitem.isMore" @overlay-click="delCard(subindex, subitem)"
|
||||
@visible-change="onVisibleChange" overlayClassName="del-menu-popup" trigger="click"
|
||||
destroy-on-close placement="top-left">
|
||||
<div v-if="(activeSubmenu == subindex) || (currentSecondpath == subitem.path) || subitem.isMore"
|
||||
@click.stop="openMore(subindex)" variant="outline" class="menu-more-wrap">
|
||||
<t-icon name="ellipsis" class="menu-more" />
|
||||
</div>
|
||||
<template #content>
|
||||
<span class="del_submenu">{{ t('menu.deleteRecord') }}</span>
|
||||
</template>
|
||||
</t-popup>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 下半部分:账户信息、系统设置、退出登录 -->
|
||||
<div class="menu_bottom">
|
||||
<div class="menu_box" v-for="(item, index) in bottomMenuItems" :key="'bottom-' + index">
|
||||
<div v-if="item.path === 'logout'">
|
||||
<t-popconfirm
|
||||
:content="t('menu.confirmLogout')"
|
||||
@confirm="handleLogout"
|
||||
placement="top"
|
||||
:show-arrow="true"
|
||||
>
|
||||
<div @mouseenter="mouseenteMenu(item.path)" @mouseleave="mouseleaveMenu(item.path)"
|
||||
:class="['menu_item', 'logout-item']">
|
||||
<div class="menu_item-box">
|
||||
<div class="menu_icon">
|
||||
<img class="icon" :src="getImgSrc(logoutIcon)" alt="">
|
||||
</div>
|
||||
<span class="menu_title">{{ t(item.titleKey) }}</span>
|
||||
</div>
|
||||
</div>
|
||||
</t-popconfirm>
|
||||
</div>
|
||||
<div v-else @click="handleMenuClick(item.path)"
|
||||
@mouseenter="mouseenteMenu(item.path)" @mouseleave="mouseleaveMenu(item.path)"
|
||||
:class="['menu_item', item.childrenPath && item.childrenPath == currentpath ? 'menu_item_c_active' : (item.path == currentpath) ? 'menu_item_active' : '']">
|
||||
<div class="menu_item-box">
|
||||
<div class="menu_icon">
|
||||
<img class="icon" :src="getImgSrc(item.icon == 'zhishiku' ? knowledgeIcon : item.icon == 'tenant' ? tenantIcon : prefixIcon)" alt="">
|
||||
</div>
|
||||
<span class="menu_title">{{ item.path === 'knowledge-bases' && kbMenuItem?.title ? kbMenuItem.title : t(item.titleKey) }}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<input type="file" @change="upload" style="display: none" ref="uploadInput"
|
||||
accept=".pdf,.docx,.doc,.txt,.md,.jpg,.jpeg,.png" />
|
||||
accept=".pdf,.docx,.doc,.txt,.md,.jpg,.jpeg,.png,.csv,.xls,.xlsx" />
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
<script setup lang="ts">
|
||||
import { storeToRefs } from 'pinia';
|
||||
import { onMounted, watch, computed, ref, reactive } from 'vue';
|
||||
import { onMounted, watch, computed, ref, reactive, nextTick } from 'vue';
|
||||
import { useRoute, useRouter } from 'vue-router';
|
||||
import { useI18n } from 'vue-i18n';
|
||||
import { getSessionsList, delSession } from "@/api/chat/index";
|
||||
import { getKnowledgeBaseById, listKnowledgeBases, uploadKnowledgeFile } from '@/api/knowledge-base';
|
||||
import { kbFileTypeVerification } from '@/utils/index';
|
||||
import { useMenuStore } from '@/stores/menu';
|
||||
import useKnowledgeBase from '@/hooks/useKnowledgeBase';
|
||||
import { useAuthStore } from '@/stores/auth';
|
||||
import { MessagePlugin } from "tdesign-vue-next";
|
||||
let { requestMethod } = useKnowledgeBase()
|
||||
const { t } = useI18n();
|
||||
let uploadInput = ref();
|
||||
const usemenuStore = useMenuStore();
|
||||
const authStore = useAuthStore();
|
||||
const route = useRoute();
|
||||
const router = useRouter();
|
||||
const currentpath = ref('');
|
||||
@@ -74,39 +140,206 @@ const submenuscrollContainer = ref(null);
|
||||
// 计算总页数
|
||||
const totalPages = computed(() => Math.ceil(total.value / page_size.value));
|
||||
const hasMore = computed(() => currentPage.value < totalPages.value);
|
||||
type MenuItem = { title: string; icon: string; path: string; childrenPath?: string; children?: any[] };
|
||||
const { menuArr } = storeToRefs(usemenuStore);
|
||||
let activeSubmenu = ref(-1);
|
||||
let activeSubmenu = ref<number>(-1);
|
||||
|
||||
// 是否处于知识库详情页
|
||||
const isInKnowledgeBase = computed<boolean>(() => {
|
||||
return route.name === 'knowledgeBaseDetail' ||
|
||||
route.name === 'kbCreatChat' ||
|
||||
route.name === 'chat' ||
|
||||
route.name === 'knowledgeBaseSettings';
|
||||
});
|
||||
|
||||
// 统一的菜单项激活状态判断
|
||||
const isMenuItemActive = (itemPath: string): boolean => {
|
||||
const currentRoute = route.name;
|
||||
|
||||
switch (itemPath) {
|
||||
case 'knowledge-bases':
|
||||
return currentRoute === 'knowledgeBaseList' ||
|
||||
currentRoute === 'knowledgeBaseDetail' ||
|
||||
currentRoute === 'knowledgeBaseSettings';
|
||||
case 'creatChat':
|
||||
return currentRoute === 'kbCreatChat';
|
||||
case 'tenant':
|
||||
return currentRoute === 'tenant';
|
||||
default:
|
||||
return itemPath === currentpath.value;
|
||||
}
|
||||
};
|
||||
|
||||
// 统一的图标激活状态判断
|
||||
const getIconActiveState = (itemPath: string) => {
|
||||
const currentRoute = route.name;
|
||||
|
||||
return {
|
||||
isKbActive: itemPath === 'knowledge-bases' && (
|
||||
currentRoute === 'knowledgeBaseList' ||
|
||||
currentRoute === 'knowledgeBaseDetail' ||
|
||||
currentRoute === 'knowledgeBaseSettings'
|
||||
),
|
||||
isCreatChatActive: itemPath === 'creatChat' && currentRoute === 'kbCreatChat',
|
||||
isTenantActive: itemPath === 'tenant' && currentRoute === 'tenant',
|
||||
isChatActive: itemPath === 'chat' && currentRoute === 'chat'
|
||||
};
|
||||
};
|
||||
|
||||
// 分离上下两部分菜单
|
||||
const topMenuItems = computed<MenuItem[]>(() => {
|
||||
return (menuArr.value as unknown as MenuItem[]).filter((item: MenuItem) =>
|
||||
item.path === 'knowledge-bases' || (isInKnowledgeBase.value && item.path === 'creatChat')
|
||||
);
|
||||
});
|
||||
|
||||
const bottomMenuItems = computed<MenuItem[]>(() => {
|
||||
return (menuArr.value as unknown as MenuItem[]).filter((item: MenuItem) => {
|
||||
if (item.path === 'knowledge-bases' || item.path === 'creatChat') {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
});
|
||||
|
||||
// 当前知识库名称和列表
|
||||
const currentKbName = ref<string>('')
|
||||
const allKnowledgeBases = ref<Array<{ id: string; name: string; embedding_model_id?: string; summary_model_id?: string }>>([])
|
||||
const showKbDropdown = ref<boolean>(false)
|
||||
|
||||
// 过滤已初始化的知识库
|
||||
const initializedKnowledgeBases = computed(() => {
|
||||
return allKnowledgeBases.value.filter(kb =>
|
||||
kb.embedding_model_id && kb.embedding_model_id !== '' &&
|
||||
kb.summary_model_id && kb.summary_model_id !== ''
|
||||
)
|
||||
})
|
||||
|
||||
// 动态更新知识库菜单项标题
|
||||
const kbMenuItem = computed(() => {
|
||||
const kbItem = topMenuItems.value.find(item => item.path === 'knowledge-bases')
|
||||
if (kbItem && isInKnowledgeBase.value && currentKbName.value) {
|
||||
return { ...kbItem, title: currentKbName.value }
|
||||
}
|
||||
return kbItem
|
||||
})
|
||||
|
||||
const loading = ref(false)
|
||||
const uploadFile = () => {
|
||||
const uploadFile = async () => {
|
||||
// 获取当前知识库ID
|
||||
const currentKbId = await getCurrentKbId();
|
||||
|
||||
// 检查当前知识库的初始化状态
|
||||
if (currentKbId) {
|
||||
try {
|
||||
const kbResponse = await getKnowledgeBaseById(currentKbId);
|
||||
const kb = kbResponse.data;
|
||||
|
||||
// 检查知识库是否已初始化(有 EmbeddingModelID 和 SummaryModelID)
|
||||
if (!kb.embedding_model_id || kb.embedding_model_id === '' ||
|
||||
!kb.summary_model_id || kb.summary_model_id === '') {
|
||||
MessagePlugin.warning(t('knowledgeBase.notInitialized'));
|
||||
return;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('获取知识库信息失败:', error);
|
||||
MessagePlugin.error(t('knowledgeBase.getInfoFailed'));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
uploadInput.value.click()
|
||||
}
|
||||
const upload = (e) => {
|
||||
requestMethod(e.target.files[0], uploadInput)
|
||||
const upload = async (e: any) => {
|
||||
const file = e.target.files[0];
|
||||
if (!file) return;
|
||||
|
||||
// 文件类型验证
|
||||
if (kbFileTypeVerification(file)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 获取当前知识库ID
|
||||
const currentKbId = (route.params as any)?.kbId as string;
|
||||
if (!currentKbId) {
|
||||
MessagePlugin.error(t('knowledgeBase.missingId'));
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await uploadKnowledgeFile(currentKbId, { file });
|
||||
const responseData = result as any;
|
||||
console.log('上传API返回结果:', responseData);
|
||||
|
||||
// 如果没有抛出异常,就认为上传成功,先触发刷新事件
|
||||
console.log('文件上传完成,发送事件通知页面刷新,知识库ID:', currentKbId);
|
||||
window.dispatchEvent(new CustomEvent('knowledgeFileUploaded', {
|
||||
detail: { kbId: currentKbId }
|
||||
}));
|
||||
|
||||
// 然后处理UI消息
|
||||
// 判断上传是否成功 - 检查多种可能的成功标识
|
||||
const isSuccess = responseData.success || responseData.code === 200 || responseData.status === 'success' || (!responseData.error && responseData);
|
||||
|
||||
if (isSuccess) {
|
||||
MessagePlugin.info(t('file.uploadSuccess'));
|
||||
} else {
|
||||
// 改进错误信息提取逻辑
|
||||
let errorMessage = t('file.uploadFailed');
|
||||
if (responseData.error && responseData.error.message) {
|
||||
errorMessage = responseData.error.message;
|
||||
} else if (responseData.message) {
|
||||
errorMessage = responseData.message;
|
||||
}
|
||||
if (responseData.code === 'duplicate_file' || (responseData.error && responseData.error.code === 'duplicate_file')) {
|
||||
errorMessage = t('file.fileExists');
|
||||
}
|
||||
MessagePlugin.error(errorMessage);
|
||||
}
|
||||
} catch (err: any) {
|
||||
let errorMessage = t('file.uploadFailed');
|
||||
if (err.code === 'duplicate_file') {
|
||||
errorMessage = t('file.fileExists');
|
||||
} else if (err.error && err.error.message) {
|
||||
errorMessage = err.error.message;
|
||||
} else if (err.message) {
|
||||
errorMessage = err.message;
|
||||
}
|
||||
MessagePlugin.error(errorMessage);
|
||||
} finally {
|
||||
uploadInput.value.value = "";
|
||||
}
|
||||
}
|
||||
const mouseenteBotDownr = (val) => {
|
||||
const mouseenteBotDownr = (val: number) => {
|
||||
activeSubmenu.value = val;
|
||||
}
|
||||
const mouseleaveBotDown = () => {
|
||||
activeSubmenu.value = -1;
|
||||
}
|
||||
const onVisibleChange = (e) => {
|
||||
const onVisibleChange = (_e: any) => {
|
||||
}
|
||||
|
||||
const delCard = (index, item) => {
|
||||
delSession(item.id).then(res => {
|
||||
if (res && res.success) {
|
||||
menuArr.value[1].children.splice(index, 1);
|
||||
const delCard = (index: number, item: any) => {
|
||||
delSession(item.id).then((res: any) => {
|
||||
if (res && (res as any).success) {
|
||||
(menuArr.value as any[])[1]?.children?.splice(index, 1);
|
||||
if (item.id == route.params.chatid) {
|
||||
router.push('/platform/creatChat');
|
||||
// 删除当前会话后,跳转到当前知识库的创建聊天页面
|
||||
const kbId = route.params.kbId;
|
||||
if (kbId) {
|
||||
router.push(`/platform/knowledge-bases/${kbId}/creatChat`);
|
||||
} else {
|
||||
router.push('/platform/knowledge-bases');
|
||||
}
|
||||
}
|
||||
} else {
|
||||
MessagePlugin.error("删除失败,请稍后再试!");
|
||||
MessagePlugin.error(t('knowledgeBase.deleteFailed'));
|
||||
}
|
||||
})
|
||||
}
|
||||
const debounce = (fn, delay) => {
|
||||
let timer
|
||||
return (...args) => {
|
||||
const debounce = (fn: (...args: any[]) => void, delay: number) => {
|
||||
let timer: ReturnType<typeof setTimeout>
|
||||
return (...args: any[]) => {
|
||||
clearTimeout(timer)
|
||||
timer = setTimeout(() => fn(...args), delay)
|
||||
}
|
||||
@@ -124,80 +357,221 @@ const checkScrollBottom = () => {
|
||||
}
|
||||
}
|
||||
const handleScroll = debounce(checkScrollBottom, 200)
|
||||
const getMessageList = () => {
|
||||
const getMessageList = async () => {
|
||||
// 仅在知识库内部显示对话列表
|
||||
if (!isInKnowledgeBase.value) {
|
||||
usemenuStore.clearMenuArr();
|
||||
currentKbName.value = '';
|
||||
return;
|
||||
}
|
||||
let kbId = (route.params as any)?.kbId as string
|
||||
// 新的路由格式:/platform/chat/:kbId/:chatid,直接从路由参数获取知识库ID
|
||||
if (!kbId) {
|
||||
usemenuStore.clearMenuArr();
|
||||
currentKbName.value = '';
|
||||
return;
|
||||
}
|
||||
|
||||
// 获取知识库名称和所有知识库列表
|
||||
try {
|
||||
const [kbRes, allKbRes]: any[] = await Promise.all([
|
||||
getKnowledgeBaseById(kbId),
|
||||
listKnowledgeBases()
|
||||
])
|
||||
if (kbRes?.data?.name) {
|
||||
currentKbName.value = kbRes.data.name
|
||||
}
|
||||
if (allKbRes?.data) {
|
||||
allKnowledgeBases.value = allKbRes.data
|
||||
}
|
||||
} catch {}
|
||||
|
||||
if (loading.value) return;
|
||||
loading.value = true;
|
||||
usemenuStore.clearMenuArr();
|
||||
getSessionsList(currentPage.value, page_size.value).then(res => {
|
||||
getSessionsList(currentPage.value, page_size.value).then((res: any) => {
|
||||
if (res.data && res.data.length) {
|
||||
res.data.forEach(item => {
|
||||
let obj = { title: item.title ? item.title : "新会话", path: `chat/${item.id}`, id: item.id, isMore: false, isNoTitle: item.title ? false : true }
|
||||
// 过滤出当前知识库的会话
|
||||
const filtered = res.data.filter((s: any) => s.knowledge_base_id === kbId)
|
||||
filtered.forEach((item: any) => {
|
||||
let obj = { title: item.title ? item.title : t('menu.newSession'), path: `chat/${kbId}/${item.id}`, id: item.id, isMore: false, isNoTitle: item.title ? false : true }
|
||||
usemenuStore.updatemenuArr(obj)
|
||||
});
|
||||
loading.value = false;
|
||||
}
|
||||
if (res.total) {
|
||||
total.value = res.total;
|
||||
if ((res as any).total) {
|
||||
total.value = (res as any).total;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const openMore = (e) => { }
|
||||
const openMore = (_e: any) => { }
|
||||
onMounted(() => {
|
||||
currentpath.value = route.name;
|
||||
if (route.params.chatid) {
|
||||
currentSecondpath.value = `${route.name}/${route.params.chatid}`;
|
||||
const routeName = typeof route.name === 'string' ? route.name : (route.name ? String(route.name) : '')
|
||||
currentpath.value = routeName;
|
||||
if (route.params.chatid && route.params.kbId) {
|
||||
currentSecondpath.value = `chat/${route.params.kbId}/${route.params.chatid}`;
|
||||
}
|
||||
getMessageList();
|
||||
});
|
||||
|
||||
watch([() => route.name, () => route.params], (newvalue) => {
|
||||
currentpath.value = newvalue[0];
|
||||
if (newvalue[1].chatid) {
|
||||
currentSecondpath.value = `${newvalue[0]}/${newvalue[1].chatid}`;
|
||||
const nameStr = typeof newvalue[0] === 'string' ? (newvalue[0] as string) : (newvalue[0] ? String(newvalue[0]) : '')
|
||||
currentpath.value = nameStr;
|
||||
if (newvalue[1].chatid && newvalue[1].kbId) {
|
||||
currentSecondpath.value = `chat/${newvalue[1].kbId}/${newvalue[1].chatid}`;
|
||||
} else {
|
||||
currentSecondpath.value = "";
|
||||
}
|
||||
|
||||
// 路由变化时刷新对话列表(仅在知识库内部)
|
||||
getMessageList();
|
||||
// 路由变化时更新图标状态
|
||||
getIcon(nameStr);
|
||||
});
|
||||
let fileAddIcon = ref('file-add-green.svg');
|
||||
let knowledgeIcon = ref('zhishiku-green.svg');
|
||||
let prefixIcon = ref('prefixIcon.svg');
|
||||
let settingIcon = ref('setting.svg');
|
||||
let logoutIcon = ref('logout.svg');
|
||||
let tenantIcon = ref('user.svg'); // 使用专门的用户图标
|
||||
let pathPrefix = ref(route.name)
|
||||
const getIcon = (path) => {
|
||||
fileAddIcon.value = path == 'knowledgeBase' ? 'file-add-green.svg' : 'file-add.svg';
|
||||
knowledgeIcon.value = path == 'knowledgeBase' ? 'zhishiku-green.svg' : 'zhishiku.svg';
|
||||
prefixIcon.value = path == 'creatChat' ? 'prefixIcon-green.svg' : path == 'knowledgeBase' ? 'prefixIcon-grey.svg' : 'prefixIcon.svg';
|
||||
settingIcon.value = path == 'settings' ? 'setting-green.svg' : 'setting.svg';
|
||||
const getIcon = (path: string) => {
|
||||
// 根据当前路由状态更新所有图标
|
||||
const kbActiveState = getIconActiveState('knowledge-bases');
|
||||
const creatChatActiveState = getIconActiveState('creatChat');
|
||||
const tenantActiveState = getIconActiveState('tenant');
|
||||
|
||||
// 上传图标:只在知识库相关页面显示绿色
|
||||
fileAddIcon.value = kbActiveState.isKbActive ? 'file-add-green.svg' : 'file-add.svg';
|
||||
|
||||
// 知识库图标:只在知识库页面显示绿色
|
||||
knowledgeIcon.value = kbActiveState.isKbActive ? 'zhishiku-green.svg' : 'zhishiku.svg';
|
||||
|
||||
// 对话图标:只在对话创建页面显示绿色,在知识库页面显示灰色,其他情况显示默认
|
||||
prefixIcon.value = creatChatActiveState.isCreatChatActive ? 'prefixIcon-green.svg' :
|
||||
kbActiveState.isKbActive ? 'prefixIcon-grey.svg' :
|
||||
'prefixIcon.svg';
|
||||
|
||||
// 租户图标:只在租户页面显示绿色
|
||||
tenantIcon.value = tenantActiveState.isTenantActive ? 'user-green.svg' : 'user.svg';
|
||||
|
||||
// 退出图标:始终显示默认
|
||||
logoutIcon.value = 'logout.svg';
|
||||
}
|
||||
getIcon(route.name)
|
||||
const gotopage = (path) => {
|
||||
pathPrefix.value = path;
|
||||
// 如果是系统设置,跳转到初始化配置页面
|
||||
if (path === 'settings') {
|
||||
router.push('/initialization');
|
||||
getIcon(typeof route.name === 'string' ? route.name as string : (route.name ? String(route.name) : ''))
|
||||
const handleMenuClick = async (path: string) => {
|
||||
if (path === 'knowledge-bases') {
|
||||
// 知识库菜单项:如果在知识库内部,跳转到当前知识库文件页;否则跳转到知识库列表
|
||||
const kbId = await getCurrentKbId()
|
||||
if (kbId) {
|
||||
router.push(`/platform/knowledge-bases/${kbId}`)
|
||||
} else {
|
||||
router.push('/platform/knowledge-bases')
|
||||
}
|
||||
} else {
|
||||
router.push(`/platform/${path}`);
|
||||
gotopage(path)
|
||||
}
|
||||
}
|
||||
|
||||
// 处理退出登录确认
|
||||
const handleLogout = () => {
|
||||
gotopage('logout')
|
||||
}
|
||||
|
||||
const getCurrentKbId = async (): Promise<string | null> => {
|
||||
let kbId = (route.params as any)?.kbId as string
|
||||
// 新的路由格式:/platform/chat/:kbId/:chatid,直接从路由参数获取
|
||||
if (!kbId && route.name === 'chat' && (route.params as any)?.kbId) {
|
||||
kbId = (route.params as any).kbId
|
||||
}
|
||||
return kbId || null
|
||||
}
|
||||
|
||||
const gotopage = async (path: string) => {
|
||||
pathPrefix.value = path;
|
||||
// 处理退出登录
|
||||
if (path === 'logout') {
|
||||
authStore.logout();
|
||||
router.push('/login');
|
||||
return;
|
||||
} else {
|
||||
if (path === 'creatChat') {
|
||||
const kbId = await getCurrentKbId()
|
||||
if (kbId) {
|
||||
router.push(`/platform/knowledge-bases/${kbId}/creatChat`)
|
||||
} else {
|
||||
router.push(`/platform/knowledge-bases`)
|
||||
}
|
||||
} else {
|
||||
router.push(`/platform/${path}`);
|
||||
}
|
||||
}
|
||||
getIcon(path)
|
||||
}
|
||||
|
||||
const getImgSrc = (url) => {
|
||||
const getImgSrc = (url: string) => {
|
||||
return new URL(`/src/assets/img/${url}`, import.meta.url).href;
|
||||
}
|
||||
|
||||
const mouseenteMenu = (path) => {
|
||||
if (pathPrefix.value != 'knowledgeBase' && pathPrefix.value != 'creatChat' && path != 'knowledgeBase') {
|
||||
const mouseenteMenu = (path: string) => {
|
||||
if (pathPrefix.value != 'knowledge-bases' && pathPrefix.value != 'creatChat' && path != 'knowledge-bases') {
|
||||
prefixIcon.value = 'prefixIcon-grey.svg';
|
||||
}
|
||||
}
|
||||
const mouseleaveMenu = (path) => {
|
||||
if (pathPrefix.value != 'knowledgeBase' && pathPrefix.value != 'creatChat' && path != 'knowledgeBase') {
|
||||
getIcon(route.name)
|
||||
const mouseleaveMenu = (path: string) => {
|
||||
if (pathPrefix.value != 'knowledge-bases' && pathPrefix.value != 'creatChat' && path != 'knowledge-bases') {
|
||||
const nameStr = typeof route.name === 'string' ? route.name as string : (route.name ? String(route.name) : '')
|
||||
getIcon(nameStr)
|
||||
}
|
||||
}
|
||||
|
||||
// 知识库下拉相关方法
|
||||
const toggleKbDropdown = (event?: Event) => {
|
||||
if (event) {
|
||||
event.stopPropagation()
|
||||
}
|
||||
showKbDropdown.value = !showKbDropdown.value
|
||||
}
|
||||
|
||||
const switchKnowledgeBase = (kbId: string, event?: Event) => {
|
||||
if (event) {
|
||||
event.stopPropagation()
|
||||
}
|
||||
showKbDropdown.value = false
|
||||
const currentRoute = route.name
|
||||
|
||||
// 路由跳转
|
||||
if (currentRoute === 'knowledgeBaseDetail') {
|
||||
router.push(`/platform/knowledge-bases/${kbId}`)
|
||||
} else if (currentRoute === 'kbCreatChat') {
|
||||
router.push(`/platform/knowledge-bases/${kbId}/creatChat`)
|
||||
} else if (currentRoute === 'knowledgeBaseSettings') {
|
||||
router.push(`/platform/knowledge-bases/${kbId}/settings`)
|
||||
} else {
|
||||
router.push(`/platform/knowledge-bases/${kbId}`)
|
||||
}
|
||||
|
||||
// 刷新右侧内容 - 通过触发页面重新加载或发送事件
|
||||
nextTick(() => {
|
||||
// 发送全局事件通知页面刷新知识库内容
|
||||
window.dispatchEvent(new CustomEvent('knowledgeBaseChanged', {
|
||||
detail: { kbId }
|
||||
}))
|
||||
})
|
||||
}
|
||||
|
||||
// 点击外部关闭下拉菜单
|
||||
const handleClickOutside = () => {
|
||||
showKbDropdown.value = false
|
||||
}
|
||||
|
||||
onMounted(() => {
|
||||
document.addEventListener('click', handleClickOutside)
|
||||
})
|
||||
|
||||
watch(() => route.params.kbId, () => {
|
||||
showKbDropdown.value = false
|
||||
})
|
||||
|
||||
</script>
|
||||
<style lang="less" scoped>
|
||||
.del_submenu {
|
||||
@@ -210,6 +584,10 @@ const mouseleaveMenu = (path) => {
|
||||
padding: 8px;
|
||||
background: #fff;
|
||||
box-sizing: border-box;
|
||||
height: 100vh;
|
||||
overflow: hidden;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
|
||||
.logo_box {
|
||||
height: 80px;
|
||||
@@ -239,9 +617,28 @@ const mouseleaveMenu = (path) => {
|
||||
line-height: 21.7px;
|
||||
}
|
||||
|
||||
.menu_top {
|
||||
flex: 1;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
overflow: hidden;
|
||||
min-height: 0;
|
||||
}
|
||||
|
||||
.menu_bottom {
|
||||
flex-shrink: 0;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.menu_box {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
|
||||
&.has-submenu {
|
||||
flex: 1;
|
||||
min-height: 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -341,18 +738,21 @@ const mouseleaveMenu = (path) => {
|
||||
font-style: normal;
|
||||
font-weight: 600;
|
||||
line-height: 22px;
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
max-width: 120px;
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.submenu {
|
||||
font-family: "PingFang SC";
|
||||
font-size: 14px;
|
||||
font-style: normal;
|
||||
font-family: "PingFang SC";
|
||||
font-size: 14px;
|
||||
font-style: normal;
|
||||
overflow-y: scroll;
|
||||
overflow-y: auto;
|
||||
scrollbar-width: none;
|
||||
height: calc(98vh - 276px);
|
||||
flex: 1;
|
||||
min-height: 0;
|
||||
margin-left: 4px;
|
||||
}
|
||||
|
||||
.submenu_item_p {
|
||||
@@ -427,6 +827,92 @@ const mouseleaveMenu = (path) => {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 知识库下拉菜单样式 */
|
||||
.kb-dropdown-icon {
|
||||
margin-left: auto;
|
||||
color: #666;
|
||||
transition: transform 0.3s ease, color 0.2s ease;
|
||||
cursor: pointer;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
width: 16px;
|
||||
height: 16px;
|
||||
|
||||
&.rotate-180 {
|
||||
transform: rotate(180deg);
|
||||
}
|
||||
|
||||
&:hover {
|
||||
color: #07c05f;
|
||||
}
|
||||
|
||||
&.active {
|
||||
color: #07c05f;
|
||||
}
|
||||
|
||||
&.active:hover {
|
||||
color: #05a04f;
|
||||
}
|
||||
|
||||
svg {
|
||||
width: 12px;
|
||||
height: 12px;
|
||||
transition: inherit;
|
||||
}
|
||||
}
|
||||
|
||||
.kb-dropdown-menu {
|
||||
position: absolute;
|
||||
top: 100%;
|
||||
left: 0;
|
||||
right: 0;
|
||||
background: #fff;
|
||||
border: 1px solid #e5e7eb;
|
||||
border-radius: 6px;
|
||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
|
||||
z-index: 1000;
|
||||
max-height: 200px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.kb-dropdown-item {
|
||||
padding: 8px 16px;
|
||||
cursor: pointer;
|
||||
transition: background-color 0.2s ease;
|
||||
font-size: 14px;
|
||||
color: #333;
|
||||
|
||||
&:hover {
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
&.active {
|
||||
background-color: #07c05f1a;
|
||||
color: #07c05f;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
&:first-child {
|
||||
border-radius: 6px 6px 0 0;
|
||||
}
|
||||
|
||||
&:last-child {
|
||||
border-radius: 0 0 6px 6px;
|
||||
}
|
||||
}
|
||||
|
||||
.menu_item-box {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
width: 100%;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.menu_box {
|
||||
position: relative;
|
||||
}
|
||||
</style>
|
||||
<style lang="less">
|
||||
.upload-popup {
|
||||
@@ -456,4 +942,48 @@ const mouseleaveMenu = (path) => {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// 退出登录确认框样式
|
||||
:deep(.t-popconfirm) {
|
||||
.t-popconfirm__content {
|
||||
background: #fff;
|
||||
border: 1px solid #e7e7e7;
|
||||
border-radius: 6px;
|
||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
|
||||
padding: 12px 16px;
|
||||
font-size: 14px;
|
||||
color: #333;
|
||||
max-width: 200px;
|
||||
}
|
||||
|
||||
.t-popconfirm__arrow {
|
||||
border-bottom-color: #e7e7e7;
|
||||
}
|
||||
|
||||
.t-popconfirm__arrow::after {
|
||||
border-bottom-color: #fff;
|
||||
}
|
||||
|
||||
.t-popconfirm__buttons {
|
||||
margin-top: 8px;
|
||||
display: flex;
|
||||
justify-content: flex-end;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.t-button--variant-outline {
|
||||
border-color: #d9d9d9;
|
||||
color: #666;
|
||||
}
|
||||
|
||||
.t-button--theme-danger {
|
||||
background-color: #ff4d4f;
|
||||
border-color: #ff4d4f;
|
||||
}
|
||||
|
||||
.t-button--theme-danger:hover {
|
||||
background-color: #ff7875;
|
||||
border-color: #ff7875;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
@@ -1,52 +1,54 @@
|
||||
import { ref, reactive, onMounted } from "vue";
|
||||
import { ref, reactive } from "vue";
|
||||
import { storeToRefs } from "pinia";
|
||||
import { formatStringDate, kbFileTypeVerification } from "../utils/index";
|
||||
import { MessagePlugin } from "tdesign-vue-next";
|
||||
import {
|
||||
uploadKnowledgeBase,
|
||||
getKnowledgeBase,
|
||||
uploadKnowledgeFile,
|
||||
listKnowledgeFiles,
|
||||
getKnowledgeDetails,
|
||||
delKnowledgeDetails,
|
||||
getKnowledgeDetailsCon,
|
||||
} from "@/api/knowledge-base/index";
|
||||
import { knowledgeStore } from "@/stores/knowledge";
|
||||
import { useRoute } from 'vue-router';
|
||||
|
||||
const usemenuStore = knowledgeStore();
|
||||
export default function () {
|
||||
export default function (knowledgeBaseId?: string) {
|
||||
const route = useRoute();
|
||||
const { cardList, total } = storeToRefs(usemenuStore);
|
||||
let moreIndex = ref(-1);
|
||||
const details = reactive({
|
||||
title: "",
|
||||
time: "",
|
||||
md: [],
|
||||
md: [] as any[],
|
||||
id: "",
|
||||
total: 0
|
||||
});
|
||||
const getKnowled = (query = { page: 1, page_size: 35 }) => {
|
||||
getKnowledgeBase(query)
|
||||
const getKnowled = (query = { page: 1, page_size: 35 }, kbId?: string) => {
|
||||
const targetKbId = kbId || knowledgeBaseId;
|
||||
if (!targetKbId) return;
|
||||
|
||||
listKnowledgeFiles(targetKbId, query)
|
||||
.then((result: any) => {
|
||||
let { data, total: totalResult } = result;
|
||||
let cardList_ = data.map((item) => {
|
||||
item["file_name"] = item.file_name.substring(
|
||||
0,
|
||||
item.file_name.lastIndexOf(".")
|
||||
);
|
||||
return {
|
||||
...item,
|
||||
updated_at: formatStringDate(new Date(item.updated_at)),
|
||||
isMore: false,
|
||||
file_type: item.file_type.toLocaleUpperCase(),
|
||||
};
|
||||
});
|
||||
if (query.page == 1) {
|
||||
const { data, total: totalResult } = result;
|
||||
const cardList_ = data.map((item: any) => ({
|
||||
...item,
|
||||
file_name: item.file_name.substring(0, item.file_name.lastIndexOf(".")),
|
||||
updated_at: formatStringDate(new Date(item.updated_at)),
|
||||
isMore: false,
|
||||
file_type: item.file_type.toLocaleUpperCase(),
|
||||
}));
|
||||
|
||||
if (query.page === 1) {
|
||||
cardList.value = cardList_;
|
||||
} else {
|
||||
cardList.value.push(...cardList_);
|
||||
}
|
||||
total.value = totalResult;
|
||||
})
|
||||
.catch((err) => {});
|
||||
.catch(() => {});
|
||||
};
|
||||
const delKnowledge = (index: number, item) => {
|
||||
const delKnowledge = (index: number, item: any) => {
|
||||
cardList.value[index].isMore = false;
|
||||
moreIndex.value = -1;
|
||||
delKnowledgeDetails(item.id)
|
||||
@@ -58,7 +60,7 @@ export default function () {
|
||||
MessagePlugin.error("知识删除失败!");
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
.catch(() => {
|
||||
MessagePlugin.error("知识删除失败!");
|
||||
});
|
||||
};
|
||||
@@ -70,56 +72,48 @@ export default function () {
|
||||
moreIndex.value = -1;
|
||||
}
|
||||
};
|
||||
const requestMethod = (file: any, uploadInput) => {
|
||||
if (file instanceof File && uploadInput) {
|
||||
if (kbFileTypeVerification(file)) {
|
||||
return;
|
||||
}
|
||||
uploadKnowledgeBase({ file })
|
||||
.then((result: any) => {
|
||||
if (result.success) {
|
||||
MessagePlugin.info("上传成功!");
|
||||
getKnowled();
|
||||
} else {
|
||||
// 改进错误信息提取逻辑
|
||||
let errorMessage = "上传失败!";
|
||||
|
||||
// 优先从 error 对象中获取错误信息
|
||||
if (result.error && result.error.message) {
|
||||
errorMessage = result.error.message;
|
||||
} else if (result.message) {
|
||||
errorMessage = result.message;
|
||||
}
|
||||
|
||||
// 检查错误码,如果是重复文件则显示特定提示
|
||||
if (result.code === 'duplicate_file' || (result.error && result.error.code === 'duplicate_file')) {
|
||||
errorMessage = "文件已存在";
|
||||
}
|
||||
|
||||
MessagePlugin.error(errorMessage);
|
||||
}
|
||||
uploadInput.value.value = "";
|
||||
})
|
||||
.catch((err: any) => {
|
||||
// 改进 catch 中的错误处理
|
||||
let errorMessage = "上传失败!";
|
||||
|
||||
if (err.code === 'duplicate_file') {
|
||||
errorMessage = "文件已存在";
|
||||
} else if (err.error && err.error.message) {
|
||||
errorMessage = err.error.message;
|
||||
} else if (err.message) {
|
||||
errorMessage = err.message;
|
||||
}
|
||||
|
||||
MessagePlugin.error(errorMessage);
|
||||
uploadInput.value.value = "";
|
||||
});
|
||||
} else {
|
||||
MessagePlugin.error("file文件类型错误!");
|
||||
const requestMethod = (file: any, uploadInput: any) => {
|
||||
if (!(file instanceof File) || !uploadInput) {
|
||||
MessagePlugin.error("文件类型错误!");
|
||||
return;
|
||||
}
|
||||
|
||||
if (kbFileTypeVerification(file)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 获取当前知识库ID
|
||||
let currentKbId: string | undefined = (route.params as any)?.kbId as string;
|
||||
if (!currentKbId && typeof window !== 'undefined') {
|
||||
const match = window.location.pathname.match(/knowledge-bases\/([^/]+)/);
|
||||
if (match?.[1]) currentKbId = match[1];
|
||||
}
|
||||
if (!currentKbId) {
|
||||
currentKbId = knowledgeBaseId;
|
||||
}
|
||||
if (!currentKbId) {
|
||||
MessagePlugin.error("缺少知识库ID");
|
||||
return;
|
||||
}
|
||||
|
||||
uploadKnowledgeFile(currentKbId, { file })
|
||||
.then((result: any) => {
|
||||
if (result.success) {
|
||||
MessagePlugin.info("上传成功!");
|
||||
getKnowled({ page: 1, page_size: 35 }, currentKbId);
|
||||
} else {
|
||||
const errorMessage = result.error?.message || result.message || "上传失败!";
|
||||
MessagePlugin.error(result.code === 'duplicate_file' ? "文件已存在" : errorMessage);
|
||||
}
|
||||
uploadInput.value.value = "";
|
||||
})
|
||||
.catch((err: any) => {
|
||||
const errorMessage = err.error?.message || err.message || "上传失败!";
|
||||
MessagePlugin.error(err.code === 'duplicate_file' ? "文件已存在" : errorMessage);
|
||||
uploadInput.value.value = "";
|
||||
});
|
||||
};
|
||||
const getCardDetails = (item) => {
|
||||
const getCardDetails = (item: any) => {
|
||||
Object.assign(details, {
|
||||
title: "",
|
||||
time: "",
|
||||
@@ -129,7 +123,7 @@ export default function () {
|
||||
getKnowledgeDetails(item.id)
|
||||
.then((result: any) => {
|
||||
if (result.success && result.data) {
|
||||
let { data } = result;
|
||||
const { data } = result;
|
||||
Object.assign(details, {
|
||||
title: data.file_name,
|
||||
time: formatStringDate(new Date(data.updated_at)),
|
||||
@@ -137,15 +131,16 @@ export default function () {
|
||||
});
|
||||
}
|
||||
})
|
||||
.catch((err) => {});
|
||||
getfDetails(item.id, 1);
|
||||
.catch(() => {});
|
||||
getfDetails(item.id, 1);
|
||||
};
|
||||
const getfDetails = (id, page) => {
|
||||
|
||||
const getfDetails = (id: string, page: number) => {
|
||||
getKnowledgeDetailsCon(id, page)
|
||||
.then((result: any) => {
|
||||
if (result.success && result.data) {
|
||||
let { data, total: totalResult } = result;
|
||||
if (page == 1) {
|
||||
const { data, total: totalResult } = result;
|
||||
if (page === 1) {
|
||||
details.md = data;
|
||||
} else {
|
||||
details.md.push(...data);
|
||||
@@ -153,7 +148,7 @@ export default function () {
|
||||
details.total = totalResult;
|
||||
}
|
||||
})
|
||||
.catch((err) => {});
|
||||
.catch(() => {});
|
||||
};
|
||||
return {
|
||||
cardList,
|
||||
|
||||
24
frontend/src/i18n/index.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
import { createI18n } from 'vue-i18n'
|
||||
import zhCN from './locales/zh-CN.ts'
|
||||
import ruRU from './locales/ru-RU.ts'
|
||||
import enUS from './locales/en-US.ts'
|
||||
|
||||
const messages = {
|
||||
'zh-CN': zhCN,
|
||||
'en-US': enUS,
|
||||
'ru-RU': ruRU
|
||||
}
|
||||
|
||||
// Получаем сохраненный язык из localStorage или используем китайский по умолчанию
|
||||
const savedLocale = localStorage.getItem('locale') || 'zh-CN'
|
||||
console.log('i18n инициализация с языком:', savedLocale)
|
||||
|
||||
const i18n = createI18n({
|
||||
legacy: false,
|
||||
locale: savedLocale,
|
||||
fallbackLocale: 'zh-CN',
|
||||
globalInjection: true,
|
||||
messages
|
||||
})
|
||||
|
||||
export default i18n
|
||||
553
frontend/src/i18n/locales/en-US.ts
Normal file
@@ -0,0 +1,553 @@
|
||||
export default {
|
||||
menu: {
|
||||
knowledgeBase: 'Knowledge Base',
|
||||
chat: 'Chat',
|
||||
createChat: 'Create Chat',
|
||||
tenant: 'Account Info',
|
||||
settings: 'System Settings',
|
||||
logout: 'Logout',
|
||||
uploadKnowledge: 'Upload Knowledge',
|
||||
deleteRecord: 'Delete Record',
|
||||
newSession: 'New Chat',
|
||||
confirmLogout: 'Are you sure you want to logout?',
|
||||
systemInfo: 'System Information'
|
||||
},
|
||||
knowledgeBase: {
|
||||
title: 'Knowledge Base',
|
||||
list: 'Knowledge Base List',
|
||||
detail: 'Knowledge Base Details',
|
||||
create: 'Create Knowledge Base',
|
||||
edit: 'Edit Knowledge Base',
|
||||
delete: 'Delete Knowledge Base',
|
||||
name: 'Name',
|
||||
description: 'Description',
|
||||
files: 'Files',
|
||||
settings: 'Settings',
|
||||
upload: 'Upload File',
|
||||
uploadSuccess: 'File uploaded successfully!',
|
||||
uploadFailed: 'File upload failed!',
|
||||
fileExists: 'File already exists',
|
||||
notInitialized: 'Knowledge base is not initialized. Please configure models in settings before uploading files',
|
||||
getInfoFailed: 'Failed to get knowledge base information, file upload is not possible',
|
||||
missingId: 'Knowledge base ID is missing',
|
||||
deleteFailed: 'Delete failed. Please try again later!',
|
||||
createKnowledgeBase: 'Create Knowledge Base',
|
||||
knowledgeBaseName: 'Knowledge Base Name',
|
||||
enterName: 'Enter knowledge base name',
|
||||
embeddingModel: 'Embedding Model',
|
||||
selectEmbeddingModel: 'Select embedding model',
|
||||
summaryModel: 'Summary Model',
|
||||
selectSummaryModel: 'Select summary model',
|
||||
rerankModel: 'Rerank Model',
|
||||
selectRerankModel: 'Select rerank model (optional)',
|
||||
createSuccess: 'Knowledge base created successfully',
|
||||
createFailed: 'Failed to create knowledge base',
|
||||
updateSuccess: 'Knowledge base updated successfully',
|
||||
updateFailed: 'Failed to update knowledge base',
|
||||
deleteSuccess: 'Knowledge base deleted successfully',
|
||||
deleteConfirm: 'Are you sure you want to delete this knowledge base?',
|
||||
fileName: 'File Name',
|
||||
fileSize: 'File Size',
|
||||
uploadTime: 'Upload Time',
|
||||
status: 'Status',
|
||||
actions: 'Actions',
|
||||
processing: 'Processing',
|
||||
completed: 'Completed',
|
||||
failed: 'Failed',
|
||||
noFiles: 'No files',
|
||||
dragFilesHere: 'Drag files here or',
|
||||
clickToUpload: 'click to upload',
|
||||
supportedFormats: 'Supported formats',
|
||||
maxFileSize: 'Max file size',
|
||||
viewDetails: 'View Details',
|
||||
downloadFile: 'Download File',
|
||||
deleteFile: 'Delete File',
|
||||
confirmDeleteFile: 'Are you sure you want to delete this file?',
|
||||
totalFiles: 'Total files',
|
||||
totalSize: 'Total size',
|
||||
// Additional translations for KnowledgeBase.vue
|
||||
newSession: 'New Chat',
|
||||
deleteDocument: 'Delete Document',
|
||||
parsingFailed: 'Parsing failed',
|
||||
parsingInProgress: 'Parsing...',
|
||||
deleteConfirmation: 'Delete Confirmation',
|
||||
confirmDeleteDocument: 'Confirm deletion of document "{fileName}", recovery will be impossible after deletion',
|
||||
cancel: 'Cancel',
|
||||
confirmDelete: 'Confirm Delete',
|
||||
selectKnowledgeBaseFirst: 'Please select a knowledge base first',
|
||||
sessionCreationFailed: 'Failed to create chat session',
|
||||
sessionCreationError: 'Chat session creation error',
|
||||
settingsParsingFailed: 'Failed to parse settings',
|
||||
fileUploadEventReceived: 'File upload event received, uploaded knowledge base ID: {uploadedKbId}, current knowledge base ID: {currentKbId}',
|
||||
matchingKnowledgeBase: 'Matching knowledge base, starting file list update',
|
||||
routeParamChange: 'Route parameter change, re-fetching knowledge base content',
|
||||
fileUploadEventListening: 'Listening for file upload events',
|
||||
apiCallKnowledgeFiles: 'Direct API call to get knowledge base file list',
|
||||
responseInterceptorData: 'Since the response interceptor has already returned data, result is part of the response data',
|
||||
hookProcessing: 'Processing according to useKnowledgeBase hook method',
|
||||
errorHandling: 'Error handling',
|
||||
priorityCurrentPageKbId: 'Priority to use knowledge base ID of current page',
|
||||
fallbackLocalStorageKbId: 'If current page has no knowledge base ID, attempt to get knowledge base ID from settings in localStorage',
|
||||
// Additional translations for KnowledgeBaseList.vue
|
||||
createNewKnowledgeBase: 'Create Knowledge Base',
|
||||
uninitializedWarning: 'Some knowledge bases are not initialized, you need to configure model information in settings first to add knowledge documents',
|
||||
initializedStatus: 'Initialized',
|
||||
notInitializedStatus: 'Not Initialized',
|
||||
needSettingsFirst: 'You need to configure model information in settings first to add knowledge',
|
||||
documents: 'Documents',
|
||||
configureModelsFirst: 'Please configure model information in settings first',
|
||||
confirmDeleteKnowledgeBase: 'Confirm deletion of this knowledge base?',
|
||||
createKnowledgeBaseDialog: 'Create Knowledge Base',
|
||||
enterNameKb: 'Enter name',
|
||||
enterDescriptionKb: 'Enter description',
|
||||
createKb: 'Create',
|
||||
deleted: 'Deleted',
|
||||
deleteFailedKb: 'Delete failed',
|
||||
noDescription: 'No description',
|
||||
emptyKnowledgeDragDrop: 'Knowledge is empty, drag and drop to upload',
|
||||
pdfDocFormat: 'pdf, doc format files, max 10M',
|
||||
textMarkdownFormat: 'text, markdown format files, max 200K',
|
||||
dragFileNotText: 'Please drag files instead of text or links'
|
||||
},
|
||||
chat: {
|
||||
title: 'Chat',
|
||||
newChat: 'New Chat',
|
||||
inputPlaceholder: 'Enter your message...',
|
||||
send: 'Send',
|
||||
thinking: 'Thinking...',
|
||||
regenerate: 'Regenerate',
|
||||
copy: 'Copy',
|
||||
delete: 'Delete',
|
||||
reference: 'Reference',
|
||||
noMessages: 'No messages',
|
||||
// Additional translations for chat components
|
||||
waitingForAnswer: 'Waiting for answer...',
|
||||
cannotAnswer: 'Sorry, I cannot answer this question.',
|
||||
summarizingAnswer: 'Summarizing answer...',
|
||||
loading: 'Loading...',
|
||||
enterDescription: 'Enter description',
|
||||
referencedContent: '{count} related materials used',
|
||||
deepThinking: 'Deep thinking completed',
|
||||
knowledgeBaseQandA: 'Knowledge Base Q&A',
|
||||
askKnowledgeBase: 'Ask the knowledge base',
|
||||
sourcesCount: '{count} sources',
|
||||
pleaseEnterContent: 'Please enter content!',
|
||||
pleaseUploadKnowledgeBase: 'Please upload knowledge base first!',
|
||||
replyingPleaseWait: 'Replying, please try again later!',
|
||||
createSessionFailed: 'Failed to create session',
|
||||
createSessionError: 'Session creation error',
|
||||
unableToGetKnowledgeBaseId: 'Unable to get knowledge base ID'
|
||||
},
|
||||
settings: {
|
||||
title: 'Settings',
|
||||
system: 'System Settings',
|
||||
systemConfig: 'System Configuration',
|
||||
knowledgeBaseSettings: 'Knowledge Base Settings',
|
||||
configureKbModels: 'Configure models and document splitting parameters for this knowledge base',
|
||||
manageSystemModels: 'Manage and update system models and service configurations',
|
||||
basicInfo: 'Basic Information',
|
||||
documentSplitting: 'Document Splitting',
|
||||
apiEndpoint: 'API Endpoint',
|
||||
enterApiEndpoint: 'Enter API endpoint, e.g.: http://localhost',
|
||||
enterApiKey: 'Enter API key',
|
||||
enterKnowledgeBaseId: 'Enter knowledge base ID',
|
||||
saveConfig: 'Save Configuration',
|
||||
reset: 'Reset',
|
||||
configSaved: 'Configuration saved successfully',
|
||||
enterApiEndpointRequired: 'Enter API endpoint',
|
||||
enterApiKeyRequired: 'Enter API key',
|
||||
enterKnowledgeBaseIdRequired: 'Enter knowledge base ID',
|
||||
name: 'Name',
|
||||
enterName: 'Enter name',
|
||||
description: 'Description',
|
||||
chunkSize: 'Chunk Size',
|
||||
chunkOverlap: 'Chunk Overlap',
|
||||
save: 'Save',
|
||||
saving: 'Saving...',
|
||||
saveSuccess: 'Saved successfully',
|
||||
saveFailed: 'Failed to save',
|
||||
model: 'Model',
|
||||
llmModel: 'LLM Model',
|
||||
embeddingModel: 'Embedding Model',
|
||||
rerankModel: 'Rerank Model',
|
||||
vlmModel: 'Multimodal Model',
|
||||
modelName: 'Model Name',
|
||||
modelUrl: 'Model URL',
|
||||
apiKey: 'API Key',
|
||||
cancel: 'Cancel',
|
||||
saveFailedSettings: 'Failed to save settings',
|
||||
enterNameRequired: 'Enter name'
|
||||
},
|
||||
initialization: {
|
||||
title: 'Initialization',
|
||||
welcome: 'Welcome to WeKnora',
|
||||
description: 'Please configure the system before starting',
|
||||
step1: 'Step 1: Configure LLM Model',
|
||||
step2: 'Step 2: Configure Embedding Model',
|
||||
step3: 'Step 3: Configure Additional Models',
|
||||
complete: 'Complete Initialization',
|
||||
skip: 'Skip',
|
||||
next: 'Next',
|
||||
previous: 'Previous',
|
||||
// Ollama service
|
||||
ollamaServiceStatus: 'Ollama Service Status',
|
||||
refreshStatus: 'Refresh Status',
|
||||
ollamaServiceAddress: 'Ollama Service Address',
|
||||
notConfigured: 'Not Configured',
|
||||
notRunning: 'Not Running',
|
||||
normal: 'Normal',
|
||||
installedModels: 'Installed Models',
|
||||
none: 'None temporarily',
|
||||
// Knowledge base
|
||||
knowledgeBaseInfo: 'Knowledge Base Information',
|
||||
knowledgeBaseName: 'Knowledge Base Name',
|
||||
knowledgeBaseNamePlaceholder: 'Enter knowledge base name',
|
||||
knowledgeBaseDescription: 'Knowledge Base Description',
|
||||
knowledgeBaseDescriptionPlaceholder: 'Enter knowledge base description',
|
||||
// LLM model
|
||||
llmModelConfig: 'LLM Large Language Model Configuration',
|
||||
modelSource: 'Model Source',
|
||||
local: 'Ollama (Local)',
|
||||
remote: 'Remote API (Remote)',
|
||||
modelName: 'Model Name',
|
||||
modelNamePlaceholder: 'E.g.: qwen3:0.6b',
|
||||
baseUrl: 'Base URL',
|
||||
baseUrlPlaceholder: 'E.g.: https://api.openai.com/v1, remove /chat/completions from the end of URL',
|
||||
apiKey: 'API Key (Optional)',
|
||||
apiKeyPlaceholder: 'Enter API Key (Optional)',
|
||||
downloadModel: 'Download Model',
|
||||
installed: 'Installed',
|
||||
notInstalled: 'Not Installed',
|
||||
notChecked: 'Not Checked',
|
||||
checkConnection: 'Check Connection',
|
||||
connectionNormal: 'Connection Normal',
|
||||
connectionFailed: 'Connection Failed',
|
||||
checkingConnection: 'Checking Connection',
|
||||
// Embedding model
|
||||
embeddingModelConfig: 'Embedding Model Configuration',
|
||||
embeddingWarning: 'Knowledge base already has files, cannot change embedding model configuration',
|
||||
dimension: 'Dimension',
|
||||
dimensionPlaceholder: 'Enter vector dimension',
|
||||
detectDimension: 'Detect Dimension',
|
||||
// Rerank model
|
||||
rerankModelConfig: 'Rerank Model Configuration',
|
||||
enableRerank: 'Enable Rerank Model',
|
||||
// Multimodal settings
|
||||
multimodalConfig: 'Multimodal Configuration',
|
||||
enableMultimodal: 'Enable image information extraction',
|
||||
visualLanguageModelConfig: 'Visual Language Model Configuration',
|
||||
interfaceType: 'Interface Type',
|
||||
openaiCompatible: 'OpenAI Compatible Interface',
|
||||
// Storage settings
|
||||
storageServiceConfig: 'Storage Service Configuration',
|
||||
storageType: 'Storage Type',
|
||||
bucketName: 'Bucket Name',
|
||||
bucketNamePlaceholder: 'Enter Bucket name',
|
||||
pathPrefix: 'Path Prefix',
|
||||
pathPrefixPlaceholder: 'E.g.: images',
|
||||
secretId: 'Secret ID',
|
||||
secretIdPlaceholder: 'Enter COS Secret ID',
|
||||
secretKey: 'Secret Key',
|
||||
secretKeyPlaceholder: 'Enter COS Secret Key',
|
||||
region: 'Region',
|
||||
regionPlaceholder: 'E.g.: ap-beijing',
|
||||
appId: 'App ID',
|
||||
appIdPlaceholder: 'Enter App ID',
|
||||
// Multimodal function testing
|
||||
functionTest: 'Function Test',
|
||||
testDescription: 'Upload an image to test the model\'s image description and text recognition functions',
|
||||
selectImage: 'Select Image',
|
||||
startTest: 'Start Test',
|
||||
testResult: 'Test Result',
|
||||
imageDescription: 'Image Description:',
|
||||
textRecognition: 'Text Recognition:',
|
||||
processingTime: 'Processing Time:',
|
||||
testFailed: 'Test Failed',
|
||||
multimodalProcessingFailed: 'Multimodal processing failed',
|
||||
// Document splitting
|
||||
documentSplittingConfig: 'Document Splitting Configuration',
|
||||
splittingStrategy: 'Splitting Strategy',
|
||||
balancedMode: 'Balanced Mode',
|
||||
balancedModeDesc: 'Chunk size: 1000 / Overlap: 200',
|
||||
precisionMode: 'Precision Mode',
|
||||
precisionModeDesc: 'Chunk size: 512 / Overlap: 100',
|
||||
contextMode: 'Context Mode',
|
||||
contextModeDesc: 'Chunk size: 2048 / Overlap: 400',
|
||||
custom: 'Custom',
|
||||
customDesc: 'Configure parameters manually',
|
||||
chunkSize: 'Chunk Size',
|
||||
chunkOverlap: 'Chunk Overlap',
|
||||
separatorSettings: 'Separator Settings',
|
||||
selectOrCustomSeparators: 'Select or customize separators',
|
||||
characters: 'characters',
|
||||
separatorParagraph: 'Paragraph separator (\\n\\n)',
|
||||
separatorNewline: 'Newline (\\n)',
|
||||
separatorPeriod: 'Period (。)',
|
||||
separatorExclamation: 'Exclamation mark (!)',
|
||||
separatorQuestion: 'Question mark (?)',
|
||||
separatorSemicolon: 'Semicolon (;)',
|
||||
separatorChineseSemicolon: 'Chinese semicolon (;)',
|
||||
separatorComma: 'Comma (,)',
|
||||
separatorChineseComma: 'Chinese comma (,)',
|
||||
// Entity and relation extraction
|
||||
entityRelationExtraction: 'Entity and Relation Extraction',
|
||||
enableEntityRelationExtraction: 'Enable entity and relation extraction',
|
||||
relationTypeConfig: 'Relation Type Configuration',
|
||||
relationType: 'Relation Type',
|
||||
generateRandomTags: 'Generate Random Tags',
|
||||
completeModelConfig: 'Please complete model configuration',
|
||||
systemWillExtract: 'The system will extract corresponding entities and relations from the text according to the selected relation types',
|
||||
extractionExample: 'Extraction Example',
|
||||
sampleText: 'Sample Text',
|
||||
sampleTextPlaceholder: 'Enter text for analysis, e.g.: "Red Mansion", also known as "Dream of the Red Chamber", is one of the four great classical novels of Chinese literature, written by Cao Xueqin during the Qing Dynasty...',
|
||||
generateRandomText: 'Generate Random Text',
|
||||
entityList: 'Entity List',
|
||||
nodeName: 'Node Name',
|
||||
nodeNamePlaceholder: 'Node name',
|
||||
addAttribute: 'Add Attribute',
|
||||
attributeValue: 'Attribute Value',
|
||||
attributeValuePlaceholder: 'Attribute value',
|
||||
addEntity: 'Add Entity',
|
||||
completeEntityInfo: 'Please complete entity information',
|
||||
relationConnection: 'Relation Connection',
|
||||
selectEntity: 'Select Entity',
|
||||
addRelation: 'Add Relation',
|
||||
completeRelationInfo: 'Please complete relation information',
|
||||
startExtraction: 'Start Extraction',
|
||||
extracting: 'Extracting...',
|
||||
defaultExample: 'Default Example',
|
||||
clearExample: 'Clear Example',
|
||||
// Buttons and messages
|
||||
updateKnowledgeBaseSettings: 'Update Knowledge Base Settings',
|
||||
updateConfigInfo: 'Update Configuration Information',
|
||||
completeConfig: 'Complete Configuration',
|
||||
waitForDownloads: 'Please wait for all Ollama models to finish downloading before updating configuration',
|
||||
completeModelConfigInfo: 'Please complete model configuration information',
|
||||
knowledgeBaseIdMissing: 'Knowledge base ID is missing',
|
||||
knowledgeBaseSettingsUpdateSuccess: 'Knowledge base settings updated successfully',
|
||||
configUpdateSuccess: 'Configuration updated successfully',
|
||||
systemInitComplete: 'System initialization completed',
|
||||
operationFailed: 'Operation failed',
|
||||
updateKnowledgeBaseInfoFailed: 'Failed to update knowledge base basic information',
|
||||
knowledgeBaseIdMissingCannotSave: 'Knowledge base ID is missing, cannot save configuration',
|
||||
operationFailedCheckNetwork: 'Operation failed, please check network connection',
|
||||
imageUploadSuccess: 'Image uploaded successfully, testing can begin',
|
||||
multimodalConfigIncomplete: 'Multimodal configuration incomplete, please complete multimodal configuration before uploading images',
|
||||
pleaseSelectImage: 'Please select an image',
|
||||
multimodalTestSuccess: 'Multimodal test successful',
|
||||
multimodalTestFailed: 'Multimodal test failed',
|
||||
pleaseEnterSampleText: 'Please enter sample text',
|
||||
pleaseEnterRelationType: 'Please enter relation type',
|
||||
pleaseEnterLLMModelConfig: 'Please enter LLM large language model configuration',
|
||||
noValidNodesExtracted: 'No valid nodes extracted',
|
||||
noValidRelationsExtracted: 'No valid relations extracted',
|
||||
extractionFailedCheckNetwork: 'Extraction failed, please check network or text format',
|
||||
generateFailedRetry: 'Generation failed, please try again',
|
||||
pleaseCheckForm: 'Please check form correctness',
|
||||
detectionSuccessful: 'Detection successful, dimension automatically filled as',
|
||||
detectionFailed: 'Detection failed',
|
||||
detectionFailedCheckConfig: 'Detection failed, please check configuration',
|
||||
modelDownloadSuccess: 'Model downloaded successfully',
|
||||
modelDownloadFailed: 'Model download failed',
|
||||
downloadStartFailed: 'Download start failed',
|
||||
queryProgressFailed: 'Progress query failed',
|
||||
checkOllamaStatusFailed: 'Ollama status check failed',
|
||||
getKnowledgeBaseInfoFailed: 'Failed to get knowledge base information',
|
||||
textRelationExtractionFailed: 'Text relation extraction failed',
|
||||
// Validation
|
||||
pleaseEnterKnowledgeBaseName: 'Please enter knowledge base name',
|
||||
knowledgeBaseNameLength: 'Knowledge base name length must be 1-50 characters',
|
||||
knowledgeBaseDescriptionLength: 'Knowledge base description cannot exceed 200 characters',
|
||||
pleaseEnterLLMModelName: 'Please enter LLM model name',
|
||||
pleaseEnterBaseURL: 'Please enter BaseURL',
|
||||
pleaseEnterEmbeddingModelName: 'Please enter embedding model name',
|
||||
pleaseEnterEmbeddingDimension: 'Please enter embedding dimension',
|
||||
dimensionMustBeInteger: 'Dimension must be a valid integer, usually 768, 1024, 1536, 3584, etc.',
|
||||
pleaseEnterTextContent: 'Please enter text content',
|
||||
textContentMinLength: 'Text content must contain at least 10 characters',
|
||||
pleaseEnterValidTag: 'Please enter a valid tag',
|
||||
tagAlreadyExists: 'This tag already exists',
|
||||
// Additional translations for InitializationContent.vue
|
||||
checkFailed: 'Check failed',
|
||||
startingDownload: 'Starting download...',
|
||||
downloadStarted: 'Download started',
|
||||
model: 'Model',
|
||||
startModelDownloadFailed: 'Failed to start model download',
|
||||
downloadCompleted: 'Download completed',
|
||||
downloadFailed: 'Download failed',
|
||||
knowledgeBaseSettingsModeMissingId: 'Knowledge base settings mode missing ID',
|
||||
completeEmbeddingConfig: 'Please complete embedding configuration first',
|
||||
detectionSuccess: 'Detection successful,',
|
||||
dimensionAutoFilled: 'dimension automatically filled:',
|
||||
checkFormCorrectness: 'Please check form correctness',
|
||||
systemInitializationCompleted: 'System initialization completed',
|
||||
generationFailedRetry: 'Generation failed, please try again',
|
||||
chunkSizeDesc: 'Size of each text chunk. Larger chunks preserve more context but may reduce search accuracy.',
|
||||
chunkOverlapDesc: 'Number of characters overlapping between adjacent chunks. Helps maintain context at chunk boundaries.',
|
||||
selectRelationType: 'Select relation type'
|
||||
},
|
||||
auth: {
|
||||
login: 'Login',
|
||||
logout: 'Logout',
|
||||
username: 'Username',
|
||||
email: 'Email',
|
||||
password: 'Password',
|
||||
confirmPassword: 'Confirm Password',
|
||||
rememberMe: 'Remember Me',
|
||||
forgotPassword: 'Forgot Password?',
|
||||
loginSuccess: 'Login successful!',
|
||||
loginFailed: 'Login failed',
|
||||
loggingIn: 'Logging in...',
|
||||
register: 'Register',
|
||||
registering: 'Registering...',
|
||||
createAccount: 'Create Account',
|
||||
haveAccount: 'Already have an account?',
|
||||
noAccount: 'Don\'t have an account?',
|
||||
backToLogin: 'Back to Login',
|
||||
registerNow: 'Register Now',
|
||||
registerSuccess: 'Registration successful! The system has created an exclusive tenant for you, please login',
|
||||
registerFailed: 'Registration failed',
|
||||
subtitle: 'Document understanding and semantic search framework based on large models',
|
||||
registerSubtitle: 'The system will create an exclusive tenant for you after registration',
|
||||
emailPlaceholder: 'Enter email address',
|
||||
passwordPlaceholder: 'Enter password (8-32 characters, including letters and numbers)',
|
||||
confirmPasswordPlaceholder: 'Enter password again',
|
||||
usernamePlaceholder: 'Enter username',
|
||||
emailRequired: 'Enter email address',
|
||||
emailInvalid: 'Enter correct email format',
|
||||
passwordRequired: 'Enter password',
|
||||
passwordMinLength: 'Password must be at least 8 characters',
|
||||
passwordMaxLength: 'Password cannot exceed 32 characters',
|
||||
passwordMustContainLetter: 'Password must contain letters',
|
||||
passwordMustContainNumber: 'Password must contain numbers',
|
||||
usernameRequired: 'Enter username',
|
||||
usernameMinLength: 'Username must be at least 2 characters',
|
||||
usernameMaxLength: 'Username cannot exceed 20 characters',
|
||||
usernameInvalid: 'Username can only contain letters, numbers, underscores and Chinese characters',
|
||||
confirmPasswordRequired: 'Confirm password',
|
||||
passwordMismatch: 'Entered passwords do not match',
|
||||
loginError: 'Login error, please check email or password',
|
||||
loginErrorRetry: 'Login error, please try again later',
|
||||
registerError: 'Registration error, please try again later',
|
||||
forgotPasswordNotAvailable: 'Password recovery function is temporarily unavailable, please contact administrator'
|
||||
},
|
||||
common: {
|
||||
confirm: 'Confirm',
|
||||
cancel: 'Cancel',
|
||||
save: 'Save',
|
||||
delete: 'Delete',
|
||||
edit: 'Edit',
|
||||
create: 'Create',
|
||||
search: 'Search',
|
||||
filter: 'Filter',
|
||||
export: 'Export',
|
||||
import: 'Import',
|
||||
upload: 'Upload',
|
||||
download: 'Download',
|
||||
refresh: 'Refresh',
|
||||
loading: 'Loading...',
|
||||
noData: 'No data',
|
||||
error: 'Error',
|
||||
success: 'Success',
|
||||
warning: 'Warning',
|
||||
info: 'Information',
|
||||
yes: 'Yes',
|
||||
no: 'No',
|
||||
ok: 'OK',
|
||||
close: 'Close',
|
||||
back: 'Back',
|
||||
next: 'Next',
|
||||
finish: 'Finish',
|
||||
all: 'All',
|
||||
reset: 'Reset',
|
||||
clear: 'Clear'
|
||||
},
|
||||
file: {
|
||||
upload: 'Upload File',
|
||||
uploadSuccess: 'File uploaded successfully',
|
||||
uploadFailed: 'File upload failed',
|
||||
delete: 'Delete File',
|
||||
deleteSuccess: 'File deleted successfully',
|
||||
deleteFailed: 'File deletion failed',
|
||||
download: 'Download File',
|
||||
preview: 'Preview',
|
||||
unsupportedFormat: 'Unsupported file format',
|
||||
maxSizeExceeded: 'Maximum file size exceeded',
|
||||
selectFile: 'Select File'
|
||||
},
|
||||
tenant: {
|
||||
title: 'Tenant Information',
|
||||
name: 'Tenant Name',
|
||||
id: 'Tenant ID',
|
||||
createdAt: 'Created At',
|
||||
updatedAt: 'Updated At',
|
||||
status: 'Status',
|
||||
active: 'Active',
|
||||
inactive: 'Inactive',
|
||||
// Additional translations for TenantInfo.vue
|
||||
systemInfo: 'System Information',
|
||||
viewSystemInfo: 'View system version and user account configuration information',
|
||||
version: 'Version',
|
||||
buildTime: 'Build Time',
|
||||
goVersion: 'Go Version',
|
||||
userInfo: 'User Information',
|
||||
userId: 'User ID',
|
||||
username: 'Username',
|
||||
email: 'Email',
|
||||
tenantInfo: 'Tenant Information',
|
||||
tenantId: 'Tenant ID',
|
||||
tenantName: 'Tenant Name',
|
||||
description: 'Description',
|
||||
business: 'Business',
|
||||
noDescription: 'No description',
|
||||
noBusiness: 'None',
|
||||
statusActive: 'Active',
|
||||
statusInactive: 'Not activated',
|
||||
statusSuspended: 'Suspended',
|
||||
statusUnknown: 'Unknown',
|
||||
apiKey: 'API Key',
|
||||
keepApiKeySafe: 'Please keep your API Key safe, do not disclose it in public places or code repositories',
|
||||
storageInfo: 'Storage Information',
|
||||
storageQuota: 'Storage Quota',
|
||||
used: 'Used',
|
||||
usage: 'Usage',
|
||||
apiDevDocs: 'API Developer Documentation',
|
||||
useApiKey: 'Use your API Key to start development, view complete API documentation and code examples.',
|
||||
viewApiDoc: 'View API Documentation',
|
||||
loadingAccountInfo: 'Loading account information...',
|
||||
loadFailed: 'Load failed',
|
||||
retry: 'Retry',
|
||||
apiKeyCopied: 'API Key copied to clipboard',
|
||||
unknown: 'Unknown',
|
||||
formatError: 'Format error'
|
||||
},
|
||||
error: {
|
||||
network: 'Network error',
|
||||
server: 'Server error',
|
||||
notFound: 'Not found',
|
||||
unauthorized: 'Unauthorized',
|
||||
forbidden: 'Access forbidden',
|
||||
unknown: 'Unknown error',
|
||||
tryAgain: 'Please try again'
|
||||
},
|
||||
model: {
|
||||
llmModel: 'LLM Model',
|
||||
embeddingModel: 'Embedding Model',
|
||||
rerankModel: 'Rerank Model',
|
||||
vlmModel: 'Multimodal Model',
|
||||
modelName: 'Model Name',
|
||||
modelProvider: 'Model Provider',
|
||||
modelUrl: 'Model URL',
|
||||
apiKey: 'API Key',
|
||||
testConnection: 'Test Connection',
|
||||
connectionSuccess: 'Connection successful',
|
||||
connectionFailed: 'Connection failed',
|
||||
dimension: 'Dimension',
|
||||
maxTokens: 'Max Tokens',
|
||||
temperature: 'Temperature',
|
||||
topP: 'Top P',
|
||||
selectModel: 'Select Model',
|
||||
customModel: 'Custom Model',
|
||||
builtinModel: 'Built-in Model'
|
||||
}
|
||||
}
|
||||
553
frontend/src/i18n/locales/ru-RU.ts
Normal file
@@ -0,0 +1,553 @@
|
||||
export default {
|
||||
menu: {
|
||||
knowledgeBase: 'База знаний',
|
||||
chat: 'Диалог',
|
||||
createChat: 'Создать диалог',
|
||||
tenant: 'Информация об аккаунте',
|
||||
settings: 'Настройки системы',
|
||||
logout: 'Выход',
|
||||
uploadKnowledge: 'Загрузить знания',
|
||||
deleteRecord: 'Удалить запись',
|
||||
newSession: 'Новый диалог',
|
||||
confirmLogout: 'Вы уверены, что хотите выйти?',
|
||||
systemInfo: 'Информация о системе'
|
||||
},
|
||||
knowledgeBase: {
|
||||
title: 'База знаний',
|
||||
list: 'Список баз знаний',
|
||||
detail: 'Детали базы знаний',
|
||||
create: 'Создать базу знаний',
|
||||
edit: 'Редактировать базу знаний',
|
||||
delete: 'Удалить базу знаний',
|
||||
name: 'Название',
|
||||
description: 'Описание',
|
||||
files: 'Файлы',
|
||||
settings: 'Настройки',
|
||||
upload: 'Загрузить файл',
|
||||
uploadSuccess: 'Файл успешно загружен!',
|
||||
uploadFailed: 'Ошибка загрузки файла!',
|
||||
fileExists: 'Файл уже существует',
|
||||
notInitialized: 'База знаний не инициализирована. Пожалуйста, настройте модели в разделе настроек перед загрузкой файлов',
|
||||
getInfoFailed: 'Не удалось получить информацию о базе знаний, загрузка файла невозможна',
|
||||
missingId: 'Отсутствует ID базы знаний',
|
||||
deleteFailed: 'Не удалось удалить. Пожалуйста, попробуйте позже!',
|
||||
createKnowledgeBase: 'Создать базу знаний',
|
||||
knowledgeBaseName: 'Название базы знаний',
|
||||
enterName: 'Введите название базы знаний',
|
||||
embeddingModel: 'Модель встраивания',
|
||||
selectEmbeddingModel: 'Выберите модель встраивания',
|
||||
summaryModel: 'Модель суммаризации',
|
||||
selectSummaryModel: 'Выберите модель суммаризации',
|
||||
rerankModel: 'Модель ранжирования',
|
||||
selectRerankModel: 'Выберите модель ранжирования (опционально)',
|
||||
createSuccess: 'База знаний успешно создана',
|
||||
createFailed: 'Не удалось создать базу знаний',
|
||||
updateSuccess: 'База знаний успешно обновлена',
|
||||
updateFailed: 'Не удалось обновить базу знаний',
|
||||
deleteSuccess: 'База знаний успешно удалена',
|
||||
deleteConfirm: 'Вы уверены, что хотите удалить эту базу знаний?',
|
||||
fileName: 'Имя файла',
|
||||
fileSize: 'Размер файла',
|
||||
uploadTime: 'Время загрузки',
|
||||
status: 'Статус',
|
||||
actions: 'Действия',
|
||||
processing: 'Обработка',
|
||||
completed: 'Завершено',
|
||||
failed: 'Ошибка',
|
||||
noFiles: 'Нет файлов',
|
||||
dragFilesHere: 'Перетащите файлы сюда или',
|
||||
clickToUpload: 'нажмите для загрузки',
|
||||
supportedFormats: 'Поддерживаемые форматы',
|
||||
maxFileSize: 'Макс. размер файла',
|
||||
viewDetails: 'Просмотр деталей',
|
||||
downloadFile: 'Скачать файл',
|
||||
deleteFile: 'Удалить файл',
|
||||
confirmDeleteFile: 'Вы уверены, что хотите удалить этот файл?',
|
||||
totalFiles: 'Всего файлов',
|
||||
totalSize: 'Общий размер',
|
||||
// Дополнительные переводы для KnowledgeBase.vue
|
||||
newSession: 'Новый диалог',
|
||||
deleteDocument: 'Удалить документ',
|
||||
parsingFailed: 'Парсинг не удался',
|
||||
parsingInProgress: 'Парсинг...',
|
||||
deleteConfirmation: 'Подтверждение удаления',
|
||||
confirmDeleteDocument: 'Подтвердить удаление документа "{fileName}", после удаления восстановление невозможно',
|
||||
cancel: 'Отмена',
|
||||
confirmDelete: 'Подтвердить удаление',
|
||||
selectKnowledgeBaseFirst: 'Пожалуйста, сначала выберите базу знаний',
|
||||
sessionCreationFailed: 'Не удалось создать диалог',
|
||||
sessionCreationError: 'Ошибка создания диалога',
|
||||
settingsParsingFailed: 'Не удалось разобрать настройки',
|
||||
fileUploadEventReceived: 'Получено событие загрузки файла, загруженный ID базы знаний: {uploadedKbId}, текущий ID базы знаний: {currentKbId}',
|
||||
matchingKnowledgeBase: 'Совпадающая база знаний, начинаем обновление списка файлов',
|
||||
routeParamChange: 'Изменение параметров маршрута, повторное получение содержимого базы знаний',
|
||||
fileUploadEventListening: 'Прослушивание события загрузки файла',
|
||||
apiCallKnowledgeFiles: 'Прямой вызов API для получения списка файлов базы знаний',
|
||||
responseInterceptorData: 'Поскольку перехватчик ответа уже вернул data, result является частью данных ответа',
|
||||
hookProcessing: 'Обработка в соответствии со способом useKnowledgeBase hook',
|
||||
errorHandling: 'Обработка ошибок',
|
||||
priorityCurrentPageKbId: 'Приоритет использования ID базы знаний текущей страницы',
|
||||
fallbackLocalStorageKbId: 'Если на текущей странице нет ID базы знаний, попытка получить ID базы знаний из настроек в localStorage',
|
||||
// Дополнительные переводы для KnowledgeBaseList.vue
|
||||
createNewKnowledgeBase: 'Создать базу знаний',
|
||||
uninitializedWarning: 'Некоторые базы знаний не инициализированы, необходимо сначала настроить информацию о моделях в настройках, чтобы добавить документы знаний',
|
||||
initializedStatus: 'Инициализирована',
|
||||
notInitializedStatus: 'Не инициализирована',
|
||||
needSettingsFirst: 'Необходимо сначала настроить информацию о моделях в настройках, чтобы добавить знания',
|
||||
documents: 'Документы',
|
||||
configureModelsFirst: 'Пожалуйста, сначала настройте информацию о моделях в настройках',
|
||||
confirmDeleteKnowledgeBase: 'Подтвердить удаление этой базы знаний?',
|
||||
createKnowledgeBaseDialog: 'Создать базу знаний',
|
||||
enterNameKb: 'Введите название',
|
||||
enterDescriptionKb: 'Введите описание',
|
||||
createKb: 'Создать',
|
||||
deleted: 'Удалено',
|
||||
deleteFailedKb: 'Не удалось удалить',
|
||||
noDescription: 'Нет описания',
|
||||
emptyKnowledgeDragDrop: 'База знаний пуста, перетащите файлы для загрузки',
|
||||
pdfDocFormat: 'Файлы pdf, doc формата, не более 10 МБ',
|
||||
textMarkdownFormat: 'Файлы text, markdown формата, не более 200 КБ',
|
||||
dragFileNotText: 'Пожалуйста, перетащите файлы, а не текст или ссылки'
|
||||
},
|
||||
chat: {
|
||||
title: 'Диалог',
|
||||
newChat: 'Новый чат',
|
||||
inputPlaceholder: 'Введите ваше сообщение...',
|
||||
send: 'Отправить',
|
||||
thinking: 'Думаю...',
|
||||
regenerate: 'Сгенерировать заново',
|
||||
copy: 'Копировать',
|
||||
delete: 'Удалить',
|
||||
reference: 'Ссылка',
|
||||
noMessages: 'Нет сообщений',
|
||||
// Дополнительные переводы для компонентов чата
|
||||
waitingForAnswer: 'Ожидание ответа...',
|
||||
cannotAnswer: 'Извините, я не могу ответить на этот вопрос.',
|
||||
summarizingAnswer: 'Подведение итогов ответа...',
|
||||
loading: 'Загрузка...',
|
||||
enterDescription: 'Введите описание',
|
||||
referencedContent: 'Использовано {count} связанных материалов',
|
||||
deepThinking: 'Глубокое мышление завершено',
|
||||
knowledgeBaseQandA: 'Вопросы и ответы на основе базы знаний',
|
||||
askKnowledgeBase: 'Задайте вопрос базе знаний',
|
||||
sourcesCount: '{count} источников',
|
||||
pleaseEnterContent: 'Пожалуйста, введите содержимое!',
|
||||
pleaseUploadKnowledgeBase: 'Пожалуйста, сначала загрузите базу знаний!',
|
||||
replyingPleaseWait: 'Идёт ответ, пожалуйста, попробуйте позже!',
|
||||
createSessionFailed: 'Не удалось создать сеанс',
|
||||
createSessionError: 'Ошибка создания сеанса',
|
||||
unableToGetKnowledgeBaseId: 'Невозможно получить ID базы знаний'
|
||||
},
|
||||
settings: {
|
||||
title: 'Настройки',
|
||||
system: 'Настройки системы',
|
||||
systemConfig: 'Системная конфигурация',
|
||||
knowledgeBaseSettings: 'Настройки базы знаний',
|
||||
configureKbModels: 'Настройка моделей и параметров разделения документов для этой базы знаний',
|
||||
manageSystemModels: 'Управление и обновление системных моделей и конфигураций сервисов',
|
||||
basicInfo: 'Основная информация',
|
||||
documentSplitting: 'Разделение документов',
|
||||
apiEndpoint: 'API конечная точка',
|
||||
enterApiEndpoint: 'Введите API конечную точку, например: http://localhost',
|
||||
enterApiKey: 'Введите API ключ',
|
||||
enterKnowledgeBaseId: 'Введите ID базы знаний',
|
||||
saveConfig: 'Сохранить конфигурацию',
|
||||
reset: 'Сбросить',
|
||||
configSaved: 'Конфигурация сохранена успешно',
|
||||
enterApiEndpointRequired: 'Введите API конечную точку',
|
||||
enterApiKeyRequired: 'Введите API ключ',
|
||||
enterKnowledgeBaseIdRequired: 'Введите ID базы знаний',
|
||||
name: 'Название',
|
||||
enterName: 'Введите название',
|
||||
description: 'Описание',
|
||||
chunkSize: 'Размер блока',
|
||||
chunkOverlap: 'Перекрытие блоков',
|
||||
save: 'Сохранить',
|
||||
saving: 'Сохранение...',
|
||||
saveSuccess: 'Сохранено успешно',
|
||||
saveFailed: 'Не удалось сохранить',
|
||||
model: 'Модель',
|
||||
llmModel: 'LLM модель',
|
||||
embeddingModel: 'Модель встраивания',
|
||||
rerankModel: 'Модель ранжирования',
|
||||
vlmModel: 'Мультимодальная модель',
|
||||
modelName: 'Название модели',
|
||||
modelUrl: 'URL модели',
|
||||
apiKey: 'API ключ',
|
||||
cancel: 'Отмена',
|
||||
saveFailedSettings: 'Не удалось сохранить настройки',
|
||||
enterNameRequired: 'Введите название'
|
||||
},
|
||||
initialization: {
|
||||
title: 'Инициализация',
|
||||
welcome: 'Добро пожаловать в WeKnora',
|
||||
description: 'Пожалуйста, настройте систему перед началом работы',
|
||||
step1: 'Шаг 1: Настройка LLM модели',
|
||||
step2: 'Шаг 2: Настройка модели встраивания',
|
||||
step3: 'Шаг 3: Настройка дополнительных моделей',
|
||||
complete: 'Завершить инициализацию',
|
||||
skip: 'Пропустить',
|
||||
next: 'Далее',
|
||||
previous: 'Назад',
|
||||
// Ollama сервис
|
||||
ollamaServiceStatus: 'Статус службы Ollama',
|
||||
refreshStatus: 'Обновить статус',
|
||||
ollamaServiceAddress: 'Адрес службы Ollama',
|
||||
notConfigured: 'Не настроено',
|
||||
notRunning: 'Не запущено',
|
||||
normal: 'Нормально',
|
||||
installedModels: 'Установленные модели',
|
||||
none: 'Временно отсутствует',
|
||||
// База знаний
|
||||
knowledgeBaseInfo: 'Информация о базе знаний',
|
||||
knowledgeBaseName: 'Название базы знаний',
|
||||
knowledgeBaseNamePlaceholder: 'Введите название базы знаний',
|
||||
knowledgeBaseDescription: 'Описание базы знаний',
|
||||
knowledgeBaseDescriptionPlaceholder: 'Введите описание базы знаний',
|
||||
// LLM модель
|
||||
llmModelConfig: 'Конфигурация LLM большой языковой модели',
|
||||
modelSource: 'Источник модели',
|
||||
local: 'Ollama (локальный)',
|
||||
remote: 'Remote API (удаленный)',
|
||||
modelName: 'Название модели',
|
||||
modelNamePlaceholder: 'Например: qwen3:0.6b',
|
||||
baseUrl: 'Base URL',
|
||||
baseUrlPlaceholder: 'Например: https://api.openai.com/v1, удалите часть /chat/completions в конце URL',
|
||||
apiKey: 'API Key (необязательно)',
|
||||
apiKeyPlaceholder: 'Введите API Key (необязательно)',
|
||||
downloadModel: 'Скачать модель',
|
||||
installed: 'Установлено',
|
||||
notInstalled: 'Не установлено',
|
||||
notChecked: 'Не проверено',
|
||||
checkConnection: 'Проверить соединение',
|
||||
connectionNormal: 'Соединение в норме',
|
||||
connectionFailed: 'Ошибка соединения',
|
||||
checkingConnection: 'Проверка соединения',
|
||||
// Embedding модель
|
||||
embeddingModelConfig: 'Конфигурация модели встраивания',
|
||||
embeddingWarning: 'В базе знаний уже есть файлы, невозможно изменить конфигурацию модели встраивания',
|
||||
dimension: 'Размерность',
|
||||
dimensionPlaceholder: 'Введите размерность вектора',
|
||||
detectDimension: 'Определить размерность',
|
||||
// Rerank модель
|
||||
rerankModelConfig: 'Конфигурация модели ранжирования',
|
||||
enableRerank: 'Включить модель ранжирования',
|
||||
// Мультимодальные настройки
|
||||
multimodalConfig: 'Мультимодальная конфигурация',
|
||||
enableMultimodal: 'Включить извлечение информации из изображений',
|
||||
visualLanguageModelConfig: 'Конфигурация визуально-языковой модели',
|
||||
interfaceType: 'Тип интерфейса',
|
||||
openaiCompatible: 'Совместимый с OpenAI интерфейс',
|
||||
// Настройки хранилища
|
||||
storageServiceConfig: 'Конфигурация службы хранения',
|
||||
storageType: 'Тип хранения',
|
||||
bucketName: 'Bucket Name',
|
||||
bucketNamePlaceholder: 'Введите имя Bucket',
|
||||
pathPrefix: 'Path Prefix',
|
||||
pathPrefixPlaceholder: 'Например: images',
|
||||
secretId: 'Secret ID',
|
||||
secretIdPlaceholder: 'Введите COS Secret ID',
|
||||
secretKey: 'Secret Key',
|
||||
secretKeyPlaceholder: 'Введите COS Secret Key',
|
||||
region: 'Region',
|
||||
regionPlaceholder: 'Например: ap-beijing',
|
||||
appId: 'App ID',
|
||||
appIdPlaceholder: 'Введите App ID',
|
||||
// Тестирование мультимодальных функций
|
||||
functionTest: 'Тест функции',
|
||||
testDescription: 'Загрузите изображение для тестирования функций описания изображений и распознавания текста модели VLM',
|
||||
selectImage: 'Выбрать изображение',
|
||||
startTest: 'Начать тест',
|
||||
testResult: 'Результат теста',
|
||||
imageDescription: 'Описание изображения:',
|
||||
textRecognition: 'Распознавание текста:',
|
||||
processingTime: 'Время обработки:',
|
||||
testFailed: 'Тест не удался',
|
||||
multimodalProcessingFailed: 'Ошибка мультимодальной обработки',
|
||||
// Разделение документов
|
||||
documentSplittingConfig: 'Конфигурация разделения документов',
|
||||
splittingStrategy: 'Стратегия разделения',
|
||||
balancedMode: 'Сбалансированный режим',
|
||||
balancedModeDesc: 'Размер блока: 1000 / Перекрытие: 200',
|
||||
precisionMode: 'Точный режим',
|
||||
precisionModeDesc: 'Размер блока: 512 / Перекрытие: 100',
|
||||
contextMode: 'Контекстный режим',
|
||||
contextModeDesc: 'Размер блока: 2048 / Перекрытие: 400',
|
||||
custom: 'Пользовательский',
|
||||
customDesc: 'Настроить параметры вручную',
|
||||
chunkSize: 'Размер блока',
|
||||
chunkOverlap: 'Перекрытие блоков',
|
||||
separatorSettings: 'Настройки разделителей',
|
||||
selectOrCustomSeparators: 'Выберите или настройте разделители',
|
||||
characters: 'символов',
|
||||
separatorParagraph: 'Разделитель абзацев (\\n\\n)',
|
||||
separatorNewline: 'Перевод строки (\\n)',
|
||||
separatorPeriod: 'Точка (。)',
|
||||
separatorExclamation: 'Восклицательный знак (!)',
|
||||
separatorQuestion: 'Вопросительный знак (?)',
|
||||
separatorSemicolon: 'Точка с запятой (;)',
|
||||
separatorChineseSemicolon: 'Китайская точка с запятой (;)',
|
||||
separatorComma: 'Запятая (,)',
|
||||
separatorChineseComma: 'Китайская запятая (,)',
|
||||
// Извлечение сущностей и отношений
|
||||
entityRelationExtraction: 'Извлечение сущностей и отношений',
|
||||
enableEntityRelationExtraction: 'Включить извлечение сущностей и отношений',
|
||||
relationTypeConfig: 'Конфигурация типов отношений',
|
||||
relationType: 'Тип отношения',
|
||||
generateRandomTags: 'Сгенерировать случайные теги',
|
||||
completeModelConfig: 'Пожалуйста, завершите конфигурацию модели',
|
||||
systemWillExtract: 'Система будет извлекать соответствующие сущности и отношения из текста в соответствии с выбранными типами отношений',
|
||||
extractionExample: 'Пример извлечения',
|
||||
sampleText: 'Пример текста',
|
||||
sampleTextPlaceholder: 'Введите текст для анализа, например: "Красный особняк", также известный как "Сон в красном тереме", является одним из четырех великих классических произведений китайской литературы, написанным Цинь Сюэцином в династии Цин...',
|
||||
generateRandomText: 'Сгенерировать случайный текст',
|
||||
entityList: 'Список сущностей',
|
||||
nodeName: 'Имя узла',
|
||||
nodeNamePlaceholder: 'Имя узла',
|
||||
addAttribute: 'Добавить атрибут',
|
||||
attributeValue: 'Значение атрибута',
|
||||
attributeValuePlaceholder: 'Значение атрибута',
|
||||
addEntity: 'Добавить сущность',
|
||||
completeEntityInfo: 'Пожалуйста, завершите информацию о сущности',
|
||||
relationConnection: 'Соединение отношений',
|
||||
selectEntity: 'Выберите сущность',
|
||||
addRelation: 'Добавить отношение',
|
||||
completeRelationInfo: 'Пожалуйста, завершите информацию об отношении',
|
||||
startExtraction: 'Начать извлечение',
|
||||
extracting: 'Извлечение...',
|
||||
defaultExample: 'Пример по умолчанию',
|
||||
clearExample: 'Очистить пример',
|
||||
// Кнопки и сообщения
|
||||
updateKnowledgeBaseSettings: 'Обновить настройки базы знаний',
|
||||
updateConfigInfo: 'Обновить информацию о конфигурации',
|
||||
completeConfig: 'Завершить конфигурацию',
|
||||
waitForDownloads: 'Пожалуйста, дождитесь завершения загрузки всех моделей Ollama перед обновлением конфигурации',
|
||||
completeModelConfigInfo: 'Пожалуйста, завершите информацию о конфигурации модели',
|
||||
knowledgeBaseIdMissing: 'Отсутствует ID базы знаний',
|
||||
knowledgeBaseSettingsUpdateSuccess: 'Настройки базы знаний успешно обновлены',
|
||||
configUpdateSuccess: 'Конфигурация успешно обновлена',
|
||||
systemInitComplete: 'Инициализация системы завершена',
|
||||
operationFailed: 'Операция не удалась',
|
||||
updateKnowledgeBaseInfoFailed: 'Не удалось обновить базовую информацию о базе знаний',
|
||||
knowledgeBaseIdMissingCannotSave: 'Отсутствует ID базы знаний, невозможно сохранить конфигурацию',
|
||||
operationFailedCheckNetwork: 'Операция не удалась, проверьте сетевое соединение',
|
||||
imageUploadSuccess: 'Изображение успешно загружено, можно начать тестирование',
|
||||
multimodalConfigIncomplete: 'Мультимодальная конфигурация неполная, пожалуйста, завершите мультимодальную конфигурацию перед загрузкой изображения',
|
||||
pleaseSelectImage: 'Пожалуйста, выберите изображение',
|
||||
multimodalTestSuccess: 'Мультимодальный тест успешен',
|
||||
multimodalTestFailed: 'Мультимодальный тест не удался',
|
||||
pleaseEnterSampleText: 'Пожалуйста, введите текст примера',
|
||||
pleaseEnterRelationType: 'Пожалуйста, введите тип отношения',
|
||||
pleaseEnterLLMModelConfig: 'Пожалуйста, введите конфигурацию LLM большой языковой модели',
|
||||
noValidNodesExtracted: 'Не извлечено допустимых узлов',
|
||||
noValidRelationsExtracted: 'Не извлечено допустимых отношений',
|
||||
extractionFailedCheckNetwork: 'Извлечение не удалось, проверьте сетевое соединение или формат текста',
|
||||
generateFailedRetry: 'Генерация не удалась, попробуйте еще раз',
|
||||
pleaseCheckForm: 'Пожалуйста, проверьте правильность заполнения формы',
|
||||
detectionSuccessful: 'Обнаружение успешно, размерность автоматически заполнена как',
|
||||
detectionFailed: 'Обнаружение не удалось',
|
||||
detectionFailedCheckConfig: 'Обнаружение не удалось, проверьте конфигурацию',
|
||||
modelDownloadSuccess: 'Модель успешно загружена',
|
||||
modelDownloadFailed: 'Не удалось загрузить модель',
|
||||
downloadStartFailed: 'Не удалось начать загрузку',
|
||||
queryProgressFailed: 'Не удалось запросить прогресс',
|
||||
checkOllamaStatusFailed: 'Не удалось проверить статус Ollama',
|
||||
getKnowledgeBaseInfoFailed: 'Не удалось получить информацию о базе знаний',
|
||||
textRelationExtractionFailed: 'Не удалось извлечь текстовые отношения',
|
||||
// Валидация
|
||||
pleaseEnterKnowledgeBaseName: 'Пожалуйста, введите название базы знаний',
|
||||
knowledgeBaseNameLength: 'Длина названия базы знаний должна быть от 1 до 50 символов',
|
||||
knowledgeBaseDescriptionLength: 'Длина описания базы знаний не может превышать 200 символов',
|
||||
pleaseEnterLLMModelName: 'Пожалуйста, введите название LLM модели',
|
||||
pleaseEnterBaseURL: 'Пожалуйста, введите BaseURL',
|
||||
pleaseEnterEmbeddingModelName: 'Пожалуйста, введите название модели встраивания',
|
||||
pleaseEnterEmbeddingDimension: 'Пожалуйста, введите размерность встраивания',
|
||||
dimensionMustBeInteger: 'Размерность должна быть допустимым целым числом, обычно 768, 1024, 1536, 3584 и т.д.',
|
||||
pleaseEnterTextContent: 'Пожалуйста, введите текстовое содержание',
|
||||
textContentMinLength: 'Текстовое содержание должно содержать не менее 10 символов',
|
||||
pleaseEnterValidTag: 'Пожалуйста, введите действительный тег',
|
||||
tagAlreadyExists: 'Этот тег уже существует',
|
||||
// Дополнительные переводы для InitializationContent.vue
|
||||
checkFailed: 'Проверка не удалась',
|
||||
startingDownload: 'Запуск загрузки...',
|
||||
downloadStarted: 'Загрузка началась',
|
||||
model: 'Модель',
|
||||
startModelDownloadFailed: 'Не удалось запустить загрузку модели',
|
||||
downloadCompleted: 'Загрузка завершена',
|
||||
downloadFailed: 'Загрузка не удалась',
|
||||
knowledgeBaseSettingsModeMissingId: 'В режиме настроек базы знаний отсутствует ID базы знаний',
|
||||
completeEmbeddingConfig: 'Пожалуйста, сначала полностью заполните конфигурацию встраивания',
|
||||
detectionSuccess: 'Обнаружение успешно,',
|
||||
dimensionAutoFilled: 'размерность автоматически заполнена:',
|
||||
checkFormCorrectness: 'Пожалуйста, проверьте правильность заполнения формы',
|
||||
systemInitializationCompleted: 'Инициализация системы завершена',
|
||||
generationFailedRetry: 'Генерация не удалась, пожалуйста, попробуйте еще раз',
|
||||
chunkSizeDesc: 'Размер каждого текстового блока. Большие блоки сохраняют больше контекста, но могут снизить точность поиска.',
|
||||
chunkOverlapDesc: 'Количество символов, перекрывающихся между соседними блоками. Помогает сохранить контекст на границах блоков.',
|
||||
selectRelationType: 'Выберите тип отношения'
|
||||
},
|
||||
auth: {
|
||||
login: 'Вход',
|
||||
logout: 'Выход',
|
||||
username: 'Имя пользователя',
|
||||
email: 'Почта Email',
|
||||
password: 'Пароль',
|
||||
confirmPassword: 'Подтвердите пароль',
|
||||
rememberMe: 'Запомнить меня',
|
||||
forgotPassword: 'Забыли пароль?',
|
||||
loginSuccess: 'Вход выполнен успешно!',
|
||||
loginFailed: 'Ошибка входа',
|
||||
loggingIn: 'Вход...',
|
||||
register: 'Регистрация',
|
||||
registering: 'Регистрация...',
|
||||
createAccount: 'Создать аккаунт',
|
||||
haveAccount: 'Уже есть аккаунт?',
|
||||
noAccount: 'Ещё нет аккаунта?',
|
||||
backToLogin: 'Вернуться ко входу',
|
||||
registerNow: 'Зарегистрироваться',
|
||||
registerSuccess: 'Регистрация успешна! Система создала для вас эксклюзивного арендатора, пожалуйста, войдите',
|
||||
registerFailed: 'Ошибка регистрации',
|
||||
subtitle: 'Фреймворк понимания документов и семантического поиска на основе больших моделей',
|
||||
registerSubtitle: 'После регистрации система создаст для вас эксклюзивного арендатора',
|
||||
emailPlaceholder: 'Введите адрес электронной почты',
|
||||
passwordPlaceholder: 'Введите пароль (8-32 символа, включая буквы и цифры)',
|
||||
confirmPasswordPlaceholder: 'Введите пароль ещё раз',
|
||||
usernamePlaceholder: 'Введите имя пользователя',
|
||||
emailRequired: 'Введите адрес электронной почты',
|
||||
emailInvalid: 'Введите правильный формат электронной почты',
|
||||
passwordRequired: 'Введите пароль',
|
||||
passwordMinLength: 'Пароль должен быть не менее 8 символов',
|
||||
passwordMaxLength: 'Пароль не может превышать 32 символа',
|
||||
passwordMustContainLetter: 'Пароль должен содержать буквы',
|
||||
passwordMustContainNumber: 'Пароль должен содержать цифры',
|
||||
usernameRequired: 'Введите имя пользователя',
|
||||
usernameMinLength: 'Имя пользователя должно быть не менее 2 символов',
|
||||
usernameMaxLength: 'Имя пользователя не может превышать 20 символов',
|
||||
usernameInvalid: 'Имя пользователя может содержать только буквы, цифры, подчёркивания и китайские иероглифы',
|
||||
confirmPasswordRequired: 'Подтвердите пароль',
|
||||
passwordMismatch: 'Введённые пароли не совпадают',
|
||||
loginError: 'Ошибка входа, пожалуйста, проверьте электронную почту или пароль',
|
||||
loginErrorRetry: 'Ошибка входа, пожалуйста, повторите попытку позже',
|
||||
registerError: 'Ошибка регистрации, пожалуйста, повторите попытку позже',
|
||||
forgotPasswordNotAvailable: 'Функция восстановления пароля временно недоступна, пожалуйста, свяжитесь с администратором'
|
||||
},
|
||||
common: {
|
||||
confirm: 'Подтвердить',
|
||||
cancel: 'Отмена',
|
||||
save: 'Сохранить',
|
||||
delete: 'Удалить',
|
||||
edit: 'Редактировать',
|
||||
create: 'Создать',
|
||||
search: 'Поиск',
|
||||
filter: 'Фильтр',
|
||||
export: 'Экспорт',
|
||||
import: 'Импорт',
|
||||
upload: 'Загрузить',
|
||||
download: 'Скачать',
|
||||
refresh: 'Обновить',
|
||||
loading: 'Загрузка...',
|
||||
noData: 'Нет данных',
|
||||
error: 'Ошибка',
|
||||
success: 'Успешно',
|
||||
warning: 'Предупреждение',
|
||||
info: 'Информация',
|
||||
yes: 'Да',
|
||||
no: 'Нет',
|
||||
ok: 'OK',
|
||||
close: 'Закрыть',
|
||||
back: 'Назад',
|
||||
next: 'Далее',
|
||||
finish: 'Завершить',
|
||||
all: 'Все',
|
||||
reset: 'Сбросить',
|
||||
clear: 'Очистить'
|
||||
},
|
||||
file: {
|
||||
upload: 'Загрузить файл',
|
||||
uploadSuccess: 'Файл успешно загружен',
|
||||
uploadFailed: 'Ошибка загрузки файла',
|
||||
delete: 'Удалить файл',
|
||||
deleteSuccess: 'Файл успешно удален',
|
||||
deleteFailed: 'Ошибка удаления файла',
|
||||
download: 'Скачать файл',
|
||||
preview: 'Предпросмотр',
|
||||
unsupportedFormat: 'Неподдерживаемый формат файла',
|
||||
maxSizeExceeded: 'Превышен максимальный размер файла',
|
||||
selectFile: 'Выберите файл'
|
||||
},
|
||||
tenant: {
|
||||
title: 'Информация об арендаторе',
|
||||
name: 'Имя арендатора',
|
||||
id: 'ID арендатора',
|
||||
createdAt: 'Дата создания',
|
||||
updatedAt: 'Дата обновления',
|
||||
status: 'Статус',
|
||||
active: 'Активен',
|
||||
inactive: 'Неактивен',
|
||||
// Дополнительные переводы для TenantInfo.vue
|
||||
systemInfo: 'Системная информация',
|
||||
viewSystemInfo: 'Просмотр информации о версии системы и конфигурации учётной записи пользователя',
|
||||
version: 'Версия',
|
||||
buildTime: 'Время сборки',
|
||||
goVersion: 'Версия Go',
|
||||
userInfo: 'Информация о пользователе',
|
||||
userId: 'ID пользователя',
|
||||
username: 'Имя пользователя',
|
||||
email: 'Электронная почта',
|
||||
tenantInfo: 'Информация об арендаторе',
|
||||
tenantId: 'ID арендатора',
|
||||
tenantName: 'Название арендатора',
|
||||
description: 'Описание',
|
||||
business: 'Бизнес',
|
||||
noDescription: 'Нет описания',
|
||||
noBusiness: 'Нет',
|
||||
statusActive: 'Активен',
|
||||
statusInactive: 'Не активирован',
|
||||
statusSuspended: 'Приостановлен',
|
||||
statusUnknown: 'Неизвестен',
|
||||
apiKey: 'API Key',
|
||||
keepApiKeySafe: 'Пожалуйста, храните ваш API Key в безопасности, не раскрывайте его в общественных местах или репозиториях кода',
|
||||
storageInfo: 'Информация о хранилище',
|
||||
storageQuota: 'Квота хранилища',
|
||||
used: 'Использовано',
|
||||
usage: 'Использование',
|
||||
apiDevDocs: 'Документация для разработчиков API',
|
||||
useApiKey: 'Используйте ваш API Key для начала разработки, просмотрите полную документацию API и примеры кода.',
|
||||
viewApiDoc: 'Просмотреть документацию API',
|
||||
loadingAccountInfo: 'Загрузка информации об учётной записи...',
|
||||
loadFailed: 'Загрузка не удалась',
|
||||
retry: 'Повторить',
|
||||
apiKeyCopied: 'API Key скопирован в буфер обмена',
|
||||
unknown: 'Неизвестно',
|
||||
formatError: 'Ошибка формата'
|
||||
},
|
||||
error: {
|
||||
network: 'Ошибка сети',
|
||||
server: 'Ошибка сервера',
|
||||
notFound: 'Не найдено',
|
||||
unauthorized: 'Не авторизован',
|
||||
forbidden: 'Доступ запрещен',
|
||||
unknown: 'Неизвестная ошибка',
|
||||
tryAgain: 'Пожалуйста, попробуйте еще раз'
|
||||
},
|
||||
model: {
|
||||
llmModel: 'LLM модель',
|
||||
embeddingModel: 'Модель встраивания',
|
||||
rerankModel: 'Модель ранжирования',
|
||||
vlmModel: 'Мультимодальная модель',
|
||||
modelName: 'Название модели',
|
||||
modelProvider: 'Поставщик модели',
|
||||
modelUrl: 'URL модели',
|
||||
apiKey: 'API ключ',
|
||||
testConnection: 'Проверить соединение',
|
||||
connectionSuccess: 'Соединение успешно',
|
||||
connectionFailed: 'Ошибка соединения',
|
||||
dimension: 'Размерность',
|
||||
maxTokens: 'Макс. токенов',
|
||||
temperature: 'Температура',
|
||||
topP: 'Top P',
|
||||
selectModel: 'Выберите модель',
|
||||
customModel: 'Пользовательская модель',
|
||||
builtinModel: 'Встроенная модель'
|
||||
}
|
||||
}
|
||||
536
frontend/src/i18n/locales/zh-CN.ts
Normal file
@@ -0,0 +1,536 @@
|
||||
export default {
|
||||
menu: {
|
||||
knowledgeBase: '知识库',
|
||||
chat: '对话',
|
||||
createChat: '创建对话',
|
||||
tenant: '账户信息',
|
||||
settings: '系统设置',
|
||||
logout: '退出登录',
|
||||
uploadKnowledge: '上传知识',
|
||||
deleteRecord: '删除记录',
|
||||
newSession: '新会话',
|
||||
confirmLogout: '确定要退出登录吗?',
|
||||
systemInfo: '系统信息'
|
||||
},
|
||||
knowledgeBase: {
|
||||
title: '知识库',
|
||||
list: '知识库列表',
|
||||
detail: '知识库详情',
|
||||
create: '创建知识库',
|
||||
edit: '编辑知识库',
|
||||
delete: '删除知识库',
|
||||
name: '名称',
|
||||
description: '描述',
|
||||
files: '文件',
|
||||
settings: '设置',
|
||||
upload: '上传文件',
|
||||
uploadSuccess: '文件上传成功!',
|
||||
uploadFailed: '文件上传失败!',
|
||||
fileExists: '文件已存在',
|
||||
notInitialized: '该知识库尚未完成初始化配置,请先前往设置页面配置模型信息后再上传文件',
|
||||
getInfoFailed: '获取知识库信息失败,无法上传文件',
|
||||
missingId: '缺少知识库ID',
|
||||
deleteFailed: '删除失败,请稍后再试!',
|
||||
createKnowledgeBase: '创建知识库',
|
||||
knowledgeBaseName: '知识库名称',
|
||||
enterName: '输入知识库名称',
|
||||
embeddingModel: '嵌入模型',
|
||||
selectEmbeddingModel: '选择嵌入模型',
|
||||
summaryModel: '摘要模型',
|
||||
selectSummaryModel: '选择摘要模型',
|
||||
rerankModel: '重排序模型',
|
||||
selectRerankModel: '选择重排序模型(可选)',
|
||||
createSuccess: '知识库创建成功',
|
||||
createFailed: '知识库创建失败',
|
||||
updateSuccess: '知识库更新成功',
|
||||
updateFailed: '知识库更新失败',
|
||||
deleteSuccess: '知识库删除成功',
|
||||
deleteConfirm: '确定要删除此知识库吗?',
|
||||
fileName: '文件名',
|
||||
fileSize: '文件大小',
|
||||
uploadTime: '上传时间',
|
||||
status: '状态',
|
||||
actions: '操作',
|
||||
processing: '处理中',
|
||||
completed: '已完成',
|
||||
failed: '失败',
|
||||
noFiles: '暂无文件',
|
||||
dragFilesHere: '拖拽文件至此或',
|
||||
clickToUpload: '点击上传',
|
||||
supportedFormats: '支持格式',
|
||||
maxFileSize: '最大文件大小',
|
||||
viewDetails: '查看详情',
|
||||
downloadFile: '下载文件',
|
||||
deleteFile: '删除文件',
|
||||
confirmDeleteFile: '确定要删除此文件吗?',
|
||||
totalFiles: '文件总数',
|
||||
totalSize: '总大小',
|
||||
newSession: '新会话',
|
||||
deleteDocument: '删除文档',
|
||||
parsingFailed: '解析失败',
|
||||
parsingInProgress: '解析中...',
|
||||
deleteConfirmation: '删除确认',
|
||||
confirmDeleteDocument: '确认删除文档"{fileName}",删除后将无法恢复',
|
||||
cancel: '取消',
|
||||
confirmDelete: '确认删除',
|
||||
selectKnowledgeBaseFirst: '请先选择知识库',
|
||||
sessionCreationFailed: '创建会话失败',
|
||||
sessionCreationError: '会话创建错误',
|
||||
settingsParsingFailed: '设置解析失败',
|
||||
fileUploadEventReceived: '收到文件上传事件,上传的知识库ID:{uploadedKbId},当前知识库ID:{currentKbId}',
|
||||
matchingKnowledgeBase: '知识库匹配,开始更新文件列表',
|
||||
routeParamChange: '路由参数变化,重新获取知识库内容',
|
||||
fileUploadEventListening: '监听文件上传事件',
|
||||
apiCallKnowledgeFiles: '直接调用API获取知识库文件列表',
|
||||
responseInterceptorData: '由于响应拦截器已返回data,result是响应数据的一部分',
|
||||
hookProcessing: '按照useKnowledgeBase hook方法处理',
|
||||
errorHandling: '错误处理',
|
||||
priorityCurrentPageKbId: '优先使用当前页面的知识库ID',
|
||||
fallbackLocalStorageKbId: '如果当前页面没有知识库ID,尝试从localStorage的设置中获取知识库ID',
|
||||
createNewKnowledgeBase: '创建知识库',
|
||||
uninitializedWarning: '部分知识库未初始化,需要先在设置中配置模型信息才能添加知识文档',
|
||||
initializedStatus: '已初始化',
|
||||
notInitializedStatus: '未初始化',
|
||||
needSettingsFirst: '需要先在设置中配置模型信息才能添加知识',
|
||||
documents: '文档',
|
||||
configureModelsFirst: '请先在设置中配置模型信息',
|
||||
confirmDeleteKnowledgeBase: '确认删除此知识库?',
|
||||
createKnowledgeBaseDialog: '创建知识库',
|
||||
enterNameKb: '输入名称',
|
||||
enterDescriptionKb: '输入描述',
|
||||
createKb: '创建',
|
||||
deleted: '已删除',
|
||||
deleteFailedKb: '删除失败',
|
||||
noDescription: '无描述',
|
||||
emptyKnowledgeDragDrop: '知识为空,拖放上传',
|
||||
pdfDocFormat: 'pdf、doc 格式文件,不超过10M',
|
||||
textMarkdownFormat: 'text、markdown格式文件,不超过200K',
|
||||
dragFileNotText: '请拖拽文件而不是文本或链接'
|
||||
},
|
||||
chat: {
|
||||
title: '对话',
|
||||
newChat: '新对话',
|
||||
inputPlaceholder: '请输入您的消息...',
|
||||
send: '发送',
|
||||
thinking: '思考中...',
|
||||
regenerate: '重新生成',
|
||||
copy: '复制',
|
||||
delete: '删除',
|
||||
reference: '引用',
|
||||
noMessages: '暂无消息',
|
||||
waitingForAnswer: '等待回答...',
|
||||
cannotAnswer: '抱歉,我无法回答这个问题。',
|
||||
summarizingAnswer: '总结答案中...',
|
||||
loading: '加载中...',
|
||||
enterDescription: '输入描述',
|
||||
referencedContent: '引用了 {count} 个相关资料',
|
||||
deepThinking: '深度思考完成',
|
||||
knowledgeBaseQandA: '知识库问答',
|
||||
askKnowledgeBase: '向知识库提问',
|
||||
sourcesCount: '{count} 个来源',
|
||||
pleaseEnterContent: '请输入内容!',
|
||||
pleaseUploadKnowledgeBase: '请先上传知识库!',
|
||||
replyingPleaseWait: '正在回复,请稍后再试!',
|
||||
createSessionFailed: '创建会话失败',
|
||||
createSessionError: '创建会话出错',
|
||||
unableToGetKnowledgeBaseId: '无法获取知识库ID'
|
||||
},
|
||||
settings: {
|
||||
title: '设置',
|
||||
system: '系统设置',
|
||||
systemConfig: '系统配置',
|
||||
knowledgeBaseSettings: '知识库设置',
|
||||
configureKbModels: '为此知识库配置模型和文档分割参数',
|
||||
manageSystemModels: '管理和更新系统模型及服务配置',
|
||||
basicInfo: '基本信息',
|
||||
documentSplitting: '文档分割',
|
||||
apiEndpoint: 'API端点',
|
||||
enterApiEndpoint: '输入API端点,例如:http://localhost',
|
||||
enterApiKey: '输入API密钥',
|
||||
enterKnowledgeBaseId: '输入知识库ID',
|
||||
saveConfig: '保存配置',
|
||||
reset: '重置',
|
||||
configSaved: '配置保存成功',
|
||||
enterApiEndpointRequired: '请输入API端点',
|
||||
enterApiKeyRequired: '请输入API密钥',
|
||||
enterKnowledgeBaseIdRequired: '请输入知识库ID',
|
||||
name: '名称',
|
||||
enterName: '输入名称',
|
||||
description: '描述',
|
||||
chunkSize: '分块大小',
|
||||
chunkOverlap: '分块重叠',
|
||||
save: '保存',
|
||||
saving: '保存中...',
|
||||
saveSuccess: '保存成功',
|
||||
saveFailed: '保存失败',
|
||||
model: '模型',
|
||||
llmModel: 'LLM模型',
|
||||
embeddingModel: '嵌入模型',
|
||||
rerankModel: '重排序模型',
|
||||
vlmModel: '多模态模型',
|
||||
modelName: '模型名称',
|
||||
modelUrl: '模型地址',
|
||||
apiKey: 'API密钥',
|
||||
cancel: '取消',
|
||||
saveFailedSettings: '设置保存失败',
|
||||
enterNameRequired: '请输入名称'
|
||||
},
|
||||
initialization: {
|
||||
title: '初始化',
|
||||
welcome: '欢迎使用WeKnora',
|
||||
description: '请先配置系统以开始使用',
|
||||
step1: '步骤1:配置LLM模型',
|
||||
step2: '步骤2:配置嵌入模型',
|
||||
step3: '步骤3:配置其他模型',
|
||||
complete: '完成初始化',
|
||||
skip: '跳过',
|
||||
next: '下一步',
|
||||
previous: '上一步',
|
||||
ollamaServiceStatus: 'Ollama服务状态',
|
||||
refreshStatus: '刷新状态',
|
||||
ollamaServiceAddress: 'Ollama服务地址',
|
||||
notConfigured: '未配置',
|
||||
notRunning: '未运行',
|
||||
normal: '正常',
|
||||
installedModels: '已安装模型',
|
||||
none: '暂无',
|
||||
knowledgeBaseInfo: '知识库信息',
|
||||
knowledgeBaseName: '知识库名称',
|
||||
knowledgeBaseNamePlaceholder: '输入知识库名称',
|
||||
knowledgeBaseDescription: '知识库描述',
|
||||
knowledgeBaseDescriptionPlaceholder: '输入知识库描述',
|
||||
llmModelConfig: 'LLM大语言模型配置',
|
||||
modelSource: '模型来源',
|
||||
local: 'Ollama(本地)',
|
||||
remote: 'Remote API(远程)',
|
||||
modelName: '模型名称',
|
||||
modelNamePlaceholder: '例如:qwen3:0.6b',
|
||||
baseUrl: 'Base URL',
|
||||
baseUrlPlaceholder: '例如:https://api.openai.com/v1,去掉URL末尾的/chat/completions部分',
|
||||
apiKey: 'API Key(可选)',
|
||||
apiKeyPlaceholder: '输入API Key(可选)',
|
||||
downloadModel: '下载模型',
|
||||
installed: '已安装',
|
||||
notInstalled: '未安装',
|
||||
notChecked: '未检查',
|
||||
checkConnection: '检查连接',
|
||||
connectionNormal: '连接正常',
|
||||
connectionFailed: '连接失败',
|
||||
checkingConnection: '正在检查连接',
|
||||
embeddingModelConfig: '嵌入模型配置',
|
||||
embeddingWarning: '知识库已有文件,无法更改嵌入模型配置',
|
||||
dimension: '维度',
|
||||
dimensionPlaceholder: '输入向量维度',
|
||||
detectDimension: '检测维度',
|
||||
rerankModelConfig: '重排序模型配置',
|
||||
enableRerank: '启用重排序模型',
|
||||
multimodalConfig: '多模态配置',
|
||||
enableMultimodal: '启用图像信息提取',
|
||||
visualLanguageModelConfig: '视觉语言模型配置',
|
||||
interfaceType: '接口类型',
|
||||
openaiCompatible: 'OpenAI兼容接口',
|
||||
storageServiceConfig: '存储服务配置',
|
||||
storageType: '存储类型',
|
||||
bucketName: 'Bucket名称',
|
||||
bucketNamePlaceholder: '输入Bucket名称',
|
||||
pathPrefix: '路径前缀',
|
||||
pathPrefixPlaceholder: '例如:images',
|
||||
secretId: 'Secret ID',
|
||||
secretIdPlaceholder: '输入COS Secret ID',
|
||||
secretKey: 'Secret Key',
|
||||
secretKeyPlaceholder: '输入COS Secret Key',
|
||||
region: 'Region',
|
||||
regionPlaceholder: '例如:ap-beijing',
|
||||
appId: 'App ID',
|
||||
appIdPlaceholder: '输入App ID',
|
||||
functionTest: '功能测试',
|
||||
testDescription: '上传图片测试VLM模型的图像描述和文字识别功能',
|
||||
selectImage: '选择图片',
|
||||
startTest: '开始测试',
|
||||
testResult: '测试结果',
|
||||
imageDescription: '图像描述:',
|
||||
textRecognition: '文字识别:',
|
||||
processingTime: '处理时间:',
|
||||
testFailed: '测试失败',
|
||||
multimodalProcessingFailed: '多模态处理失败',
|
||||
documentSplittingConfig: '文档分割配置',
|
||||
splittingStrategy: '分割策略',
|
||||
balancedMode: '平衡模式',
|
||||
balancedModeDesc: '分块大小:1000 / 重叠:200',
|
||||
precisionMode: '精确模式',
|
||||
precisionModeDesc: '分块大小:512 / 重叠:100',
|
||||
contextMode: '上下文模式',
|
||||
contextModeDesc: '分块大小:2048 / 重叠:400',
|
||||
custom: '自定义',
|
||||
customDesc: '手动配置参数',
|
||||
chunkSize: '分块大小',
|
||||
chunkOverlap: '分块重叠',
|
||||
separatorSettings: '分隔符设置',
|
||||
selectOrCustomSeparators: '选择或自定义分隔符',
|
||||
characters: '个字符',
|
||||
separatorParagraph: '段落分隔符 (\\n\\n)',
|
||||
separatorNewline: '换行符 (\\n)',
|
||||
separatorPeriod: '句号 (。)',
|
||||
separatorExclamation: '感叹号 (!)',
|
||||
separatorQuestion: '问号 (?)',
|
||||
separatorSemicolon: '分号 (;)',
|
||||
separatorChineseSemicolon: '中文分号 (;)',
|
||||
separatorComma: '逗号 (,)',
|
||||
separatorChineseComma: '中文逗号 (,)',
|
||||
entityRelationExtraction: '实体和关系提取',
|
||||
enableEntityRelationExtraction: '启用实体和关系提取',
|
||||
relationTypeConfig: '关系类型配置',
|
||||
relationType: '关系类型',
|
||||
generateRandomTags: '生成随机标签',
|
||||
completeModelConfig: '请完成模型配置',
|
||||
systemWillExtract: '系统将根据所选关系类型从文本中提取相应的实体和关系',
|
||||
extractionExample: '提取示例',
|
||||
sampleText: '示例文本',
|
||||
sampleTextPlaceholder: '输入用于分析的文本,例如:"红楼梦",又名"石头记",是中国四大名著之一,清代曹雪芹所著...',
|
||||
generateRandomText: '生成随机文本',
|
||||
entityList: '实体列表',
|
||||
nodeName: '节点名称',
|
||||
nodeNamePlaceholder: '节点名称',
|
||||
addAttribute: '添加属性',
|
||||
attributeValue: '属性值',
|
||||
attributeValuePlaceholder: '属性值',
|
||||
addEntity: '添加实体',
|
||||
completeEntityInfo: '请完成实体信息',
|
||||
relationConnection: '关系连接',
|
||||
selectEntity: '选择实体',
|
||||
addRelation: '添加关系',
|
||||
completeRelationInfo: '请完成关系信息',
|
||||
startExtraction: '开始提取',
|
||||
extracting: '提取中...',
|
||||
defaultExample: '默认示例',
|
||||
clearExample: '清除示例',
|
||||
updateKnowledgeBaseSettings: '更新知识库设置',
|
||||
updateConfigInfo: '更新配置信息',
|
||||
completeConfig: '完成配置',
|
||||
waitForDownloads: '请等待所有Ollama模型下载完成后再更新配置',
|
||||
completeModelConfigInfo: '请完成模型配置信息',
|
||||
knowledgeBaseIdMissing: '知识库ID缺失',
|
||||
knowledgeBaseSettingsUpdateSuccess: '知识库设置更新成功',
|
||||
configUpdateSuccess: '配置更新成功',
|
||||
systemInitComplete: '系统初始化完成',
|
||||
operationFailed: '操作失败',
|
||||
updateKnowledgeBaseInfoFailed: '更新知识库基本信息失败',
|
||||
knowledgeBaseIdMissingCannotSave: '知识库ID缺失,无法保存配置',
|
||||
operationFailedCheckNetwork: '操作失败,请检查网络连接',
|
||||
imageUploadSuccess: '图片上传成功,可以开始测试',
|
||||
multimodalConfigIncomplete: '多模态配置不完整,请先完成多模态配置后再上传图片',
|
||||
pleaseSelectImage: '请选择图片',
|
||||
multimodalTestSuccess: '多模态测试成功',
|
||||
multimodalTestFailed: '多模态测试失败',
|
||||
pleaseEnterSampleText: '请输入示例文本',
|
||||
pleaseEnterRelationType: '请输入关系类型',
|
||||
pleaseEnterLLMModelConfig: '请输入LLM大语言模型配置',
|
||||
noValidNodesExtracted: '未提取到有效节点',
|
||||
noValidRelationsExtracted: '未提取到有效关系',
|
||||
extractionFailedCheckNetwork: '提取失败,请检查网络或文本格式',
|
||||
generateFailedRetry: '生成失败,请重试',
|
||||
pleaseCheckForm: '请检查表单填写是否正确',
|
||||
detectionSuccessful: '检测成功,维度自动填充为',
|
||||
detectionFailed: '检测失败',
|
||||
detectionFailedCheckConfig: '检测失败,请检查配置',
|
||||
modelDownloadSuccess: '模型下载成功',
|
||||
modelDownloadFailed: '模型下载失败',
|
||||
downloadStartFailed: '下载启动失败',
|
||||
queryProgressFailed: '进度查询失败',
|
||||
checkOllamaStatusFailed: 'Ollama状态检查失败',
|
||||
getKnowledgeBaseInfoFailed: '获取知识库信息失败',
|
||||
textRelationExtractionFailed: '文本关系提取失败',
|
||||
pleaseEnterKnowledgeBaseName: '请输入知识库名称',
|
||||
knowledgeBaseNameLength: '知识库名称长度必须为1-50个字符',
|
||||
knowledgeBaseDescriptionLength: '知识库描述不能超过200个字符',
|
||||
pleaseEnterLLMModelName: '请输入LLM模型名称',
|
||||
pleaseEnterBaseURL: '请输入BaseURL',
|
||||
pleaseEnterEmbeddingModelName: '请输入嵌入模型名称',
|
||||
pleaseEnterEmbeddingDimension: '请输入嵌入维度',
|
||||
dimensionMustBeInteger: '维度必须是有效整数,通常为768、1024、1536、3584等',
|
||||
pleaseEnterTextContent: '请输入文本内容',
|
||||
textContentMinLength: '文本内容必须包含至少10个字符',
|
||||
pleaseEnterValidTag: '请输入有效标签',
|
||||
tagAlreadyExists: '此标签已存在',
|
||||
checkFailed: '检查失败',
|
||||
startingDownload: '正在启动下载...',
|
||||
downloadStarted: '下载已开始',
|
||||
model: '模型',
|
||||
startModelDownloadFailed: '启动模型下载失败',
|
||||
downloadCompleted: '下载完成',
|
||||
downloadFailed: '下载失败',
|
||||
knowledgeBaseSettingsModeMissingId: '知识库设置模式缺少知识库ID',
|
||||
completeEmbeddingConfig: '请先完成嵌入配置',
|
||||
detectionSuccess: '检测成功,',
|
||||
dimensionAutoFilled: '维度已自动填充:',
|
||||
checkFormCorrectness: '请检查表单填写是否正确',
|
||||
systemInitializationCompleted: '系统初始化完成',
|
||||
generationFailedRetry: '生成失败,请重试',
|
||||
chunkSizeDesc: '每个文本块的大小。较大的块保留更多上下文,但可能降低搜索准确性。',
|
||||
chunkOverlapDesc: '相邻块之间重叠的字符数。有助于保持块边界处的上下文。',
|
||||
selectRelationType: '选择关系类型'
|
||||
},
|
||||
auth: {
|
||||
login: '登录',
|
||||
logout: '退出',
|
||||
username: '用户名',
|
||||
email: '邮箱',
|
||||
password: '密码',
|
||||
confirmPassword: '确认密码',
|
||||
rememberMe: '记住我',
|
||||
forgotPassword: '忘记密码?',
|
||||
loginSuccess: '登录成功!',
|
||||
loginFailed: '登录失败',
|
||||
loggingIn: '登录中...',
|
||||
register: '注册',
|
||||
registering: '注册中...',
|
||||
createAccount: '创建账户',
|
||||
haveAccount: '已有账户?',
|
||||
noAccount: '还没有账户?',
|
||||
backToLogin: '返回登录',
|
||||
registerNow: '立即注册',
|
||||
registerSuccess: '注册成功!系统已为您创建专属租户,请登录',
|
||||
registerFailed: '注册失败',
|
||||
subtitle: '基于大模型的文档理解和语义搜索框架',
|
||||
registerSubtitle: '注册后系统将为您创建专属租户',
|
||||
emailPlaceholder: '输入邮箱地址',
|
||||
passwordPlaceholder: '输入密码(8-32个字符,包含字母和数字)',
|
||||
confirmPasswordPlaceholder: '再次输入密码',
|
||||
usernamePlaceholder: '输入用户名',
|
||||
emailRequired: '请输入邮箱地址',
|
||||
emailInvalid: '请输入正确的邮箱格式',
|
||||
passwordRequired: '请输入密码',
|
||||
passwordMinLength: '密码至少8个字符',
|
||||
passwordMaxLength: '密码不能超过32个字符',
|
||||
passwordMustContainLetter: '密码必须包含字母',
|
||||
passwordMustContainNumber: '密码必须包含数字',
|
||||
usernameRequired: '请输入用户名',
|
||||
usernameMinLength: '用户名至少2个字符',
|
||||
usernameMaxLength: '用户名不能超过20个字符',
|
||||
usernameInvalid: '用户名只能包含字母、数字、下划线和中文字符',
|
||||
confirmPasswordRequired: '请确认密码',
|
||||
passwordMismatch: '两次输入的密码不一致',
|
||||
loginError: '登录错误,请检查邮箱或密码',
|
||||
loginErrorRetry: '登录错误,请稍后重试',
|
||||
registerError: '注册错误,请稍后重试',
|
||||
forgotPasswordNotAvailable: '密码找回功能暂不可用,请联系管理员'
|
||||
},
|
||||
common: {
|
||||
confirm: '确认',
|
||||
cancel: '取消',
|
||||
save: '保存',
|
||||
delete: '删除',
|
||||
edit: '编辑',
|
||||
create: '创建',
|
||||
search: '搜索',
|
||||
filter: '筛选',
|
||||
export: '导出',
|
||||
import: '导入',
|
||||
upload: '上传',
|
||||
download: '下载',
|
||||
refresh: '刷新',
|
||||
loading: '加载中...',
|
||||
noData: '暂无数据',
|
||||
error: '错误',
|
||||
success: '成功',
|
||||
warning: '警告',
|
||||
info: '信息',
|
||||
yes: '是',
|
||||
no: '否',
|
||||
ok: '确定',
|
||||
close: '关闭',
|
||||
back: '返回',
|
||||
next: '下一步',
|
||||
finish: '完成',
|
||||
all: '全部',
|
||||
reset: '重置',
|
||||
clear: '清空'
|
||||
},
|
||||
file: {
|
||||
upload: '上传文件',
|
||||
uploadSuccess: '文件上传成功',
|
||||
uploadFailed: '文件上传失败',
|
||||
delete: '删除文件',
|
||||
deleteSuccess: '文件删除成功',
|
||||
deleteFailed: '文件删除失败',
|
||||
download: '下载文件',
|
||||
preview: '预览',
|
||||
unsupportedFormat: '不支持的文件格式',
|
||||
maxSizeExceeded: '文件大小超过限制',
|
||||
selectFile: '选择文件'
|
||||
},
|
||||
tenant: {
|
||||
title: '租户信息',
|
||||
name: '租户名称',
|
||||
id: '租户ID',
|
||||
createdAt: '创建时间',
|
||||
updatedAt: '更新时间',
|
||||
status: '状态',
|
||||
active: '活跃',
|
||||
inactive: '未活跃',
|
||||
systemInfo: '系统信息',
|
||||
viewSystemInfo: '查看系统版本和用户账户配置信息',
|
||||
version: '版本',
|
||||
buildTime: '构建时间',
|
||||
goVersion: 'Go版本',
|
||||
userInfo: '用户信息',
|
||||
userId: '用户ID',
|
||||
username: '用户名',
|
||||
email: '邮箱',
|
||||
tenantInfo: '租户信息',
|
||||
tenantId: '租户ID',
|
||||
tenantName: '租户名称',
|
||||
description: '描述',
|
||||
business: '业务',
|
||||
noDescription: '无描述',
|
||||
noBusiness: '无',
|
||||
statusActive: '活跃',
|
||||
statusInactive: '未激活',
|
||||
statusSuspended: '已暂停',
|
||||
statusUnknown: '未知',
|
||||
apiKey: 'API密钥',
|
||||
keepApiKeySafe: '请妥善保管您的API密钥,不要在公共场所或代码仓库中泄露',
|
||||
storageInfo: '存储信息',
|
||||
storageQuota: '存储配额',
|
||||
used: '已使用',
|
||||
usage: '使用率',
|
||||
apiDevDocs: 'API开发文档',
|
||||
useApiKey: '使用您的API密钥开始开发,查看完整的API文档和代码示例。',
|
||||
viewApiDoc: '查看API文档',
|
||||
loadingAccountInfo: '加载账户信息中...',
|
||||
loadFailed: '加载失败',
|
||||
retry: '重试',
|
||||
apiKeyCopied: 'API密钥已复制到剪贴板',
|
||||
unknown: '未知',
|
||||
formatError: '格式错误'
|
||||
},
|
||||
error: {
|
||||
network: '网络错误',
|
||||
server: '服务器错误',
|
||||
notFound: '未找到',
|
||||
unauthorized: '未授权',
|
||||
forbidden: '禁止访问',
|
||||
unknown: '未知错误',
|
||||
tryAgain: '请重试'
|
||||
},
|
||||
model: {
|
||||
llmModel: 'LLM模型',
|
||||
embeddingModel: '嵌入模型',
|
||||
rerankModel: '重排序模型',
|
||||
vlmModel: '多模态模型',
|
||||
modelName: '模型名称',
|
||||
modelProvider: '模型提供商',
|
||||
modelUrl: '模型地址',
|
||||
apiKey: 'API密钥',
|
||||
testConnection: '测试连接',
|
||||
connectionSuccess: '连接成功',
|
||||
connectionFailed: '连接失败',
|
||||
dimension: '维度',
|
||||
maxTokens: '最大令牌数',
|
||||
temperature: '温度',
|
||||
topP: 'Top P',
|
||||
selectModel: '选择模型',
|
||||
customModel: '自定义模型',
|
||||
builtinModel: '内置模型'
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ import { createApp } from "vue";
|
||||
import { createPinia } from "pinia";
|
||||
import App from "./App.vue";
|
||||
import router from "./router";
|
||||
import i18n from "./i18n";
|
||||
import "./assets/fonts.css";
|
||||
import TDesign from "tdesign-vue-next";
|
||||
// 引入组件库的少量全局样式变量
|
||||
@@ -12,5 +13,6 @@ const app = createApp(App);
|
||||
app.use(TDesign);
|
||||
app.use(createPinia());
|
||||
app.use(router);
|
||||
app.use(i18n);
|
||||
|
||||
app.mount("#app");
|
||||
|
||||
@@ -1,89 +1,117 @@
|
||||
import { createRouter, createWebHistory } from 'vue-router'
|
||||
import { checkInitializationStatus } from '@/api/initialization'
|
||||
import { listKnowledgeBases } from '@/api/knowledge-base'
|
||||
import { useAuthStore } from '@/stores/auth'
|
||||
import { validateToken } from '@/api/auth'
|
||||
|
||||
const router = createRouter({
|
||||
history: createWebHistory(import.meta.env.BASE_URL),
|
||||
routes: [
|
||||
{
|
||||
path: "/",
|
||||
redirect: "/platform",
|
||||
redirect: "/platform/knowledge-bases",
|
||||
},
|
||||
{
|
||||
path: "/initialization",
|
||||
name: "initialization",
|
||||
component: () => import("../views/initialization/InitializationConfig.vue"),
|
||||
meta: { requiresInit: false } // 初始化页面不需要检查初始化状态
|
||||
path: "/login",
|
||||
name: "login",
|
||||
component: () => import("../views/auth/Login.vue"),
|
||||
meta: { requiresAuth: false, requiresInit: false }
|
||||
},
|
||||
{
|
||||
path: "/knowledgeBase",
|
||||
name: "home",
|
||||
component: () => import("../views/knowledge/KnowledgeBase.vue"),
|
||||
meta: { requiresInit: true }
|
||||
meta: { requiresInit: true, requiresAuth: true }
|
||||
},
|
||||
{
|
||||
path: "/platform",
|
||||
name: "Platform",
|
||||
redirect: "/platform/knowledgeBase",
|
||||
redirect: "/platform/knowledge-bases",
|
||||
component: () => import("../views/platform/index.vue"),
|
||||
meta: { requiresInit: true },
|
||||
meta: { requiresInit: true, requiresAuth: true },
|
||||
children: [
|
||||
{
|
||||
path: "knowledgeBase",
|
||||
name: "knowledgeBase",
|
||||
path: "tenant",
|
||||
name: "tenant",
|
||||
component: () => import("../views/tenant/TenantInfo.vue"),
|
||||
meta: { requiresInit: true, requiresAuth: true }
|
||||
},
|
||||
{
|
||||
path: "knowledge-bases",
|
||||
name: "knowledgeBaseList",
|
||||
component: () => import("../views/knowledge/KnowledgeBaseList.vue"),
|
||||
meta: { requiresInit: true, requiresAuth: true }
|
||||
},
|
||||
{
|
||||
path: "knowledge-bases/:kbId",
|
||||
name: "knowledgeBaseDetail",
|
||||
component: () => import("../views/knowledge/KnowledgeBase.vue"),
|
||||
meta: { requiresInit: true }
|
||||
meta: { requiresInit: true, requiresAuth: true }
|
||||
},
|
||||
{
|
||||
path: "creatChat",
|
||||
name: "creatChat",
|
||||
path: "knowledge-bases/:kbId/creatChat",
|
||||
name: "kbCreatChat",
|
||||
component: () => import("../views/creatChat/creatChat.vue"),
|
||||
meta: { requiresInit: true }
|
||||
meta: { requiresInit: true, requiresAuth: true }
|
||||
},
|
||||
{
|
||||
path: "chat/:chatid",
|
||||
path: "knowledge-bases/:kbId/settings",
|
||||
name: "knowledgeBaseSettings",
|
||||
component: () => import("../views/initialization/InitializationContent.vue"),
|
||||
props: { isKbSettings: true },
|
||||
meta: { requiresInit: true, requiresAuth: true }
|
||||
},
|
||||
{
|
||||
path: "chat/:kbId/:chatid",
|
||||
name: "chat",
|
||||
component: () => import("../views/chat/index.vue"),
|
||||
meta: { requiresInit: true }
|
||||
},
|
||||
{
|
||||
path: "settings",
|
||||
name: "settings",
|
||||
component: () => import("../views/settings/Settings.vue"),
|
||||
meta: { requiresInit: true }
|
||||
meta: { requiresInit: true, requiresAuth: true }
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// 路由守卫:检查系统初始化状态
|
||||
// 路由守卫:检查认证状态和系统初始化状态
|
||||
router.beforeEach(async (to, from, next) => {
|
||||
// 如果访问的是初始化页面,直接放行
|
||||
if (to.meta.requiresInit === false) {
|
||||
next();
|
||||
return;
|
||||
}
|
||||
|
||||
1
|
||||
|
||||
try {
|
||||
// 检查系统是否已初始化
|
||||
const { initialized } = await checkInitializationStatus();
|
||||
|
||||
if (initialized) {
|
||||
// 系统已初始化,记录到本地存储并正常跳转
|
||||
localStorage.setItem('system_initialized', 'true');
|
||||
next();
|
||||
} else {
|
||||
// 系统未初始化,跳转到初始化页面
|
||||
console.log('系统未初始化,跳转到初始化页面');
|
||||
next('/initialization');
|
||||
const authStore = useAuthStore()
|
||||
|
||||
// 如果访问的是登录页面或初始化页面,直接放行
|
||||
if (to.meta.requiresAuth === false || to.meta.requiresInit === false) {
|
||||
// 如果已登录用户访问登录页面,重定向到知识库列表页面
|
||||
if (to.path === '/login' && authStore.isLoggedIn) {
|
||||
next('/platform/knowledge-bases')
|
||||
return
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('检查初始化状态失败:', error);
|
||||
// 如果检查失败,默认认为需要初始化
|
||||
next('/initialization');
|
||||
next()
|
||||
return
|
||||
}
|
||||
|
||||
// 检查用户认证状态
|
||||
if (to.meta.requiresAuth !== false) {
|
||||
if (!authStore.isLoggedIn) {
|
||||
// 未登录,跳转到登录页面
|
||||
next('/login')
|
||||
return
|
||||
}
|
||||
|
||||
// 验证Token有效性
|
||||
// try {
|
||||
// const { valid } = await validateToken()
|
||||
// if (!valid) {
|
||||
// // Token无效,清空认证信息并跳转到登录页面
|
||||
// authStore.logout()
|
||||
// next('/login')
|
||||
// return
|
||||
// }
|
||||
// } catch (error) {
|
||||
// console.error('Token验证失败:', error)
|
||||
// authStore.logout()
|
||||
// next('/login')
|
||||
// return
|
||||
// }
|
||||
}
|
||||
|
||||
next()
|
||||
});
|
||||
|
||||
export default router
|
||||
|
||||
169
frontend/src/stores/auth.ts
Normal file
@@ -0,0 +1,169 @@
|
||||
import { defineStore } from 'pinia'
|
||||
import { ref, computed } from 'vue'
|
||||
import type { UserInfo, TenantInfo, KnowledgeBaseInfo } from '@/api/auth'
|
||||
|
||||
export const useAuthStore = defineStore('auth', () => {
|
||||
// 状态
|
||||
const user = ref<UserInfo | null>(null)
|
||||
const tenant = ref<TenantInfo | null>(null)
|
||||
const token = ref<string>('')
|
||||
const refreshToken = ref<string>('')
|
||||
const knowledgeBases = ref<KnowledgeBaseInfo[]>([])
|
||||
const currentKnowledgeBase = ref<KnowledgeBaseInfo | null>(null)
|
||||
|
||||
// 计算属性
|
||||
const isLoggedIn = computed(() => {
|
||||
return !!token.value && !!user.value
|
||||
})
|
||||
|
||||
const hasValidTenant = computed(() => {
|
||||
return !!tenant.value && !!tenant.value.api_key
|
||||
})
|
||||
|
||||
const currentTenantId = computed(() => {
|
||||
return tenant.value?.id || ''
|
||||
})
|
||||
|
||||
const currentUserId = computed(() => {
|
||||
return user.value?.id || ''
|
||||
})
|
||||
|
||||
// 操作方法
|
||||
const setUser = (userData: UserInfo) => {
|
||||
user.value = userData
|
||||
// 保存到localStorage
|
||||
localStorage.setItem('weknora_user', JSON.stringify(userData))
|
||||
}
|
||||
|
||||
const setTenant = (tenantData: TenantInfo) => {
|
||||
tenant.value = tenantData
|
||||
// 保存到localStorage
|
||||
localStorage.setItem('weknora_tenant', JSON.stringify(tenantData))
|
||||
}
|
||||
|
||||
const setToken = (tokenValue: string) => {
|
||||
token.value = tokenValue
|
||||
localStorage.setItem('weknora_token', tokenValue)
|
||||
}
|
||||
|
||||
const setRefreshToken = (refreshTokenValue: string) => {
|
||||
refreshToken.value = refreshTokenValue
|
||||
localStorage.setItem('weknora_refresh_token', refreshTokenValue)
|
||||
}
|
||||
|
||||
const setKnowledgeBases = (kbList: KnowledgeBaseInfo[]) => {
|
||||
// 确保输入是数组
|
||||
knowledgeBases.value = Array.isArray(kbList) ? kbList : []
|
||||
localStorage.setItem('weknora_knowledge_bases', JSON.stringify(knowledgeBases.value))
|
||||
}
|
||||
|
||||
const setCurrentKnowledgeBase = (kb: KnowledgeBaseInfo | null) => {
|
||||
currentKnowledgeBase.value = kb
|
||||
if (kb) {
|
||||
localStorage.setItem('weknora_current_kb', JSON.stringify(kb))
|
||||
} else {
|
||||
localStorage.removeItem('weknora_current_kb')
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const logout = () => {
|
||||
// 清空状态
|
||||
user.value = null
|
||||
tenant.value = null
|
||||
token.value = ''
|
||||
refreshToken.value = ''
|
||||
knowledgeBases.value = []
|
||||
currentKnowledgeBase.value = null
|
||||
|
||||
// 清空localStorage
|
||||
localStorage.removeItem('weknora_user')
|
||||
localStorage.removeItem('weknora_tenant')
|
||||
localStorage.removeItem('weknora_token')
|
||||
localStorage.removeItem('weknora_refresh_token')
|
||||
localStorage.removeItem('weknora_knowledge_bases')
|
||||
localStorage.removeItem('weknora_current_kb')
|
||||
|
||||
}
|
||||
|
||||
const initFromStorage = () => {
|
||||
// 从localStorage恢复状态
|
||||
const storedUser = localStorage.getItem('weknora_user')
|
||||
const storedTenant = localStorage.getItem('weknora_tenant')
|
||||
const storedToken = localStorage.getItem('weknora_token')
|
||||
const storedRefreshToken = localStorage.getItem('weknora_refresh_token')
|
||||
const storedKnowledgeBases = localStorage.getItem('weknora_knowledge_bases')
|
||||
const storedCurrentKb = localStorage.getItem('weknora_current_kb')
|
||||
|
||||
if (storedUser) {
|
||||
try {
|
||||
user.value = JSON.parse(storedUser)
|
||||
} catch (e) {
|
||||
console.error('解析用户信息失败:', e)
|
||||
}
|
||||
}
|
||||
|
||||
if (storedTenant) {
|
||||
try {
|
||||
tenant.value = JSON.parse(storedTenant)
|
||||
} catch (e) {
|
||||
console.error('解析租户信息失败:', e)
|
||||
}
|
||||
}
|
||||
|
||||
if (storedToken) {
|
||||
token.value = storedToken
|
||||
}
|
||||
|
||||
if (storedRefreshToken) {
|
||||
refreshToken.value = storedRefreshToken
|
||||
}
|
||||
|
||||
if (storedKnowledgeBases) {
|
||||
try {
|
||||
const parsed = JSON.parse(storedKnowledgeBases)
|
||||
knowledgeBases.value = Array.isArray(parsed) ? parsed : []
|
||||
} catch (e) {
|
||||
console.error('解析知识库列表失败:', e)
|
||||
knowledgeBases.value = []
|
||||
}
|
||||
}
|
||||
|
||||
if (storedCurrentKb) {
|
||||
try {
|
||||
currentKnowledgeBase.value = JSON.parse(storedCurrentKb)
|
||||
} catch (e) {
|
||||
console.error('解析当前知识库失败:', e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 初始化时从localStorage恢复状态
|
||||
initFromStorage()
|
||||
|
||||
return {
|
||||
// 状态
|
||||
user,
|
||||
tenant,
|
||||
token,
|
||||
refreshToken,
|
||||
knowledgeBases,
|
||||
currentKnowledgeBase,
|
||||
|
||||
// 计算属性
|
||||
isLoggedIn,
|
||||
hasValidTenant,
|
||||
currentTenantId,
|
||||
currentUserId,
|
||||
|
||||
// 方法
|
||||
setUser,
|
||||
setTenant,
|
||||
setToken,
|
||||
setRefreshToken,
|
||||
setKnowledgeBases,
|
||||
setCurrentKnowledgeBase,
|
||||
logout,
|
||||
initFromStorage
|
||||
}
|
||||
})
|
||||