18 Commits

Author SHA1 Message Date
wizardchen
6f6ca84dae feat(docreader): add health check 2025-09-10 20:22:14 +08:00
wizardchen
8a2b38da6f fix: rewrite query is empty 2025-09-10 19:35:26 +08:00
begoniezhao
79b9315758 feat: Added knowledge base column value update method and optimized logging 2025-09-10 19:04:37 +08:00
Liwx
b3c43b2180 Update WeKnora.md 2025-09-10 17:25:07 +08:00
Liwx1014
bfea6775ee doc: add process parsing md 2025-09-10 17:25:07 +08:00
Liwx1014
2241127a41 doc: add process parsing document 2025-09-10 17:25:07 +08:00
wizardchen
7cfae7e0d3 fix: pre fetch ocr models in docker container 2025-09-10 17:24:26 +08:00
wizardchen
19d2493afc fix: make file docker build not work 2025-09-10 15:13:12 +08:00
wizardchen
0e1d7edca3 fix: image parser concurrency error 2025-09-10 13:19:39 +08:00
wizardchen
fd6c50059e feat: support modify listening port 2025-09-10 10:14:43 +08:00
wizardchen
7775559a9b feat: use paddle ocr v4 instead 2025-09-10 01:22:25 +08:00
wizardchen
2b6cbee1b6 feat: add aliyun rerank 2025-09-10 01:22:25 +08:00
wizardchen
4214e6782b feat: add aliyun rerank model 2025-09-10 01:22:25 +08:00
begoniezhao
3f8a1d20c1 fix(docreader): update paddle version 2025-09-09 19:25:02 +08:00
begoniezhao
7efa173812 chore: Added VERSION file to optimize docker image build configuration 2025-09-09 14:57:09 +08:00
wizardchen
44e0e9ecb8 docs: update readme 2025-09-09 11:33:32 +08:00
KaiFan Yu
820aeacbba fix: fix app start fail when config STORAGE_TYPE=cos
https://github.com/Tencent/WeKnora/issues/223
2025-09-08 23:26:37 +08:00
wizardchen
daa5e8853a docs: add CHANGELOG for 0.1.0 and version badges 2025-09-08 23:14:31 +08:00
30 changed files with 1490 additions and 834 deletions

View File

@@ -27,6 +27,15 @@ STREAM_MANAGER_TYPE=redis
# 数据库端口默认为5432
DB_PORT=5432
# 应用服务端口默认为8080
APP_PORT=8080
# 前端服务端口默认为80
FRONTEND_PORT=80
# 文档解析模块端口默认为50051
DOCREADER_PORT=50051
# 数据库用户名
DB_USER=postgres

View File

@@ -42,6 +42,9 @@ jobs:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Read VERSION file
run: echo "VERSION=$(cat VERSION)" >> $GITHUB_ENV
- name: Build ${{ matrix.service_name }} Image
uses: docker/build-push-action@v3
with:
@@ -49,4 +52,8 @@ jobs:
platforms: ${{ matrix.platform }}
file: ${{ matrix.file }}
context: ${{ matrix.context }}
tags: ${{ secrets.DOCKERHUB_USERNAME }}/weknora-${{ matrix.service_name }}:latest
tags: |
${{ secrets.DOCKERHUB_USERNAME }}/weknora-${{ matrix.service_name }}:latest
${{ secrets.DOCKERHUB_USERNAME }}/weknora-${{ matrix.service_name }}:${{ env.VERSION }}
cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-${{ matrix.service_name }}:cache
cache-to: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/weknora-${{ matrix.service_name }}:cache,mode=max

17
CHANGELOG.md Normal file
View File

@@ -0,0 +1,17 @@
# Changelog
All notable changes to this project will be documented in this file.
## [0.1.0] - 2025-09-08
- Initial public release of WeKnora.
- Web UI for knowledge upload, chat, configuration, and settings.
- RAG pipeline with chunking, embedding, retrieval, reranking, and generation.
- Initialization wizard for configuring models (LLM, embedding, rerank, retriever).
- Support for local Ollama and remote API models.
- Vector backends: PostgreSQL (pgvector), Elasticsearch; GraphRAG support.
- End-to-end evaluation utilities and metrics.
- Docker Compose for quick startup and service orchestration.
- MCP server support for integrating with MCP-compatible clients.
[0.1.0]: https://github.com/Tencent/WeKnora/tree/v0.1.0

View File

@@ -1,4 +1,4 @@
.PHONY: help build run test clean docker-build docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama build-images build-images-app build-images-docreader build-images-frontend clean-images
.PHONY: help build run test clean docker-build-app docker-build-docreader docker-build-frontend docker-build-all docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama build-images build-images-app build-images-docreader build-images-frontend clean-images check-env list-containers pull-images show-platform
# Show help
help:
@@ -11,10 +11,13 @@ help:
@echo " clean 清理构建文件"
@echo ""
@echo "Docker 命令:"
@echo " docker-build 构建 Docker 镜像"
@echo " docker-run 运行 Docker 容器"
@echo " docker-stop 停止 Docker 容器"
@echo " docker-restart 重启 Docker 容器"
@echo " docker-build-app 构建应用 Docker 镜像 (wechatopenai/weknora-app)"
@echo " docker-build-docreader 构建文档读取器镜像 (wechatopenai/weknora-docreader)"
@echo " docker-build-frontend 构建前端镜像 (wechatopenai/weknora-ui)"
@echo " docker-build-all 构建所有 Docker 镜像"
@echo " docker-run 运行 Docker 容器"
@echo " docker-stop 停止 Docker 容器"
@echo " docker-restart 重启 Docker 容器"
@echo ""
@echo "服务管理:"
@echo " start-all 启动所有服务"
@@ -37,15 +40,32 @@ help:
@echo " lint 代码检查"
@echo " deps 安装依赖"
@echo " docs 生成 API 文档"
@echo ""
@echo "环境检查:"
@echo " check-env 检查环境配置"
@echo " list-containers 列出运行中的容器"
@echo " pull-images 拉取最新镜像"
@echo " show-platform 显示当前构建平台"
# Go related variables
BINARY_NAME=WeKnora
MAIN_PATH=./cmd/server
# Docker related variables
DOCKER_IMAGE=WeKnora
DOCKER_IMAGE=wechatopenai/weknora-app
DOCKER_TAG=latest
# Platform detection
ifeq ($(shell uname -m),x86_64)
PLATFORM=linux/amd64
else ifeq ($(shell uname -m),aarch64)
PLATFORM=linux/arm64
else ifeq ($(shell uname -m),arm64)
PLATFORM=linux/arm64
else
PLATFORM=linux/amd64
endif
# Build the application
build:
go build -o $(BINARY_NAME) $(MAIN_PATH)
@@ -64,8 +84,19 @@ clean:
rm -f $(BINARY_NAME)
# Build Docker image
docker-build:
docker build -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
docker-build-app:
docker build --platform $(PLATFORM) -f docker/Dockerfile.app -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
# Build docreader Docker image
docker-build-docreader:
docker build --platform $(PLATFORM) -f docker/Dockerfile.docreader -t wechatopenai/weknora-docreader:latest .
# Build frontend Docker image
docker-build-frontend:
docker build --platform $(PLATFORM) -f frontend/Dockerfile -t wechatopenai/weknora-ui:latest frontend/
# Build all Docker images
docker-build-all: docker-build-app docker-build-docreader docker-build-frontend
# Run Docker container (传统方式)
docker-run:
@@ -107,10 +138,10 @@ build-images-frontend:
clean-images:
./scripts/build_images.sh --clean
# Restart Docker container (stop, rebuild, start)
# Restart Docker container (stop, start)
docker-restart:
docker-compose stop -t 60
docker-compose up --build
docker-compose up
# Database migrations
migrate-up:
@@ -151,4 +182,21 @@ clean-db:
docker volume rm weknora_redis_data; \
fi
# Environment check
check-env:
./scripts/start_all.sh --check
# List containers
list-containers:
./scripts/start_all.sh --list
# Pull latest images
pull-images:
./scripts/start_all.sh --pull
# Show current platform
show-platform:
@echo "当前系统架构: $(shell uname -m)"
@echo "Docker构建平台: $(PLATFORM)"

280
README.md
View File

@@ -14,148 +14,154 @@
<a href="https://github.com/Tencent/WeKnora/blob/main/LICENSE">
<img src="https://img.shields.io/badge/License-MIT-ffffff?labelColor=d4eaf7&color=2e6cc4" alt="License">
</a>
<a href="./CHANGELOG.md">
<img alt="Version" src="https://img.shields.io/badge/version-0.1.0-2e6cc4?labelColor=d4eaf7">
</a>
</p>
<p align="center">
| <a href="./README_EN.md"><b>English</b></a> | <b>简体中文</b> | <a href="./README_JA.md"><b>日本語</b></a> |
| <b>English</b> | <a href="./README_CN.md"><b>简体中文</b></a> | <a href="./README_JA.md"><b>日本語</b></a> |
</p>
<p align="center">
<h4 align="center">
[项目介绍](#-项目介绍) • [架构设计](#-架构设计) • [核心特性](#-核心特性) • [快速开始](#-快速开始) • [文档](#-文档) • [开发指南](#-开发指南)
[Overview](#-overview) • [Architecture](#-architecture) • [Key Features](#-key-features) • [Getting Started](#-getting-started) • [API Reference](#-api-reference) • [Developer Guide](#-developer-guide)
</h4>
</p>
# 💡 WeKnora - 基于大模型的文档理解检索框架
# 💡 WeKnora - LLM-Powered Document Understanding & Retrieval Framework
## 📌 项目介绍
## 📌 Overview
[**WeKnora(维娜拉)**](https://weknora.weixin.qq.com) 是一款基于大语言模型LLM的文档理解与语义检索框架专为结构复杂、内容异构的文档场景而打造。
[**WeKnora**](https://weknora.weixin.qq.com) is an LLM-powered framework designed for deep document understanding and semantic retrieval, especially for handling complex, heterogeneous documents.
框架采用模块化架构,融合多模态预处理、语义向量索引、智能召回与大模型生成推理,构建起高效、可控的文档问答流程。核心检索流程基于 **RAGRetrieval-Augmented Generation** 机制,将上下文相关片段与语言模型结合,实现更高质量的语义回答。
It adopts a modular architecture that combines multimodal preprocessing, semantic vector indexing, intelligent retrieval, and large language model inference. At its core, WeKnora follows the **RAG (Retrieval-Augmented Generation)** paradigm, enabling high-quality, context-aware answers by combining relevant document chunks with model reasoning.
**官网:** https://weknora.weixin.qq.com
**Website:** https://weknora.weixin.qq.com
## 🏗️ 架构设计
## 🏗️ Architecture
![weknora-pipelone.png](./docs/images/pipeline.jpg)
![weknora-pipeline.png](./docs/images/pipeline.jpg)
WeKnora 采用现代化模块化设计,构建了一条完整的文档理解与检索流水线。系统主要包括文档解析、向量化处理、检索引擎和大模型推理等核心模块,每个组件均可灵活配置与扩展。
WeKnora employs a modern modular design to build a complete document understanding and retrieval pipeline. The system primarily includes document parsing, vector processing, retrieval engine, and large model inference as core modules, with each component being flexibly configurable and extendable.
## 🎯 核心特性
## 🎯 Key Features
- **🔍 精准理解**:支持 PDFWord、图片等文档的结构化内容提取,统一构建语义视图
- **🧠 智能推理**:借助大语言模型理解文档上下文与用户意图,支持精准问答与多轮对话
- **🔧 灵活扩展**:从解析、嵌入、召回到生成全流程解耦,便于灵活集成与定制扩展
- **⚡ 高效检索**:混合多种检索策略:关键词、向量、知识图谱
- **🎯 简单易用**直观的Web界面与标准API零技术门槛快速上手
- **🔒 安全可控**:支持本地化与私有云部署,数据完全自主可控
- **🔍 Precise Understanding**: Structured content extraction from PDFs, Word documents, images and more into unified semantic views
- **🧠 Intelligent Reasoning**: Leverages LLMs to understand document context and user intent for accurate Q&A and multi-turn conversations
- **🔧 Flexible Extension**: All components from parsing and embedding to retrieval and generation are decoupled for easy customization
- **⚡ Efficient Retrieval**: Hybrid retrieval strategies combining keywords, vectors, and knowledge graphs
- **🎯 User-Friendly**: Intuitive web interface and standardized APIs for zero technical barriers
- **🔒 Secure & Controlled**: Support for local deployment and private cloud, ensuring complete data sovereignty
## 📊 适用场景
## 📊 Application Scenarios
| 应用场景 | 具体应用 | 核心价值 |
| Scenario | Applications | Core Value |
|---------|----------|----------|
| **企业知识管理** | 内部文档检索、规章制度问答、操作手册查询 | 提升知识查找效率,降低培训成本 |
| **科研文献分析** | 论文检索、研究报告分析、学术资料整理 | 加速文献调研,辅助研究决策 |
| **产品技术支持** | 产品手册问答、技术文档检索、故障排查 | 提升客户服务质量,减少技术支持负担 |
| **法律合规审查** | 合同条款检索、法规政策查询、案例分析 | 提高合规效率,降低法律风险 |
| **医疗知识辅助** | 医学文献检索、诊疗指南查询、病例分析 | 辅助临床决策,提升诊疗质量 |
| **Enterprise Knowledge Management** | Internal document retrieval, policy Q&A, operation manual search | Improve knowledge discovery efficiency, reduce training costs |
| **Academic Research Analysis** | Paper retrieval, research report analysis, scholarly material organization | Accelerate literature review, assist research decisions |
| **Product Technical Support** | Product manual Q&A, technical documentation search, troubleshooting | Enhance customer service quality, reduce support burden |
| **Legal & Compliance Review** | Contract clause retrieval, regulatory policy search, case analysis | Improve compliance efficiency, reduce legal risks |
| **Medical Knowledge Assistance** | Medical literature retrieval, treatment guideline search, case analysis | Support clinical decisions, improve diagnosis quality |
## 🧩 功能模块能力
## 🧩 Feature Matrix
| 功能模块 | 支持情况 | 说明 |
| Module | Support | Description |
|---------|---------|------|
| 文档格式支持 | ✅ PDF / Word / Txt / Markdown / 图片(含 OCR / Caption | 支持多种结构化与非结构化文档内容解析,支持图文混排与图像文字提取 |
| 嵌入模型支持 | ✅ 本地模型、BGE / GTE API 等 | 支持自定义 embedding 模型,兼容本地部署与云端向量生成接口 |
| 向量数据库接入 | ✅ PostgreSQLpgvector)、Elasticsearch | 支持主流向量索引后端,可灵活切换与扩展,适配不同检索场景 |
| 检索机制 | ✅ BM25 / Dense Retrieve / GraphRAG | 支持稠密/稀疏召回、知识图谱增强检索等多种策略,可自由组合召回-重排-生成流程 |
| 大模型集成 | ✅ 支持 QwenDeepSeek 等,思考/非思考模式切换 | 可接入本地大模型(如 Ollama 启动)或调用外部 API 服务,支持推理模式灵活配置 |
| 问答能力 | ✅ 上下文感知、多轮对话、提示词模板 | 支持复杂语义建模、指令控制与链式问答,可配置提示词与上下文窗口 |
| 端到端测试支持 | ✅ 检索+生成过程可视化与指标评估 | 提供一体化链路测试工具,支持评估召回命中率、回答覆盖度、BLEU / ROUGE 等主流指标 |
| 部署模式 | ✅ 支持本地部署 / Docker 镜像 | 满足私有化、离线部署与灵活运维的需求 |
| 用户界面 | ✅ Web UI + RESTful API | 提供交互式界面与标准 API 接口,适配开发者与业务用户使用习惯 |
| Document Formats | ✅ PDF / Word / Txt / Markdown / Images (with OCR / Caption) | Support for structured and unstructured documents with text extraction from images |
| Embedding Models | ✅ Local models, BGE / GTE APIs, etc. | Customizable embedding models, compatible with local deployment and cloud vector generation APIs |
| Vector DB Integration | ✅ PostgreSQL (pgvector), Elasticsearch | Support for mainstream vector index backends, flexible switching for different retrieval scenarios |
| Retrieval Strategies | ✅ BM25 / Dense Retrieval / GraphRAG | Support for sparse/dense recall and knowledge graph-enhanced retrieval with customizable retrieve-rerank-generate pipelines |
| LLM Integration | ✅ Support for Qwen, DeepSeek, etc., with thinking/non-thinking mode switching | Compatible with local models (e.g., via Ollama) or external API services with flexible inference configuration |
| QA Capabilities | ✅ Context-aware, multi-turn dialogue, prompt templates | Support for complex semantic modeling, instruction control and chain-of-thought Q&A with configurable prompts and context windows |
| E2E Testing | ✅ Retrieval+generation process visualization and metric evaluation | End-to-end testing tools for evaluating recall hit rates, answer coverage, BLEU/ROUGE and other metrics |
| Deployment Modes | ✅ Support for local deployment / Docker images | Meets private, offline deployment and flexible operation requirements |
| User Interfaces | ✅ Web UI + RESTful API | Interactive interface and standard API endpoints, suitable for both developers and business users |
## 🚀 快速开始
## 🚀 Getting Started
### 🛠 环境要求
### 🛠 Prerequisites
确保本地已安装以下工具:
Make sure the following tools are installed on your system:
* [Docker](https://www.docker.com/)
* [Docker Compose](https://docs.docker.com/compose/)
* [Git](https://git-scm.com/)
### 📦 安装步骤
### 📦 Installation
#### ① 克隆代码仓库
#### ① Clone the repository
```bash
# 克隆主仓库
# Clone the main repository
git clone https://github.com/Tencent/WeKnora.git
cd WeKnora
```
#### ② 配置环境变量
#### ② Configure environment variables
```bash
# 复制示例配置文件
# Copy example env file
cp .env.example .env
# 编辑 .env,填入对应配置信息
# 所有变量说明详见 .env.example 注释
# Edit .env and set required values
# All variables are documented in the .env.example comments
```
#### ③ 启动服务
#### ③ Start the services
```bash
# 启动全部服务(含 Ollama 与后端容器)
# Start all services (Ollama + backend containers)
./scripts/start_all.sh
#
# Or
make start-all
```
#### ③ 启动服务备选
#### ③ Start the services (backup)
```bash
# 启动 ollama 服务 (可选)
# Start ollama services (Optional)
ollama serve > /dev/null 2>&1 &
# 启动服务
# Start the service
docker compose up -d
```
#### ④ 停止服务
#### ④ Stop the services
```bash
./scripts/start_all.sh --stop
#
# Or
make stop-all
```
### 🌐 服务访问地址
### 🌐 Access Services
启动成功后,可访问以下地址:
Once started, services will be available at:
* Web UI`http://localhost`
* 后端 API`http://localhost:8080`
* 链路追踪(Jaeger`http://localhost:16686`
* Web UI: `http://localhost`
* Backend API: `http://localhost:8080`
* Jaeger Tracing: `http://localhost:16686`
### 🔌 使用微信对话开放平台
### 🔌 Using WeChat Dialog Open Platform
WeKnora 作为[微信对话开放平台](https://chatbot.weixin.qq.com)的核心技术框架,提供更简便的使用方式:
WeKnora serves as the core technology framework for the [WeChat Dialog Open Platform](https://chatbot.weixin.qq.com), providing a more convenient usage approach:
- **零代码部署**:只需上传知识,即可在微信生态中快速部署智能问答服务,实现"即问即答"的体验
- **高效问题管理**:支持高频问题的独立分类管理,提供丰富的数据工具,确保回答精准可靠且易于维护
- **微信生态覆盖**通过微信对话开放平台WeKnora 的智能问答能力可无缝集成到公众号、小程序等微信场景中,提升用户交互体验
### 🔗MCP服务器访问已经部署好的WEKnora
#### 1⃣克隆储存库
- **Zero-code Deployment**: Simply upload knowledge to quickly deploy intelligent Q&A services within the WeChat ecosystem, achieving an "ask and answer" experience
- **Efficient Question Management**: Support for categorized management of high-frequency questions, with rich data tools to ensure accurate, reliable, and easily maintainable answers
- **WeChat Ecosystem Integration**: Through the WeChat Dialog Open Platform, WeKnora's intelligent Q&A capabilities can be seamlessly integrated into WeChat Official Accounts, Mini Programs, and other WeChat scenarios, enhancing user interaction experiences
### 🔗 Access WeKnora via MCP Server
#### 1⃣ Clone the repository
```
git clone https://github.com/Tencent/WeKnora
```
#### 2⃣配置MCP服务器
mcp客户端配置服务器
#### 2⃣ Configure MCP Server
Configure the MCP client to connect to the server:
```json
{
"mcpServers": {
@@ -165,150 +171,150 @@ mcp客户端配置服务器
],
"command": "python",
"env":{
"WEKNORA_API_KEY":"进入你的weknora实例打开开发者工具查看请求头x-api-key以sk开头",
"WEKNORA_BASE_URL":"http(s)://你的weknora地址/api/v1"
"WEKNORA_API_KEY":"Enter your WeKnora instance, open developer tools, check the request header x-api-key starting with sk",
"WEKNORA_BASE_URL":"http(s)://your-weknora-address/api/v1"
}
}
}
}
```
使用stdio命令直接运行
Run directly using stdio command:
```
pip install weknora-mcp-server
python -m weknora-mcp-server
```
## 🔧 初始化配置引导
## 🔧 Initialization Configuration Guide
为了方便用户快速配置各类模型降低试错成本我们改进了原来的配置文件初始化方式增加了Web UI界面进行各种模型的配置。在使用之前请确保代码更新到最新版本。具体使用步骤如下
如果是第一次使用本项目,可跳过①②步骤,直接进入③④步骤。
To help users quickly configure various models and reduce trial-and-error costs, we've improved the original configuration file initialization method by adding a Web UI interface for model configuration. Before using, please ensure the code is updated to the latest version. The specific steps are as follows:
If this is your first time using this project, you can skip steps ①② and go directly to steps ③④.
### ① 关闭服务
### ① Stop the services
```bash
./scripts/start_all.sh --stop
```
### ② 清空原有数据表(建议在没有重要数据的情况下使用)
### ② Clear existing data tables (recommended when no important data exists)
```bash
make clean-db
```
### ③ 编译并启动服务
### ③ Compile and start services
```bash
./scripts/start_all.sh
```
### ④ 访问Web UI
### ④ Access Web UI
http://localhost
首次访问会自动跳转到初始化配置页面,配置完成后会自动跳转到知识库页面。请按照页面提示信息完成模型的配置。
On first access, it will automatically redirect to the initialization configuration page. After configuration is complete, it will automatically redirect to the knowledge base page. Please follow the page instructions to complete model configuration.
![配置页面](./docs/images/config.png)
![Configuration Page](./docs/images/config.png)
## 📱 Interface Showcase
## 📱 功能展示
### Web UI 界面
### Web UI Interface
<table>
<tr>
<td><b>知识上传</b><br/><img src="./docs/images/knowledges.png" alt="知识上传界面"></td>
<td><b>知识问答入口</b><br/><img src="./docs/images/qa.png" alt="知识问答入口"></td>
<td><b>Knowledge Upload</b><br/><img src="./docs/images/knowledges.png" alt="Knowledge Upload Interface"></td>
<td><b>Q&A Entry</b><br/><img src="./docs/images/qa.png" alt="Q&A Entry Interface"></td>
</tr>
<tr>
<td colspan="2"><b>图文结果回答</b><br/><img src="./docs/images/answer.png" alt="图文结果回答"></td>
<td colspan="2"><b>Rich Text & Image Responses</b><br/><img src="./docs/images/answer.png" alt="Rich Answer Interface"></td>
</tr>
</table>
**知识库管理:** 支持拖拽上传各类文档,自动识别文档结构并提取核心知识,建立索引。系统清晰展示处理进度和文档状态,实现高效的知识库管理。
**Knowledge Base Management:** Support for dragging and dropping various documents, automatically identifying document structures and extracting core knowledge to establish indexes. The system clearly displays processing progress and document status, achieving efficient knowledge base management.
### 文档知识图谱
### Document Knowledge Graph
<table>
<tr>
<td><img src="./docs/images/graph2.png" alt="知识图谱展示1"></td>
<td><img src="./docs/images/graph1.png" alt="知识图谱展示2"></td>
<td><img src="./docs/images/graph2.png" alt="Knowledge Graph View 1"></td>
<td><img src="./docs/images/graph1.png" alt="Knowledge Graph View 2"></td>
</tr>
</table>
WeKnora 支持将文档转化为知识图谱,展示文档中不同段落之间的关联关系。开启知识图谱功能后,系统会分析并构建文档内部的语义关联网络,不仅帮助用户理解文档内容,还为索引和检索提供结构化支撑,提升检索结果的相关性和广度。
### 配套MCP服务器调用效果
<img width="950" height="2063" alt="118d078426f42f3d4983c13386085d7f" src="https://github.com/user-attachments/assets/09111ec8-0489-415c-969d-aa3835778e14" />
WeKnora supports transforming documents into knowledge graphs, displaying the relationships between different sections of the documents. Once the knowledge graph feature is enabled, the system analyzes and constructs an internal semantic association network that not only helps users understand document content but also provides structured support for indexing and retrieval, enhancing the relevance and breadth of search results.
### MCP Server Integration Effects
<img width="950" height="2063" alt="MCP Server Integration Demo" src="https://github.com/user-attachments/assets/09111ec8-0489-415c-969d-aa3835778e14" />
## 📘 文档
## 📘 API Reference
常见问题排查:[常见问题排查](./docs/QA.md)
Troubleshooting FAQ: [Troubleshooting FAQ](./docs/QA.md)
详细接口说明请参考:[API 文档](./docs/API.md)
Detailed API documentation is available at: [API Docs](./docs/API.md)
## 🧭 开发指南
## 🧭 Developer Guide
### 📁 项目目录结构
### 📁 Directory Structure
```
WeKnora/
├── cmd/ # 应用入口
├── internal/ # 核心业务逻辑
├── config/ # 配置文件
├── migrations/ # 数据库迁移脚本
├── scripts/ # 启动与工具脚本
├── services/ # 各子服务实现
├── frontend/ # 前端项目
└── docs/ # 项目文档
├── cmd/ # Main entry point
├── internal/ # Core business logic
├── config/ # Configuration files
├── migrations/ # DB migration scripts
├── scripts/ # Shell scripts
├── services/ # Microservice logic
├── frontend/ # Frontend app
└── docs/ # Project documentation
```
### 🔧 常用命令
### 🔧 Common Commands
```bash
# 清空数据库(慎用!)
# Wipe all data from DB (use with caution)
make clean-db
```
## 🤝 贡献指南
## 🤝 Contributing
我们欢迎社区用户参与贡献如有建议、Bug 或新功能需求,请通过 [Issue](https://github.com/Tencent/WeKnora/issues) 提出,或直接提交 Pull Request
We welcome community contributions! For suggestions, bugs, or feature requests, please submit an [Issue](https://github.com/Tencent/WeKnora/issues) or directly create a Pull Request.
### 🎯 贡献方式
### 🎯 How to Contribute
- 🐛 **Bug修复**: 发现并修复系统缺陷
-**新功能**: 提出并实现新特性
- 📚 **文档改进**: 完善项目文档
- 🧪 **测试用例**: 编写单元测试和集成测试
- 🎨 **UI/UX优化**: 改进用户界面和体验
- 🐛 **Bug Fixes**: Discover and fix system defects
-**New Features**: Propose and implement new capabilities
- 📚 **Documentation**: Improve project documentation
- 🧪 **Test Cases**: Write unit and integration tests
- 🎨 **UI/UX Enhancements**: Improve user interface and experience
### 📋 贡献流程
### 📋 Contribution Process
1. **Fork项目** 到你的GitHub账户
2. **创建特性分支** `git checkout -b feature/amazing-feature`
3. **提交更改** `git commit -m 'Add amazing feature'`
4. **推送分支** `git push origin feature/amazing-feature`
5. **创建Pull Request** 并详细描述变更内容
1. **Fork the project** to your GitHub account
2. **Create a feature branch** `git checkout -b feature/amazing-feature`
3. **Commit changes** `git commit -m 'Add amazing feature'`
4. **Push branch** `git push origin feature/amazing-feature`
5. **Create a Pull Request** with detailed description of changes
### 🎨 代码规范
### 🎨 Code Standards
- 遵循 [Go Code Review Comments](https://github.com/golang/go/wiki/CodeReviewComments)
- 使用 `gofmt` 格式化代码
- 添加必要的单元测试
- 更新相关文档
- Follow [Go Code Review Comments](https://github.com/golang/go/wiki/CodeReviewComments)
- Format code using `gofmt`
- Add necessary unit tests
- Update relevant documentation
### 📝 提交规范
### 📝 Commit Guidelines
使用 [Conventional Commits](https://www.conventionalcommits.org/) 规范:
Use [Conventional Commits](https://www.conventionalcommits.org/) standard:
```
feat: 添加文档批量上传功能
fix: 修复向量检索精度问题
docs: 更新API文档
test: 添加检索引擎测试用例
refactor: 重构文档解析模块
feat: Add document batch upload functionality
fix: Resolve vector retrieval precision issue
docs: Update API documentation
test: Add retrieval engine test cases
refactor: Restructure document parsing module
```
## 📄 许可证
## 📄 License
本项目基于 [MIT](./LICENSE) 协议发布。
你可以自由使用、修改和分发本项目代码,但需保留原始版权声明。
This project is licensed under the [MIT License](./LICENSE).
You are free to use, modify, and distribute the code with proper attribution.

317
README_CN.md Normal file
View File

@@ -0,0 +1,317 @@
<p align="center">
<picture>
<img src="./docs/images/logo.png" alt="WeKnora Logo" height="120"/>
</picture>
</p>
<p align="center">
<a href="https://weknora.weixin.qq.com" target="_blank">
<img alt="官方网站" src="https://img.shields.io/badge/官方网站-WeKnora-4e6b99">
</a>
<a href="https://chatbot.weixin.qq.com" target="_blank">
<img alt="微信对话开放平台" src="https://img.shields.io/badge/微信对话开放平台-5ac725">
</a>
<a href="https://github.com/Tencent/WeKnora/blob/main/LICENSE">
<img src="https://img.shields.io/badge/License-MIT-ffffff?labelColor=d4eaf7&color=2e6cc4" alt="License">
</a>
<a href="./CHANGELOG.md">
<img alt="版本" src="https://img.shields.io/badge/version-0.1.0-2e6cc4?labelColor=d4eaf7">
</a>
</p>
<p align="center">
| <a href="./README.md"><b>English</b></a> | <b>简体中文</b> | <a href="./README_JA.md"><b>日本語</b></a> |
</p>
<p align="center">
<h4 align="center">
[项目介绍](#-项目介绍) • [架构设计](#-架构设计) • [核心特性](#-核心特性) • [快速开始](#-快速开始) • [文档](#-文档) • [开发指南](#-开发指南)
</h4>
</p>
# 💡 WeKnora - 基于大模型的文档理解检索框架
## 📌 项目介绍
[**WeKnora维娜拉**](https://weknora.weixin.qq.com) 是一款基于大语言模型LLM的文档理解与语义检索框架专为结构复杂、内容异构的文档场景而打造。
框架采用模块化架构,融合多模态预处理、语义向量索引、智能召回与大模型生成推理,构建起高效、可控的文档问答流程。核心检索流程基于 **RAGRetrieval-Augmented Generation** 机制,将上下文相关片段与语言模型结合,实现更高质量的语义回答。
**官网:** https://weknora.weixin.qq.com
## 🏗️ 架构设计
![weknora-pipelone.png](./docs/images/pipeline.jpg)
WeKnora 采用现代化模块化设计,构建了一条完整的文档理解与检索流水线。系统主要包括文档解析、向量化处理、检索引擎和大模型推理等核心模块,每个组件均可灵活配置与扩展。
## 🎯 核心特性
- **🔍 精准理解**:支持 PDF、Word、图片等文档的结构化内容提取统一构建语义视图
- **🧠 智能推理**:借助大语言模型理解文档上下文与用户意图,支持精准问答与多轮对话
- **🔧 灵活扩展**:从解析、嵌入、召回到生成全流程解耦,便于灵活集成与定制扩展
- **⚡ 高效检索**:混合多种检索策略:关键词、向量、知识图谱
- **🎯 简单易用**直观的Web界面与标准API零技术门槛快速上手
- **🔒 安全可控**:支持本地化与私有云部署,数据完全自主可控
## 📊 适用场景
| 应用场景 | 具体应用 | 核心价值 |
|---------|----------|----------|
| **企业知识管理** | 内部文档检索、规章制度问答、操作手册查询 | 提升知识查找效率,降低培训成本 |
| **科研文献分析** | 论文检索、研究报告分析、学术资料整理 | 加速文献调研,辅助研究决策 |
| **产品技术支持** | 产品手册问答、技术文档检索、故障排查 | 提升客户服务质量,减少技术支持负担 |
| **法律合规审查** | 合同条款检索、法规政策查询、案例分析 | 提高合规效率,降低法律风险 |
| **医疗知识辅助** | 医学文献检索、诊疗指南查询、病例分析 | 辅助临床决策,提升诊疗质量 |
## 🧩 功能模块能力
| 功能模块 | 支持情况 | 说明 |
|---------|---------|------|
| 文档格式支持 | ✅ PDF / Word / Txt / Markdown / 图片(含 OCR / Caption | 支持多种结构化与非结构化文档内容解析,支持图文混排与图像文字提取 |
| 嵌入模型支持 | ✅ 本地模型、BGE / GTE API 等 | 支持自定义 embedding 模型,兼容本地部署与云端向量生成接口 |
| 向量数据库接入 | ✅ PostgreSQLpgvector、Elasticsearch | 支持主流向量索引后端,可灵活切换与扩展,适配不同检索场景 |
| 检索机制 | ✅ BM25 / Dense Retrieve / GraphRAG | 支持稠密/稀疏召回、知识图谱增强检索等多种策略,可自由组合召回-重排-生成流程 |
| 大模型集成 | ✅ 支持 Qwen、DeepSeek 等,思考/非思考模式切换 | 可接入本地大模型(如 Ollama 启动)或调用外部 API 服务,支持推理模式灵活配置 |
| 问答能力 | ✅ 上下文感知、多轮对话、提示词模板 | 支持复杂语义建模、指令控制与链式问答,可配置提示词与上下文窗口 |
| 端到端测试支持 | ✅ 检索+生成过程可视化与指标评估 | 提供一体化链路测试工具支持评估召回命中率、回答覆盖度、BLEU / ROUGE 等主流指标 |
| 部署模式 | ✅ 支持本地部署 / Docker 镜像 | 满足私有化、离线部署与灵活运维的需求 |
| 用户界面 | ✅ Web UI + RESTful API | 提供交互式界面与标准 API 接口,适配开发者与业务用户使用习惯 |
## 🚀 快速开始
### 🛠 环境要求
确保本地已安装以下工具:
* [Docker](https://www.docker.com/)
* [Docker Compose](https://docs.docker.com/compose/)
* [Git](https://git-scm.com/)
### 📦 安装步骤
#### ① 克隆代码仓库
```bash
# 克隆主仓库
git clone https://github.com/Tencent/WeKnora.git
cd WeKnora
```
#### ② 配置环境变量
```bash
# 复制示例配置文件
cp .env.example .env
# 编辑 .env填入对应配置信息
# 所有变量说明详见 .env.example 注释
```
#### ③ 启动服务
```bash
# 启动全部服务(含 Ollama 与后端容器)
./scripts/start_all.sh
# 或
make start-all
```
#### ③ 启动服务备选
```bash
# 启动 ollama 服务 (可选)
ollama serve > /dev/null 2>&1 &
# 启动服务
docker compose up -d
```
#### ④ 停止服务
```bash
./scripts/start_all.sh --stop
# 或
make stop-all
```
### 🌐 服务访问地址
启动成功后,可访问以下地址:
* Web UI`http://localhost`
* 后端 API`http://localhost:8080`
* 链路追踪Jaeger`http://localhost:16686`
### 🔌 使用微信对话开放平台
WeKnora 作为[微信对话开放平台](https://chatbot.weixin.qq.com)的核心技术框架,提供更简便的使用方式:
- **零代码部署**:只需上传知识,即可在微信生态中快速部署智能问答服务,实现"即问即答"的体验
- **高效问题管理**:支持高频问题的独立分类管理,提供丰富的数据工具,确保回答精准可靠且易于维护
- **微信生态覆盖**通过微信对话开放平台WeKnora 的智能问答能力可无缝集成到公众号、小程序等微信场景中,提升用户交互体验
### 🔗MCP服务器访问已经部署好的WEKnora
#### 1⃣克隆储存库
```
git clone https://github.com/Tencent/WeKnora
```
#### 2⃣配置MCP服务器
mcp客户端配置服务器
```json
{
"mcpServers": {
"weknora": {
"args": [
"path/to/WeKnora/mcp-server/run_server.py"
],
"command": "python",
"env":{
"WEKNORA_API_KEY":"进入你的weknora实例打开开发者工具查看请求头x-api-key以sk开头",
"WEKNORA_BASE_URL":"http(s)://你的weknora地址/api/v1"
}
}
}
}
```
使用stdio命令直接运行
```
pip install weknora-mcp-server
python -m weknora-mcp-server
```
## 🔧 初始化配置引导
为了方便用户快速配置各类模型降低试错成本我们改进了原来的配置文件初始化方式增加了Web UI界面进行各种模型的配置。在使用之前请确保代码更新到最新版本。具体使用步骤如下
如果是第一次使用本项目,可跳过①②步骤,直接进入③④步骤。
### ① 关闭服务
```bash
./scripts/start_all.sh --stop
```
### ② 清空原有数据表(建议在没有重要数据的情况下使用)
```bash
make clean-db
```
### ③ 编译并启动服务
```bash
./scripts/start_all.sh
```
### ④ 访问Web UI
http://localhost
首次访问会自动跳转到初始化配置页面,配置完成后会自动跳转到知识库页面。请按照页面提示信息完成模型的配置。
![配置页面](./docs/images/config.png)
## 📱 功能展示
### Web UI 界面
<table>
<tr>
<td><b>知识上传</b><br/><img src="./docs/images/knowledges.png" alt="知识上传界面"></td>
<td><b>知识问答入口</b><br/><img src="./docs/images/qa.png" alt="知识问答入口"></td>
</tr>
<tr>
<td colspan="2"><b>图文结果回答</b><br/><img src="./docs/images/answer.png" alt="图文结果回答"></td>
</tr>
</table>
**知识库管理:** 支持拖拽上传各类文档,自动识别文档结构并提取核心知识,建立索引。系统清晰展示处理进度和文档状态,实现高效的知识库管理。
### 文档知识图谱
<table>
<tr>
<td><img src="./docs/images/graph2.png" alt="知识图谱展示1"></td>
<td><img src="./docs/images/graph1.png" alt="知识图谱展示2"></td>
</tr>
</table>
WeKnora 支持将文档转化为知识图谱,展示文档中不同段落之间的关联关系。开启知识图谱功能后,系统会分析并构建文档内部的语义关联网络,不仅帮助用户理解文档内容,还为索引和检索提供结构化支撑,提升检索结果的相关性和广度。
### 配套MCP服务器调用效果
<img width="950" height="2063" alt="118d078426f42f3d4983c13386085d7f" src="https://github.com/user-attachments/assets/09111ec8-0489-415c-969d-aa3835778e14" />
## 📘 文档
常见问题排查:[常见问题排查](./docs/QA.md)
详细接口说明请参考:[API 文档](./docs/API.md)
## 🧭 开发指南
### 📁 项目目录结构
```
WeKnora/
├── cmd/ # 应用入口
├── internal/ # 核心业务逻辑
├── config/ # 配置文件
├── migrations/ # 数据库迁移脚本
├── scripts/ # 启动与工具脚本
├── services/ # 各子服务实现
├── frontend/ # 前端项目
└── docs/ # 项目文档
```
### 🔧 常用命令
```bash
# 清空数据库(慎用!)
make clean-db
```
## 🤝 贡献指南
我们欢迎社区用户参与贡献如有建议、Bug 或新功能需求,请通过 [Issue](https://github.com/Tencent/WeKnora/issues) 提出,或直接提交 Pull Request。
### 🎯 贡献方式
- 🐛 **Bug修复**: 发现并修复系统缺陷
-**新功能**: 提出并实现新特性
- 📚 **文档改进**: 完善项目文档
- 🧪 **测试用例**: 编写单元测试和集成测试
- 🎨 **UI/UX优化**: 改进用户界面和体验
### 📋 贡献流程
1. **Fork项目** 到你的GitHub账户
2. **创建特性分支** `git checkout -b feature/amazing-feature`
3. **提交更改** `git commit -m 'Add amazing feature'`
4. **推送分支** `git push origin feature/amazing-feature`
5. **创建Pull Request** 并详细描述变更内容
### 🎨 代码规范
- 遵循 [Go Code Review Comments](https://github.com/golang/go/wiki/CodeReviewComments)
- 使用 `gofmt` 格式化代码
- 添加必要的单元测试
- 更新相关文档
### 📝 提交规范
使用 [Conventional Commits](https://www.conventionalcommits.org/) 规范:
```
feat: 添加文档批量上传功能
fix: 修复向量检索精度问题
docs: 更新API文档
test: 添加检索引擎测试用例
refactor: 重构文档解析模块
```
## 📄 许可证
本项目基于 [MIT](./LICENSE) 协议发布。
你可以自由使用、修改和分发本项目代码,但需保留原始版权声明。

View File

@@ -1,249 +0,0 @@
<p align="center">
<picture>
<img src="./docs/images/logo.png" alt="WeKnora Logo" height="120"/>
</picture>
</p>
<p align="center">
<a href="https://weknora.weixin.qq.com" target="_blank">
<img alt="官方网站" src="https://img.shields.io/badge/官方网站-WeKnora-4e6b99">
</a>
<a href="https://chatbot.weixin.qq.com" target="_blank">
<img alt="微信对话开放平台" src="https://img.shields.io/badge/微信对话开放平台-5ac725">
</a>
<a href="https://github.com/Tencent/WeKnora/blob/main/LICENSE">
<img src="https://img.shields.io/badge/License-MIT-ffffff?labelColor=d4eaf7&color=2e6cc4" alt="License">
</a>
</p>
<p align="center">
| <b>English</b> | <a href="./README.md"><b>简体中文</b></a> | <a href="./README_JA.md"><b>日本語</b></a> |
</p>
<p align="center">
<h4 align="center">
[Overview](#-overview) • [Architecture](#-architecture) • [Key Features](#-key-features) • [Getting Started](#-getting-started) • [API Reference](#-api-reference) • [Developer Guide](#-developer-guide)
</h4>
</p>
# 💡 WeKnora - LLM-Powered Document Understanding & Retrieval Framework
## 📌 Overview
[**WeKnora**](https://weknora.weixin.qq.com) is an LLM-powered framework designed for deep document understanding and semantic retrieval, especially for handling complex, heterogeneous documents.
It adopts a modular architecture that combines multimodal preprocessing, semantic vector indexing, intelligent retrieval, and large language model inference. At its core, WeKnora follows the **RAG (Retrieval-Augmented Generation)** paradigm, enabling high-quality, context-aware answers by combining relevant document chunks with model reasoning.
**Website:** https://weknora.weixin.qq.com
## 🏗️ Architecture
![weknora-pipeline.png](./docs/images/pipeline.jpg)
WeKnora employs a modern modular design to build a complete document understanding and retrieval pipeline. The system primarily includes document parsing, vector processing, retrieval engine, and large model inference as core modules, with each component being flexibly configurable and extendable.
## 🎯 Key Features
- **🔍 Precise Understanding**: Structured content extraction from PDFs, Word documents, images and more into unified semantic views
- **🧠 Intelligent Reasoning**: Leverages LLMs to understand document context and user intent for accurate Q&A and multi-turn conversations
- **🔧 Flexible Extension**: All components from parsing and embedding to retrieval and generation are decoupled for easy customization
- **⚡ Efficient Retrieval**: Hybrid retrieval strategies combining keywords, vectors, and knowledge graphs
- **🎯 User-Friendly**: Intuitive web interface and standardized APIs for zero technical barriers
- **🔒 Secure & Controlled**: Support for local deployment and private cloud, ensuring complete data sovereignty
## 📊 Application Scenarios
| Scenario | Applications | Core Value |
|---------|----------|----------|
| **Enterprise Knowledge Management** | Internal document retrieval, policy Q&A, operation manual search | Improve knowledge discovery efficiency, reduce training costs |
| **Academic Research Analysis** | Paper retrieval, research report analysis, scholarly material organization | Accelerate literature review, assist research decisions |
| **Product Technical Support** | Product manual Q&A, technical documentation search, troubleshooting | Enhance customer service quality, reduce support burden |
| **Legal & Compliance Review** | Contract clause retrieval, regulatory policy search, case analysis | Improve compliance efficiency, reduce legal risks |
| **Medical Knowledge Assistance** | Medical literature retrieval, treatment guideline search, case analysis | Support clinical decisions, improve diagnosis quality |
## 🧩 Feature Matrix
| Module | Support | Description |
|---------|---------|------|
| Document Formats | ✅ PDF / Word / Txt / Markdown / Images (with OCR / Caption) | Support for structured and unstructured documents with text extraction from images |
| Embedding Models | ✅ Local models, BGE / GTE APIs, etc. | Customizable embedding models, compatible with local deployment and cloud vector generation APIs |
| Vector DB Integration | ✅ PostgreSQL (pgvector), Elasticsearch | Support for mainstream vector index backends, flexible switching for different retrieval scenarios |
| Retrieval Strategies | ✅ BM25 / Dense Retrieval / GraphRAG | Support for sparse/dense recall and knowledge graph-enhanced retrieval with customizable retrieve-rerank-generate pipelines |
| LLM Integration | ✅ Support for Qwen, DeepSeek, etc., with thinking/non-thinking mode switching | Compatible with local models (e.g., via Ollama) or external API services with flexible inference configuration |
| QA Capabilities | ✅ Context-aware, multi-turn dialogue, prompt templates | Support for complex semantic modeling, instruction control and chain-of-thought Q&A with configurable prompts and context windows |
| E2E Testing | ✅ Retrieval+generation process visualization and metric evaluation | End-to-end testing tools for evaluating recall hit rates, answer coverage, BLEU/ROUGE and other metrics |
| Deployment Modes | ✅ Support for local deployment / Docker images | Meets private, offline deployment and flexible operation requirements |
| User Interfaces | ✅ Web UI + RESTful API | Interactive interface and standard API endpoints, suitable for both developers and business users |
## 🚀 Getting Started
### 🛠 Prerequisites
Make sure the following tools are installed on your system:
* [Docker](https://www.docker.com/)
* [Docker Compose](https://docs.docker.com/compose/)
* [Git](https://git-scm.com/)
### 📦 Installation
#### ① Clone the repository
```bash
# Clone the main repository
git clone https://github.com/Tencent/WeKnora.git
cd WeKnora
```
#### ② Configure environment variables
```bash
# Copy example env file
cp .env.example .env
# Edit .env and set required values
# All variables are documented in the .env.example comments
```
#### ③ Start the services
```bash
# Start all services (Ollama + backend containers)
./scripts/start_all.sh
# Or
make start-all
```
#### ③ Start the services (backup)
```bash
# Start ollama services (Optional)
ollama serve > /dev/null 2>&1 &
# Start the service
docker compose up -d
```
#### ④ Stop the services
```bash
./scripts/start_all.sh --stop
# Or
make stop-all
```
### 🌐 Access Services
Once started, services will be available at:
* Web UI: `http://localhost`
* Backend API: `http://localhost:8080`
* Jaeger Tracing: `http://localhost:16686`
### 🔌 Using WeChat Dialog Open Platform
WeKnora serves as the core technology framework for the [WeChat Dialog Open Platform](https://chatbot.weixin.qq.com), providing a more convenient usage approach:
- **Zero-code Deployment**: Simply upload knowledge to quickly deploy intelligent Q&A services within the WeChat ecosystem, achieving an "ask and answer" experience
- **Efficient Question Management**: Support for categorized management of high-frequency questions, with rich data tools to ensure accurate, reliable, and easily maintainable answers
- **WeChat Ecosystem Integration**: Through the WeChat Dialog Open Platform, WeKnora's intelligent Q&A capabilities can be seamlessly integrated into WeChat Official Accounts, Mini Programs, and other WeChat scenarios, enhancing user interaction experiences
## 📱 Interface Showcase
### Web UI Interface
<table>
<tr>
<td><b>Knowledge Upload</b><br/><img src="./docs/images/knowledges.png" alt="Knowledge Upload Interface"></td>
<td><b>Q&A Entry</b><br/><img src="./docs/images/qa.png" alt="Q&A Entry Interface"></td>
</tr>
<tr>
<td colspan="2"><b>Rich Text & Image Responses</b><br/><img src="./docs/images/answer.png" alt="Rich Answer Interface"></td>
</tr>
</table>
**Knowledge Base Management:** Support for dragging and dropping various documents, automatically identifying document structures and extracting core knowledge to establish indexes. The system clearly displays processing progress and document status, achieving efficient knowledge base management.
### Document Knowledge Graph
<table>
<tr>
<td><img src="./docs/images/graph2.png" alt="Knowledge Graph View 1"></td>
<td><img src="./docs/images/graph1.png" alt="Knowledge Graph View 2"></td>
</tr>
</table>
WeKnora supports transforming documents into knowledge graphs, displaying the relationships between different sections of the documents. Once the knowledge graph feature is enabled, the system analyzes and constructs an internal semantic association network that not only helps users understand document content but also provides structured support for indexing and retrieval, enhancing the relevance and breadth of search results.
## 📘 API Reference
Detailed API documentation is available at: [API Docs](./docs/API.md)
## 🧭 Developer Guide
### 📁 Directory Structure
```
WeKnora/
├── cmd/ # Main entry point
├── internal/ # Core business logic
├── config/ # Configuration files
├── migrations/ # DB migration scripts
├── scripts/ # Shell scripts
├── services/ # Microservice logic
├── frontend/ # Frontend app
└── docs/ # Project documentation
```
### 🔧 Common Commands
```bash
# Wipe all data from DB (use with caution)
make clean-db
```
## 🤝 Contributing
We welcome community contributions! For suggestions, bugs, or feature requests, please submit an [Issue](https://github.com/Tencent/WeKnora/issues) or directly create a Pull Request.
### 🎯 How to Contribute
- 🐛 **Bug Fixes**: Discover and fix system defects
-**New Features**: Propose and implement new capabilities
- 📚 **Documentation**: Improve project documentation
- 🧪 **Test Cases**: Write unit and integration tests
- 🎨 **UI/UX Enhancements**: Improve user interface and experience
### 📋 Contribution Process
1. **Fork the project** to your GitHub account
2. **Create a feature branch** `git checkout -b feature/amazing-feature`
3. **Commit changes** `git commit -m 'Add amazing feature'`
4. **Push branch** `git push origin feature/amazing-feature`
5. **Create a Pull Request** with detailed description of changes
### 🎨 Code Standards
- Follow [Go Code Review Comments](https://github.com/golang/go/wiki/CodeReviewComments)
- Format code using `gofmt`
- Add necessary unit tests
- Update relevant documentation
### 📝 Commit Guidelines
Use [Conventional Commits](https://www.conventionalcommits.org/) standard:
```
feat: Add document batch upload functionality
fix: Resolve vector retrieval precision issue
docs: Update API documentation
test: Add retrieval engine test cases
refactor: Restructure document parsing module
```
## 📄 License
This project is licensed under the [MIT License](./LICENSE).
You are free to use, modify, and distribute the code with proper attribution.

View File

@@ -14,10 +14,13 @@
<a href="https://github.com/Tencent/WeKnora/blob/main/LICENSE">
<img src="https://img.shields.io/badge/License-MIT-ffffff?labelColor=d4eaf7&color=2e6cc4" alt="License">
</a>
<a href="./CHANGELOG.md">
<img alt="バージョン" src="https://img.shields.io/badge/version-0.1.0-2e6cc4?labelColor=d4eaf7">
</a>
</p>
<p align="center">
| <a href="./README_EN.md"><b>English</b></a> | <a href="./README.md"><b>简体中文</b></a> | <b>日本語</b> |
| <a href="./README.md"><b>English</b></a> | <a href="./README_CN.md"><b>简体中文</b></a> | <b>日本語</b> |
</p>
<p align="center">

1
VERSION Normal file
View File

@@ -0,0 +1 @@
0.1.0

View File

@@ -3,11 +3,18 @@ services:
image: wechatopenai/weknora-app:latest
container_name: WeKnora-app
ports:
- "8080:8080"
- "${APP_PORT:-8080}:8080"
volumes:
- data-files:/data/files
- ./config/config.yaml:/app/config/config.yaml
environment:
- COS_SECRET_ID=${COS_SECRET_ID}
- COS_SECRET_KEY=${COS_SECRET_KEY}
- COS_REGION=${COS_REGION}
- COS_BUCKET_NAME=${COS_BUCKET_NAME}
- COS_APP_ID=${COS_APP_ID}
- COS_PATH_PREFIX=${COS_PATH_PREFIX}
- COS_ENABLE_OLD_DOMAIN=${COS_ENABLE_OLD_DOMAIN}
- GIN_MODE=${GIN_MODE}
- DB_DRIVER=postgres
- DB_HOST=postgres
@@ -61,6 +68,8 @@ services:
condition: service_healthy
minio:
condition: service_started
docreader:
condition: service_healthy
networks:
- WeKnora-network
restart: unless-stopped
@@ -91,7 +100,7 @@ services:
image: wechatopenai/weknora-ui:latest
container_name: WeKnora-frontend
ports:
- "80:80"
- "${FRONTEND_PORT:-80}:80"
depends_on:
- app
networks:
@@ -102,7 +111,7 @@ services:
image: wechatopenai/weknora-docreader:latest
container_name: WeKnora-docreader
ports:
- "50051:50051"
- "${DOCREADER_PORT:-50051}:50051"
environment:
- COS_SECRET_ID=${COS_SECRET_ID}
- COS_SECRET_KEY=${COS_SECRET_KEY}
@@ -122,6 +131,12 @@ services:
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME}
- MINIO_USE_SSL=${MINIO_USE_SSL}
- WEB_PROXY=${WEB_PROXY}
healthcheck:
test: ["CMD", "grpc_health_probe", "-addr=:50051"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
networks:
- WeKnora-network
restart: unless-stopped

View File

@@ -50,22 +50,33 @@ COPY services/docreader/requirements.txt .
# 安装依赖
RUN pip cache purge && pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
# 预下载 PP-OCRv5 模型
RUN mkdir -p /root/.paddlex/official_models && \
wget https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/PP-OCRv5_server_det_infer.tar \
-O /root/.paddlex/official_models/PP-OCRv5_server_det_infer.tar && \
wget https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/PP-OCRv5_server_rec_infer.tar \
-O /root/.paddlex/official_models/PP-OCRv5_server_rec_infer.tar && \
tar -xf /root/.paddlex/official_models/PP-OCRv5_server_det_infer.tar -C /root/.paddlex/official_models/ && \
tar -xf /root/.paddlex/official_models/PP-OCRv5_server_rec_infer.tar -C /root/.paddlex/official_models/ && \
rm -rf /root/.paddlex/official_models/PP-OCRv5_server_det_infer.tar /root/.paddlex/official_models/PP-OCRv5_server_rec_infer.tar
# 预下载 PP-OCRv4 模型
RUN mkdir -p /root/.paddleocr/whl/det/ch && \
mkdir -p /root/.paddleocr/whl/rec/ch && \
mkdir -p /root/.paddleocr/whl/cls/ch && \
# 下载检测模型
wget https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar \
-O /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar && \
tar -xf /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar -C /root/.paddleocr/whl/det/ch/ && \
# 下载识别模型
wget https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar \
-O /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
tar -xf /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar -C /root/.paddleocr/whl/rec/ch/ && \
# 下载文本方向分类模型(用于判断文本是否需要旋转)
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar \
-O /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar && \
tar -xf /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar -C /root/.paddleocr/whl/cls/ && \
# 清理压缩包
rm -f /root/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer.tar && \
rm -f /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
rm -f /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar
# 复制源代码和生成脚本
COPY services/docreader/src/ /app/src/
COPY services/docreader/scripts/ /app/scripts/
# 确保模型目录存在
RUN ls -la /root/.paddlex/official_models
RUN ls -la /root/.paddleocr/whl/
# 生成 protobuf 代码
RUN chmod +x /app/scripts/generate_proto.sh && bash /app/scripts/generate_proto.sh
@@ -103,30 +114,18 @@ RUN apt-get update && apt-get install -y \
libglu1-mesa \
libsm6 \
libreoffice \
curl \
&& rm -rf /var/lib/apt/lists/*
# # 下载并安装 LibreOffice区分架构
# RUN mkdir -p /tmp/libreoffice && cd /tmp/libreoffice && \
# if [ "$(uname -m)" = "x86_64" ]; then \
# wget https://mirrors.tuna.tsinghua.edu.cn/libreoffice/libreoffice/stable/25.2.5/deb/x86_64/LibreOffice_25.2.5_Linux_x86-64_deb.tar.gz && \
# tar -xzf LibreOffice_25.2.5_Linux_x86-64_deb.tar.gz && \
# cd LibreOffice_25.2.5.2_Linux_x86-64_deb/DEBS && dpkg -i *.deb; \
# elif [ "$(uname -m)" = "aarch64" ] || [ "$(uname -m)" = "arm64" ]; then \
# wget https://mirrors.aliyun.com/libreoffice/testing/25.8.0/deb/aarch64/LibreOffice_25.8.0.3_Linux_aarch64_deb.tar.gz && \
# tar -xzf LibreOffice_25.8.0.3_Linux_aarch64_deb.tar.gz && \
# cd LibreOffice_25.8.0.3_Linux_aarch64_deb/DEBS && dpkg -i *.deb; \
# else \
# echo "Unsupported architecture: $(uname -m)" && exit 1; \
# fi && \
# cd / && rm -rf /tmp/libreoffice
# 设置 LibreOffice 环境变量
# RUN echo 'export LIBREOFFICE_PATH=/opt/libreoffice25.2/program/soffice' >> /etc/environment;
# 安装 grpc_health_probe
RUN GRPC_HEALTH_PROBE_VERSION=v0.4.24 && \
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-amd64 && \
chmod +x /bin/grpc_health_probe
# 从构建阶段复制已安装的依赖和生成的代码
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
COPY --from=builder /root/.paddlex/official_models /root/.paddlex/official_models
COPY --from=builder /root/.paddleocr /root/.paddleocr
COPY --from=builder /app/src /app/src
# 安装 Playwright 浏览器

319
docs/WeKnora.md Normal file
View File

@@ -0,0 +1,319 @@
## 介绍
WeKora 是一个可立即在生产环境投入的企业级RAG框架实现智能文档理解和检索功能。该系统采用模块化设计将文档理解、向量存储、推理文件等功能分离。
![arc](./images/arc.png)
---
## PipeLine
WeKnora 处理文档需要多个步骤:插入-》知识提取-》索引-》检索-》生成,整个流程支持多种检索方法,
![](./images/pipeline2.jpeg)
以用户上传的一张住宿流水单pdf文件为例详细介绍下其数据流
### 1. 接收请求与初始化
+ **请求识别**: 系统收到一个请求,并为其分配了唯一的 `request_id=Lkq0OGLYu2fV`,用于追踪整个处理流程。
+ **租户与会话验证**:
- 系统首先验证了租户信息ID: 1, Name: Default Tenant
- 接着开始处理一个知识库问答Knowledge QA请求该请求属于会话 `1f241340-ae75-40a5-8731-9a3a82e34fdd`
+ **用户问题**: 用户的原始问题是:“**入住的房型是什么**”。
+ **消息创建**: 系统为用户的提问和即将生成的回答分别创建了消息记录ID 分别为 `703ddf09-...``6f057649-...`
### 2. 知识库问答流程启动
系统正式调用知识库问答服务并定义了将要按顺序执行的完整处理管道Pipeline包含以下9个事件
`[rewrite_query, preprocess_query, chunk_search, chunk_rerank, chunk_merge, filter_top_k, into_chat_message, chat_completion_stream, stream_filter]`
---
### 3. 事件执行详情
#### 事件 1: `rewrite_query` - 问题改写
+ **目的**: 为了让检索更精确,系统需要结合上下文来理解用户的真实意图。
+ **操作**:
1. 系统检索了当前会话最近的20条历史消息实际检索到8条作为上下文。
2. 调用了一个名为 `deepseek-r1:7b` 的本地大语言模型。
3. 模型根据聊天历史分析出提问者是“Liwx”并将原问题“入住的房型是什么”改写得更具体。
+ **结果**: 问题被成功改写为:“**Liwx本次入住的房型是什么**”。
#### 事件 2: `preprocess_query` - 问题预处理
+ **目的**: 将改写后的问题进行分词,转换为适合搜索引擎处理的关键词序列。
+ **操作**: 对改写后的问题进行了分词处理。
+ **结果**: 生成了一串关键词:“`需要 改写 用户 问题 入住 房型 根据 提供 信息 入住 人 Liwx 选择 房型 双床 房 因此 改写 后 完整 问题 为 Liwx 本次 入住 房型`”。
#### 事件 3: `chunk_search` - 知识区块检索
这是最核心的**检索Retrieval**步骤系统执行了两次混合搜索Hybrid Search
+ **第一次搜索 (使用改写后的完整问句)**:
- **向量检索**:
1. 加载嵌入模型 `bge-m3:latest` 将问句转换为一个1024维的向量。
2. 在PostgreSQL数据库中进行向量相似度搜索找到了2个相关的知识区块chunkID 分别为 `e3bf6599-...``3989c6ce-...`
- **关键词检索**:
1. 同时,系统也进行了关键词搜索。
2. 同样找到了上述2个知识区块。
- **结果合并**: 两种方法找到的4个结果实际是2个重复的被去重最终得到2个唯一的知识区块。
+ **第二次搜索 (使用预处理后的关键词序列)**:
- 系统使用分词后的关键词重复了上述的**向量检索**和**关键词检索**过程。
- 最终也得到了相同的2个知识区块。
+ **最终结果**: 经过两次搜索和结果合并系统锁定了2个最相关的知识区块并将它们的内容提取出来准备用于生成答案。
#### 事件 4: `chunk_rerank` - 结果重排序
+ **目的**: 使用一个更强大的模型对初步检索出的结果进行更精细的排序,以提高最终答案的质量。
+ **操作**: 日志显示 `Rerank model ID is empty, skipping reranking`。这意味着系统配置了重排序步骤,但没有指定具体的重排序模型,因此**跳过了此步骤**。
#### 事件 5: `chunk_merge` - 区块合并
+ **目的**: 将内容上相邻或相关的知识区块进行合并,形成更完整的上下文。
+ **操作**: 系统分析了检索到的2个区块并尝试进行合并。根据日志最终处理后仍然是2个独立的区块但已按相关性分数排好序。
#### 事件 6: `filter_top_k` - Top-K 过滤
+ **目的**: 仅保留最相关的K个结果防止过多无关信息干扰语言模型。
+ **操作**: 系统配置保留前5个Top-K = 5最相关的区块。由于当前只有2个区块它们全部通过了此过滤器。
#### 事件 7 & 8: `into_chat_message` & `chat_completion_stream` - 生成回答
这是**生成Generation**步骤。
+ **目的**: 基于检索到的信息,生成自然流畅的回答。
+ **操作**:
1. 系统将检索到的2个知识区块的内容、用户的原始问题以及聊天历史整合在一起形成一个完整的提示Prompt
2. 再次调用 `deepseek-r1:7b` 大语言模型,并以**流式Stream**的方式请求生成答案。流式输出可以实现打字机效果,提升用户体验。
#### 事件 9: `stream_filter` - 流式输出过滤
+ **目的**: 对模型生成的实时文本流进行后处理,过滤掉不需要的特殊标记或内容。
+ **操作**:
- 系统设置了一个过滤器,用于移除模型在思考过程中可能产生的内部标记,如 `<think>``</think>`
- 日志显示,模型输出的第一个词块是 `<think> 根据`,过滤器成功拦截并移除了 `<think>` 标记,只将“根据”及之后的内容传递下去。
### 4. 完成与响应
+ **发送引用**: 在生成答案的同时系统将作为依据的2个知识区块作为“参考内容”发送给前端以便用户查证来源。
+ **更新消息**: 当模型生成完所有内容后系统将完整的回答更新到之前创建的消息记录ID: `6f057649-...`)中。
+ **请求结束**: 服务器返回 `200` 成功状态码,标志着本次从提问到回答的完整流程结束。
### 总结
这个日志完整地记录了一次典型的RAG流程系统通过**问题改写**和**预处理**来精确理解用户意图,接着利用**向量与关键词混合检索**从知识库中找到相关信息,虽然跳过了**重排序**,但依然执行了**合并**与**过滤**,最后将检索到的知识作为上下文,交由大语言模型**生成**流畅、准确的回答,并通过**流式过滤**保证了输出的纯净性。
## 文档解析切分
代码实现了一个独立的、通过gRPC通信的微服务专门负责文档内容的深度解析、分块和多模态信息提取。它正是“异步处理”阶段的核心执行者。
### **整体架构**
这是一个基于Python的gRPC服务其核心职责是接收文件或URL并将其解析成结构化的、可供后续处理如向量化的文本块Chunks
+ `server.py`: 服务的入口和网络层。它负责启动一个多进程、多线程的gRPC服务器接收来自Go后端的请求并将解析结果返回。
+ `parser.py`: 设计模式中的**外观Facade模式**。它提供了一个统一的`Parser`屏蔽了内部多种具体解析器如PDF、DOCX、Markdown等的复杂性。外部调用者`server.py`)只需与这个`Parser`类交互。
+ `base_parser.py`: 解析器的基类定义了所有具体解析器共享的核心逻辑和抽象方法。这是整个解析流程的“大脑”包含了最复杂的文本分块、图片处理、OCR和图像描述生成等功能。
---
### **详细工作流程**
当Go后端启动异步任务时它会携带文件内容和配置信息向这个Python服务发起一次gRPC调用。以下是完整的处理流程
#### **第一步:请求接收与分发 (**`server.py`** & **`parser.py`**)
1. **gRPC服务入口 (**`server.py: serve`**)**:
- 服务通过`serve()`函数启动。它会根据环境变量(`GRPC_WORKER_PROCESSES`, `GRPC_MAX_WORKERS`)启动一个**多进程、多线程**的服务器以充分利用CPU资源提高并发处理能力。
- 每个工作进程都监听在指定的端口如50051准备接收请求。
2. **请求处理 (**`server.py: ReadFromFile`**)**:
- 当Go后端发起`ReadFromFile`请求时,其中一个工作进程会接收到该请求。
- 该方法首先会解析请求中的参数,包括:
* `file_name`, `file_type`, `file_content`:文件的基本信息和二进制内容。
* `read_config`: 一个包含所有解析配置的复杂对象,如`chunk_size`(分块大小)、`chunk_overlap`(重叠大小)、`enable_multimodal`(是否启用多模态处理)、`storage_config`(对象存储配置)、`vlm_config`(视觉语言模型配置)等。
- 它将这些配置整合成一个`ChunkingConfig`数据对象。
- 最关键的一步是调用 `self.parser.parse_file(...)`,将解析任务交给`Parser`外观类处理。
3. **解析器选择 (**`parser.py: Parser.parse_file`**)**:
- `Parser`类接收到任务后,首先调用`get_parser(file_type)`方法。
- 该方法会根据文件类型(例如 `'pdf'`)在一个字典 `self.parsers` 中查找对应的具体解析器类(例如 `PDFParser`)。
- 找到后,它会**实例化**这个`PDFParser`类,并将`ChunkingConfig`等所有配置信息传递给构造函数。
#### **第二步:核心解析与分块 (**`base_parser.py`**)**
它触及了整个流程的核心:**如何保证信息的上下文完整性和原始顺序**。
根据 `base_parser.py` 代码,**最终切分出的 Chunk 中的文本、表格和图像是按照它们在原始文档中的出现顺序来保存的**。
这个顺序得以保证,主要归功于 `BaseParser` 中几个设计精巧的方法相互协作。我们来详细追踪一下这个流程。
整个顺序的保证可以分为三个阶段:
1. **阶段一:统一的文本流创建 (**`pdf_parser.py`**)**:
-`parse_into_text` 方法中,您的代码会**逐页**处理PDF。
- 在每一页内部,它会按照一定的逻辑(先提取非表格文本,再附加表格,最后附加图像占位符)将所有内容**拼接成一个长字符串** (`page_content_parts`)。
- **关键点**: 虽然在这个阶段文本、表格和图像占位符的拼接顺序可能不是100%精确到字符级别,但它保证了**同一页的内容会在一起**,并且大致遵循了从上到下的阅读顺序。
- 最后,所有页面的内容被 `"\n\n--- Page Break ---\n\n"` 连接起来,形成一个**包含了所有信息文本、Markdown表格、图像占位符的、单一的、有序的文本流 (**`final_text`**)**。
2. **阶段二:原子化与保护 (**`_split_into_units`**)**:
- 这个单一的 `final_text` 被传递给 `_split_into_units` 方法。
- 这个方法是**保证结构完整性的关键**。它使用正则表达式,将**整个Markdown表格**和**整个Markdown图像占位符**识别为**不可分割的原子单元 (atomic units)**。
- 它会将这些原子单元(表格、图片)和它们之间的普通文本块,按照它们在 `final_text` 中出现的**原始顺序**,切分成一个列表 (`units`)。
- **结果**: 我们现在有了一个列表,例如 `['一些文本', '![...](...)', '另一些文本', '|...|...|\n|---|---|\n...', '更多文本']`。这个列表中的元素顺序**完全等同于它们在原始文档中的顺序**。
3. **阶段三:顺序分块 (**`chunk_text`**)**:
- `chunk_text` 方法接收到这个**有序的 **`units`** 列表**。
- 它的工作机制非常简单直接:它会**按顺序**遍历这个列表中的每一个单元(`unit`)。
- 它将这些单元**依次添加**到一个临时的 `current_chunk` 列表中,直到这个块的长度接近 `chunk_size` 的上限。
- 当一个块满了之后,它就被保存下来,然后开始一个新的块(可能会带有上一个块的重叠部分)。
- **关键点**: 因为 `chunk_text` **严格按照 **`units`** 列表的顺序进行处理**,所以它永远不会打乱表格、文本和图像之间的相对顺序。一个在文档中先出现的表格,也必然会出现在一个序号更靠前的 Chunk 中。
4. **阶段四:图像信息附加 (**`process_chunks_images`**)**:
- 在文本块被切分好之后,`process_chunks_images` 方法会被调用。
- 它会处理**每一个**已经生成好的 Chunk。
- 在每个 Chunk 内部它会找到图像占位符然后进行AI处理。
- 最后它会将处理好的图像信息包含永久URL、OCR文本、图像描述等附加到**该 Chunk 自己**的 `.images` 属性中。
- **关键点**: 这个过程**不会改变 Chunk 的顺序或其 **`.content`** 的内容**。它只是为已经存在的、顺序正确的 Chunk 附加额外的信息。
#### **第三步:多模态处理(如果启用) (**`base_parser.py`**)**
如果 `enable_multimodal``True`,在文本分块完成后,会进入最复杂的多模态处理阶段。
1. **并发任务启动 (**`BaseParser.process_chunks_images`**)**:
- 该方法使用`asyncio`Python的异步I/O框架来**并发处理所有文本块中的图片**,以极大地提升效率。
- 它为每个`Chunk`创建一个异步任务`process_chunk_images_async`
2. **处理单个块中的图片 (**`BaseParser.process_chunk_images_async`**)**:
- **提取图片引用**: 首先,使用正则表达式 `extract_images_from_chunk` 从当前块的文本中找到所有的图片引用(例如,`![alt text](image.png)`)。
- **图片持久化**: 对于找到的每个图片,并发地调用 `download_and_upload_image`。这个函数负责:
* 从其原始位置可能是PDF内部、本地路径或远程URL获取图片数据。
* 将图片**上传到配置好的对象存储COS/MinIO**。这一步至关重要它将临时的、不稳定的图片引用转换成一个持久化、可通过URL公开访问的地址。
* 返回持久化的URL和图片对象PIL Image
- **并发AI处理**: 将所有成功上传的图片收集起来,调用`process_multiple_images`
* 该方法内部使用`asyncio.Semaphore`来限制并发数量例如最多同时处理5张图片防止瞬间消耗过多内存或触发模型API的速率限制。
* 对于每张图片,它会调用`process_image_async`
3. **处理单张图片 (**`BaseParser.process_image_async`**)**:
- **OCR**: 调用`perform_ocr`它会使用一个OCR引擎`PaddleOCR`)来识别图片中的所有文字。
- **图像描述 (Caption)**: 调用`get_image_caption`它会将图片数据转为Base64发送给配置的视觉语言模型VLM生成对图片内容的自然语言描述。
- 该方法返回 `(ocr_text, caption, 持久化URL)`
4. **结果聚合**:
- 所有图片处理完成后包含持久化URL、OCR文本和图像描述的结构化信息会被附加到对应`Chunk`对象的 `.images` 字段上。
#### **第四步:返回结果 (**`server.py`**)**
1. **数据转换 (**`server.py: _convert_chunk_to_proto`**)**:
-`parser.parse_file`执行完毕后,它返回一个包含所有处理过的`Chunk`对象的列表(`ParseResult`)。
- `ReadFromFile`方法接收到这个结果,并调用`_convert_chunk_to_proto`将Python的`Chunk`对象包括其内部的图片信息转换成gRPC定义的Protobuf消息格式。
2. **响应返回**:
- 最后gRPC服务器将这个包含所有分块和多模态信息的`ReadResponse`消息发送回给调用方——Go后端服务。
至此Go后端就拿到了结构化、信息丰富的文档数据可以进行下一步的向量化和索引存储了。
## 部署
支持Docker 镜像本地部署并通过API端口提供接口服务
## 性能和监控
Weknora包含丰富的监控和测试组件
+ 分布式跟踪集成Jaeger用于跟踪请求在服务架构中的完整执行路。本质上Jaeger是一种帮助用户“看见”请求在分布式系统中完整生命周期的技术。
+ 健康监控:监控服务处在健康状态
+ 可扩展性:通过容器化部署,可通过多个服务满足大规模并发请求
## QA
### 问题1: 在检索过程的执行了两次混合搜索的目的是什么?以及第一次和第二次搜索有什么不同?
这是一个非常好的观察。系统执行两次混合搜索是为了**最大化检索的准确性和召回率**,本质上是一种**查询扩展Query Expansion和多策略检索**的组合方法。
#### 目的
通过两种不同形式的查询(原始改写句 vs. 分词后的关键词序列)去搜索,系统可以结合两种查询方式的优点:
+ **语义检索的深度**: 使用完整的句子进行搜索,能更好地利用向量模型(如`bge-m3`)对句子整体含义的理解能力,找到语义上最接近的知识区块。
+ **关键词检索的广度**: 使用分词后的关键词进行搜索能确保即使知识区块的表述方式与原问题不同但只要包含了核心关键词就有机会被命中。这对于传统的关键词匹配算法如BM25尤其有效。
简单来说,就是**用两种不同的“问法”去问同一个问题**,然后将两边的结果汇总起来,确保最相关的知识不会被遗漏。
#### 两次搜索的不同点
它们最核心的不同在于**输入的查询文本Query Text**
1. **第一次混合搜索**
- **输入**: 使用的是经过`rewrite_query`事件后生成的、**语法完整的自然语言问句**。
- **日志证据**:
```plain
INFO [2025-08-29 09:46:36.896] [request_id=Lkq0OGLYu2fV] knowledgebase.go:266[HybridSearch] | Hybrid search parameters, knowledge base ID: kb-00000001, query text: 需要改写的用户问题是“入住的房型是什么”。根据提供的信息入住人Liwx选择的房型是双床房。因此改写后的完整问题为 “Liwx本次入住的房型是什么”
```
2. **第二次混合搜索**
- **输入**: 使用的是经过`preprocess_query`事件处理后生成的、**由空格隔开的关键词序列**。
- **日志证据**:
```plain
INFO [2025-08-29 09:46:37.257] [request_id=Lkq0OGLYu2fV] knowledgebase.go:266[HybridSearch] | Hybrid search parameters, knowledge base ID: kb-00000001, query text: 需要 改写 用户 问题 入住 房型 根据 提供 信息 入住 人 Liwx 选择 房型 双床 房 因此 改写 后 完整 问题 为 Liwx 本次 入住 房型
```
最终系统将这两次搜索的结果进行去重和合并日志中显示每次都找到2个结果去重后总共还是2个从而得到一个更可靠的知识集合用于后续的答案生成。
### 问题2重排序模型分析
Reranker重排器是目前RAG领域中非常先进的技术它们在工作原理和适用场景上有着显著的区别。
简单来说,它们代表了从“**专门的判别模型**”到“**利用大语言模型LLM进行判别**”再到“**深度挖掘LLM内部信息进行判别**”的演进。
以下是它们的详细区别:
#### 1. Normal Reranker (常规重排器 / 交叉编码器)
这是最经典也是最主流的重排方法。
+ **模型类型**: **序列分类模型 (Sequence Classification Model)**。本质上是一个**交叉编码器 (Cross-Encoder)**通常基于BERT、RoBERTa等双向编码器架构。`BAAI/bge-reranker-base/large/v2-m3` 都属于这一类。
+ **工作原理**:
1. 它将**查询Query**和**待排序的文档Passage**拼接成一个单一的输入序列,例如:`[CLS] what is panda? [SEP] The giant panda is a bear species endemic to China. [SEP]`
2. 这个拼接后的序列被完整地送入模型中。模型内部的自注意力机制Self-Attention可以同时分析查询和文档中的每一个词并计算它们之间**细粒度的交互关系**。
3. 模型最终输出一个**单一的分数Logit**,这个分数直接代表了查询和文档的相关性。分数越高,相关性越强。
+ **关键特性**:
- **优点**: 由于查询和文档在模型内部进行了充分的、深度的交互,其**准确度通常非常高**是衡量Reranker性能的黄金标准。
- **缺点**: **速度较慢**。因为它必须为**每一个“查询-文档”对**都独立执行一次完整的、代价高昂的计算。如果初步检索返回了100个文档它就需要运行100次。
#### 2. LLM-based Reranker (基于LLM的重排器)
这种方法创造性地利用了通用大语言模型LLM的能力来进行重排。
+ **模型类型**: **因果语言模型 (Causal Language Model)**即我们常说的GPT、Llama、Gemma这类用于生成文本的LLM。`BAAI/bge-reranker-v2-gemma` 就是一个典型的例子。
+ **工作原理**:
1. 它**不是直接输出一个分数**,而是将重排任务**转化为一个问答或文本生成任务**。
2. 它通过一个精心设计的**提示Prompt**来组织输入,例如:`"Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'. A: {query} B: {passage}"`
3. 它将这个完整的Prompt喂给LLM然后**观察LLM在最后生成“Yes”这个词的概率**。
4. 这个**生成“Yes”的概率或其Logit值就被当作是相关性分数**。如果模型非常确信答案是“Yes”说明它认为文档B包含了查询A的答案即相关性高。
+ **关键特性**:
- **优点**: 能够利用LLM强大的**语义理解、推理和世界知识**,对于需要深度理解和推理才能判断相关性的复杂查询,效果可能更好。
- **缺点**: 计算开销可能非常大取决于LLM的大小并且性能**高度依赖于Prompt的设计**。
#### 3. LLM-based Layerwise Reranker (基于LLM分层信息的重排器)
这是第二种方法的“威力加强版”,是一种更前沿、更复杂的探究性技术。
+ **模型类型**: 同样是**因果语言模型 (Causal Language Model)**,例如`BAAI/bge-reranker-v2-minicpm-layerwise`
+ **工作原理**:
1. 输入部分与第二种方法完全相同也是使用“Yes/No”的Prompt。
2. 核心区别在于**分数的提取方式**。它不再仅仅依赖LLM**最后一层**的输出(即最终的预测结果)。
3. 它认为LLM在逐层处理信息的过程中不同深度的网络层Layer可能捕获了不同层次的语义相关性信息。因此它会从**模型的多个中间层**提取出关于“Yes”这个词的预测Logit。
4. 代码中的 `cutoff_layers=[28]` 参数就是告诉模型“请把第28层的输出给我”。最终你会得到一个或多个来自不同网络层的分数这些分数可以被平均或以其他方式组合形成一个更鲁棒的最终相关性判断。
+ **关键特性**:
- **优点**: 理论上可以获得**更丰富、更全面的相关性信号**,可能达到比只看最后一层更高的精度,是目前探索性能极限的一种方法。
- **缺点**: **复杂度最高**,需要对模型进行特定的修改才能提取中间层信息(代码中的`trust_remote_code=True`就是一个信号),计算开销也很大。
#### 总结对比
| 特性 | 1. Normal Reranker (常规) | 2. LLM-based Reranker (基于LLM) | 3. LLM-based Layerwise Reranker (基于LLM分层) |
| :--- | :--- | :--- | :--- |
| **底层模型** | 交叉编码器 (如BERT) | 因果语言模型 (如Gemma) | 因果语言模型 (如MiniCPM) |
| **工作原理** | 计算Query和Passage的深度交互直接输出相关分 | 将排序任务转为"Yes/No"预测,用"Yes"的概率作为分数 | 与2类似但从LLM的多个中间层提取"Yes"的概率 |
| **输出** | 单一的相关性分数 | 单一的相关性分数(来自最后一层) | 多个相关性分数(来自不同层) |
| **优点** | **速度与精度的最佳平衡点**,成熟稳定 | 利用LLM的推理能力处理复杂问题 | 理论上精度最高,信号更丰富 |
| **缺点** | 相比向量检索慢 | 计算开销大依赖Prompt设计 | **复杂度最高**,计算开销最大 |
| **推荐场景** | **大多数生产环境的首选**,效果好,易于部署 | 对答案质量有极致要求,且计算资源充足的场景 | 学术研究或追求SOTAState-of-the-art性能的场景 |
#### 使用建议
1. **开始阶段**: 强烈建议您**从 **`Normal Reranker`** 开始**,例如 `BAAI/bge-reranker-v2-m3`。它是目前综合表现最好的模型之一能显著提升您的RAG系统性能并且相对容易集成和部署。
2. **进阶探索**: 如果您发现常规Reranker在处理某些非常微妙或需要复杂推理的查询时表现不佳并且您拥有充足的GPU资源可以尝试 `LLM-based Reranker`
3. **前沿研究**: `Layerwise Reranker` 更适合研究人员或希望在特定任务上压榨出最后一点性能的专家。
### 问题3粗过滤或细过滤后的知识带重排如何组装发送给大模型的
这一块主要是设计提示词,典型的指令细节,其核心任务是根据上下文回答用户问题。组装上下文时需要指定
关键约束:必须严格按照所提供文档回答,禁止使用你自己的知识回答
未知情况处理: 如果文档中没有足够的信息来回答问题,请告知“根据所掌握的资料,无法回答这个问题”
引用要求:在回答时,如果引用了某个文档内容,请在句子末尾加上文档编号

BIN
docs/WeKnora.pdf Normal file

Binary file not shown.

BIN
docs/images/arc.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

BIN
docs/images/pipeline2.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

View File

@@ -197,3 +197,8 @@ func (r *knowledgeRepository) AminusB(
}
return knowledgeIDs, err
}
func (r *knowledgeRepository) UpdateKnowledgeColumn(ctx context.Context, id string, column string, value interface{}) error {
err := r.db.WithContext(ctx).Model(&types.Knowledge{}).Where("id = ?", id).Update(column, value).Error
return err
}

View File

@@ -166,8 +166,10 @@ func (p *PluginRewrite) OnEvent(ctx context.Context,
return next()
}
// Update rewritten query
chatManage.RewriteQuery = response.Content
if response.Content != "" {
// Update rewritten query
chatManage.RewriteQuery = response.Content
}
logger.GetLogger(ctx).Infof("Rewritten query, session_id: %s, rewrite_query: %s",
chatManage.SessionID, chatManage.RewriteQuery)
return next()

View File

@@ -166,9 +166,7 @@ func (s *knowledgeService) CreateKnowledgeFromFile(ctx context.Context,
if exists {
logger.Infof(ctx, "File already exists: %s", file.Filename)
// Update creation time for existing knowledge
existingKnowledge.CreatedAt = time.Now()
existingKnowledge.UpdatedAt = time.Now()
if err := s.repo.UpdateKnowledge(ctx, existingKnowledge); err != nil {
if err := s.repo.UpdateKnowledgeColumn(ctx, existingKnowledge.ID, "created_at", time.Now()); err != nil {
logger.Errorf(ctx, "Failed to update existing knowledge: %v", err)
return nil, err
}
@@ -627,6 +625,8 @@ func (s *knowledgeService) processDocument(ctx context.Context,
attribute.String("embedding_model", knowledge.EmbeddingModelID),
attribute.Bool("enable_multimodal", enableMultimodel),
)
logger.GetLogger(ctx).Infof("processDocument trace id: %s", span.SpanContext().TraceID().String())
if !enableMultimodel && IsImageType(knowledge.FileType) {
logger.GetLogger(ctx).WithField("knowledge_id", knowledge.ID).
WithField("error", ErrImageNotParse).Errorf("processDocument image without enable multimodel")
@@ -1137,6 +1137,7 @@ func (s *knowledgeService) processChunks(ctx context.Context,
span.RecordError(err)
return
}
logger.GetLogger(ctx).Infof("processChunks batch index successfully, with %d index", len(indexInfoList))
// Update knowledge status to completed
knowledge.ParseStatus = "completed"
@@ -1154,6 +1155,7 @@ func (s *knowledgeService) processChunks(ctx context.Context,
if err := s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, totalStorageSize); err != nil {
logger.GetLogger(ctx).WithField("error", err).Errorf("processChunks update tenant storage used failed")
}
logger.GetLogger(ctx).Infof("processChunks successfully")
}
// GetSummary generates a summary for knowledge content using an AI model

View File

@@ -18,6 +18,7 @@ import (
"github.com/Tencent/WeKnora/internal/errors"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/models/embedding"
"github.com/Tencent/WeKnora/internal/models/rerank"
"github.com/Tencent/WeKnora/internal/models/utils/ollama"
"github.com/Tencent/WeKnora/internal/types"
"github.com/Tencent/WeKnora/internal/types/interfaces"
@@ -1410,93 +1411,38 @@ func (h *InitializationHandler) checkRemoteModelConnection(ctx context.Context,
// checkRerankModelConnection 检查Rerank模型连接和功能的内部方法
func (h *InitializationHandler) checkRerankModelConnection(ctx context.Context,
modelName, baseURL, apiKey string) (bool, string) {
client := &http.Client{
Timeout: 15 * time.Second,
// 创建Reranker配置
config := &rerank.RerankerConfig{
APIKey: apiKey,
BaseURL: baseURL,
ModelName: modelName,
Source: types.ModelSourceRemote, // 默认值实际会根据URL判断
}
// 构造重排API端点
rerankEndpoint := baseURL + "/rerank"
// Mock测试数据
testQuery := "什么是人工智能?"
testPassages := []string{
"机器学习是人工智能的一个子领域,专注于算法和统计模型,使计算机系统能够通过经验自动改进。",
"深度学习是机器学习的一个子集,使用人工神经网络来模拟人脑的工作方式。",
}
// 构造重排请求
rerankRequest := map[string]interface{}{
"model": modelName,
"query": testQuery,
"documents": testPassages,
"truncate_prompt_tokens": 512,
}
jsonData, err := json.Marshal(rerankRequest)
// 创建Reranker实例
reranker, err := rerank.NewReranker(config)
if err != nil {
return false, fmt.Sprintf("构造请求失败: %v", err)
return false, fmt.Sprintf("创建Reranker失败: %v", err)
}
logger.Infof(ctx, "Rerank request: %s, modelName=%s, baseURL=%s, apiKey=%s",
string(jsonData), modelName, baseURL, apiKey)
// 简化的测试数据
testQuery := "ping"
testDocuments := []string{
"pong",
}
req, err := http.NewRequestWithContext(
ctx, "POST", rerankEndpoint, strings.NewReader(string(jsonData)),
)
// 使用Reranker进行测试
results, err := reranker.Rerank(ctx, testQuery, testDocuments)
if err != nil {
return false, fmt.Sprintf("创建请求失败: %v", err)
return false, fmt.Sprintf("重排测试失败: %v", err)
}
// 添加认证头
if apiKey != "" {
req.Header.Set("Authorization", "Bearer "+apiKey)
}
req.Header.Set("Content-Type", "application/json")
resp, err := client.Do(req)
if err != nil {
return false, fmt.Sprintf("连接失败: %v", err)
}
defer resp.Body.Close()
// 读取响应
body, err := io.ReadAll(resp.Body)
if err != nil {
return false, fmt.Sprintf("读取响应失败: %v", err)
}
// 检查响应状态
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
// 尝试解析重排响应
var rerankResp struct {
Results []struct {
Index int `json:"index"`
Document string `json:"document"`
RelevanceScore float64 `json:"relevance_score"`
} `json:"results"`
}
if err := json.Unmarshal(body, &rerankResp); err != nil {
// 如果无法解析标准重排响应,检查是否有其他格式
return true, "连接正常,但响应格式非标准"
}
// 检查是否返回了重排结果
if len(rerankResp.Results) > 0 {
return true, fmt.Sprintf("重排功能正常,返回%d个结果", len(rerankResp.Results))
} else {
return false, "重排接口连接成功,但未返回重排结果"
}
} else if resp.StatusCode == 401 {
return false, "认证失败请检查API Key"
} else if resp.StatusCode == 403 {
return false, "权限不足请检查API Key权限"
} else if resp.StatusCode == 404 {
return false, "重排API端点不存在请检查Base URL"
} else if resp.StatusCode == 422 {
return false, fmt.Sprintf("请求参数错误: %s", string(body))
// 检查结果
if len(results) > 0 {
return true, fmt.Sprintf("重排功能正常,返回%d个结果", len(results))
} else {
return false, fmt.Sprintf("API返回错误状态: %d, 响应: %s", resp.StatusCode, string(body))
return false, "重排接口连接成功,但未返回重排结果"
}
}

View File

@@ -0,0 +1,164 @@
package rerank
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"github.com/Tencent/WeKnora/internal/logger"
)
// AliyunReranker implements a reranking system based on Aliyun DashScope models
type AliyunReranker struct {
modelName string // Name of the model used for reranking
modelID string // Unique identifier of the model
apiKey string // API key for authentication
baseURL string // Base URL for API requests
client *http.Client // HTTP client for making API requests
}
// AliyunRerankRequest represents a request to rerank documents using Aliyun DashScope API
type AliyunRerankRequest struct {
Model string `json:"model"` // Model to use for reranking
Input AliyunRerankInput `json:"input"` // Input containing query and documents
Parameters AliyunRerankParameters `json:"parameters"` // Parameters for the reranking
}
// AliyunRerankInput contains the query and documents for reranking
type AliyunRerankInput struct {
Query string `json:"query"` // Query text to compare documents against
Documents []string `json:"documents"` // List of document texts to rerank
}
// AliyunRerankParameters contains parameters for the reranking request
type AliyunRerankParameters struct {
ReturnDocuments bool `json:"return_documents"` // Whether to return documents in response
TopN int `json:"top_n"` // Number of top results to return
}
// AliyunRerankResponse represents the response from Aliyun DashScope reranking request
type AliyunRerankResponse struct {
Output AliyunOutput `json:"output"` // Output containing results
Usage AliyunUsage `json:"usage"` // Token usage information
}
// AliyunOutput contains the reranking results
type AliyunOutput struct {
Results []AliyunRankResult `json:"results"` // Ranked results with relevance scores
}
// AliyunRankResult represents a single reranking result from Aliyun
type AliyunRankResult struct {
Document AliyunDocument `json:"document"` // Document information
Index int `json:"index"` // Original index of the document
RelevanceScore float64 `json:"relevance_score"` // Relevance score
}
// AliyunDocument represents document information in Aliyun response
type AliyunDocument struct {
Text string `json:"text"` // Document text
}
// AliyunUsage contains information about token usage in the Aliyun API request
type AliyunUsage struct {
TotalTokens int `json:"total_tokens"` // Total tokens consumed
}
// NewAliyunReranker creates a new instance of Aliyun reranker with the provided configuration
func NewAliyunReranker(config *RerankerConfig) (*AliyunReranker, error) {
apiKey := config.APIKey
baseURL := "https://dashscope.aliyuncs.com/api/v1/services/rerank/text-rerank/text-rerank"
if url := config.BaseURL; url != "" {
baseURL = url
}
return &AliyunReranker{
modelName: config.ModelName,
modelID: config.ModelID,
apiKey: apiKey,
baseURL: baseURL,
client: &http.Client{},
}, nil
}
// Rerank performs document reranking based on relevance to the query using Aliyun DashScope API
func (r *AliyunReranker) Rerank(ctx context.Context, query string, documents []string) ([]RankResult, error) {
// Build the request body
requestBody := &AliyunRerankRequest{
Model: r.modelName,
Input: AliyunRerankInput{
Query: query,
Documents: documents,
},
Parameters: AliyunRerankParameters{
ReturnDocuments: true,
TopN: len(documents), // Return all documents
},
}
jsonData, err := json.Marshal(requestBody)
if err != nil {
return nil, fmt.Errorf("marshal request body: %w", err)
}
// Send the request
req, err := http.NewRequestWithContext(ctx, "POST", r.baseURL, bytes.NewBuffer(jsonData))
if err != nil {
return nil, fmt.Errorf("create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", r.apiKey))
// Log the curl equivalent for debugging
logger.Debugf(ctx, "curl -X POST %s -H \"Content-Type: application/json\" -H \"Authorization: Bearer %s\" -d '%s'",
r.baseURL, r.apiKey, string(jsonData),
)
resp, err := r.client.Do(req)
if err != nil {
return nil, fmt.Errorf("do request: %w", err)
}
defer resp.Body.Close()
// Read the response
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("read response body: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("aliyun rerank API error: Http Status: %s, Body: %s", resp.Status, string(body))
}
var response AliyunRerankResponse
if err := json.Unmarshal(body, &response); err != nil {
return nil, fmt.Errorf("unmarshal response: %w", err)
}
// Convert Aliyun results to standard RankResult format
results := make([]RankResult, len(response.Output.Results))
for i, aliyunResult := range response.Output.Results {
results[i] = RankResult{
Index: aliyunResult.Index,
Document: DocumentInfo{
Text: aliyunResult.Document.Text,
},
RelevanceScore: aliyunResult.RelevanceScore,
}
}
return results, nil
}
// GetModelName returns the name of the reranking model
func (r *AliyunReranker) GetModelName() string {
return r.modelName
}
// GetModelID returns the unique identifier of the reranking model
func (r *AliyunReranker) GetModelID() string {
return r.modelID
}

View File

@@ -88,10 +88,10 @@ type RerankerConfig struct {
// NewReranker creates a reranker
func NewReranker(config *RerankerConfig) (Reranker, error) {
switch strings.ToLower(string(config.Source)) {
case string(types.ModelSourceRemote):
// 根据URL判断模型来源而不是依赖Source字段
if strings.Contains(config.BaseURL, "https://dashscope.aliyuncs.com/api/v1/services/rerank/text-rerank/text-rerank") {
return NewAliyunReranker(config)
} else {
return NewOpenAIReranker(config)
default:
return nil, fmt.Errorf("unsupported rerank model source: %s", config.Source)
}
}

View File

@@ -70,4 +70,5 @@ type KnowledgeRepository interface {
) (bool, *types.Knowledge, error)
// AminusB returns the difference set of A and B.
AminusB(ctx context.Context, Atenant uint, A string, Btenant uint, B string) ([]string, error)
UpdateKnowledgeColumn(ctx context.Context, id string, column string, value interface{}) error
}

View File

@@ -33,6 +33,7 @@ type ModelSource string
const (
ModelSourceLocal ModelSource = "local" // Local model
ModelSourceRemote ModelSource = "remote" // Remote model
ModelSourceAliyun ModelSource = "aliyun" // Aliyun DashScope model
)
type EmbeddingParameters struct {

View File

@@ -703,8 +703,8 @@ else
if [ "$START_OLLAMA" = true ] && [ "$START_DOCKER" = true ]; then
if [ $OLLAMA_RESULT -eq 0 ] && [ $DOCKER_RESULT -eq 0 ]; then
log_success "所有服务启动完成,可通过以下地址访问:"
echo -e "${GREEN} - 前端界面: http://localhost${NC}"
echo -e "${GREEN} - API接口: http://localhost:8080${NC}"
echo -e "${GREEN} - 前端界面: http://localhost:${FRONTEND_PORT:-80}${NC}"
echo -e "${GREEN} - API接口: http://localhost:${APP_PORT:-8080}${NC}"
echo -e "${GREEN} - Jaeger链路追踪: http://localhost:16686${NC}"
else
log_error "部分服务启动失败,请检查日志并修复问题"
@@ -714,8 +714,8 @@ else
echo -e "${GREEN} - Ollama API: http://localhost:$OLLAMA_PORT${NC}"
elif [ "$START_DOCKER" = true ] && [ $DOCKER_RESULT -eq 0 ]; then
log_success "Docker容器启动完成可通过以下地址访问:"
echo -e "${GREEN} - 前端界面: http://localhost${NC}"
echo -e "${GREEN} - API接口: http://localhost:8080${NC}"
echo -e "${GREEN} - 前端界面: http://localhost:${FRONTEND_PORT:-80}${NC}"
echo -e "${GREEN} - API接口: http://localhost:${APP_PORT:-8080}${NC}"
echo -e "${GREEN} - Jaeger链路追踪: http://localhost:16686${NC}"
fi
fi

View File

@@ -1,5 +1,6 @@
grpcio
grpcio-tools
grpcio-health-checking
protobuf
python-docx
PyPDF2
@@ -13,7 +14,7 @@ urllib3
markdownify
mistletoe
goose3[all]
paddleocr==3.0.0
paddleocr>=2.10.0,<3.0.0
markdown
pypdf
cos-python-sdk-v5
@@ -25,7 +26,7 @@ ollama
pdfplumber
--extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cpu/
paddlepaddle==3.0.0
paddlepaddle>=3.0.0,<4.0.0
# --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
# paddlepaddle-gpu==3.0.0

View File

@@ -24,4 +24,47 @@ logger = logging.getLogger(__name__)
def init_ocr_model():
PaddleOCR()
"""Initialize PaddleOCR model to pre-download and cache models"""
try:
logger.info("Initializing PaddleOCR model for pre-download...")
# 使用与代码中相同的配置
ocr_config = {
"use_gpu": False,
"text_det_limit_type": "max",
"text_det_limit_side_len": 960,
"use_doc_orientation_classify": True, # 启用文档方向分类
"use_doc_unwarping": False,
"use_textline_orientation": True, # 启用文本行方向检测
"text_recognition_model_name": "PP-OCRv4_server_rec",
"text_detection_model_name": "PP-OCRv4_server_det",
"text_det_thresh": 0.3,
"text_det_box_thresh": 0.6,
"text_det_unclip_ratio": 1.5,
"text_rec_score_thresh": 0.0,
"ocr_version": "PP-OCRv4",
"lang": "ch",
"show_log": False,
"use_dilation": True,
"det_db_score_mode": "slow",
}
# 初始化PaddleOCR这会触发模型下载和缓存
ocr = PaddleOCR(**ocr_config)
logger.info("PaddleOCR model initialization completed successfully")
# 测试OCR功能以确保模型正常工作
import numpy as np
from PIL import Image
# 创建一个简单的测试图像
test_image = np.ones((100, 300, 3), dtype=np.uint8) * 255
test_pil = Image.fromarray(test_image)
# 执行一次OCR测试
result = ocr.ocr(np.array(test_pil), cls=False)
logger.info("PaddleOCR test completed successfully")
except Exception as e:
logger.error(f"Failed to initialize PaddleOCR model: {str(e)}")
raise

View File

@@ -33,7 +33,9 @@ except ImportError:
except ImportError:
# If both imports fail, set to None
Caption = None
logging.warning("Failed to import Caption, image captioning will be unavailable")
logging.warning(
"Failed to import Caption, image captioning will be unavailable"
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
@@ -78,7 +80,9 @@ class BaseParser(ABC):
"""
if cls._ocr_engine is None and not cls._ocr_engine_failed:
try:
cls._ocr_engine = OCREngine.get_instance(backend_type=backend_type, **kwargs)
cls._ocr_engine = OCREngine.get_instance(
backend_type=backend_type, **kwargs
)
if cls._ocr_engine is None:
cls._ocr_engine_failed = True
logger.error(f"Failed to initialize OCR engine ({backend_type})")
@@ -89,7 +93,6 @@ class BaseParser(ABC):
logger.error(f"Failed to initialize OCR engine: {str(e)}")
return None
return cls._ocr_engine
def __init__(
self,
@@ -135,7 +138,7 @@ class BaseParser(ABC):
self.max_concurrent_tasks = max_concurrent_tasks
self.max_chunks = max_chunks
self.chunking_config = chunking_config
logger.info(
f"Initializing {self.__class__.__name__} for file: {file_name}, type: {self.file_type}"
)
@@ -174,10 +177,14 @@ class BaseParser(ABC):
resized_image = self._resize_image_if_needed(image)
# Get OCR engine
ocr_engine = self.get_ocr_engine(backend_type=self.ocr_backend, **self.ocr_config)
ocr_engine = self.get_ocr_engine(
backend_type=self.ocr_backend, **self.ocr_config
)
if ocr_engine is None:
logger.error(f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, "
"skipping OCR recognition")
logger.error(
f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, "
"skipping OCR recognition"
)
return ""
# Execute OCR prediction
@@ -199,11 +206,13 @@ class BaseParser(ABC):
return ocr_result
except Exception as e:
process_time = time.time() - start_time
logger.error(f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds")
logger.error(
f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds"
)
return ""
finally:
# Release image resources
if resized_image is not image and hasattr(resized_image, 'close'):
if resized_image is not image and hasattr(resized_image, "close"):
# Only close the new image we created, not the original image
resized_image.close()
@@ -218,25 +227,33 @@ class BaseParser(ABC):
"""
try:
# If it's a PIL Image
if hasattr(image, 'size'):
if hasattr(image, "size"):
width, height = image.size
if width > self.max_image_size or height > self.max_image_size:
logger.info(f"Resizing PIL image, original size: {width}x{height}")
scale = min(self.max_image_size / width, self.max_image_size / height)
scale = min(
self.max_image_size / width, self.max_image_size / height
)
new_width = int(width * scale)
new_height = int(height * scale)
resized_image = image.resize((new_width, new_height))
logger.info(f"Resized to: {new_width}x{new_height}")
return resized_image
else:
logger.info(f"PIL image size {width}x{height} is within limits, no resizing needed")
logger.info(
f"PIL image size {width}x{height} is within limits, no resizing needed"
)
return image
# If it's a numpy array
elif hasattr(image, 'shape'):
elif hasattr(image, "shape"):
height, width = image.shape[:2]
if width > self.max_image_size or height > self.max_image_size:
logger.info(f"Resizing numpy image, original size: {width}x{height}")
scale = min(self.max_image_size / width, self.max_image_size / height)
logger.info(
f"Resizing numpy image, original size: {width}x{height}"
)
scale = min(
self.max_image_size / width, self.max_image_size / height
)
new_width = int(width * scale)
new_height = int(height * scale)
# Use PIL for resizing numpy arrays
@@ -246,7 +263,9 @@ class BaseParser(ABC):
logger.info(f"Resized to: {new_width}x{new_height}")
return resized_image
else:
logger.info(f"Numpy image size {width}x{height} is within limits, no resizing needed")
logger.info(
f"Numpy image size {width}x{height} is within limits, no resizing needed"
)
return image
else:
logger.warning(f"Unknown image type: {type(image)}, cannot resize")
@@ -278,7 +297,9 @@ class BaseParser(ABC):
caption = ""
if self.caption_parser:
logger.info(f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption")
logger.info(
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
)
# Convert image to base64 for caption generation
img_base64 = image_to_base64(image)
if img_base64:
@@ -295,7 +316,7 @@ class BaseParser(ABC):
# Release image resources
del image
return ocr_text, caption, image_url
async def process_image_async(self, image, image_url=None):
@@ -325,13 +346,17 @@ class BaseParser(ABC):
ocr_task = loop.run_in_executor(None, self.perform_ocr, resized_image)
ocr_text = await asyncio.wait_for(ocr_task, timeout=30.0)
except asyncio.TimeoutError:
logger.error("OCR processing timed out (30 seconds), skipping this image")
logger.error(
"OCR processing timed out (30 seconds), skipping this image"
)
ocr_text = ""
except Exception as e:
logger.error(f"OCR processing error: {str(e)}")
ocr_text = ""
logger.info(f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption")
logger.info(
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
)
caption = ""
if self.caption_parser:
try:
@@ -340,9 +365,13 @@ class BaseParser(ABC):
if img_base64:
# Add timeout to avoid blocking caption retrieval (30 seconds timeout)
caption_task = self.get_image_caption_async(img_base64)
image_data, caption = await asyncio.wait_for(caption_task, timeout=30.0)
image_data, caption = await asyncio.wait_for(
caption_task, timeout=30.0
)
if caption:
logger.info(f"Successfully obtained image caption: {caption}")
logger.info(
f"Successfully obtained image caption: {caption}"
)
else:
logger.warning("Failed to get caption")
else:
@@ -353,27 +382,20 @@ class BaseParser(ABC):
except Exception as e:
logger.error(f"Failed to get caption: {str(e)}")
else:
logger.info("Caption service not initialized, skipping caption retrieval")
logger.info(
"Caption service not initialized, skipping caption retrieval"
)
return ocr_text, caption, image_url
finally:
# Release image resources
if resized_image is not image and hasattr(resized_image, 'close'):
if resized_image is not image and hasattr(resized_image, "close"):
# Only close the new image we created, not the original image
resized_image.close()
async def process_with_limit(self, idx, image, url, semaphore, current_request_id=None):
async def process_with_limit(self, idx, image, url, semaphore):
"""Function to process a single image using a semaphore"""
try:
# Set request ID in the asynchronous task
if current_request_id:
try:
from utils.request import set_request_id
set_request_id(current_request_id)
logger.info(f"Asynchronous task {idx+1} setting request ID: {current_request_id}")
except Exception as e:
logger.warning(f"Failed to set request ID in asynchronous task: {str(e)}")
logger.info(f"Waiting to process image {idx+1}")
async with semaphore: # Use semaphore to control concurrency
logger.info(f"Starting to process image {idx+1}")
@@ -385,7 +407,7 @@ class BaseParser(ABC):
return ("", "", url) # Return empty result to avoid overall failure
finally:
# Manually release image resources
if hasattr(image, 'close'):
if hasattr(image, "close"):
image.close()
async def process_multiple_images(self, images_data):
@@ -404,26 +426,19 @@ class BaseParser(ABC):
return []
# Set max concurrency, reduce concurrency to avoid resource contention
max_concurrency = min(self.max_concurrent_tasks, 5) # Reduce concurrency to prevent excessive memory usage
max_concurrency = min(
self.max_concurrent_tasks, 1
) # Reduce concurrency to prevent excessive memory usage
# Use semaphore to limit concurrency
semaphore = asyncio.Semaphore(max_concurrency)
# Store results to avoid overall failure due to task failure
results = []
# Get current request ID to set in each asynchronous task
current_request_id = None
try:
from utils.request import get_request_id
current_request_id = get_request_id()
logger.info(f"Capturing current request ID before async processing: {current_request_id}")
except Exception as e:
logger.warning(f"Failed to get current request ID: {str(e)}")
# Create all tasks, but use semaphore to limit actual concurrency
tasks = [
self.process_with_limit(i, img, url, semaphore, current_request_id)
self.process_with_limit(i, img, url, semaphore)
for i, (img, url) in enumerate(images_data)
]
@@ -434,7 +449,9 @@ class BaseParser(ABC):
# Handle possible exception results
for i, result in enumerate(completed_results):
if isinstance(result, Exception):
logger.error(f"Image {i+1} processing returned an exception: {str(result)}")
logger.error(
f"Image {i+1} processing returned an exception: {str(result)}"
)
# For exceptions, add empty results
if i < len(images_data):
results.append(("", "", images_data[i][1]))
@@ -449,7 +466,9 @@ class BaseParser(ABC):
images_data.clear()
logger.info("Image processing resource cleanup complete")
logger.info(f"Completed concurrent processing of {len(results)}/{len(images_data)} images")
logger.info(
f"Completed concurrent processing of {len(results)}/{len(images_data)} images"
)
return results
def decode_bytes(self, content: bytes) -> str:
@@ -529,9 +548,13 @@ class BaseParser(ABC):
def __init_storage(self):
"""Initialize storage client based on configuration"""
if self._storage is None:
storage_config = self.chunking_config.storage_config if self.chunking_config else None
storage_config = (
self.chunking_config.storage_config if self.chunking_config else None
)
self._storage = create_storage(storage_config)
logger.info(f"Initialized storage client: {self._storage.__class__.__name__}")
logger.info(
f"Initialized storage client: {self._storage.__class__.__name__}"
)
return self._storage
def upload_file(self, file_path: str) -> str:
@@ -605,40 +628,50 @@ class BaseParser(ABC):
logger.info(f"Beginning chunking process for text")
chunks = self.chunk_text(text)
logger.info(f"Created {len(chunks)} chunks from document")
# Limit the number of returned chunks
if len(chunks) > self.max_chunks:
logger.warning(f"Limiting chunks from {len(chunks)} to maximum {self.max_chunks}")
chunks = chunks[:self.max_chunks]
logger.warning(
f"Limiting chunks from {len(chunks)} to maximum {self.max_chunks}"
)
chunks = chunks[: self.max_chunks]
# If multimodal is enabled and file type is supported, process images in each chunk
if self.enable_multimodal:
# Get file extension and convert to lowercase
file_ext = (
os.path.splitext(self.file_name)[1].lower()
if self.file_name
else (
self.file_type.lower()
if self.file_type
else ""
)
else (self.file_type.lower() if self.file_type else "")
)
# Define allowed file types for image processing
allowed_types = [
'.pdf', # PDF files
'.md', '.markdown', # Markdown files
'.doc', '.docx', # Word documents
".pdf", # PDF files
".md",
".markdown", # Markdown files
".doc",
".docx", # Word documents
# Image files
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'
".jpg",
".jpeg",
".png",
".gif",
".bmp",
".tiff",
".webp",
]
if file_ext in allowed_types:
logger.info(f"Processing images in each chunk for file type: {file_ext}")
logger.info(
f"Processing images in each chunk for file type: {file_ext}"
)
chunks = self.process_chunks_images(chunks, image_map)
else:
logger.info(f"Skipping image processing for unsupported file type: {file_ext}")
logger.info(
f"Skipping image processing for unsupported file type: {file_ext}"
)
return ParseResult(text=text, chunks=chunks)
def _split_into_units(self, text: str) -> List[str]:
@@ -649,11 +682,13 @@ class BaseParser(ABC):
Returns:
基本单元的列表
"""
logger.info(f"Splitting text into basic units with robust structure protection, text length: {len(text)}")
logger.info(
f"Splitting text into basic units with robust structure protection, text length: {len(text)}"
)
# 定义所有需要作为整体保护的结构模式 ---
table_pattern = r"(?m)(^\|.*\|[ \t]*\r?\n(?:[ \t]*\r?\n)?^\|\s*:?--+.*\r?\n(?:^\|.*\|\r?\n?)*)"
# 其他需要保护的结构(代码块、公式块、行内元素)
code_block_pattern = r"```[\s\S]*?```"
math_block_pattern = r"\$\$[\s\S]*?\$\$"
@@ -661,7 +696,12 @@ class BaseParser(ABC):
# 查找所有受保护结构的位置 ---
protected_ranges = []
for pattern in [table_pattern, code_block_pattern, math_block_pattern, inline_pattern]:
for pattern in [
table_pattern,
code_block_pattern,
math_block_pattern,
inline_pattern,
]:
for match in re.finditer(pattern, text):
# 确保匹配到的不是空字符串,避免无效范围
if match.group(0).strip():
@@ -669,8 +709,10 @@ class BaseParser(ABC):
# 按起始位置排序
protected_ranges.sort(key=lambda x: x[0])
logger.info(f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links).")
logger.info(
f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links)."
)
# 合并可能重叠的保护范围 ---
# 确保我们有一组不相交的、需要保护的文本块
if protected_ranges:
@@ -685,15 +727,17 @@ class BaseParser(ABC):
# 如果不重叠,则完成当前范围并开始一个新的范围
merged_ranges.append((current_start, current_end))
current_start, current_end = next_start, next_end
merged_ranges.append((current_start, current_end))
protected_ranges = merged_ranges
logger.info(f"After merging overlaps, {len(protected_ranges)} protected ranges remain.")
logger.info(
f"After merging overlaps, {len(protected_ranges)} protected ranges remain."
)
# 根据保护范围和分隔符来分割文本 ---
units = []
last_end = 0
# 定义分隔符的正则表达式,通过加括号来保留分隔符本身
separator_pattern = f"({'|'.join(re.escape(s) for s in self.separators)})"
@@ -703,7 +747,7 @@ class BaseParser(ABC):
pre_text = text[last_end:start]
# 对这部分非保护文本进行分割,并保留分隔符
segments = re.split(separator_pattern, pre_text)
units.extend([s for s in segments if s]) # 添加所有非空部分
units.extend([s for s in segments if s]) # 添加所有非空部分
# b. 将整个受保护的块(例如,一个完整的表格)作为一个单独的、不可分割的单元添加
protected_text = text[start:end]
@@ -715,10 +759,11 @@ class BaseParser(ABC):
if last_end < len(text):
post_text = text[last_end:]
segments = re.split(separator_pattern, post_text)
units.extend([s for s in segments if s]) # 添加所有非空部分
units.extend([s for s in segments if s]) # 添加所有非空部分
logger.info(f"Text splitting complete, created {len(units)} final basic units.")
return units
def _find_complete_units(self, units: List[str], target_size: int) -> List[str]:
"""Find a list of complete units that do not exceed the target size
@@ -884,71 +929,58 @@ class BaseParser(ABC):
"""
logger.info(f"Extracting image information from Chunk #{chunk.seq}")
text = chunk.content
# Regex to extract image information from text, supporting Markdown images and HTML images
img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)|<img [^>]*src="([^"]+)" [^>]*>'
# Extract image information
img_matches = list(re.finditer(img_pattern, text))
logger.info(f"Chunk #{chunk.seq} found {len(img_matches)} images")
images_info = []
for match_idx, match in enumerate(img_matches):
# Process image URL
img_url = match.group(2) if match.group(2) else match.group(3)
alt_text = match.group(1) if match.group(1) else ""
# Record image information
image_info = {
"original_url": img_url,
"start": match.start(),
"end": match.end(),
"alt_text": alt_text,
"match_text": text[match.start():match.end()]
"match_text": text[match.start() : match.end()],
}
images_info.append(image_info)
logger.info(
f"Image in Chunk #{chunk.seq} {match_idx+1}: "
f"URL={img_url[:50]}..."
f"Image in Chunk #{chunk.seq} {match_idx+1}: " f"URL={img_url[:50]}..."
if len(img_url) > 50
else f"Image in Chunk #{chunk.seq} {match_idx+1}: URL={img_url}"
)
return images_info
async def download_and_upload_image(self, img_url: str, current_request_id=None, image_map=None):
async def download_and_upload_image(self, img_url: str):
"""Download image and upload to object storage, if it's already an object storage path or local path, use directly
Args:
img_url: Image URL or local path
current_request_id: Current request ID
image_map: Optional dictionary mapping image URLs to Image objects
Returns:
tuple: (original URL, storage URL, image object), if failed returns (original URL, None, None)
"""
# Set request ID context in the asynchronous task
try:
if current_request_id:
from utils.request import set_request_id
set_request_id(current_request_id)
logger.info(f"Asynchronous task setting request ID: {current_request_id}")
except Exception as e:
logger.warning(f"Failed to set request ID in asynchronous task: {str(e)}")
try:
import requests
from PIL import Image
import io
# Check if image is already in the image_map
if image_map and img_url in image_map:
logger.info(f"Image already in image_map: {img_url}, using cached object")
return img_url, img_url, image_map[img_url]
# Check if it's already a storage URL (COS or MinIO)
is_storage_url = any(pattern in img_url for pattern in ["cos", "myqcloud.com", "minio", ".s3."])
is_storage_url = any(
pattern in img_url
for pattern in ["cos", "myqcloud.com", "minio", ".s3."]
)
if is_storage_url:
logger.info(f"Image already on COS: {img_url}, no need to re-upload")
try:
@@ -961,7 +993,7 @@ class BaseParser(ABC):
proxies["http"] = http_proxy
if https_proxy:
proxies["https"] = https_proxy
response = requests.get(img_url, timeout=5, proxies=proxies)
if response.status_code == 200:
image = Image.open(io.BytesIO(response.content))
@@ -972,12 +1004,14 @@ class BaseParser(ABC):
# Image will be closed by the caller
pass
else:
logger.warning(f"Failed to get storage image: {response.status_code}")
logger.warning(
f"Failed to get storage image: {response.status_code}"
)
return img_url, img_url, None
except Exception as e:
logger.error(f"Error getting storage image: {str(e)}")
return img_url, img_url, None
# Check if it's a local file path
elif os.path.exists(img_url) and os.path.isfile(img_url):
logger.info(f"Using local image file: {img_url}")
@@ -986,17 +1020,19 @@ class BaseParser(ABC):
# Read local image
image = Image.open(img_url)
# Upload to storage
with open(img_url, 'rb') as f:
with open(img_url, "rb") as f:
content = f.read()
storage_url = self.upload_bytes(content)
logger.info(f"Successfully uploaded local image to storage: {storage_url}")
logger.info(
f"Successfully uploaded local image to storage: {storage_url}"
)
return img_url, storage_url, image
except Exception as e:
logger.error(f"Error processing local image: {str(e)}")
if image and hasattr(image, 'close'):
if image and hasattr(image, "close"):
image.close()
return img_url, None, None
# Normal remote URL download handling
else:
# Get proxy settings from environment variables
@@ -1007,17 +1043,21 @@ class BaseParser(ABC):
proxies["http"] = http_proxy
if https_proxy:
proxies["https"] = https_proxy
logger.info(f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}")
logger.info(
f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}"
)
response = requests.get(img_url, timeout=5, proxies=proxies)
if response.status_code == 200:
# Download successful, create image object
image = Image.open(io.BytesIO(response.content))
try:
# Upload to storage using the method in BaseParser
storage_url = self.upload_bytes(response.content)
logger.info(f"Successfully uploaded image to storage: {storage_url}")
logger.info(
f"Successfully uploaded image to storage: {storage_url}"
)
return img_url, storage_url, image
finally:
# Image will be closed by the caller
@@ -1025,74 +1065,79 @@ class BaseParser(ABC):
else:
logger.warning(f"Failed to download image: {response.status_code}")
return img_url, None, None
except Exception as e:
logger.error(f"Error downloading or processing image: {str(e)}")
return img_url, None, None
async def process_chunk_images_async(self, chunk, chunk_idx, total_chunks, current_request_id=None, image_map=None):
async def process_chunk_images_async(
self, chunk, chunk_idx, total_chunks, image_map=None
):
"""Asynchronously process images in a single Chunk
Args:
chunk: Chunk object to process
chunk_idx: Chunk index
total_chunks: Total number of chunks
current_request_id: Current request ID
image_map: Optional dictionary mapping image URLs to Image objects
Returns:
Processed Chunk object
"""
# Set request ID context in the asynchronous task
try:
if current_request_id:
from utils.request import set_request_id
set_request_id(current_request_id)
logger.info(f"Chunk processing task #{chunk_idx+1} setting request ID: {current_request_id}")
except Exception as e:
logger.warning(f"Failed to set request ID in Chunk processing task: {str(e)}")
logger.info(f"Starting to process images in Chunk #{chunk_idx+1}/{total_chunks}")
logger.info(
f"Starting to process images in Chunk #{chunk_idx+1}/{total_chunks}"
)
# Extract image information from the Chunk
images_info = self.extract_images_from_chunk(chunk)
if not images_info:
logger.info(f"Chunk #{chunk_idx+1} found no images")
return chunk
# Prepare images that need to be downloaded and processed
images_to_process = []
url_to_info_map = {} # Map URL to image information
# Record all image URLs that need to be processed
for img_info in images_info:
url = img_info["original_url"]
url_to_info_map[url] = img_info
# Create an asynchronous event loop (current loop)
loop = asyncio.get_event_loop()
# Concurrent download and upload of images
tasks = [self.download_and_upload_image(url, current_request_id, image_map) for url in url_to_info_map.keys()]
results = await asyncio.gather(*tasks)
results = []
download_tasks = []
for img_url in url_to_info_map.keys(): # Check if image is already in the image_map
if image_map and img_url in image_map:
logger.info(f"Image already in image_map: {img_url}, using cached object")
results.append((img_url, img_url, image_map[img_url]))
else:
download_task = self.download_and_upload_image(img_url)
download_tasks.append(download_task)
# Concurrent download and upload of images, ignore images that are already in the image_map
results.extend(await asyncio.gather(*download_tasks))
# Process download results, prepare for OCR processing
for orig_url, cos_url, image in results:
if cos_url and image:
img_info = url_to_info_map[orig_url]
img_info["cos_url"] = cos_url
images_to_process.append((image, cos_url))
# If no images were successfully downloaded and uploaded, return the original Chunk
if not images_to_process:
logger.info(f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images")
logger.info(
f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images"
)
return chunk
# Concurrent processing of all images (OCR + caption)
logger.info(f"Processing {len(images_to_process)} images in Chunk #{chunk_idx+1}")
logger.info(
f"Processing {len(images_to_process)} images in Chunk #{chunk_idx+1}"
)
# Concurrent processing of all images
processed_results = await self.process_multiple_images(images_to_process)
# Process OCR and Caption results
for ocr_text, caption, img_url in processed_results:
# Find the corresponding original URL
@@ -1100,22 +1145,24 @@ class BaseParser(ABC):
if info.get("cos_url") == img_url:
info["ocr_text"] = ocr_text if ocr_text else ""
info["caption"] = caption if caption else ""
if ocr_text:
logger.info(f"Image OCR extracted {len(ocr_text)} characters: {img_url}")
logger.info(
f"Image OCR extracted {len(ocr_text)} characters: {img_url}"
)
if caption:
logger.info(f"Obtained image description: '{caption}'")
break
# Add processed image information to the Chunk
processed_images = []
for img_info in images_info:
if "cos_url" in img_info:
processed_images.append(img_info)
# Update image information in the Chunk
chunk.images = processed_images
logger.info(f"Completed image processing in Chunk #{chunk_idx+1}")
return chunk
@@ -1128,42 +1175,37 @@ class BaseParser(ABC):
Returns:
List of processed document chunks
"""
logger.info(f"Starting concurrent processing of images in all {len(chunks)} chunks")
logger.info(
f"Starting concurrent processing of images in all {len(chunks)} chunks"
)
if not chunks:
logger.warning("No chunks to process")
return chunks
# Get current request ID to pass to asynchronous tasks
current_request_id = None
try:
from utils.request import get_request_id
current_request_id = get_request_id()
logger.info(f"Capturing current request ID before async processing: {current_request_id}")
except Exception as e:
logger.warning(f"Failed to get current request ID: {str(e)}")
# Create and run all Chunk concurrent processing tasks
async def process_all_chunks():
# Set max concurrency, reduce concurrency to avoid resource contention
max_concurrency = min(self.max_concurrent_tasks, 5) # Reduce concurrency
max_concurrency = min(self.max_concurrent_tasks, 1) # Reduce concurrency
# Use semaphore to limit concurrency
semaphore = asyncio.Semaphore(max_concurrency)
async def process_with_limit(chunk, idx, total):
"""Use semaphore to control concurrent processing of Chunks"""
async with semaphore:
return await self.process_chunk_images_async(chunk, idx, total, current_request_id, image_map)
return await self.process_chunk_images_async(
chunk, idx, total, image_map
)
# Create tasks for all Chunks
tasks = [
process_with_limit(chunk, idx, len(chunks))
for idx, chunk in enumerate(chunks)
]
# Execute all tasks concurrently
results = await asyncio.gather(*tasks, return_exceptions=True)
# Handle possible exceptions
processed_chunks = []
for i, result in enumerate(results):
@@ -1174,9 +1216,9 @@ class BaseParser(ABC):
processed_chunks.append(chunks[i])
else:
processed_chunks.append(result)
return processed_chunks
# Create event loop and run all tasks
try:
# Check if event loop already exists
@@ -1189,11 +1231,13 @@ class BaseParser(ABC):
# If no event loop, create a new one
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
# Execute processing for all Chunks
processed_chunks = loop.run_until_complete(process_all_chunks())
logger.info(f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks")
logger.info(
f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks"
)
return processed_chunks
except Exception as e:
logger.error(f"Error during concurrent chunk processing: {str(e)}")

View File

@@ -54,6 +54,7 @@ class DocParser(BaseParser):
enable_multimodal=self.enable_multimodal,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
chunking_config=self.chunking_config,
separators=self.separators,
)
text = docx_parser.parse_into_text(docx_content)

View File

@@ -33,30 +33,40 @@ class PaddleOCRBackend(OCRBackend):
"""Initialize PaddleOCR backend"""
self.ocr = None
try:
import os
import paddle
# Set PaddlePaddle to use CPU and disable GPU
os.environ['CUDA_VISIBLE_DEVICES'] = ''
paddle.set_device('cpu')
from paddleocr import PaddleOCR
# Default OCR configuration
# OCR configuration with text orientation classification enabled
ocr_config = {
"text_det_limit_type": "max", # Change from 'min' to 'max'
"text_det_limit_side_len": 960, # A standard and safe limit for the longest side
"use_doc_orientation_classify": False, # Do not use document image orientation classification
"use_doc_unwarping": False, # Do not use document unwarping
"use_textline_orientation": False, # Do not use textline orientation classification
"text_recognition_model_name": "PP-OCRv5_server_rec",
"text_detection_model_name": "PP-OCRv5_server_det",
"text_recognition_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_rec_infer",
"text_detection_model_dir": "/root/.paddlex/official_models/PP-OCRv5_server_det_infer",
"text_det_thresh": 0.3, # Text detection pixel threshold
"text_det_box_thresh": 0.6, # Text detection box threshold
"text_det_unclip_ratio": 1.5, # Text detection expansion ratio
"text_rec_score_thresh": 0.0, # Text recognition confidence threshold
"ocr_version": "PP-OCRv5", # Switch to PP-OCRv4 here to compare
"use_gpu": False,
"text_det_limit_type": "max",
"text_det_limit_side_len": 960,
"use_doc_orientation_classify": True, # 启用文档方向分类
"use_doc_unwarping": False,
"use_textline_orientation": True, # 启用文本行方向检测
"text_recognition_model_name": "PP-OCRv4_server_rec",
"text_detection_model_name": "PP-OCRv4_server_det",
"text_det_thresh": 0.3,
"text_det_box_thresh": 0.6,
"text_det_unclip_ratio": 1.5,
"text_rec_score_thresh": 0.0,
"ocr_version": "PP-OCRv4",
"lang": "ch",
"show_log": False,
"use_dilation": True, # improves accuracy
"det_db_score_mode": "slow", # improves accuracy
}
self.ocr = PaddleOCR(**ocr_config)
logger.info("PaddleOCR engine initialized successfully")
except ImportError:
logger.error("Failed to import paddleocr. Please install it with 'pip install paddleocr'")
except ImportError as e:
logger.error(f"Failed to import paddleocr: {str(e)}. Please install it with 'pip install paddleocr'")
except Exception as e:
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
@@ -71,50 +81,39 @@ class PaddleOCRBackend(OCRBackend):
"""
try:
# Ensure image is in RGB format
if hasattr(image, "convert") and image.mode != "RGB":
image = image.convert("RGB")
# Convert to numpy array if needed
if hasattr(image, "convert"):
if image.mode == "RGBA":
img_for_ocr = image.convert("RGB") # 尝试转换为 RGB
logger.info(f"Converted image from RGBA to RGB format for OCR.")
elif image.mode != "RGB": # 如果不是 RGBA 也不是 RGB也尝试转 RGB
img_for_ocr = image.convert("RGB")
logger.info(f"Converted image from {image.mode} to RGB format for OCR.")
else:
img_for_ocr = image
logger.info(f"Image already in RGB format.")
image_array = np.array(image)
else:
img_for_ocr = image
logger.info(f"Image is not a PIL.Image object, assuming it's already suitable for OCR.")
image_array = image
# Convert to numpy array if not already
if hasattr(img_for_ocr, "convert"):
image_array = np.array(img_for_ocr)
else:
image_array = img_for_ocr
ocr_result = self.ocr.predict(image_array)
# Perform OCR
ocr_result = self.ocr.ocr(image_array, cls=False)
# Extract text
if ocr_result and any(ocr_result):
ocr_text = ""
for image_result in ocr_result:
ocr_text = ocr_text + " ".join(image_result["rec_texts"])
text_length = len(ocr_text)
if text_length > 0:
logger.info(f"OCR extracted {text_length} characters")
logger.info(
f"OCR text sample: {ocr_text[:100]}..."
if text_length > 100
else f"OCR text: {ocr_text}"
)
return ocr_text
else:
logger.warning("OCR returned empty result")
ocr_text = ""
if ocr_result and ocr_result[0]:
for line in ocr_result[0]:
if line and len(line) >= 2:
text = line[1][0] if line[1] else ""
if text:
ocr_text += text + " "
text_length = len(ocr_text.strip())
if text_length > 0:
logger.info(f"OCR extracted {text_length} characters")
return ocr_text.strip()
else:
logger.warning("OCR did not return any result")
return ""
logger.warning("OCR returned empty result")
return ""
except Exception as e:
logger.error(f"OCR recognition error: {str(e)}")
return ""
class NanonetsOCRBackend(OCRBackend):
"""Nanonets OCR backend implementation using OpenAI API format"""

View File

@@ -5,6 +5,9 @@ from concurrent import futures
import traceback
import grpc
import uuid
import atexit
from grpc_health.v1 import health_pb2_grpc
from grpc_health.v1.health import HealthServicer
# Add parent directory to Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -324,99 +327,51 @@ def init_ocr_engine(ocr_backend, ocr_config):
logger.error(f"Error initializing OCR engine: {str(e)}")
return False
def serve():
init_ocr_engine(os.getenv("OCR_BACKEND", "paddle"), {
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
})
# Set max number of worker threads and processes
# Set max number of worker threads
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
worker_processes = int(os.environ.get("GRPC_WORKER_PROCESSES", str(os.cpu_count() or 1)))
logger.info(f"Starting DocReader service, max worker threads per process: {max_workers}, "
f"processes: {worker_processes}")
logger.info(f"Starting DocReader service with {max_workers} worker threads")
# Get port number
port = os.environ.get("GRPC_PORT", "50051")
# Multi-process mode
if worker_processes > 1:
import multiprocessing
processes = []
def run_server():
# Create server
server = grpc.server(
futures.ThreadPoolExecutor(max_workers=max_workers),
options=[
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
],
)
# Register service
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
# Set listen address
server.add_insecure_port(f"[::]:{port}")
# Start service
server.start()
logger.info(f"Worker process {os.getpid()} started on port {port}")
try:
# Wait for service termination
server.wait_for_termination()
except KeyboardInterrupt:
logger.info(f"Worker process {os.getpid()} received termination signal")
server.stop(0)
# Start specified number of worker processes
for i in range(worker_processes):
process = multiprocessing.Process(target=run_server)
processes.append(process)
process.start()
logger.info(f"Started worker process {process.pid} ({i+1}/{worker_processes})")
# Wait for all processes to complete
try:
for process in processes:
process.join()
except KeyboardInterrupt:
logger.info("Master process received termination signal")
for process in processes:
if process.is_alive():
logger.info(f"Terminating worker process {process.pid}")
process.terminate()
# Create server
server = grpc.server(
futures.ThreadPoolExecutor(max_workers=max_workers),
options=[
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
],
)
# Single-process mode
else:
# Create server
server = grpc.server(
futures.ThreadPoolExecutor(max_workers=max_workers),
options=[
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
],
)
# Register service
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
# Set listen address
server.add_insecure_port(f"[::]:{port}")
# Start service
server.start()
logger.info(f"Server started on port {port} (single process mode)")
logger.info("Server is ready to accept connections")
try:
# Wait for service termination
server.wait_for_termination()
except KeyboardInterrupt:
logger.info("Received termination signal, shutting down server")
server.stop(0)
# Register services
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
# Register health check service
health_servicer = HealthServicer()
health_pb2_grpc.add_HealthServicer_to_server(health_servicer, server)
# Set listen address
server.add_insecure_port(f"[::]:{port}")
# Start service
server.start()
logger.info(f"Server started on port {port}")
logger.info("Server is ready to accept connections")
try:
# Wait for service termination
server.wait_for_termination()
except KeyboardInterrupt:
logger.info("Received termination signal, shutting down server")
server.stop(0)
if __name__ == "__main__":
serve()