mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
fix: ignore non-utf8 words when saving to db
This commit is contained in:
56
Makefile
56
Makefile
@@ -1,4 +1,42 @@
|
||||
.PHONY: build run test clean docker-build docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama
|
||||
.PHONY: help build run test clean docker-build docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama build-images build-images-app build-images-docreader build-images-frontend clean-images
|
||||
|
||||
# Show help
|
||||
help:
|
||||
@echo "WeKnora Makefile 帮助"
|
||||
@echo ""
|
||||
@echo "基础命令:"
|
||||
@echo " build 构建应用"
|
||||
@echo " run 运行应用"
|
||||
@echo " test 运行测试"
|
||||
@echo " clean 清理构建文件"
|
||||
@echo ""
|
||||
@echo "Docker 命令:"
|
||||
@echo " docker-build 构建 Docker 镜像"
|
||||
@echo " docker-run 运行 Docker 容器"
|
||||
@echo " docker-stop 停止 Docker 容器"
|
||||
@echo " docker-restart 重启 Docker 容器"
|
||||
@echo ""
|
||||
@echo "服务管理:"
|
||||
@echo " start-all 启动所有服务"
|
||||
@echo " stop-all 停止所有服务"
|
||||
@echo " start-ollama 仅启动 Ollama 服务"
|
||||
@echo ""
|
||||
@echo "镜像构建:"
|
||||
@echo " build-images 从源码构建所有镜像"
|
||||
@echo " build-images-app 从源码构建应用镜像"
|
||||
@echo " build-images-docreader 从源码构建文档读取器镜像"
|
||||
@echo " build-images-frontend 从源码构建前端镜像"
|
||||
@echo " clean-images 清理本地镜像"
|
||||
@echo ""
|
||||
@echo "数据库:"
|
||||
@echo " migrate-up 执行数据库迁移"
|
||||
@echo " migrate-down 回滚数据库迁移"
|
||||
@echo ""
|
||||
@echo "开发工具:"
|
||||
@echo " fmt 格式化代码"
|
||||
@echo " lint 代码检查"
|
||||
@echo " deps 安装依赖"
|
||||
@echo " docs 生成 API 文档"
|
||||
|
||||
# Go related variables
|
||||
BINARY_NAME=WeKnora
|
||||
@@ -53,6 +91,22 @@ stop-all:
|
||||
docker-stop:
|
||||
docker-compose down
|
||||
|
||||
# 从源码构建镜像相关命令
|
||||
build-images:
|
||||
./scripts/build_images.sh
|
||||
|
||||
build-images-app:
|
||||
./scripts/build_images.sh --app
|
||||
|
||||
build-images-docreader:
|
||||
./scripts/build_images.sh --docreader
|
||||
|
||||
build-images-frontend:
|
||||
./scripts/build_images.sh --frontend
|
||||
|
||||
clean-images:
|
||||
./scripts/build_images.sh --clean
|
||||
|
||||
# Restart Docker container (stop, rebuild, start)
|
||||
docker-restart:
|
||||
docker-compose stop -t 60
|
||||
|
||||
8
frontend/.dockerignore
Normal file
8
frontend/.dockerignore
Normal file
@@ -0,0 +1,8 @@
|
||||
node_modules
|
||||
dist
|
||||
.git
|
||||
.gitignore
|
||||
README.md
|
||||
.vscode
|
||||
*.log
|
||||
.DS_Store
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
|
||||
"github.com/Tencent/WeKnora/internal/common"
|
||||
"github.com/Tencent/WeKnora/internal/types"
|
||||
"github.com/Tencent/WeKnora/internal/types/interfaces"
|
||||
"gorm.io/gorm"
|
||||
@@ -21,6 +22,9 @@ func NewChunkRepository(db *gorm.DB) interfaces.ChunkRepository {
|
||||
|
||||
// CreateChunks creates multiple chunks in batches
|
||||
func (r *chunkRepository) CreateChunks(ctx context.Context, chunks []*types.Chunk) error {
|
||||
for _, chunk := range chunks {
|
||||
chunk.Content = common.CleanInvalidUTF8(chunk.Content)
|
||||
}
|
||||
return r.db.WithContext(ctx).CreateInBatches(chunks, 100).Error
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/Tencent/WeKnora/internal/common"
|
||||
"github.com/Tencent/WeKnora/internal/types"
|
||||
"github.com/pgvector/pgvector-go"
|
||||
)
|
||||
@@ -59,7 +60,7 @@ func toDBVectorEmbedding(indexInfo *types.IndexInfo, additionalParams map[string
|
||||
ChunkID: indexInfo.ChunkID,
|
||||
KnowledgeID: indexInfo.KnowledgeID,
|
||||
KnowledgeBaseID: indexInfo.KnowledgeBaseID,
|
||||
Content: indexInfo.Content,
|
||||
Content: common.CleanInvalidUTF8(indexInfo.Content),
|
||||
}
|
||||
// Add embedding data if available in additionalParams
|
||||
if additionalParams != nil && slices.Contains(slices.Collect(maps.Keys(additionalParams)), "embedding") {
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"regexp"
|
||||
"slices"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// ToInterfaceSlice converts a slice of strings to a slice of empty interfaces.
|
||||
@@ -73,3 +74,27 @@ func ParseLLMJsonResponse(content string, target interface{}) error {
|
||||
// If no code block found, return the original error
|
||||
return err
|
||||
}
|
||||
|
||||
// CleanInvalidUTF8 移除字符串中的非法 UTF-8 字符和 \x00
|
||||
func CleanInvalidUTF8(s string) string {
|
||||
var b strings.Builder
|
||||
b.Grow(len(s))
|
||||
|
||||
for i := 0; i < len(s); {
|
||||
r, size := utf8.DecodeRuneInString(s[i:])
|
||||
if r == utf8.RuneError && size == 1 {
|
||||
// 非法 UTF-8 字节,跳过
|
||||
i++
|
||||
continue
|
||||
}
|
||||
if r == 0 {
|
||||
// NULL 字符 \x00,跳过
|
||||
i += size
|
||||
continue
|
||||
}
|
||||
b.WriteRune(r)
|
||||
i += size
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
309
scripts/build_images.sh
Executable file
309
scripts/build_images.sh
Executable file
@@ -0,0 +1,309 @@
|
||||
#!/bin/bash
|
||||
# 该脚本用于从源码构建WeKnora的所有Docker镜像
|
||||
|
||||
# 设置颜色
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
RED='\033[0;31m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # 无颜色
|
||||
|
||||
# 获取项目根目录(脚本所在目录的上一级)
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
PROJECT_ROOT="$( cd "$SCRIPT_DIR/.." && pwd )"
|
||||
|
||||
# 版本信息
|
||||
VERSION="1.0.0"
|
||||
SCRIPT_NAME=$(basename "$0")
|
||||
|
||||
# 显示帮助信息
|
||||
show_help() {
|
||||
echo -e "${GREEN}WeKnora 镜像构建脚本 v${VERSION}${NC}"
|
||||
echo -e "${GREEN}用法:${NC} $0 [选项]"
|
||||
echo "选项:"
|
||||
echo " -h, --help 显示帮助信息"
|
||||
echo " -a, --all 构建所有镜像(默认)"
|
||||
echo " -p, --app 仅构建应用镜像"
|
||||
echo " -d, --docreader 仅构建文档读取器镜像"
|
||||
echo " -f, --frontend 仅构建前端镜像"
|
||||
echo " -c, --clean 清理所有本地镜像"
|
||||
echo " -v, --version 显示版本信息"
|
||||
exit 0
|
||||
}
|
||||
|
||||
# 显示版本信息
|
||||
show_version() {
|
||||
echo -e "${GREEN}WeKnora 镜像构建脚本 v${VERSION}${NC}"
|
||||
exit 0
|
||||
}
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
# 检查Docker是否已安装
|
||||
check_docker() {
|
||||
log_info "检查Docker环境..."
|
||||
|
||||
if ! command -v docker &> /dev/null; then
|
||||
log_error "未安装Docker,请先安装Docker"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 检查Docker服务运行状态
|
||||
if ! docker info &> /dev/null; then
|
||||
log_error "Docker服务未运行,请启动Docker服务"
|
||||
return 1
|
||||
fi
|
||||
|
||||
log_success "Docker环境检查通过"
|
||||
return 0
|
||||
}
|
||||
|
||||
# 检测平台
|
||||
check_platform() {
|
||||
log_info "检测系统平台信息..."
|
||||
if [ "$(uname -m)" = "x86_64" ]; then
|
||||
export PLATFORM="linux/amd64"
|
||||
export ARCH="amd64"
|
||||
elif [ "$(uname -m)" = "aarch64" ] || [ "$(uname -m)" = "arm64" ]; then
|
||||
export PLATFORM="linux/arm64"
|
||||
export ARCH="arm64"
|
||||
else
|
||||
log_warning "未识别的平台类型:$(uname -m),将使用默认平台 linux/amd64"
|
||||
export PLATFORM="linux/amd64"
|
||||
export ARCH="amd64"
|
||||
fi
|
||||
log_info "当前平台:$PLATFORM"
|
||||
}
|
||||
|
||||
# 构建应用镜像
|
||||
build_app_image() {
|
||||
log_info "构建应用镜像 (weknora-app)..."
|
||||
|
||||
cd "$PROJECT_ROOT"
|
||||
|
||||
docker build \
|
||||
--platform $PLATFORM \
|
||||
--build-arg GOPRIVATE_ARG=${GOPRIVATE:-""} \
|
||||
--build-arg GOPROXY_ARG=${GOPROXY:-"https://goproxy.cn,direct"} \
|
||||
--build-arg GOSUMDB_ARG=${GOSUMDB:-"off"} \
|
||||
-f docker/Dockerfile.app \
|
||||
-t wechatopenai/weknora-app:${ARCH}-latest \
|
||||
.
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
log_success "应用镜像构建成功"
|
||||
return 0
|
||||
else
|
||||
log_error "应用镜像构建失败"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 构建文档读取器镜像
|
||||
build_docreader_image() {
|
||||
log_info "构建文档读取器镜像 (weknora-docreader)..."
|
||||
|
||||
cd "$PROJECT_ROOT"
|
||||
|
||||
docker build \
|
||||
--platform $PLATFORM \
|
||||
--build-arg PLATFORM=$PLATFORM \
|
||||
-f docker/Dockerfile.docreader \
|
||||
-t wechatopenai/weknora-docreader:${ARCH}-latest \
|
||||
.
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
log_success "文档读取器镜像构建成功"
|
||||
return 0
|
||||
else
|
||||
log_error "文档读取器镜像构建失败"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 构建前端镜像
|
||||
build_frontend_image() {
|
||||
log_info "构建前端镜像 (weknora-ui)..."
|
||||
|
||||
cd "$PROJECT_ROOT"
|
||||
|
||||
docker build \
|
||||
--platform $PLATFORM \
|
||||
-f frontend/Dockerfile \
|
||||
-t wechatopenai/weknora-ui:${ARCH}-latest \
|
||||
frontend/
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
log_success "前端镜像构建成功"
|
||||
return 0
|
||||
else
|
||||
log_error "前端镜像构建失败"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 构建所有镜像
|
||||
build_all_images() {
|
||||
log_info "开始构建所有镜像..."
|
||||
|
||||
local app_result=0
|
||||
local docreader_result=0
|
||||
local frontend_result=0
|
||||
|
||||
# 构建应用镜像
|
||||
build_app_image
|
||||
app_result=$?
|
||||
|
||||
# 构建文档读取器镜像
|
||||
build_docreader_image
|
||||
docreader_result=$?
|
||||
|
||||
# 构建前端镜像
|
||||
build_frontend_image
|
||||
frontend_result=$?
|
||||
|
||||
# 显示构建结果
|
||||
echo ""
|
||||
log_info "=== 构建结果 ==="
|
||||
if [ $app_result -eq 0 ]; then
|
||||
log_success "✓ 应用镜像构建成功"
|
||||
else
|
||||
log_error "✗ 应用镜像构建失败"
|
||||
fi
|
||||
|
||||
if [ $docreader_result -eq 0 ]; then
|
||||
log_success "✓ 文档读取器镜像构建成功"
|
||||
else
|
||||
log_error "✗ 文档读取器镜像构建失败"
|
||||
fi
|
||||
|
||||
if [ $frontend_result -eq 0 ]; then
|
||||
log_success "✓ 前端镜像构建成功"
|
||||
else
|
||||
log_error "✗ 前端镜像构建失败"
|
||||
fi
|
||||
|
||||
if [ $app_result -eq 0 ] && [ $docreader_result -eq 0 ] && [ $frontend_result -eq 0 ]; then
|
||||
log_success "所有镜像构建完成!"
|
||||
return 0
|
||||
else
|
||||
log_error "部分镜像构建失败"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 清理本地镜像
|
||||
clean_images() {
|
||||
log_info "清理本地WeKnora镜像..."
|
||||
|
||||
# 停止相关容器
|
||||
log_info "停止相关容器..."
|
||||
docker stop $(docker ps -q --filter "ancestor=wechatopenai/weknora-app:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
|
||||
docker stop $(docker ps -q --filter "ancestor=wechatopenai/weknora-docreader:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
|
||||
docker stop $(docker ps -q --filter "ancestor=wechatopenai/weknora-ui:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
|
||||
|
||||
# 删除相关容器
|
||||
log_info "删除相关容器..."
|
||||
docker rm $(docker ps -aq --filter "ancestor=wechatopenai/weknora-app:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
|
||||
docker rm $(docker ps -aq --filter "ancestor=wechatopenai/weknora-docreader:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
|
||||
docker rm $(docker ps -aq --filter "ancestor=wechatopenai/weknora-ui:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
|
||||
|
||||
# 删除镜像
|
||||
log_info "删除本地镜像..."
|
||||
docker rmi wechatopenai/weknora-app:${ARCH}-latest 2>/dev/null || true
|
||||
docker rmi wechatopenai/weknora-docreader:${ARCH}-latest 2>/dev/null || true
|
||||
docker rmi wechatopenai/weknora-ui:${ARCH}-latest 2>/dev/null || true
|
||||
|
||||
docker image prune -f
|
||||
|
||||
log_success "镜像清理完成"
|
||||
return 0
|
||||
}
|
||||
|
||||
# 解析命令行参数
|
||||
BUILD_ALL=false
|
||||
BUILD_APP=false
|
||||
BUILD_DOCREADER=false
|
||||
BUILD_FRONTEND=false
|
||||
CLEAN_IMAGES=false
|
||||
|
||||
# 没有参数时默认构建所有镜像
|
||||
if [ $# -eq 0 ]; then
|
||||
BUILD_ALL=true
|
||||
fi
|
||||
|
||||
while [ "$1" != "" ]; do
|
||||
case $1 in
|
||||
-h | --help ) show_help
|
||||
;;
|
||||
-a | --all ) BUILD_ALL=true
|
||||
;;
|
||||
-p | --app ) BUILD_APP=true
|
||||
;;
|
||||
-d | --docreader ) BUILD_DOCREADER=true
|
||||
;;
|
||||
-f | --frontend ) BUILD_FRONTEND=true
|
||||
;;
|
||||
-c | --clean ) CLEAN_IMAGES=true
|
||||
;;
|
||||
-v | --version ) show_version
|
||||
;;
|
||||
* ) log_error "未知选项: $1"
|
||||
show_help
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# 检查Docker环境
|
||||
check_docker
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 检测平台
|
||||
check_platform
|
||||
|
||||
# 执行清理操作
|
||||
if [ "$CLEAN_IMAGES" = true ]; then
|
||||
clean_images
|
||||
exit $?
|
||||
fi
|
||||
|
||||
# 执行构建操作
|
||||
if [ "$BUILD_ALL" = true ]; then
|
||||
build_all_images
|
||||
exit $?
|
||||
fi
|
||||
|
||||
if [ "$BUILD_APP" = true ]; then
|
||||
build_app_image
|
||||
exit $?
|
||||
fi
|
||||
|
||||
if [ "$BUILD_DOCREADER" = true ]; then
|
||||
build_docreader_image
|
||||
exit $?
|
||||
fi
|
||||
|
||||
if [ "$BUILD_FRONTEND" = true ]; then
|
||||
build_frontend_image
|
||||
exit $?
|
||||
fi
|
||||
|
||||
exit 0
|
||||
@@ -308,7 +308,7 @@ check_platform() {
|
||||
export PLATFORM="linux/amd64"
|
||||
export ARCH="amd64"
|
||||
fi
|
||||
log_info "当前平台:$PLATFORM"
|
||||
log_info "当前平台:$PLATFORM"
|
||||
}
|
||||
|
||||
# 启动Docker容器
|
||||
|
||||
Reference in New Issue
Block a user