fix: ignore non-utf8 words when saving to db

This commit is contained in:
wizardchen
2025-08-21 19:47:12 +08:00
committed by lyingbug
parent 53d8b13aad
commit 1eed11fc4f
7 changed files with 404 additions and 3 deletions

View File

@@ -1,4 +1,42 @@
.PHONY: build run test clean docker-build docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama
.PHONY: help build run test clean docker-build docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama build-images build-images-app build-images-docreader build-images-frontend clean-images
# Show help
help:
@echo "WeKnora Makefile 帮助"
@echo ""
@echo "基础命令:"
@echo " build 构建应用"
@echo " run 运行应用"
@echo " test 运行测试"
@echo " clean 清理构建文件"
@echo ""
@echo "Docker 命令:"
@echo " docker-build 构建 Docker 镜像"
@echo " docker-run 运行 Docker 容器"
@echo " docker-stop 停止 Docker 容器"
@echo " docker-restart 重启 Docker 容器"
@echo ""
@echo "服务管理:"
@echo " start-all 启动所有服务"
@echo " stop-all 停止所有服务"
@echo " start-ollama 仅启动 Ollama 服务"
@echo ""
@echo "镜像构建:"
@echo " build-images 从源码构建所有镜像"
@echo " build-images-app 从源码构建应用镜像"
@echo " build-images-docreader 从源码构建文档读取器镜像"
@echo " build-images-frontend 从源码构建前端镜像"
@echo " clean-images 清理本地镜像"
@echo ""
@echo "数据库:"
@echo " migrate-up 执行数据库迁移"
@echo " migrate-down 回滚数据库迁移"
@echo ""
@echo "开发工具:"
@echo " fmt 格式化代码"
@echo " lint 代码检查"
@echo " deps 安装依赖"
@echo " docs 生成 API 文档"
# Go related variables
BINARY_NAME=WeKnora
@@ -53,6 +91,22 @@ stop-all:
docker-stop:
docker-compose down
# 从源码构建镜像相关命令
build-images:
./scripts/build_images.sh
build-images-app:
./scripts/build_images.sh --app
build-images-docreader:
./scripts/build_images.sh --docreader
build-images-frontend:
./scripts/build_images.sh --frontend
clean-images:
./scripts/build_images.sh --clean
# Restart Docker container (stop, rebuild, start)
docker-restart:
docker-compose stop -t 60

8
frontend/.dockerignore Normal file
View File

@@ -0,0 +1,8 @@
node_modules
dist
.git
.gitignore
README.md
.vscode
*.log
.DS_Store

View File

@@ -4,6 +4,7 @@ import (
"context"
"errors"
"github.com/Tencent/WeKnora/internal/common"
"github.com/Tencent/WeKnora/internal/types"
"github.com/Tencent/WeKnora/internal/types/interfaces"
"gorm.io/gorm"
@@ -21,6 +22,9 @@ func NewChunkRepository(db *gorm.DB) interfaces.ChunkRepository {
// CreateChunks creates multiple chunks in batches
func (r *chunkRepository) CreateChunks(ctx context.Context, chunks []*types.Chunk) error {
for _, chunk := range chunks {
chunk.Content = common.CleanInvalidUTF8(chunk.Content)
}
return r.db.WithContext(ctx).CreateInBatches(chunks, 100).Error
}

View File

@@ -6,6 +6,7 @@ import (
"strconv"
"time"
"github.com/Tencent/WeKnora/internal/common"
"github.com/Tencent/WeKnora/internal/types"
"github.com/pgvector/pgvector-go"
)
@@ -59,7 +60,7 @@ func toDBVectorEmbedding(indexInfo *types.IndexInfo, additionalParams map[string
ChunkID: indexInfo.ChunkID,
KnowledgeID: indexInfo.KnowledgeID,
KnowledgeBaseID: indexInfo.KnowledgeBaseID,
Content: indexInfo.Content,
Content: common.CleanInvalidUTF8(indexInfo.Content),
}
// Add embedding data if available in additionalParams
if additionalParams != nil && slices.Contains(slices.Collect(maps.Keys(additionalParams)), "embedding") {

View File

@@ -6,6 +6,7 @@ import (
"regexp"
"slices"
"strings"
"unicode/utf8"
)
// ToInterfaceSlice converts a slice of strings to a slice of empty interfaces.
@@ -73,3 +74,27 @@ func ParseLLMJsonResponse(content string, target interface{}) error {
// If no code block found, return the original error
return err
}
// CleanInvalidUTF8 移除字符串中的非法 UTF-8 字符和 \x00
func CleanInvalidUTF8(s string) string {
var b strings.Builder
b.Grow(len(s))
for i := 0; i < len(s); {
r, size := utf8.DecodeRuneInString(s[i:])
if r == utf8.RuneError && size == 1 {
// 非法 UTF-8 字节,跳过
i++
continue
}
if r == 0 {
// NULL 字符 \x00跳过
i += size
continue
}
b.WriteRune(r)
i += size
}
return b.String()
}

309
scripts/build_images.sh Executable file
View File

@@ -0,0 +1,309 @@
#!/bin/bash
# 该脚本用于从源码构建WeKnora的所有Docker镜像
# 设置颜色
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
BLUE='\033[0;34m'
NC='\033[0m' # 无颜色
# 获取项目根目录(脚本所在目录的上一级)
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PROJECT_ROOT="$( cd "$SCRIPT_DIR/.." && pwd )"
# 版本信息
VERSION="1.0.0"
SCRIPT_NAME=$(basename "$0")
# 显示帮助信息
show_help() {
echo -e "${GREEN}WeKnora 镜像构建脚本 v${VERSION}${NC}"
echo -e "${GREEN}用法:${NC} $0 [选项]"
echo "选项:"
echo " -h, --help 显示帮助信息"
echo " -a, --all 构建所有镜像(默认)"
echo " -p, --app 仅构建应用镜像"
echo " -d, --docreader 仅构建文档读取器镜像"
echo " -f, --frontend 仅构建前端镜像"
echo " -c, --clean 清理所有本地镜像"
echo " -v, --version 显示版本信息"
exit 0
}
# 显示版本信息
show_version() {
echo -e "${GREEN}WeKnora 镜像构建脚本 v${VERSION}${NC}"
exit 0
}
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
# 检查Docker是否已安装
check_docker() {
log_info "检查Docker环境..."
if ! command -v docker &> /dev/null; then
log_error "未安装Docker请先安装Docker"
return 1
fi
# 检查Docker服务运行状态
if ! docker info &> /dev/null; then
log_error "Docker服务未运行请启动Docker服务"
return 1
fi
log_success "Docker环境检查通过"
return 0
}
# 检测平台
check_platform() {
log_info "检测系统平台信息..."
if [ "$(uname -m)" = "x86_64" ]; then
export PLATFORM="linux/amd64"
export ARCH="amd64"
elif [ "$(uname -m)" = "aarch64" ] || [ "$(uname -m)" = "arm64" ]; then
export PLATFORM="linux/arm64"
export ARCH="arm64"
else
log_warning "未识别的平台类型:$(uname -m),将使用默认平台 linux/amd64"
export PLATFORM="linux/amd64"
export ARCH="amd64"
fi
log_info "当前平台:$PLATFORM"
}
# 构建应用镜像
build_app_image() {
log_info "构建应用镜像 (weknora-app)..."
cd "$PROJECT_ROOT"
docker build \
--platform $PLATFORM \
--build-arg GOPRIVATE_ARG=${GOPRIVATE:-""} \
--build-arg GOPROXY_ARG=${GOPROXY:-"https://goproxy.cn,direct"} \
--build-arg GOSUMDB_ARG=${GOSUMDB:-"off"} \
-f docker/Dockerfile.app \
-t wechatopenai/weknora-app:${ARCH}-latest \
.
if [ $? -eq 0 ]; then
log_success "应用镜像构建成功"
return 0
else
log_error "应用镜像构建失败"
return 1
fi
}
# 构建文档读取器镜像
build_docreader_image() {
log_info "构建文档读取器镜像 (weknora-docreader)..."
cd "$PROJECT_ROOT"
docker build \
--platform $PLATFORM \
--build-arg PLATFORM=$PLATFORM \
-f docker/Dockerfile.docreader \
-t wechatopenai/weknora-docreader:${ARCH}-latest \
.
if [ $? -eq 0 ]; then
log_success "文档读取器镜像构建成功"
return 0
else
log_error "文档读取器镜像构建失败"
return 1
fi
}
# 构建前端镜像
build_frontend_image() {
log_info "构建前端镜像 (weknora-ui)..."
cd "$PROJECT_ROOT"
docker build \
--platform $PLATFORM \
-f frontend/Dockerfile \
-t wechatopenai/weknora-ui:${ARCH}-latest \
frontend/
if [ $? -eq 0 ]; then
log_success "前端镜像构建成功"
return 0
else
log_error "前端镜像构建失败"
return 1
fi
}
# 构建所有镜像
build_all_images() {
log_info "开始构建所有镜像..."
local app_result=0
local docreader_result=0
local frontend_result=0
# 构建应用镜像
build_app_image
app_result=$?
# 构建文档读取器镜像
build_docreader_image
docreader_result=$?
# 构建前端镜像
build_frontend_image
frontend_result=$?
# 显示构建结果
echo ""
log_info "=== 构建结果 ==="
if [ $app_result -eq 0 ]; then
log_success "✓ 应用镜像构建成功"
else
log_error "✗ 应用镜像构建失败"
fi
if [ $docreader_result -eq 0 ]; then
log_success "✓ 文档读取器镜像构建成功"
else
log_error "✗ 文档读取器镜像构建失败"
fi
if [ $frontend_result -eq 0 ]; then
log_success "✓ 前端镜像构建成功"
else
log_error "✗ 前端镜像构建失败"
fi
if [ $app_result -eq 0 ] && [ $docreader_result -eq 0 ] && [ $frontend_result -eq 0 ]; then
log_success "所有镜像构建完成!"
return 0
else
log_error "部分镜像构建失败"
return 1
fi
}
# 清理本地镜像
clean_images() {
log_info "清理本地WeKnora镜像..."
# 停止相关容器
log_info "停止相关容器..."
docker stop $(docker ps -q --filter "ancestor=wechatopenai/weknora-app:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
docker stop $(docker ps -q --filter "ancestor=wechatopenai/weknora-docreader:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
docker stop $(docker ps -q --filter "ancestor=wechatopenai/weknora-ui:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
# 删除相关容器
log_info "删除相关容器..."
docker rm $(docker ps -aq --filter "ancestor=wechatopenai/weknora-app:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
docker rm $(docker ps -aq --filter "ancestor=wechatopenai/weknora-docreader:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
docker rm $(docker ps -aq --filter "ancestor=wechatopenai/weknora-ui:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true
# 删除镜像
log_info "删除本地镜像..."
docker rmi wechatopenai/weknora-app:${ARCH}-latest 2>/dev/null || true
docker rmi wechatopenai/weknora-docreader:${ARCH}-latest 2>/dev/null || true
docker rmi wechatopenai/weknora-ui:${ARCH}-latest 2>/dev/null || true
docker image prune -f
log_success "镜像清理完成"
return 0
}
# 解析命令行参数
BUILD_ALL=false
BUILD_APP=false
BUILD_DOCREADER=false
BUILD_FRONTEND=false
CLEAN_IMAGES=false
# 没有参数时默认构建所有镜像
if [ $# -eq 0 ]; then
BUILD_ALL=true
fi
while [ "$1" != "" ]; do
case $1 in
-h | --help ) show_help
;;
-a | --all ) BUILD_ALL=true
;;
-p | --app ) BUILD_APP=true
;;
-d | --docreader ) BUILD_DOCREADER=true
;;
-f | --frontend ) BUILD_FRONTEND=true
;;
-c | --clean ) CLEAN_IMAGES=true
;;
-v | --version ) show_version
;;
* ) log_error "未知选项: $1"
show_help
;;
esac
shift
done
# 检查Docker环境
check_docker
if [ $? -ne 0 ]; then
exit 1
fi
# 检测平台
check_platform
# 执行清理操作
if [ "$CLEAN_IMAGES" = true ]; then
clean_images
exit $?
fi
# 执行构建操作
if [ "$BUILD_ALL" = true ]; then
build_all_images
exit $?
fi
if [ "$BUILD_APP" = true ]; then
build_app_image
exit $?
fi
if [ "$BUILD_DOCREADER" = true ]; then
build_docreader_image
exit $?
fi
if [ "$BUILD_FRONTEND" = true ]; then
build_frontend_image
exit $?
fi
exit 0

View File

@@ -308,7 +308,7 @@ check_platform() {
export PLATFORM="linux/amd64"
export ARCH="amd64"
fi
log_info "当前平台:$PLATFORM"
log_info "当前平台:$PLATFORM"
}
# 启动Docker容器