diff --git a/Makefile b/Makefile index 609f8d6..ab06690 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,42 @@ -.PHONY: build run test clean docker-build docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama +.PHONY: help build run test clean docker-build docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama build-images build-images-app build-images-docreader build-images-frontend clean-images + +# Show help +help: + @echo "WeKnora Makefile 帮助" + @echo "" + @echo "基础命令:" + @echo " build 构建应用" + @echo " run 运行应用" + @echo " test 运行测试" + @echo " clean 清理构建文件" + @echo "" + @echo "Docker 命令:" + @echo " docker-build 构建 Docker 镜像" + @echo " docker-run 运行 Docker 容器" + @echo " docker-stop 停止 Docker 容器" + @echo " docker-restart 重启 Docker 容器" + @echo "" + @echo "服务管理:" + @echo " start-all 启动所有服务" + @echo " stop-all 停止所有服务" + @echo " start-ollama 仅启动 Ollama 服务" + @echo "" + @echo "镜像构建:" + @echo " build-images 从源码构建所有镜像" + @echo " build-images-app 从源码构建应用镜像" + @echo " build-images-docreader 从源码构建文档读取器镜像" + @echo " build-images-frontend 从源码构建前端镜像" + @echo " clean-images 清理本地镜像" + @echo "" + @echo "数据库:" + @echo " migrate-up 执行数据库迁移" + @echo " migrate-down 回滚数据库迁移" + @echo "" + @echo "开发工具:" + @echo " fmt 格式化代码" + @echo " lint 代码检查" + @echo " deps 安装依赖" + @echo " docs 生成 API 文档" # Go related variables BINARY_NAME=WeKnora @@ -53,6 +91,22 @@ stop-all: docker-stop: docker-compose down +# 从源码构建镜像相关命令 +build-images: + ./scripts/build_images.sh + +build-images-app: + ./scripts/build_images.sh --app + +build-images-docreader: + ./scripts/build_images.sh --docreader + +build-images-frontend: + ./scripts/build_images.sh --frontend + +clean-images: + ./scripts/build_images.sh --clean + # Restart Docker container (stop, rebuild, start) docker-restart: docker-compose stop -t 60 diff --git a/frontend/.dockerignore b/frontend/.dockerignore new file mode 100644 index 0000000..9e31322 --- /dev/null +++ b/frontend/.dockerignore @@ -0,0 +1,8 @@ +node_modules +dist +.git +.gitignore +README.md +.vscode +*.log +.DS_Store \ No newline at end of file diff --git a/internal/application/repository/chunk.go b/internal/application/repository/chunk.go index 053cfc5..2b01251 100644 --- a/internal/application/repository/chunk.go +++ b/internal/application/repository/chunk.go @@ -4,6 +4,7 @@ import ( "context" "errors" + "github.com/Tencent/WeKnora/internal/common" "github.com/Tencent/WeKnora/internal/types" "github.com/Tencent/WeKnora/internal/types/interfaces" "gorm.io/gorm" @@ -21,6 +22,9 @@ func NewChunkRepository(db *gorm.DB) interfaces.ChunkRepository { // CreateChunks creates multiple chunks in batches func (r *chunkRepository) CreateChunks(ctx context.Context, chunks []*types.Chunk) error { + for _, chunk := range chunks { + chunk.Content = common.CleanInvalidUTF8(chunk.Content) + } return r.db.WithContext(ctx).CreateInBatches(chunks, 100).Error } diff --git a/internal/application/repository/retriever/postgres/structs.go b/internal/application/repository/retriever/postgres/structs.go index 1f5eafd..45fbc0e 100644 --- a/internal/application/repository/retriever/postgres/structs.go +++ b/internal/application/repository/retriever/postgres/structs.go @@ -6,6 +6,7 @@ import ( "strconv" "time" + "github.com/Tencent/WeKnora/internal/common" "github.com/Tencent/WeKnora/internal/types" "github.com/pgvector/pgvector-go" ) @@ -59,7 +60,7 @@ func toDBVectorEmbedding(indexInfo *types.IndexInfo, additionalParams map[string ChunkID: indexInfo.ChunkID, KnowledgeID: indexInfo.KnowledgeID, KnowledgeBaseID: indexInfo.KnowledgeBaseID, - Content: indexInfo.Content, + Content: common.CleanInvalidUTF8(indexInfo.Content), } // Add embedding data if available in additionalParams if additionalParams != nil && slices.Contains(slices.Collect(maps.Keys(additionalParams)), "embedding") { diff --git a/internal/common/tools.go b/internal/common/tools.go index 3e79632..7811827 100644 --- a/internal/common/tools.go +++ b/internal/common/tools.go @@ -6,6 +6,7 @@ import ( "regexp" "slices" "strings" + "unicode/utf8" ) // ToInterfaceSlice converts a slice of strings to a slice of empty interfaces. @@ -73,3 +74,27 @@ func ParseLLMJsonResponse(content string, target interface{}) error { // If no code block found, return the original error return err } + +// CleanInvalidUTF8 移除字符串中的非法 UTF-8 字符和 \x00 +func CleanInvalidUTF8(s string) string { + var b strings.Builder + b.Grow(len(s)) + + for i := 0; i < len(s); { + r, size := utf8.DecodeRuneInString(s[i:]) + if r == utf8.RuneError && size == 1 { + // 非法 UTF-8 字节,跳过 + i++ + continue + } + if r == 0 { + // NULL 字符 \x00,跳过 + i += size + continue + } + b.WriteRune(r) + i += size + } + + return b.String() +} diff --git a/scripts/build_images.sh b/scripts/build_images.sh new file mode 100755 index 0000000..9885455 --- /dev/null +++ b/scripts/build_images.sh @@ -0,0 +1,309 @@ +#!/bin/bash +# 该脚本用于从源码构建WeKnora的所有Docker镜像 + +# 设置颜色 +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +BLUE='\033[0;34m' +NC='\033[0m' # 无颜色 + +# 获取项目根目录(脚本所在目录的上一级) +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +PROJECT_ROOT="$( cd "$SCRIPT_DIR/.." && pwd )" + +# 版本信息 +VERSION="1.0.0" +SCRIPT_NAME=$(basename "$0") + +# 显示帮助信息 +show_help() { + echo -e "${GREEN}WeKnora 镜像构建脚本 v${VERSION}${NC}" + echo -e "${GREEN}用法:${NC} $0 [选项]" + echo "选项:" + echo " -h, --help 显示帮助信息" + echo " -a, --all 构建所有镜像(默认)" + echo " -p, --app 仅构建应用镜像" + echo " -d, --docreader 仅构建文档读取器镜像" + echo " -f, --frontend 仅构建前端镜像" + echo " -c, --clean 清理所有本地镜像" + echo " -v, --version 显示版本信息" + exit 0 +} + +# 显示版本信息 +show_version() { + echo -e "${GREEN}WeKnora 镜像构建脚本 v${VERSION}${NC}" + exit 0 +} + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +# 检查Docker是否已安装 +check_docker() { + log_info "检查Docker环境..." + + if ! command -v docker &> /dev/null; then + log_error "未安装Docker,请先安装Docker" + return 1 + fi + + # 检查Docker服务运行状态 + if ! docker info &> /dev/null; then + log_error "Docker服务未运行,请启动Docker服务" + return 1 + fi + + log_success "Docker环境检查通过" + return 0 +} + +# 检测平台 +check_platform() { + log_info "检测系统平台信息..." + if [ "$(uname -m)" = "x86_64" ]; then + export PLATFORM="linux/amd64" + export ARCH="amd64" + elif [ "$(uname -m)" = "aarch64" ] || [ "$(uname -m)" = "arm64" ]; then + export PLATFORM="linux/arm64" + export ARCH="arm64" + else + log_warning "未识别的平台类型:$(uname -m),将使用默认平台 linux/amd64" + export PLATFORM="linux/amd64" + export ARCH="amd64" + fi + log_info "当前平台:$PLATFORM" +} + +# 构建应用镜像 +build_app_image() { + log_info "构建应用镜像 (weknora-app)..." + + cd "$PROJECT_ROOT" + + docker build \ + --platform $PLATFORM \ + --build-arg GOPRIVATE_ARG=${GOPRIVATE:-""} \ + --build-arg GOPROXY_ARG=${GOPROXY:-"https://goproxy.cn,direct"} \ + --build-arg GOSUMDB_ARG=${GOSUMDB:-"off"} \ + -f docker/Dockerfile.app \ + -t wechatopenai/weknora-app:${ARCH}-latest \ + . + + if [ $? -eq 0 ]; then + log_success "应用镜像构建成功" + return 0 + else + log_error "应用镜像构建失败" + return 1 + fi +} + +# 构建文档读取器镜像 +build_docreader_image() { + log_info "构建文档读取器镜像 (weknora-docreader)..." + + cd "$PROJECT_ROOT" + + docker build \ + --platform $PLATFORM \ + --build-arg PLATFORM=$PLATFORM \ + -f docker/Dockerfile.docreader \ + -t wechatopenai/weknora-docreader:${ARCH}-latest \ + . + + if [ $? -eq 0 ]; then + log_success "文档读取器镜像构建成功" + return 0 + else + log_error "文档读取器镜像构建失败" + return 1 + fi +} + +# 构建前端镜像 +build_frontend_image() { + log_info "构建前端镜像 (weknora-ui)..." + + cd "$PROJECT_ROOT" + + docker build \ + --platform $PLATFORM \ + -f frontend/Dockerfile \ + -t wechatopenai/weknora-ui:${ARCH}-latest \ + frontend/ + + if [ $? -eq 0 ]; then + log_success "前端镜像构建成功" + return 0 + else + log_error "前端镜像构建失败" + return 1 + fi +} + +# 构建所有镜像 +build_all_images() { + log_info "开始构建所有镜像..." + + local app_result=0 + local docreader_result=0 + local frontend_result=0 + + # 构建应用镜像 + build_app_image + app_result=$? + + # 构建文档读取器镜像 + build_docreader_image + docreader_result=$? + + # 构建前端镜像 + build_frontend_image + frontend_result=$? + + # 显示构建结果 + echo "" + log_info "=== 构建结果 ===" + if [ $app_result -eq 0 ]; then + log_success "✓ 应用镜像构建成功" + else + log_error "✗ 应用镜像构建失败" + fi + + if [ $docreader_result -eq 0 ]; then + log_success "✓ 文档读取器镜像构建成功" + else + log_error "✗ 文档读取器镜像构建失败" + fi + + if [ $frontend_result -eq 0 ]; then + log_success "✓ 前端镜像构建成功" + else + log_error "✗ 前端镜像构建失败" + fi + + if [ $app_result -eq 0 ] && [ $docreader_result -eq 0 ] && [ $frontend_result -eq 0 ]; then + log_success "所有镜像构建完成!" + return 0 + else + log_error "部分镜像构建失败" + return 1 + fi +} + +# 清理本地镜像 +clean_images() { + log_info "清理本地WeKnora镜像..." + + # 停止相关容器 + log_info "停止相关容器..." + docker stop $(docker ps -q --filter "ancestor=wechatopenai/weknora-app:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true + docker stop $(docker ps -q --filter "ancestor=wechatopenai/weknora-docreader:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true + docker stop $(docker ps -q --filter "ancestor=wechatopenai/weknora-ui:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true + + # 删除相关容器 + log_info "删除相关容器..." + docker rm $(docker ps -aq --filter "ancestor=wechatopenai/weknora-app:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true + docker rm $(docker ps -aq --filter "ancestor=wechatopenai/weknora-docreader:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true + docker rm $(docker ps -aq --filter "ancestor=wechatopenai/weknora-ui:${ARCH}-latest" 2>/dev/null) 2>/dev/null || true + + # 删除镜像 + log_info "删除本地镜像..." + docker rmi wechatopenai/weknora-app:${ARCH}-latest 2>/dev/null || true + docker rmi wechatopenai/weknora-docreader:${ARCH}-latest 2>/dev/null || true + docker rmi wechatopenai/weknora-ui:${ARCH}-latest 2>/dev/null || true + + docker image prune -f + + log_success "镜像清理完成" + return 0 +} + +# 解析命令行参数 +BUILD_ALL=false +BUILD_APP=false +BUILD_DOCREADER=false +BUILD_FRONTEND=false +CLEAN_IMAGES=false + +# 没有参数时默认构建所有镜像 +if [ $# -eq 0 ]; then + BUILD_ALL=true +fi + +while [ "$1" != "" ]; do + case $1 in + -h | --help ) show_help + ;; + -a | --all ) BUILD_ALL=true + ;; + -p | --app ) BUILD_APP=true + ;; + -d | --docreader ) BUILD_DOCREADER=true + ;; + -f | --frontend ) BUILD_FRONTEND=true + ;; + -c | --clean ) CLEAN_IMAGES=true + ;; + -v | --version ) show_version + ;; + * ) log_error "未知选项: $1" + show_help + ;; + esac + shift +done + +# 检查Docker环境 +check_docker +if [ $? -ne 0 ]; then + exit 1 +fi + +# 检测平台 +check_platform + +# 执行清理操作 +if [ "$CLEAN_IMAGES" = true ]; then + clean_images + exit $? +fi + +# 执行构建操作 +if [ "$BUILD_ALL" = true ]; then + build_all_images + exit $? +fi + +if [ "$BUILD_APP" = true ]; then + build_app_image + exit $? +fi + +if [ "$BUILD_DOCREADER" = true ]; then + build_docreader_image + exit $? +fi + +if [ "$BUILD_FRONTEND" = true ]; then + build_frontend_image + exit $? +fi + +exit 0 \ No newline at end of file diff --git a/scripts/start_all.sh b/scripts/start_all.sh index 154e7f0..c9b5fc5 100755 --- a/scripts/start_all.sh +++ b/scripts/start_all.sh @@ -308,7 +308,7 @@ check_platform() { export PLATFORM="linux/amd64" export ARCH="amd64" fi - log_info "当前平台:$PLATFORM" + log_info "当前平台:$PLATFORM" } # 启动Docker容器