|
|
|
|
@@ -29,6 +29,7 @@ import (
|
|
|
|
|
"github.com/Tencent/WeKnora/services/docreader/src/proto"
|
|
|
|
|
"github.com/google/uuid"
|
|
|
|
|
"go.opentelemetry.io/otel/attribute"
|
|
|
|
|
"golang.org/x/sync/errgroup"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// Error definitions for knowledge service operations
|
|
|
|
|
@@ -43,6 +44,8 @@ var (
|
|
|
|
|
ErrDuplicateFile = errors.New("file already exists")
|
|
|
|
|
// ErrDuplicateURL is returned when trying to add a URL that already exists
|
|
|
|
|
ErrDuplicateURL = errors.New("URL already exists")
|
|
|
|
|
// ErrImageNotParse is returned when trying to update image information without enabling multimodel
|
|
|
|
|
ErrImageNotParse = errors.New("image not parse without enable multimodel")
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// knowledgeService implements the knowledge service interface
|
|
|
|
|
@@ -86,7 +89,7 @@ func NewKnowledgeService(
|
|
|
|
|
|
|
|
|
|
// CreateKnowledgeFromFile creates a knowledge entry from an uploaded file
|
|
|
|
|
func (s *knowledgeService) CreateKnowledgeFromFile(ctx context.Context,
|
|
|
|
|
kbID string, file *multipart.FileHeader, metadata map[string]string,
|
|
|
|
|
kbID string, file *multipart.FileHeader, metadata map[string]string, enableMultimodel *bool,
|
|
|
|
|
) (*types.Knowledge, error) {
|
|
|
|
|
logger.Info(ctx, "Start creating knowledge from file")
|
|
|
|
|
logger.Infof(ctx, "Knowledge base ID: %s, file: %s", kbID, file.Filename)
|
|
|
|
|
@@ -203,7 +206,10 @@ func (s *knowledgeService) CreateKnowledgeFromFile(ctx context.Context,
|
|
|
|
|
// Process document asynchronously
|
|
|
|
|
logger.Info(ctx, "Starting asynchronous document processing")
|
|
|
|
|
newCtx := logger.CloneContext(ctx)
|
|
|
|
|
go s.processDocument(newCtx, kb, knowledge, file)
|
|
|
|
|
if enableMultimodel == nil {
|
|
|
|
|
enableMultimodel = &kb.ChunkingConfig.EnableMultimodal
|
|
|
|
|
}
|
|
|
|
|
go s.processDocument(newCtx, kb, knowledge, file, *enableMultimodel)
|
|
|
|
|
|
|
|
|
|
logger.Infof(ctx, "Knowledge from file created successfully, ID: %s", knowledge.ID)
|
|
|
|
|
return knowledge, nil
|
|
|
|
|
@@ -211,7 +217,7 @@ func (s *knowledgeService) CreateKnowledgeFromFile(ctx context.Context,
|
|
|
|
|
|
|
|
|
|
// CreateKnowledgeFromURL creates a knowledge entry from a URL source
|
|
|
|
|
func (s *knowledgeService) CreateKnowledgeFromURL(ctx context.Context,
|
|
|
|
|
kbID string, url string,
|
|
|
|
|
kbID string, url string, enableMultimodel *bool,
|
|
|
|
|
) (*types.Knowledge, error) {
|
|
|
|
|
logger.Info(ctx, "Start creating knowledge from URL")
|
|
|
|
|
logger.Infof(ctx, "Knowledge base ID: %s, URL: %s", kbID, url)
|
|
|
|
|
@@ -285,7 +291,10 @@ func (s *knowledgeService) CreateKnowledgeFromURL(ctx context.Context,
|
|
|
|
|
|
|
|
|
|
// Process URL asynchronously
|
|
|
|
|
logger.Info(ctx, "Starting asynchronous URL processing")
|
|
|
|
|
go s.processDocumentFromURL(ctx, kb, knowledge, url)
|
|
|
|
|
if enableMultimodel == nil {
|
|
|
|
|
enableMultimodel = &kb.ChunkingConfig.EnableMultimodal
|
|
|
|
|
}
|
|
|
|
|
go s.processDocumentFromURL(ctx, kb, knowledge, url, *enableMultimodel)
|
|
|
|
|
|
|
|
|
|
logger.Infof(ctx, "Knowledge from URL created successfully, ID: %s", knowledge.ID)
|
|
|
|
|
return knowledge, nil
|
|
|
|
|
@@ -383,33 +392,53 @@ func (s *knowledgeService) DeleteKnowledge(ctx context.Context, id string) error
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
wg := errgroup.Group{}
|
|
|
|
|
// Delete knowledge embeddings from vector store
|
|
|
|
|
tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant)
|
|
|
|
|
retrieveEngine, err := retriever.NewCompositeRetrieveEngine(tenantInfo.RetrieverEngines.Engines)
|
|
|
|
|
if err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed")
|
|
|
|
|
}
|
|
|
|
|
embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, knowledge.EmbeddingModelID)
|
|
|
|
|
if err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if err := retrieveEngine.DeleteByKnowledgeIDList(ctx, []string{id}, embeddingModel.GetDimensions()); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed")
|
|
|
|
|
}
|
|
|
|
|
// Delete all chunks associated with this knowledge
|
|
|
|
|
if err := s.chunkService.DeleteChunksByKnowledgeID(ctx, id); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete chunks failed")
|
|
|
|
|
}
|
|
|
|
|
// Delete the physical file if it exists
|
|
|
|
|
if knowledge.FilePath != "" {
|
|
|
|
|
if err := s.fileSvc.DeleteFile(ctx, knowledge.FilePath); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete file failed")
|
|
|
|
|
wg.Go(func() error {
|
|
|
|
|
tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant)
|
|
|
|
|
retrieveEngine, err := retriever.NewCompositeRetrieveEngine(tenantInfo.RetrieverEngines.Engines)
|
|
|
|
|
if err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed")
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
tenantInfo.StorageUsed -= knowledge.StorageSize
|
|
|
|
|
if err := s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, -knowledge.StorageSize); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge update tenant storage used failed")
|
|
|
|
|
embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, knowledge.EmbeddingModelID)
|
|
|
|
|
if err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed")
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
if err := retrieveEngine.DeleteByKnowledgeIDList(ctx, []string{knowledge.ID}, embeddingModel.GetDimensions()); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed")
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// Delete all chunks associated with this knowledge
|
|
|
|
|
wg.Go(func() error {
|
|
|
|
|
if err := s.chunkService.DeleteChunksByKnowledgeID(ctx, knowledge.ID); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete chunks failed")
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// Delete the physical file if it exists
|
|
|
|
|
wg.Go(func() error {
|
|
|
|
|
if knowledge.FilePath != "" {
|
|
|
|
|
if err := s.fileSvc.DeleteFile(ctx, knowledge.FilePath); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete file failed")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant)
|
|
|
|
|
tenantInfo.StorageUsed -= knowledge.StorageSize
|
|
|
|
|
if err := s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, -knowledge.StorageSize); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge update tenant storage used failed")
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if err = wg.Wait(); err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
// Delete the knowledge entry itself from the database
|
|
|
|
|
return s.repo.DeleteKnowledge(ctx, ctx.Value(types.TenantIDContextKey).(uint), id)
|
|
|
|
|
@@ -420,55 +449,70 @@ func (s *knowledgeService) DeleteKnowledgeList(ctx context.Context, ids []string
|
|
|
|
|
if len(ids) == 0 {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant)
|
|
|
|
|
// 1. Get the knowledge entry
|
|
|
|
|
tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant)
|
|
|
|
|
knowledgeList, err := s.repo.GetKnowledgeBatch(ctx, tenantInfo.ID, ids)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wg := errgroup.Group{}
|
|
|
|
|
// 2. Delete knowledge embeddings from vector store
|
|
|
|
|
retrieveEngine, err := retriever.NewCompositeRetrieveEngine(tenantInfo.RetrieverEngines.Engines)
|
|
|
|
|
if err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed")
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
group := map[string][]string{}
|
|
|
|
|
for _, knowledge := range knowledgeList {
|
|
|
|
|
group[knowledge.EmbeddingModelID] = append(group[knowledge.EmbeddingModelID], knowledge.ID)
|
|
|
|
|
}
|
|
|
|
|
for embeddingModelID, knowledgeList := range group {
|
|
|
|
|
embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, embeddingModelID)
|
|
|
|
|
wg.Go(func() error {
|
|
|
|
|
tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant)
|
|
|
|
|
retrieveEngine, err := retriever.NewCompositeRetrieveEngine(tenantInfo.RetrieverEngines.Engines)
|
|
|
|
|
if err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge get embedding model failed")
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if err := retrieveEngine.DeleteByKnowledgeIDList(ctx, knowledgeList, embeddingModel.GetDimensions()); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed")
|
|
|
|
|
continue
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 3. Delete all chunks associated with this knowledge
|
|
|
|
|
if err := s.chunkService.DeleteByKnowledgeList(ctx, ids); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete chunks failed")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 4. Delete the physical file if it exists
|
|
|
|
|
storageAdjust := int64(0)
|
|
|
|
|
for _, knowledge := range knowledgeList {
|
|
|
|
|
if knowledge.FilePath != "" {
|
|
|
|
|
if err := s.fileSvc.DeleteFile(ctx, knowledge.FilePath); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete file failed")
|
|
|
|
|
group := map[string][]string{}
|
|
|
|
|
for _, knowledge := range knowledgeList {
|
|
|
|
|
group[knowledge.EmbeddingModelID] = append(group[knowledge.EmbeddingModelID], knowledge.ID)
|
|
|
|
|
}
|
|
|
|
|
for embeddingModelID, knowledgeList := range group {
|
|
|
|
|
embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, embeddingModelID)
|
|
|
|
|
if err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge get embedding model failed")
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
if err := retrieveEngine.DeleteByKnowledgeIDList(ctx, knowledgeList, embeddingModel.GetDimensions()); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed")
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
storageAdjust -= knowledge.StorageSize
|
|
|
|
|
}
|
|
|
|
|
tenantInfo.StorageUsed += storageAdjust
|
|
|
|
|
if err := s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, storageAdjust); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge update tenant storage used failed")
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 3. Delete all chunks associated with this knowledge
|
|
|
|
|
wg.Go(func() error {
|
|
|
|
|
if err := s.chunkService.DeleteByKnowledgeList(ctx, ids); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete chunks failed")
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 4. Delete the physical file if it exists
|
|
|
|
|
wg.Go(func() error {
|
|
|
|
|
storageAdjust := int64(0)
|
|
|
|
|
for _, knowledge := range knowledgeList {
|
|
|
|
|
if knowledge.FilePath != "" {
|
|
|
|
|
if err := s.fileSvc.DeleteFile(ctx, knowledge.FilePath); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete file failed")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
storageAdjust -= knowledge.StorageSize
|
|
|
|
|
}
|
|
|
|
|
tenantInfo.StorageUsed += storageAdjust
|
|
|
|
|
if err := s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, storageAdjust); err != nil {
|
|
|
|
|
logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge update tenant storage used failed")
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if err = wg.Wait(); err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
// 5. Delete the knowledge entry itself from the database
|
|
|
|
|
return s.repo.DeleteKnowledgeList(ctx, tenantInfo.ID, ids)
|
|
|
|
|
}
|
|
|
|
|
@@ -531,8 +575,10 @@ func (s *knowledgeService) cloneKnowledge(ctx context.Context, src *types.Knowle
|
|
|
|
|
|
|
|
|
|
// processDocument handles asynchronous processing of document files
|
|
|
|
|
func (s *knowledgeService) processDocument(ctx context.Context,
|
|
|
|
|
kb *types.KnowledgeBase, knowledge *types.Knowledge, file *multipart.FileHeader,
|
|
|
|
|
kb *types.KnowledgeBase, knowledge *types.Knowledge, file *multipart.FileHeader, enableMultimodel bool,
|
|
|
|
|
) {
|
|
|
|
|
logger.GetLogger(ctx).Infof("processDocument enableMultimodel: %v", enableMultimodel)
|
|
|
|
|
|
|
|
|
|
ctx, span := tracing.ContextWithSpan(ctx, "knowledgeService.processDocument")
|
|
|
|
|
defer span.End()
|
|
|
|
|
span.SetAttributes(
|
|
|
|
|
@@ -545,7 +591,17 @@ func (s *knowledgeService) processDocument(ctx context.Context,
|
|
|
|
|
attribute.String("file_path", knowledge.FilePath),
|
|
|
|
|
attribute.Int64("file_size", knowledge.FileSize),
|
|
|
|
|
attribute.String("embedding_model", knowledge.EmbeddingModelID),
|
|
|
|
|
attribute.Bool("enable_multimodal", enableMultimodel),
|
|
|
|
|
)
|
|
|
|
|
if !enableMultimodel && IsImageType(knowledge.FileType) {
|
|
|
|
|
logger.GetLogger(ctx).WithField("knowledge_id", knowledge.ID).
|
|
|
|
|
WithField("error", ErrImageNotParse).Errorf("processDocument image without enable multimodel")
|
|
|
|
|
knowledge.ParseStatus = "failed"
|
|
|
|
|
knowledge.ErrorMessage = ErrImageNotParse.Error()
|
|
|
|
|
s.repo.UpdateKnowledge(ctx, knowledge)
|
|
|
|
|
span.RecordError(ErrImageNotParse)
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Update status to processing
|
|
|
|
|
knowledge.ParseStatus = "processing"
|
|
|
|
|
@@ -590,7 +646,7 @@ func (s *knowledgeService) processDocument(ctx context.Context,
|
|
|
|
|
ChunkSize: int32(kb.ChunkingConfig.ChunkSize),
|
|
|
|
|
ChunkOverlap: int32(kb.ChunkingConfig.ChunkOverlap),
|
|
|
|
|
Separators: kb.ChunkingConfig.Separators,
|
|
|
|
|
EnableMultimodal: kb.ChunkingConfig.EnableMultimodal,
|
|
|
|
|
EnableMultimodal: enableMultimodel,
|
|
|
|
|
},
|
|
|
|
|
RequestId: ctx.Value(types.RequestIDContextKey).(string),
|
|
|
|
|
})
|
|
|
|
|
@@ -612,7 +668,7 @@ func (s *knowledgeService) processDocument(ctx context.Context,
|
|
|
|
|
|
|
|
|
|
// processDocumentFromURL handles asynchronous processing of URL content
|
|
|
|
|
func (s *knowledgeService) processDocumentFromURL(ctx context.Context,
|
|
|
|
|
kb *types.KnowledgeBase, knowledge *types.Knowledge, url string,
|
|
|
|
|
kb *types.KnowledgeBase, knowledge *types.Knowledge, url string, enableMultimodel bool,
|
|
|
|
|
) {
|
|
|
|
|
// Update status to processing
|
|
|
|
|
knowledge.ParseStatus = "processing"
|
|
|
|
|
@@ -620,6 +676,7 @@ func (s *knowledgeService) processDocumentFromURL(ctx context.Context,
|
|
|
|
|
if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
logger.GetLogger(ctx).Infof("processDocumentFromURL enableMultimodel: %v", enableMultimodel)
|
|
|
|
|
|
|
|
|
|
// Fetch and chunk content from URL
|
|
|
|
|
resp, err := s.docReaderClient.ReadFromURL(ctx, &proto.ReadFromURLRequest{
|
|
|
|
|
@@ -629,7 +686,7 @@ func (s *knowledgeService) processDocumentFromURL(ctx context.Context,
|
|
|
|
|
ChunkSize: int32(kb.ChunkingConfig.ChunkSize),
|
|
|
|
|
ChunkOverlap: int32(kb.ChunkingConfig.ChunkOverlap),
|
|
|
|
|
Separators: kb.ChunkingConfig.Separators,
|
|
|
|
|
EnableMultimodal: kb.ChunkingConfig.EnableMultimodal,
|
|
|
|
|
EnableMultimodal: enableMultimodel,
|
|
|
|
|
},
|
|
|
|
|
RequestId: ctx.Value(types.RequestIDContextKey).(string),
|
|
|
|
|
})
|
|
|
|
|
@@ -780,7 +837,7 @@ func (s *knowledgeService) processChunks(ctx context.Context,
|
|
|
|
|
TenantID: knowledge.TenantID,
|
|
|
|
|
KnowledgeID: knowledge.ID,
|
|
|
|
|
KnowledgeBaseID: knowledge.KnowledgeBaseID,
|
|
|
|
|
Content: fmt.Sprintf("图片OCR文本: %s", img.OcrText),
|
|
|
|
|
Content: img.OcrText,
|
|
|
|
|
ChunkIndex: maxSeq + i*100 + 1, // 使用不冲突的索引方式
|
|
|
|
|
IsEnabled: true,
|
|
|
|
|
CreatedAt: time.Now(),
|
|
|
|
|
@@ -802,7 +859,7 @@ func (s *knowledgeService) processChunks(ctx context.Context,
|
|
|
|
|
TenantID: knowledge.TenantID,
|
|
|
|
|
KnowledgeID: knowledge.ID,
|
|
|
|
|
KnowledgeBaseID: knowledge.KnowledgeBaseID,
|
|
|
|
|
Content: fmt.Sprintf("图片描述: %s", img.Caption),
|
|
|
|
|
Content: img.Caption,
|
|
|
|
|
ChunkIndex: maxSeq + i*100 + 2, // 使用不冲突的索引方式
|
|
|
|
|
IsEnabled: true,
|
|
|
|
|
CreatedAt: time.Now(),
|
|
|
|
|
@@ -860,8 +917,7 @@ func (s *knowledgeService) processChunks(ctx context.Context,
|
|
|
|
|
} else {
|
|
|
|
|
for _, chunk := range textChunks {
|
|
|
|
|
chunk.RelationChunks, _ = json.Marshal(graphBuilder.GetRelationChunks(chunk.ID, relationChunkSize))
|
|
|
|
|
chunk.IndirectRelationChunks, _ =
|
|
|
|
|
json.Marshal(graphBuilder.GetIndirectRelationChunks(chunk.ID, indirectRelationChunkSize))
|
|
|
|
|
chunk.IndirectRelationChunks, _ = json.Marshal(graphBuilder.GetIndirectRelationChunks(chunk.ID, indirectRelationChunkSize))
|
|
|
|
|
}
|
|
|
|
|
for i, entity := range graphBuilder.GetAllEntities() {
|
|
|
|
|
relationChunks, _ := json.Marshal(entity.ChunkIDs)
|
|
|
|
|
@@ -1386,6 +1442,12 @@ func (s *knowledgeService) UpdateImageInfo(ctx context.Context, knowledgeID stri
|
|
|
|
|
|
|
|
|
|
// Iterate through each chunk and update its content based on the image information
|
|
|
|
|
updateChunk := []*types.Chunk{chunk}
|
|
|
|
|
var addChunk []*types.Chunk
|
|
|
|
|
|
|
|
|
|
// Track whether we've found OCR and caption child chunks for this image
|
|
|
|
|
hasOCRChunk := false
|
|
|
|
|
hasCaptionChunk := false
|
|
|
|
|
|
|
|
|
|
for i, child := range chunkChildren {
|
|
|
|
|
// Skip chunks that are not image types
|
|
|
|
|
var cImageInfo []*types.ImageInfo
|
|
|
|
|
@@ -1403,19 +1465,69 @@ func (s *knowledgeService) UpdateImageInfo(ctx context.Context, knowledgeID stri
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Update the chunk content based on the image type
|
|
|
|
|
if child.ChunkType == types.ChunkTypeImageCaption && image.Caption != cImageInfo[0].Caption {
|
|
|
|
|
child.Content = fmt.Sprintf("图片描述: %s", image.Caption)
|
|
|
|
|
child.ImageInfo = imageInfo
|
|
|
|
|
updateChunk = append(updateChunk, chunkChildren[i])
|
|
|
|
|
} else if child.ChunkType == types.ChunkTypeImageOCR && image.OCRText != cImageInfo[0].OCRText {
|
|
|
|
|
child.Content = fmt.Sprintf("图片OCR文本: %s", image.OCRText)
|
|
|
|
|
child.ImageInfo = imageInfo
|
|
|
|
|
updateChunk = append(updateChunk, chunkChildren[i])
|
|
|
|
|
// Mark that we've found chunks for this image
|
|
|
|
|
if child.ChunkType == types.ChunkTypeImageCaption {
|
|
|
|
|
hasCaptionChunk = true
|
|
|
|
|
// Update caption if it has changed
|
|
|
|
|
if image.Caption != cImageInfo[0].Caption {
|
|
|
|
|
child.Content = image.Caption
|
|
|
|
|
child.ImageInfo = imageInfo
|
|
|
|
|
updateChunk = append(updateChunk, chunkChildren[i])
|
|
|
|
|
}
|
|
|
|
|
} else if child.ChunkType == types.ChunkTypeImageOCR {
|
|
|
|
|
hasOCRChunk = true
|
|
|
|
|
// Update OCR if it has changed
|
|
|
|
|
if image.OCRText != cImageInfo[0].OCRText {
|
|
|
|
|
child.Content = image.OCRText
|
|
|
|
|
child.ImageInfo = imageInfo
|
|
|
|
|
updateChunk = append(updateChunk, chunkChildren[i])
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Create a new caption chunk if it doesn't exist and we have caption data
|
|
|
|
|
if !hasCaptionChunk && image.Caption != "" {
|
|
|
|
|
captionChunk := &types.Chunk{
|
|
|
|
|
ID: uuid.New().String(),
|
|
|
|
|
TenantID: tenantID,
|
|
|
|
|
KnowledgeID: chunk.KnowledgeID,
|
|
|
|
|
KnowledgeBaseID: chunk.KnowledgeBaseID,
|
|
|
|
|
Content: image.Caption,
|
|
|
|
|
ChunkType: types.ChunkTypeImageCaption,
|
|
|
|
|
ParentChunkID: chunk.ID,
|
|
|
|
|
ImageInfo: imageInfo,
|
|
|
|
|
}
|
|
|
|
|
addChunk = append(addChunk, captionChunk)
|
|
|
|
|
logger.Infof(ctx, "Created new caption chunk ID: %s for image URL: %s", captionChunk.ID, image.OriginalURL)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Create a new OCR chunk if it doesn't exist and we have OCR data
|
|
|
|
|
if !hasOCRChunk && image.OCRText != "" {
|
|
|
|
|
ocrChunk := &types.Chunk{
|
|
|
|
|
ID: uuid.New().String(),
|
|
|
|
|
TenantID: tenantID,
|
|
|
|
|
KnowledgeID: chunk.KnowledgeID,
|
|
|
|
|
KnowledgeBaseID: chunk.KnowledgeBaseID,
|
|
|
|
|
Content: image.OCRText,
|
|
|
|
|
ChunkType: types.ChunkTypeImageOCR,
|
|
|
|
|
ParentChunkID: chunk.ID,
|
|
|
|
|
ImageInfo: imageInfo,
|
|
|
|
|
}
|
|
|
|
|
addChunk = append(addChunk, ocrChunk)
|
|
|
|
|
logger.Infof(ctx, "Created new OCR chunk ID: %s for image URL: %s", ocrChunk.ID, image.OriginalURL)
|
|
|
|
|
}
|
|
|
|
|
logger.Infof(ctx, "Updated %d chunks out of %d total chunks", len(updateChunk), len(chunkChildren)+1)
|
|
|
|
|
|
|
|
|
|
if len(addChunk) > 0 {
|
|
|
|
|
err := s.chunkService.CreateChunks(ctx, addChunk)
|
|
|
|
|
if err != nil {
|
|
|
|
|
logger.ErrorWithFields(ctx, err, map[string]interface{}{
|
|
|
|
|
"add_chunk_size": len(addChunk),
|
|
|
|
|
})
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Update the chunks
|
|
|
|
|
for _, c := range updateChunk {
|
|
|
|
|
err := s.chunkService.UpdateChunk(ctx, c)
|
|
|
|
|
@@ -1429,7 +1541,7 @@ func (s *knowledgeService) UpdateImageInfo(ctx context.Context, knowledgeID stri
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Update the chunk vector
|
|
|
|
|
err = s.updateChunkVector(ctx, chunk.KnowledgeBaseID, updateChunk)
|
|
|
|
|
err = s.updateChunkVector(ctx, chunk.KnowledgeBaseID, append(updateChunk, addChunk...))
|
|
|
|
|
if err != nil {
|
|
|
|
|
logger.ErrorWithFields(ctx, err, map[string]interface{}{
|
|
|
|
|
"chunk_id": chunk.ID,
|
|
|
|
|
@@ -1559,3 +1671,12 @@ func (s *knowledgeService) CloneChunk(ctx context.Context, src, dst *types.Knowl
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func IsImageType(fileType string) bool {
|
|
|
|
|
switch fileType {
|
|
|
|
|
case "jpg", "jpeg", "png", "gif", "webp", "bmp", "svg", "tiff":
|
|
|
|
|
return true
|
|
|
|
|
default:
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|