mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
fix: make file docker build not work
This commit is contained in:
51
Makefile
51
Makefile
@@ -1,4 +1,4 @@
|
||||
.PHONY: help build run test clean docker-build docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama build-images build-images-app build-images-docreader build-images-frontend clean-images
|
||||
.PHONY: help build run test clean docker-build docker-build-docreader docker-build-frontend docker-build-all docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama build-images build-images-app build-images-docreader build-images-frontend clean-images check-env list-containers pull-images
|
||||
|
||||
# Show help
|
||||
help:
|
||||
@@ -11,10 +11,13 @@ help:
|
||||
@echo " clean 清理构建文件"
|
||||
@echo ""
|
||||
@echo "Docker 命令:"
|
||||
@echo " docker-build 构建 Docker 镜像"
|
||||
@echo " docker-run 运行 Docker 容器"
|
||||
@echo " docker-stop 停止 Docker 容器"
|
||||
@echo " docker-restart 重启 Docker 容器"
|
||||
@echo " docker-build-app 构建应用 Docker 镜像 (wechatopenai/weknora-app)"
|
||||
@echo " docker-build-docreader 构建文档读取器镜像 (wechatopenai/weknora-docreader)"
|
||||
@echo " docker-build-frontend 构建前端镜像 (wechatopenai/weknora-ui)"
|
||||
@echo " docker-build-all 构建所有 Docker 镜像"
|
||||
@echo " docker-run 运行 Docker 容器"
|
||||
@echo " docker-stop 停止 Docker 容器"
|
||||
@echo " docker-restart 重启 Docker 容器"
|
||||
@echo ""
|
||||
@echo "服务管理:"
|
||||
@echo " start-all 启动所有服务"
|
||||
@@ -37,13 +40,18 @@ help:
|
||||
@echo " lint 代码检查"
|
||||
@echo " deps 安装依赖"
|
||||
@echo " docs 生成 API 文档"
|
||||
@echo ""
|
||||
@echo "环境检查:"
|
||||
@echo " check-env 检查环境配置"
|
||||
@echo " list-containers 列出运行中的容器"
|
||||
@echo " pull-images 拉取最新镜像"
|
||||
|
||||
# Go related variables
|
||||
BINARY_NAME=WeKnora
|
||||
MAIN_PATH=./cmd/server
|
||||
|
||||
# Docker related variables
|
||||
DOCKER_IMAGE=WeKnora
|
||||
DOCKER_IMAGE=wechatopenai/weknora-app
|
||||
DOCKER_TAG=latest
|
||||
|
||||
# Build the application
|
||||
@@ -64,8 +72,19 @@ clean:
|
||||
rm -f $(BINARY_NAME)
|
||||
|
||||
# Build Docker image
|
||||
docker-build:
|
||||
docker build -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
|
||||
docker-build-app:
|
||||
docker build -f docker/Dockerfile.app -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
|
||||
|
||||
# Build docreader Docker image
|
||||
docker-build-docreader:
|
||||
docker build -f docker/Dockerfile.docreader -t wechatopenai/weknora-docreader:latest .
|
||||
|
||||
# Build frontend Docker image
|
||||
docker-build-frontend:
|
||||
docker build -f frontend/Dockerfile -t wechatopenai/weknora-ui:latest frontend/
|
||||
|
||||
# Build all Docker images
|
||||
docker-build-all: docker-build-app docker-build-docreader docker-build-frontend
|
||||
|
||||
# Run Docker container (传统方式)
|
||||
docker-run:
|
||||
@@ -107,10 +126,10 @@ build-images-frontend:
|
||||
clean-images:
|
||||
./scripts/build_images.sh --clean
|
||||
|
||||
# Restart Docker container (stop, rebuild, start)
|
||||
# Restart Docker container (stop, start)
|
||||
docker-restart:
|
||||
docker-compose stop -t 60
|
||||
docker-compose up --build
|
||||
docker-compose up
|
||||
|
||||
# Database migrations
|
||||
migrate-up:
|
||||
@@ -151,4 +170,16 @@ clean-db:
|
||||
docker volume rm weknora_redis_data; \
|
||||
fi
|
||||
|
||||
# Environment check
|
||||
check-env:
|
||||
./scripts/start_all.sh --check
|
||||
|
||||
# List containers
|
||||
list-containers:
|
||||
./scripts/start_all.sh --list
|
||||
|
||||
# Pull latest images
|
||||
pull-images:
|
||||
./scripts/start_all.sh --pull
|
||||
|
||||
|
||||
|
||||
@@ -33,7 +33,9 @@ except ImportError:
|
||||
except ImportError:
|
||||
# If both imports fail, set to None
|
||||
Caption = None
|
||||
logging.warning("Failed to import Caption, image captioning will be unavailable")
|
||||
logging.warning(
|
||||
"Failed to import Caption, image captioning will be unavailable"
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
@@ -78,7 +80,9 @@ class BaseParser(ABC):
|
||||
"""
|
||||
if cls._ocr_engine is None and not cls._ocr_engine_failed:
|
||||
try:
|
||||
cls._ocr_engine = OCREngine.get_instance(backend_type=backend_type, **kwargs)
|
||||
cls._ocr_engine = OCREngine.get_instance(
|
||||
backend_type=backend_type, **kwargs
|
||||
)
|
||||
if cls._ocr_engine is None:
|
||||
cls._ocr_engine_failed = True
|
||||
logger.error(f"Failed to initialize OCR engine ({backend_type})")
|
||||
@@ -89,7 +93,6 @@ class BaseParser(ABC):
|
||||
logger.error(f"Failed to initialize OCR engine: {str(e)}")
|
||||
return None
|
||||
return cls._ocr_engine
|
||||
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -135,7 +138,7 @@ class BaseParser(ABC):
|
||||
self.max_concurrent_tasks = max_concurrent_tasks
|
||||
self.max_chunks = max_chunks
|
||||
self.chunking_config = chunking_config
|
||||
|
||||
|
||||
logger.info(
|
||||
f"Initializing {self.__class__.__name__} for file: {file_name}, type: {self.file_type}"
|
||||
)
|
||||
@@ -174,10 +177,14 @@ class BaseParser(ABC):
|
||||
resized_image = self._resize_image_if_needed(image)
|
||||
|
||||
# Get OCR engine
|
||||
ocr_engine = self.get_ocr_engine(backend_type=self.ocr_backend, **self.ocr_config)
|
||||
ocr_engine = self.get_ocr_engine(
|
||||
backend_type=self.ocr_backend, **self.ocr_config
|
||||
)
|
||||
if ocr_engine is None:
|
||||
logger.error(f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, "
|
||||
"skipping OCR recognition")
|
||||
logger.error(
|
||||
f"OCR engine ({self.ocr_backend}) initialization failed or unavailable, "
|
||||
"skipping OCR recognition"
|
||||
)
|
||||
return ""
|
||||
|
||||
# Execute OCR prediction
|
||||
@@ -199,11 +206,13 @@ class BaseParser(ABC):
|
||||
return ocr_result
|
||||
except Exception as e:
|
||||
process_time = time.time() - start_time
|
||||
logger.error(f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds")
|
||||
logger.error(
|
||||
f"OCR recognition error: {str(e)}, time: {process_time:.2f} seconds"
|
||||
)
|
||||
return ""
|
||||
finally:
|
||||
# Release image resources
|
||||
if resized_image is not image and hasattr(resized_image, 'close'):
|
||||
if resized_image is not image and hasattr(resized_image, "close"):
|
||||
# Only close the new image we created, not the original image
|
||||
resized_image.close()
|
||||
|
||||
@@ -218,25 +227,33 @@ class BaseParser(ABC):
|
||||
"""
|
||||
try:
|
||||
# If it's a PIL Image
|
||||
if hasattr(image, 'size'):
|
||||
if hasattr(image, "size"):
|
||||
width, height = image.size
|
||||
if width > self.max_image_size or height > self.max_image_size:
|
||||
logger.info(f"Resizing PIL image, original size: {width}x{height}")
|
||||
scale = min(self.max_image_size / width, self.max_image_size / height)
|
||||
scale = min(
|
||||
self.max_image_size / width, self.max_image_size / height
|
||||
)
|
||||
new_width = int(width * scale)
|
||||
new_height = int(height * scale)
|
||||
resized_image = image.resize((new_width, new_height))
|
||||
logger.info(f"Resized to: {new_width}x{new_height}")
|
||||
return resized_image
|
||||
else:
|
||||
logger.info(f"PIL image size {width}x{height} is within limits, no resizing needed")
|
||||
logger.info(
|
||||
f"PIL image size {width}x{height} is within limits, no resizing needed"
|
||||
)
|
||||
return image
|
||||
# If it's a numpy array
|
||||
elif hasattr(image, 'shape'):
|
||||
elif hasattr(image, "shape"):
|
||||
height, width = image.shape[:2]
|
||||
if width > self.max_image_size or height > self.max_image_size:
|
||||
logger.info(f"Resizing numpy image, original size: {width}x{height}")
|
||||
scale = min(self.max_image_size / width, self.max_image_size / height)
|
||||
logger.info(
|
||||
f"Resizing numpy image, original size: {width}x{height}"
|
||||
)
|
||||
scale = min(
|
||||
self.max_image_size / width, self.max_image_size / height
|
||||
)
|
||||
new_width = int(width * scale)
|
||||
new_height = int(height * scale)
|
||||
# Use PIL for resizing numpy arrays
|
||||
@@ -246,7 +263,9 @@ class BaseParser(ABC):
|
||||
logger.info(f"Resized to: {new_width}x{new_height}")
|
||||
return resized_image
|
||||
else:
|
||||
logger.info(f"Numpy image size {width}x{height} is within limits, no resizing needed")
|
||||
logger.info(
|
||||
f"Numpy image size {width}x{height} is within limits, no resizing needed"
|
||||
)
|
||||
return image
|
||||
else:
|
||||
logger.warning(f"Unknown image type: {type(image)}, cannot resize")
|
||||
@@ -278,7 +297,9 @@ class BaseParser(ABC):
|
||||
caption = ""
|
||||
|
||||
if self.caption_parser:
|
||||
logger.info(f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption")
|
||||
logger.info(
|
||||
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
|
||||
)
|
||||
# Convert image to base64 for caption generation
|
||||
img_base64 = image_to_base64(image)
|
||||
if img_base64:
|
||||
@@ -295,7 +316,7 @@ class BaseParser(ABC):
|
||||
|
||||
# Release image resources
|
||||
del image
|
||||
|
||||
|
||||
return ocr_text, caption, image_url
|
||||
|
||||
async def process_image_async(self, image, image_url=None):
|
||||
@@ -325,13 +346,17 @@ class BaseParser(ABC):
|
||||
ocr_task = loop.run_in_executor(None, self.perform_ocr, resized_image)
|
||||
ocr_text = await asyncio.wait_for(ocr_task, timeout=30.0)
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("OCR processing timed out (30 seconds), skipping this image")
|
||||
logger.error(
|
||||
"OCR processing timed out (30 seconds), skipping this image"
|
||||
)
|
||||
ocr_text = ""
|
||||
except Exception as e:
|
||||
logger.error(f"OCR processing error: {str(e)}")
|
||||
ocr_text = ""
|
||||
|
||||
logger.info(f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption")
|
||||
logger.info(
|
||||
f"OCR successfully extracted {len(ocr_text)} characters, continuing to get caption"
|
||||
)
|
||||
caption = ""
|
||||
if self.caption_parser:
|
||||
try:
|
||||
@@ -340,9 +365,13 @@ class BaseParser(ABC):
|
||||
if img_base64:
|
||||
# Add timeout to avoid blocking caption retrieval (30 seconds timeout)
|
||||
caption_task = self.get_image_caption_async(img_base64)
|
||||
image_data, caption = await asyncio.wait_for(caption_task, timeout=30.0)
|
||||
image_data, caption = await asyncio.wait_for(
|
||||
caption_task, timeout=30.0
|
||||
)
|
||||
if caption:
|
||||
logger.info(f"Successfully obtained image caption: {caption}")
|
||||
logger.info(
|
||||
f"Successfully obtained image caption: {caption}"
|
||||
)
|
||||
else:
|
||||
logger.warning("Failed to get caption")
|
||||
else:
|
||||
@@ -353,27 +382,20 @@ class BaseParser(ABC):
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get caption: {str(e)}")
|
||||
else:
|
||||
logger.info("Caption service not initialized, skipping caption retrieval")
|
||||
logger.info(
|
||||
"Caption service not initialized, skipping caption retrieval"
|
||||
)
|
||||
|
||||
return ocr_text, caption, image_url
|
||||
finally:
|
||||
# Release image resources
|
||||
if resized_image is not image and hasattr(resized_image, 'close'):
|
||||
if resized_image is not image and hasattr(resized_image, "close"):
|
||||
# Only close the new image we created, not the original image
|
||||
resized_image.close()
|
||||
|
||||
async def process_with_limit(self, idx, image, url, semaphore, current_request_id=None):
|
||||
async def process_with_limit(self, idx, image, url, semaphore):
|
||||
"""Function to process a single image using a semaphore"""
|
||||
try:
|
||||
# Set request ID in the asynchronous task
|
||||
if current_request_id:
|
||||
try:
|
||||
from utils.request import set_request_id
|
||||
set_request_id(current_request_id)
|
||||
logger.info(f"Asynchronous task {idx+1} setting request ID: {current_request_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to set request ID in asynchronous task: {str(e)}")
|
||||
|
||||
logger.info(f"Waiting to process image {idx+1}")
|
||||
async with semaphore: # Use semaphore to control concurrency
|
||||
logger.info(f"Starting to process image {idx+1}")
|
||||
@@ -385,7 +407,7 @@ class BaseParser(ABC):
|
||||
return ("", "", url) # Return empty result to avoid overall failure
|
||||
finally:
|
||||
# Manually release image resources
|
||||
if hasattr(image, 'close'):
|
||||
if hasattr(image, "close"):
|
||||
image.close()
|
||||
|
||||
async def process_multiple_images(self, images_data):
|
||||
@@ -404,26 +426,19 @@ class BaseParser(ABC):
|
||||
return []
|
||||
|
||||
# Set max concurrency, reduce concurrency to avoid resource contention
|
||||
max_concurrency = min(self.max_concurrent_tasks, 5) # Reduce concurrency to prevent excessive memory usage
|
||||
max_concurrency = min(
|
||||
self.max_concurrent_tasks, 1
|
||||
) # Reduce concurrency to prevent excessive memory usage
|
||||
|
||||
# Use semaphore to limit concurrency
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
# Store results to avoid overall failure due to task failure
|
||||
results = []
|
||||
|
||||
# Get current request ID to set in each asynchronous task
|
||||
current_request_id = None
|
||||
try:
|
||||
from utils.request import get_request_id
|
||||
current_request_id = get_request_id()
|
||||
logger.info(f"Capturing current request ID before async processing: {current_request_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get current request ID: {str(e)}")
|
||||
|
||||
# Create all tasks, but use semaphore to limit actual concurrency
|
||||
tasks = [
|
||||
self.process_with_limit(i, img, url, semaphore, current_request_id)
|
||||
self.process_with_limit(i, img, url, semaphore)
|
||||
for i, (img, url) in enumerate(images_data)
|
||||
]
|
||||
|
||||
@@ -434,7 +449,9 @@ class BaseParser(ABC):
|
||||
# Handle possible exception results
|
||||
for i, result in enumerate(completed_results):
|
||||
if isinstance(result, Exception):
|
||||
logger.error(f"Image {i+1} processing returned an exception: {str(result)}")
|
||||
logger.error(
|
||||
f"Image {i+1} processing returned an exception: {str(result)}"
|
||||
)
|
||||
# For exceptions, add empty results
|
||||
if i < len(images_data):
|
||||
results.append(("", "", images_data[i][1]))
|
||||
@@ -449,7 +466,9 @@ class BaseParser(ABC):
|
||||
images_data.clear()
|
||||
logger.info("Image processing resource cleanup complete")
|
||||
|
||||
logger.info(f"Completed concurrent processing of {len(results)}/{len(images_data)} images")
|
||||
logger.info(
|
||||
f"Completed concurrent processing of {len(results)}/{len(images_data)} images"
|
||||
)
|
||||
return results
|
||||
|
||||
def decode_bytes(self, content: bytes) -> str:
|
||||
@@ -529,9 +548,13 @@ class BaseParser(ABC):
|
||||
def __init_storage(self):
|
||||
"""Initialize storage client based on configuration"""
|
||||
if self._storage is None:
|
||||
storage_config = self.chunking_config.storage_config if self.chunking_config else None
|
||||
storage_config = (
|
||||
self.chunking_config.storage_config if self.chunking_config else None
|
||||
)
|
||||
self._storage = create_storage(storage_config)
|
||||
logger.info(f"Initialized storage client: {self._storage.__class__.__name__}")
|
||||
logger.info(
|
||||
f"Initialized storage client: {self._storage.__class__.__name__}"
|
||||
)
|
||||
return self._storage
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
@@ -605,40 +628,50 @@ class BaseParser(ABC):
|
||||
logger.info(f"Beginning chunking process for text")
|
||||
chunks = self.chunk_text(text)
|
||||
logger.info(f"Created {len(chunks)} chunks from document")
|
||||
|
||||
|
||||
# Limit the number of returned chunks
|
||||
if len(chunks) > self.max_chunks:
|
||||
logger.warning(f"Limiting chunks from {len(chunks)} to maximum {self.max_chunks}")
|
||||
chunks = chunks[:self.max_chunks]
|
||||
|
||||
logger.warning(
|
||||
f"Limiting chunks from {len(chunks)} to maximum {self.max_chunks}"
|
||||
)
|
||||
chunks = chunks[: self.max_chunks]
|
||||
|
||||
# If multimodal is enabled and file type is supported, process images in each chunk
|
||||
if self.enable_multimodal:
|
||||
# Get file extension and convert to lowercase
|
||||
file_ext = (
|
||||
os.path.splitext(self.file_name)[1].lower()
|
||||
if self.file_name
|
||||
else (
|
||||
self.file_type.lower()
|
||||
if self.file_type
|
||||
else ""
|
||||
)
|
||||
else (self.file_type.lower() if self.file_type else "")
|
||||
)
|
||||
|
||||
|
||||
# Define allowed file types for image processing
|
||||
allowed_types = [
|
||||
'.pdf', # PDF files
|
||||
'.md', '.markdown', # Markdown files
|
||||
'.doc', '.docx', # Word documents
|
||||
".pdf", # PDF files
|
||||
".md",
|
||||
".markdown", # Markdown files
|
||||
".doc",
|
||||
".docx", # Word documents
|
||||
# Image files
|
||||
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".png",
|
||||
".gif",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".webp",
|
||||
]
|
||||
|
||||
|
||||
if file_ext in allowed_types:
|
||||
logger.info(f"Processing images in each chunk for file type: {file_ext}")
|
||||
logger.info(
|
||||
f"Processing images in each chunk for file type: {file_ext}"
|
||||
)
|
||||
chunks = self.process_chunks_images(chunks, image_map)
|
||||
else:
|
||||
logger.info(f"Skipping image processing for unsupported file type: {file_ext}")
|
||||
|
||||
logger.info(
|
||||
f"Skipping image processing for unsupported file type: {file_ext}"
|
||||
)
|
||||
|
||||
return ParseResult(text=text, chunks=chunks)
|
||||
|
||||
def _split_into_units(self, text: str) -> List[str]:
|
||||
@@ -649,11 +682,13 @@ class BaseParser(ABC):
|
||||
Returns:
|
||||
基本单元的列表
|
||||
"""
|
||||
logger.info(f"Splitting text into basic units with robust structure protection, text length: {len(text)}")
|
||||
|
||||
logger.info(
|
||||
f"Splitting text into basic units with robust structure protection, text length: {len(text)}"
|
||||
)
|
||||
|
||||
# 定义所有需要作为整体保护的结构模式 ---
|
||||
table_pattern = r"(?m)(^\|.*\|[ \t]*\r?\n(?:[ \t]*\r?\n)?^\|\s*:?--+.*\r?\n(?:^\|.*\|\r?\n?)*)"
|
||||
|
||||
|
||||
# 其他需要保护的结构(代码块、公式块、行内元素)
|
||||
code_block_pattern = r"```[\s\S]*?```"
|
||||
math_block_pattern = r"\$\$[\s\S]*?\$\$"
|
||||
@@ -661,7 +696,12 @@ class BaseParser(ABC):
|
||||
|
||||
# 查找所有受保护结构的位置 ---
|
||||
protected_ranges = []
|
||||
for pattern in [table_pattern, code_block_pattern, math_block_pattern, inline_pattern]:
|
||||
for pattern in [
|
||||
table_pattern,
|
||||
code_block_pattern,
|
||||
math_block_pattern,
|
||||
inline_pattern,
|
||||
]:
|
||||
for match in re.finditer(pattern, text):
|
||||
# 确保匹配到的不是空字符串,避免无效范围
|
||||
if match.group(0).strip():
|
||||
@@ -669,8 +709,10 @@ class BaseParser(ABC):
|
||||
|
||||
# 按起始位置排序
|
||||
protected_ranges.sort(key=lambda x: x[0])
|
||||
logger.info(f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links).")
|
||||
|
||||
logger.info(
|
||||
f"Found {len(protected_ranges)} protected structures (tables, code, formulas, images, links)."
|
||||
)
|
||||
|
||||
# 合并可能重叠的保护范围 ---
|
||||
# 确保我们有一组不相交的、需要保护的文本块
|
||||
if protected_ranges:
|
||||
@@ -685,15 +727,17 @@ class BaseParser(ABC):
|
||||
# 如果不重叠,则完成当前范围并开始一个新的范围
|
||||
merged_ranges.append((current_start, current_end))
|
||||
current_start, current_end = next_start, next_end
|
||||
|
||||
|
||||
merged_ranges.append((current_start, current_end))
|
||||
protected_ranges = merged_ranges
|
||||
logger.info(f"After merging overlaps, {len(protected_ranges)} protected ranges remain.")
|
||||
logger.info(
|
||||
f"After merging overlaps, {len(protected_ranges)} protected ranges remain."
|
||||
)
|
||||
|
||||
# 根据保护范围和分隔符来分割文本 ---
|
||||
units = []
|
||||
last_end = 0
|
||||
|
||||
|
||||
# 定义分隔符的正则表达式,通过加括号来保留分隔符本身
|
||||
separator_pattern = f"({'|'.join(re.escape(s) for s in self.separators)})"
|
||||
|
||||
@@ -703,7 +747,7 @@ class BaseParser(ABC):
|
||||
pre_text = text[last_end:start]
|
||||
# 对这部分非保护文本进行分割,并保留分隔符
|
||||
segments = re.split(separator_pattern, pre_text)
|
||||
units.extend([s for s in segments if s]) # 添加所有非空部分
|
||||
units.extend([s for s in segments if s]) # 添加所有非空部分
|
||||
|
||||
# b. 将整个受保护的块(例如,一个完整的表格)作为一个单独的、不可分割的单元添加
|
||||
protected_text = text[start:end]
|
||||
@@ -715,10 +759,11 @@ class BaseParser(ABC):
|
||||
if last_end < len(text):
|
||||
post_text = text[last_end:]
|
||||
segments = re.split(separator_pattern, post_text)
|
||||
units.extend([s for s in segments if s]) # 添加所有非空部分
|
||||
|
||||
units.extend([s for s in segments if s]) # 添加所有非空部分
|
||||
|
||||
logger.info(f"Text splitting complete, created {len(units)} final basic units.")
|
||||
return units
|
||||
|
||||
def _find_complete_units(self, units: List[str], target_size: int) -> List[str]:
|
||||
"""Find a list of complete units that do not exceed the target size
|
||||
|
||||
@@ -884,71 +929,58 @@ class BaseParser(ABC):
|
||||
"""
|
||||
logger.info(f"Extracting image information from Chunk #{chunk.seq}")
|
||||
text = chunk.content
|
||||
|
||||
|
||||
# Regex to extract image information from text, supporting Markdown images and HTML images
|
||||
img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)|<img [^>]*src="([^"]+)" [^>]*>'
|
||||
|
||||
|
||||
# Extract image information
|
||||
img_matches = list(re.finditer(img_pattern, text))
|
||||
logger.info(f"Chunk #{chunk.seq} found {len(img_matches)} images")
|
||||
|
||||
|
||||
images_info = []
|
||||
for match_idx, match in enumerate(img_matches):
|
||||
# Process image URL
|
||||
img_url = match.group(2) if match.group(2) else match.group(3)
|
||||
alt_text = match.group(1) if match.group(1) else ""
|
||||
|
||||
|
||||
# Record image information
|
||||
image_info = {
|
||||
"original_url": img_url,
|
||||
"start": match.start(),
|
||||
"end": match.end(),
|
||||
"alt_text": alt_text,
|
||||
"match_text": text[match.start():match.end()]
|
||||
"match_text": text[match.start() : match.end()],
|
||||
}
|
||||
images_info.append(image_info)
|
||||
|
||||
|
||||
logger.info(
|
||||
f"Image in Chunk #{chunk.seq} {match_idx+1}: "
|
||||
f"URL={img_url[:50]}..."
|
||||
f"Image in Chunk #{chunk.seq} {match_idx+1}: " f"URL={img_url[:50]}..."
|
||||
if len(img_url) > 50
|
||||
else f"Image in Chunk #{chunk.seq} {match_idx+1}: URL={img_url}"
|
||||
)
|
||||
|
||||
|
||||
return images_info
|
||||
|
||||
async def download_and_upload_image(self, img_url: str, current_request_id=None, image_map=None):
|
||||
async def download_and_upload_image(self, img_url: str):
|
||||
"""Download image and upload to object storage, if it's already an object storage path or local path, use directly
|
||||
|
||||
Args:
|
||||
img_url: Image URL or local path
|
||||
current_request_id: Current request ID
|
||||
image_map: Optional dictionary mapping image URLs to Image objects
|
||||
|
||||
Returns:
|
||||
tuple: (original URL, storage URL, image object), if failed returns (original URL, None, None)
|
||||
"""
|
||||
# Set request ID context in the asynchronous task
|
||||
try:
|
||||
if current_request_id:
|
||||
from utils.request import set_request_id
|
||||
set_request_id(current_request_id)
|
||||
logger.info(f"Asynchronous task setting request ID: {current_request_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to set request ID in asynchronous task: {str(e)}")
|
||||
|
||||
|
||||
try:
|
||||
import requests
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
# Check if image is already in the image_map
|
||||
if image_map and img_url in image_map:
|
||||
logger.info(f"Image already in image_map: {img_url}, using cached object")
|
||||
return img_url, img_url, image_map[img_url]
|
||||
|
||||
|
||||
# Check if it's already a storage URL (COS or MinIO)
|
||||
is_storage_url = any(pattern in img_url for pattern in ["cos", "myqcloud.com", "minio", ".s3."])
|
||||
is_storage_url = any(
|
||||
pattern in img_url
|
||||
for pattern in ["cos", "myqcloud.com", "minio", ".s3."]
|
||||
)
|
||||
if is_storage_url:
|
||||
logger.info(f"Image already on COS: {img_url}, no need to re-upload")
|
||||
try:
|
||||
@@ -961,7 +993,7 @@ class BaseParser(ABC):
|
||||
proxies["http"] = http_proxy
|
||||
if https_proxy:
|
||||
proxies["https"] = https_proxy
|
||||
|
||||
|
||||
response = requests.get(img_url, timeout=5, proxies=proxies)
|
||||
if response.status_code == 200:
|
||||
image = Image.open(io.BytesIO(response.content))
|
||||
@@ -972,12 +1004,14 @@ class BaseParser(ABC):
|
||||
# Image will be closed by the caller
|
||||
pass
|
||||
else:
|
||||
logger.warning(f"Failed to get storage image: {response.status_code}")
|
||||
logger.warning(
|
||||
f"Failed to get storage image: {response.status_code}"
|
||||
)
|
||||
return img_url, img_url, None
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting storage image: {str(e)}")
|
||||
return img_url, img_url, None
|
||||
|
||||
|
||||
# Check if it's a local file path
|
||||
elif os.path.exists(img_url) and os.path.isfile(img_url):
|
||||
logger.info(f"Using local image file: {img_url}")
|
||||
@@ -986,17 +1020,19 @@ class BaseParser(ABC):
|
||||
# Read local image
|
||||
image = Image.open(img_url)
|
||||
# Upload to storage
|
||||
with open(img_url, 'rb') as f:
|
||||
with open(img_url, "rb") as f:
|
||||
content = f.read()
|
||||
storage_url = self.upload_bytes(content)
|
||||
logger.info(f"Successfully uploaded local image to storage: {storage_url}")
|
||||
logger.info(
|
||||
f"Successfully uploaded local image to storage: {storage_url}"
|
||||
)
|
||||
return img_url, storage_url, image
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing local image: {str(e)}")
|
||||
if image and hasattr(image, 'close'):
|
||||
if image and hasattr(image, "close"):
|
||||
image.close()
|
||||
return img_url, None, None
|
||||
|
||||
|
||||
# Normal remote URL download handling
|
||||
else:
|
||||
# Get proxy settings from environment variables
|
||||
@@ -1007,17 +1043,21 @@ class BaseParser(ABC):
|
||||
proxies["http"] = http_proxy
|
||||
if https_proxy:
|
||||
proxies["https"] = https_proxy
|
||||
|
||||
logger.info(f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}")
|
||||
|
||||
logger.info(
|
||||
f"Downloading image {img_url}, using proxy: {proxies if proxies else 'None'}"
|
||||
)
|
||||
response = requests.get(img_url, timeout=5, proxies=proxies)
|
||||
|
||||
|
||||
if response.status_code == 200:
|
||||
# Download successful, create image object
|
||||
image = Image.open(io.BytesIO(response.content))
|
||||
try:
|
||||
# Upload to storage using the method in BaseParser
|
||||
storage_url = self.upload_bytes(response.content)
|
||||
logger.info(f"Successfully uploaded image to storage: {storage_url}")
|
||||
logger.info(
|
||||
f"Successfully uploaded image to storage: {storage_url}"
|
||||
)
|
||||
return img_url, storage_url, image
|
||||
finally:
|
||||
# Image will be closed by the caller
|
||||
@@ -1025,66 +1065,79 @@ class BaseParser(ABC):
|
||||
else:
|
||||
logger.warning(f"Failed to download image: {response.status_code}")
|
||||
return img_url, None, None
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading or processing image: {str(e)}")
|
||||
return img_url, None, None
|
||||
|
||||
async def process_chunk_images_async(self, chunk, chunk_idx, total_chunks, current_request_id=None, image_map=None):
|
||||
async def process_chunk_images_async(
|
||||
self, chunk, chunk_idx, total_chunks, image_map=None
|
||||
):
|
||||
"""Asynchronously process images in a single Chunk
|
||||
|
||||
Args:
|
||||
chunk: Chunk object to process
|
||||
chunk_idx: Chunk index
|
||||
total_chunks: Total number of chunks
|
||||
current_request_id: Current request ID
|
||||
image_map: Optional dictionary mapping image URLs to Image objects
|
||||
|
||||
Returns:
|
||||
Processed Chunk object
|
||||
"""
|
||||
|
||||
logger.info(f"Starting to process images in Chunk #{chunk_idx+1}/{total_chunks}")
|
||||
|
||||
|
||||
logger.info(
|
||||
f"Starting to process images in Chunk #{chunk_idx+1}/{total_chunks}"
|
||||
)
|
||||
|
||||
# Extract image information from the Chunk
|
||||
images_info = self.extract_images_from_chunk(chunk)
|
||||
if not images_info:
|
||||
logger.info(f"Chunk #{chunk_idx+1} found no images")
|
||||
return chunk
|
||||
|
||||
|
||||
# Prepare images that need to be downloaded and processed
|
||||
images_to_process = []
|
||||
url_to_info_map = {} # Map URL to image information
|
||||
|
||||
|
||||
# Record all image URLs that need to be processed
|
||||
for img_info in images_info:
|
||||
url = img_info["original_url"]
|
||||
url_to_info_map[url] = img_info
|
||||
|
||||
# Create an asynchronous event loop (current loop)
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
# Concurrent download and upload of images
|
||||
tasks = [self.download_and_upload_image(url, current_request_id, image_map) for url in url_to_info_map.keys()]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
results = []
|
||||
download_tasks = []
|
||||
for img_url in url_to_info_map.keys(): # Check if image is already in the image_map
|
||||
if image_map and img_url in image_map:
|
||||
logger.info(f"Image already in image_map: {img_url}, using cached object")
|
||||
results.append((img_url, img_url, image_map[img_url]))
|
||||
else:
|
||||
download_task = self.download_and_upload_image(img_url)
|
||||
download_tasks.append(download_task)
|
||||
# Concurrent download and upload of images, ignore images that are already in the image_map
|
||||
results.extend(await asyncio.gather(*download_tasks))
|
||||
|
||||
# Process download results, prepare for OCR processing
|
||||
for orig_url, cos_url, image in results:
|
||||
if cos_url and image:
|
||||
img_info = url_to_info_map[orig_url]
|
||||
img_info["cos_url"] = cos_url
|
||||
images_to_process.append((image, cos_url))
|
||||
|
||||
|
||||
# If no images were successfully downloaded and uploaded, return the original Chunk
|
||||
if not images_to_process:
|
||||
logger.info(f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images")
|
||||
logger.info(
|
||||
f"Chunk #{chunk_idx+1} found no successfully downloaded and uploaded images"
|
||||
)
|
||||
return chunk
|
||||
|
||||
|
||||
# Concurrent processing of all images (OCR + caption)
|
||||
logger.info(f"Processing {len(images_to_process)} images in Chunk #{chunk_idx+1}")
|
||||
|
||||
logger.info(
|
||||
f"Processing {len(images_to_process)} images in Chunk #{chunk_idx+1}"
|
||||
)
|
||||
|
||||
# Concurrent processing of all images
|
||||
processed_results = await self.process_multiple_images(images_to_process)
|
||||
|
||||
|
||||
# Process OCR and Caption results
|
||||
for ocr_text, caption, img_url in processed_results:
|
||||
# Find the corresponding original URL
|
||||
@@ -1092,22 +1145,24 @@ class BaseParser(ABC):
|
||||
if info.get("cos_url") == img_url:
|
||||
info["ocr_text"] = ocr_text if ocr_text else ""
|
||||
info["caption"] = caption if caption else ""
|
||||
|
||||
|
||||
if ocr_text:
|
||||
logger.info(f"Image OCR extracted {len(ocr_text)} characters: {img_url}")
|
||||
logger.info(
|
||||
f"Image OCR extracted {len(ocr_text)} characters: {img_url}"
|
||||
)
|
||||
if caption:
|
||||
logger.info(f"Obtained image description: '{caption}'")
|
||||
break
|
||||
|
||||
|
||||
# Add processed image information to the Chunk
|
||||
processed_images = []
|
||||
for img_info in images_info:
|
||||
if "cos_url" in img_info:
|
||||
processed_images.append(img_info)
|
||||
|
||||
|
||||
# Update image information in the Chunk
|
||||
chunk.images = processed_images
|
||||
|
||||
|
||||
logger.info(f"Completed image processing in Chunk #{chunk_idx+1}")
|
||||
return chunk
|
||||
|
||||
@@ -1120,42 +1175,37 @@ class BaseParser(ABC):
|
||||
Returns:
|
||||
List of processed document chunks
|
||||
"""
|
||||
logger.info(f"Starting concurrent processing of images in all {len(chunks)} chunks")
|
||||
|
||||
logger.info(
|
||||
f"Starting concurrent processing of images in all {len(chunks)} chunks"
|
||||
)
|
||||
|
||||
if not chunks:
|
||||
logger.warning("No chunks to process")
|
||||
return chunks
|
||||
|
||||
# Get current request ID to pass to asynchronous tasks
|
||||
current_request_id = None
|
||||
try:
|
||||
from utils.request import get_request_id
|
||||
current_request_id = get_request_id()
|
||||
logger.info(f"Capturing current request ID before async processing: {current_request_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get current request ID: {str(e)}")
|
||||
|
||||
|
||||
# Create and run all Chunk concurrent processing tasks
|
||||
async def process_all_chunks():
|
||||
# Set max concurrency, reduce concurrency to avoid resource contention
|
||||
max_concurrency = min(self.max_concurrent_tasks, 1) # Reduce concurrency
|
||||
# Use semaphore to limit concurrency
|
||||
semaphore = asyncio.Semaphore(max_concurrency)
|
||||
|
||||
|
||||
async def process_with_limit(chunk, idx, total):
|
||||
"""Use semaphore to control concurrent processing of Chunks"""
|
||||
async with semaphore:
|
||||
return await self.process_chunk_images_async(chunk, idx, total, current_request_id, image_map)
|
||||
|
||||
return await self.process_chunk_images_async(
|
||||
chunk, idx, total, image_map
|
||||
)
|
||||
|
||||
# Create tasks for all Chunks
|
||||
tasks = [
|
||||
process_with_limit(chunk, idx, len(chunks))
|
||||
for idx, chunk in enumerate(chunks)
|
||||
]
|
||||
|
||||
|
||||
# Execute all tasks concurrently
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
|
||||
# Handle possible exceptions
|
||||
processed_chunks = []
|
||||
for i, result in enumerate(results):
|
||||
@@ -1166,9 +1216,9 @@ class BaseParser(ABC):
|
||||
processed_chunks.append(chunks[i])
|
||||
else:
|
||||
processed_chunks.append(result)
|
||||
|
||||
|
||||
return processed_chunks
|
||||
|
||||
|
||||
# Create event loop and run all tasks
|
||||
try:
|
||||
# Check if event loop already exists
|
||||
@@ -1181,11 +1231,13 @@ class BaseParser(ABC):
|
||||
# If no event loop, create a new one
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
|
||||
# Execute processing for all Chunks
|
||||
processed_chunks = loop.run_until_complete(process_all_chunks())
|
||||
logger.info(f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks")
|
||||
|
||||
logger.info(
|
||||
f"Successfully completed concurrent processing of {len(processed_chunks)}/{len(chunks)} chunks"
|
||||
)
|
||||
|
||||
return processed_chunks
|
||||
except Exception as e:
|
||||
logger.error(f"Error during concurrent chunk processing: {str(e)}")
|
||||
|
||||
Reference in New Issue
Block a user