2 Commits

Author SHA1 Message Date
begoniezhao
b9f264b883 docs: 移除知识库混合搜索API文档说明 2025-11-20 17:59:46 +08:00
begoniezhao
154025f723 refactor: 优化解析器日志与API检查逻辑,简化异常处理 2025-11-20 15:05:53 +08:00
3 changed files with 12 additions and 57 deletions

View File

@@ -6,6 +6,7 @@ from docreader.parser.base_parser import BaseParser
from docreader.utils import endecode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class FirstParser(BaseParser):
@@ -16,16 +17,15 @@ class FirstParser(BaseParser):
self._parsers: List[BaseParser] = []
for parser_cls in self._parser_cls:
try:
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
except Exception as e:
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
def parse_into_text(self, content: bytes) -> Document:
for p in self._parsers:
logger.info(f"FirstParser: using parser {p.__class__.__name__}")
document = p.parse_into_text(content)
if document.is_valid():
logger.info(f"FirstParser: parser {p.__class__.__name__} succeeded")
return document
return Document()
@@ -43,16 +43,14 @@ class PipelineParser(BaseParser):
self._parsers: List[BaseParser] = []
for parser_cls in self._parser_cls:
try:
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
except Exception as e:
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
parser = parser_cls(*args, **kwargs)
self._parsers.append(parser)
def parse_into_text(self, content: bytes) -> Document:
images: Dict[str, str] = {}
document = Document()
for p in self._parsers:
logger.info(f"PipelineParser: using parser {p.__class__.__name__}")
document = p.parse_into_text(content)
content = endecode.encode_bytes(document.content)
images.update(document.images)

View File

@@ -28,7 +28,6 @@ class StdMinerUParser(BaseParser):
self.image_helper = MarkdownImageUtil()
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
self.enable = self.ping()
assert self.ping(), "MinerU API is not reachable"
def ping(self, timeout: int = 5) -> bool:
try:
@@ -41,6 +40,10 @@ class StdMinerUParser(BaseParser):
return False
def parse_into_text(self, content: bytes) -> Document:
if not self.enable:
logger.debug("MinerU API is not enabled")
return Document()
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
md_content: str = ""
images_b64: Dict[str, str] = {}

View File

@@ -334,7 +334,6 @@ curl --location 'http://localhost:8080/api/v1/tenants' \
| GET | `/knowledge-bases/:id` | 获取知识库详情 |
| PUT | `/knowledge-bases/:id` | 更新知识库 |
| DELETE | `/knowledge-bases/:id` | 删除知识库 |
| GET | `/knowledge-bases/:id/hybrid-search` | 混合搜索知识库内容 |
| POST | `/knowledge-bases/copy` | 拷贝知识库 |
#### POST `/knowledge-bases` - 创建知识库
@@ -656,51 +655,6 @@ curl --location --request DELETE 'http://localhost:8080/api/v1/knowledge-bases/b
}
```
#### GET `/knowledge-bases/:id/hybrid-search` - 混合搜索知识库内容
**请求**:
```curl
curl --location --request GET 'http://localhost:8080/api/v1/knowledge-bases/kb-00000001/hybrid-search' \
--header 'Content-Type: application/json' \
--header 'X-API-Key: sk-vQHV2NZI_LK5W7wHQvH3yGYExX8YnhaHwZipUYbiZKCYJbBQ' \
--data '{
"query_text": "彗星",
"vector_threshold": 0.1,
"keyword_threshold": 0.1,
"match_count": 1
}'
```
**响应**:
```json
{
"data": [
{
"id": "7d955251-3f79-4fd5-a6aa-02f81e044091",
"content": "有几位后来xxxxx",
"knowledge_id": "a6790b93-4700-4676-bd48-0d4804e1456b",
"chunk_index": 3,
"knowledge_title": "彗星.txt",
"start_at": 2287,
"end_at": 2760,
"seq": 3,
"score": 0.7402352891601821,
"match_type": 2,
"sub_chunk_id": null,
"metadata": {},
"chunk_type": "text",
"parent_chunk_id": "",
"image_info": "",
"knowledge_filename": "彗星.txt",
"knowledge_source": ""
}
],
"success": true
}
```
<div align="right"><a href="#weknora-api-文档">返回顶部 ↑</a></div>
### 知识管理API