mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
Compare commits
2 Commits
587d1b2bd3
...
b9f264b883
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b9f264b883 | ||
|
|
154025f723 |
@@ -6,6 +6,7 @@ from docreader.parser.base_parser import BaseParser
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class FirstParser(BaseParser):
|
||||
@@ -16,16 +17,15 @@ class FirstParser(BaseParser):
|
||||
|
||||
self._parsers: List[BaseParser] = []
|
||||
for parser_cls in self._parser_cls:
|
||||
try:
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
for p in self._parsers:
|
||||
logger.info(f"FirstParser: using parser {p.__class__.__name__}")
|
||||
document = p.parse_into_text(content)
|
||||
if document.is_valid():
|
||||
logger.info(f"FirstParser: parser {p.__class__.__name__} succeeded")
|
||||
return document
|
||||
return Document()
|
||||
|
||||
@@ -43,16 +43,14 @@ class PipelineParser(BaseParser):
|
||||
|
||||
self._parsers: List[BaseParser] = []
|
||||
for parser_cls in self._parser_cls:
|
||||
try:
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create parser {parser_cls.__name__}: {e}")
|
||||
parser = parser_cls(*args, **kwargs)
|
||||
self._parsers.append(parser)
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
images: Dict[str, str] = {}
|
||||
document = Document()
|
||||
for p in self._parsers:
|
||||
logger.info(f"PipelineParser: using parser {p.__class__.__name__}")
|
||||
document = p.parse_into_text(content)
|
||||
content = endecode.encode_bytes(document.content)
|
||||
images.update(document.images)
|
||||
|
||||
@@ -28,7 +28,6 @@ class StdMinerUParser(BaseParser):
|
||||
self.image_helper = MarkdownImageUtil()
|
||||
self.base64_pattern = re.compile(r"data:image/(\w+);base64,(.*)")
|
||||
self.enable = self.ping()
|
||||
assert self.ping(), "MinerU API is not reachable"
|
||||
|
||||
def ping(self, timeout: int = 5) -> bool:
|
||||
try:
|
||||
@@ -41,6 +40,10 @@ class StdMinerUParser(BaseParser):
|
||||
return False
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
if not self.enable:
|
||||
logger.debug("MinerU API is not enabled")
|
||||
return Document()
|
||||
|
||||
logger.info(f"Parsing scanned PDF via MinerU API (size: {len(content)} bytes)")
|
||||
md_content: str = ""
|
||||
images_b64: Dict[str, str] = {}
|
||||
|
||||
46
docs/API.md
46
docs/API.md
@@ -334,7 +334,6 @@ curl --location 'http://localhost:8080/api/v1/tenants' \
|
||||
| GET | `/knowledge-bases/:id` | 获取知识库详情 |
|
||||
| PUT | `/knowledge-bases/:id` | 更新知识库 |
|
||||
| DELETE | `/knowledge-bases/:id` | 删除知识库 |
|
||||
| GET | `/knowledge-bases/:id/hybrid-search` | 混合搜索知识库内容 |
|
||||
| POST | `/knowledge-bases/copy` | 拷贝知识库 |
|
||||
|
||||
#### POST `/knowledge-bases` - 创建知识库
|
||||
@@ -656,51 +655,6 @@ curl --location --request DELETE 'http://localhost:8080/api/v1/knowledge-bases/b
|
||||
}
|
||||
```
|
||||
|
||||
#### GET `/knowledge-bases/:id/hybrid-search` - 混合搜索知识库内容
|
||||
|
||||
**请求**:
|
||||
|
||||
```curl
|
||||
curl --location --request GET 'http://localhost:8080/api/v1/knowledge-bases/kb-00000001/hybrid-search' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'X-API-Key: sk-vQHV2NZI_LK5W7wHQvH3yGYExX8YnhaHwZipUYbiZKCYJbBQ' \
|
||||
--data '{
|
||||
"query_text": "彗星",
|
||||
"vector_threshold": 0.1,
|
||||
"keyword_threshold": 0.1,
|
||||
"match_count": 1
|
||||
}'
|
||||
```
|
||||
|
||||
**响应**:
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"id": "7d955251-3f79-4fd5-a6aa-02f81e044091",
|
||||
"content": "有几位后来xxxxx",
|
||||
"knowledge_id": "a6790b93-4700-4676-bd48-0d4804e1456b",
|
||||
"chunk_index": 3,
|
||||
"knowledge_title": "彗星.txt",
|
||||
"start_at": 2287,
|
||||
"end_at": 2760,
|
||||
"seq": 3,
|
||||
"score": 0.7402352891601821,
|
||||
"match_type": 2,
|
||||
"sub_chunk_id": null,
|
||||
"metadata": {},
|
||||
"chunk_type": "text",
|
||||
"parent_chunk_id": "",
|
||||
"image_info": "",
|
||||
"knowledge_filename": "彗星.txt",
|
||||
"knowledge_source": ""
|
||||
}
|
||||
],
|
||||
"success": true
|
||||
}
|
||||
```
|
||||
|
||||
<div align="right"><a href="#weknora-api-文档">返回顶部 ↑</a></div>
|
||||
|
||||
### 知识管理API
|
||||
|
||||
Reference in New Issue
Block a user