mirror of
https://github.com/Tencent/WeKnora.git
synced 2025-11-25 03:15:00 +08:00
feat: 新增 CSV、XLSX、XLS 文件类型解析支持
This commit is contained in:
@@ -13,8 +13,10 @@ The parsers extract content from documents and can split them into
|
|||||||
meaningful chunks for further processing and indexing.
|
meaningful chunks for further processing and indexing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from .csv_parser import CSVParser
|
||||||
from .doc_parser import DocParser
|
from .doc_parser import DocParser
|
||||||
from .docx2_parser import Docx2Parser
|
from .docx2_parser import Docx2Parser
|
||||||
|
from .excel_parser import ExcelParser
|
||||||
from .image_parser import ImageParser
|
from .image_parser import ImageParser
|
||||||
from .markdown_parser import MarkdownParser
|
from .markdown_parser import MarkdownParser
|
||||||
from .parser import Parser
|
from .parser import Parser
|
||||||
@@ -32,4 +34,6 @@ __all__ = [
|
|||||||
"ImageParser", # Parser for images with text content
|
"ImageParser", # Parser for images with text content
|
||||||
"WebParser", # Parser for web pages
|
"WebParser", # Parser for web pages
|
||||||
"Parser", # Main parser factory that selects the appropriate parser
|
"Parser", # Main parser factory that selects the appropriate parser
|
||||||
|
"CSVParser", # Parser for CSV files
|
||||||
|
"ExcelParser", # Parser for Excel files
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -339,6 +339,9 @@ class BaseParser(ABC):
|
|||||||
logger.info(
|
logger.info(
|
||||||
f"Extracted {len(document.content)} characters from {self.file_name}"
|
f"Extracted {len(document.content)} characters from {self.file_name}"
|
||||||
)
|
)
|
||||||
|
if document.chunks:
|
||||||
|
return document
|
||||||
|
|
||||||
splitter = TextSplitter(
|
splitter = TextSplitter(
|
||||||
chunk_size=self.chunk_size,
|
chunk_size=self.chunk_size,
|
||||||
chunk_overlap=self.chunk_overlap,
|
chunk_overlap=self.chunk_overlap,
|
||||||
|
|||||||
50
docreader/parser/csv_parser.py
Normal file
50
docreader/parser/csv_parser.py
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from docreader.models.document import Chunk, Document
|
||||||
|
from docreader.parser.base_parser import BaseParser
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CSVParser(BaseParser):
|
||||||
|
def parse_into_text(self, content: bytes) -> Document:
|
||||||
|
chunks: List[Chunk] = []
|
||||||
|
text: List[str] = []
|
||||||
|
start, end = 0, 0
|
||||||
|
|
||||||
|
df = pd.read_csv(BytesIO(content), on_bad_lines="skip")
|
||||||
|
|
||||||
|
for i, (idx, row) in enumerate(df.iterrows()):
|
||||||
|
content_row = (
|
||||||
|
",".join(
|
||||||
|
f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
end += len(content_row)
|
||||||
|
text.append(content_row)
|
||||||
|
chunks.append(Chunk(content=content_row, seq=i, start=start, end=end))
|
||||||
|
start = end
|
||||||
|
|
||||||
|
return Document(
|
||||||
|
content="".join(text),
|
||||||
|
chunks=chunks,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
your_file = "/path/to/your/file.csv"
|
||||||
|
parser = CSVParser()
|
||||||
|
with open(your_file, "rb") as f:
|
||||||
|
content = f.read()
|
||||||
|
document = parser.parse_into_text(content)
|
||||||
|
logger.error(document.content)
|
||||||
|
|
||||||
|
for chunk in document.chunks:
|
||||||
|
logger.error(chunk.content)
|
||||||
54
docreader/parser/excel_parser.py
Normal file
54
docreader/parser/excel_parser.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from docreader.models.document import Chunk, Document
|
||||||
|
from docreader.parser.base_parser import BaseParser
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ExcelParser(BaseParser):
|
||||||
|
def parse_into_text(self, content: bytes) -> Document:
|
||||||
|
chunks: List[Chunk] = []
|
||||||
|
text: List[str] = []
|
||||||
|
start, end = 0, 0
|
||||||
|
|
||||||
|
excel_file = pd.ExcelFile(BytesIO(content))
|
||||||
|
for excel_sheet_name in excel_file.sheet_names:
|
||||||
|
df = excel_file.parse(sheet_name=excel_sheet_name)
|
||||||
|
df.dropna(how="all", inplace=True)
|
||||||
|
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
page_content = []
|
||||||
|
for k, v in row.items():
|
||||||
|
if pd.notna(v):
|
||||||
|
page_content.append(f"{k}: {v}")
|
||||||
|
if not page_content:
|
||||||
|
continue
|
||||||
|
content_row = ",".join(page_content) + "\n"
|
||||||
|
end += len(content_row)
|
||||||
|
text.append(content_row)
|
||||||
|
chunks.append(
|
||||||
|
Chunk(content=content_row, seq=len(chunks), start=start, end=end)
|
||||||
|
)
|
||||||
|
start = end
|
||||||
|
|
||||||
|
return Document(content="".join(text), chunks=chunks)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
your_file = "/path/to/your/file.xlsx"
|
||||||
|
parser = ExcelParser()
|
||||||
|
with open(your_file, "rb") as f:
|
||||||
|
content = f.read()
|
||||||
|
document = parser.parse_into_text(content)
|
||||||
|
logger.error(document.content)
|
||||||
|
|
||||||
|
for chunk in document.chunks:
|
||||||
|
logger.error(chunk.content)
|
||||||
|
break
|
||||||
@@ -4,8 +4,10 @@ from typing import Dict, Type
|
|||||||
from docreader.models.document import Document
|
from docreader.models.document import Document
|
||||||
from docreader.models.read_config import ChunkingConfig
|
from docreader.models.read_config import ChunkingConfig
|
||||||
from docreader.parser.base_parser import BaseParser
|
from docreader.parser.base_parser import BaseParser
|
||||||
|
from docreader.parser.csv_parser import CSVParser
|
||||||
from docreader.parser.doc_parser import DocParser
|
from docreader.parser.doc_parser import DocParser
|
||||||
from docreader.parser.docx2_parser import Docx2Parser
|
from docreader.parser.docx2_parser import Docx2Parser
|
||||||
|
from docreader.parser.excel_parser import ExcelParser
|
||||||
from docreader.parser.image_parser import ImageParser
|
from docreader.parser.image_parser import ImageParser
|
||||||
from docreader.parser.markdown_parser import MarkdownParser
|
from docreader.parser.markdown_parser import MarkdownParser
|
||||||
from docreader.parser.pdf_parser import PDFParser
|
from docreader.parser.pdf_parser import PDFParser
|
||||||
@@ -37,6 +39,9 @@ class Parser:
|
|||||||
"tiff": ImageParser,
|
"tiff": ImageParser,
|
||||||
"webp": ImageParser,
|
"webp": ImageParser,
|
||||||
"markdown": MarkdownParser,
|
"markdown": MarkdownParser,
|
||||||
|
"csv": CSVParser,
|
||||||
|
"xlsx": ExcelParser,
|
||||||
|
"xls": ExcelParser,
|
||||||
}
|
}
|
||||||
logger.info(
|
logger.info(
|
||||||
"Parser initialized with %d parsers: %s",
|
"Parser initialized with %d parsers: %s",
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ export function formatStringDate(date: any) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
export function kbFileTypeVerification(file: any) {
|
export function kbFileTypeVerification(file: any) {
|
||||||
let validTypes = ["pdf", "txt", "md", "docx", "doc", "jpg", "jpeg", "png"];
|
let validTypes = ["pdf", "txt", "md", "docx", "doc", "jpg", "jpeg", "png", "csv", "xlsx", "xls"];
|
||||||
let type = file.name.substring(file.name.lastIndexOf(".") + 1);
|
let type = file.name.substring(file.name.lastIndexOf(".") + 1);
|
||||||
if (!validTypes.includes(type)) {
|
if (!validTypes.includes(type)) {
|
||||||
MessagePlugin.error("文件类型错误!");
|
MessagePlugin.error("文件类型错误!");
|
||||||
|
|||||||
@@ -1353,7 +1353,7 @@ func (s *knowledgeService) UpdateKnowledge(ctx context.Context, knowledge *types
|
|||||||
// isValidFileType checks if a file type is supported
|
// isValidFileType checks if a file type is supported
|
||||||
func isValidFileType(filename string) bool {
|
func isValidFileType(filename string) bool {
|
||||||
switch strings.ToLower(getFileType(filename)) {
|
switch strings.ToLower(getFileType(filename)) {
|
||||||
case "pdf", "txt", "docx", "doc", "md", "markdown", "png", "jpg", "jpeg", "gif":
|
case "pdf", "txt", "docx", "doc", "md", "markdown", "png", "jpg", "jpeg", "gif", "csv", "xlsx", "xls":
|
||||||
return true
|
return true
|
||||||
default:
|
default:
|
||||||
return false
|
return false
|
||||||
|
|||||||
Reference in New Issue
Block a user