feat: 新增 CSV、XLSX、XLS 文件类型解析支持

This commit is contained in:
begoniezhao
2025-11-19 17:27:17 +08:00
parent 3a2c86df5b
commit 587d1b2bd3
7 changed files with 118 additions and 2 deletions

View File

@@ -13,8 +13,10 @@ The parsers extract content from documents and can split them into
meaningful chunks for further processing and indexing. meaningful chunks for further processing and indexing.
""" """
from .csv_parser import CSVParser
from .doc_parser import DocParser from .doc_parser import DocParser
from .docx2_parser import Docx2Parser from .docx2_parser import Docx2Parser
from .excel_parser import ExcelParser
from .image_parser import ImageParser from .image_parser import ImageParser
from .markdown_parser import MarkdownParser from .markdown_parser import MarkdownParser
from .parser import Parser from .parser import Parser
@@ -32,4 +34,6 @@ __all__ = [
"ImageParser", # Parser for images with text content "ImageParser", # Parser for images with text content
"WebParser", # Parser for web pages "WebParser", # Parser for web pages
"Parser", # Main parser factory that selects the appropriate parser "Parser", # Main parser factory that selects the appropriate parser
"CSVParser", # Parser for CSV files
"ExcelParser", # Parser for Excel files
] ]

View File

@@ -339,6 +339,9 @@ class BaseParser(ABC):
logger.info( logger.info(
f"Extracted {len(document.content)} characters from {self.file_name}" f"Extracted {len(document.content)} characters from {self.file_name}"
) )
if document.chunks:
return document
splitter = TextSplitter( splitter = TextSplitter(
chunk_size=self.chunk_size, chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap, chunk_overlap=self.chunk_overlap,

View File

@@ -0,0 +1,50 @@
import logging
from io import BytesIO
from typing import List
import pandas as pd
from docreader.models.document import Chunk, Document
from docreader.parser.base_parser import BaseParser
logger = logging.getLogger(__name__)
class CSVParser(BaseParser):
def parse_into_text(self, content: bytes) -> Document:
chunks: List[Chunk] = []
text: List[str] = []
start, end = 0, 0
df = pd.read_csv(BytesIO(content), on_bad_lines="skip")
for i, (idx, row) in enumerate(df.iterrows()):
content_row = (
",".join(
f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns
)
+ "\n"
)
end += len(content_row)
text.append(content_row)
chunks.append(Chunk(content=content_row, seq=i, start=start, end=end))
start = end
return Document(
content="".join(text),
chunks=chunks,
)
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
your_file = "/path/to/your/file.csv"
parser = CSVParser()
with open(your_file, "rb") as f:
content = f.read()
document = parser.parse_into_text(content)
logger.error(document.content)
for chunk in document.chunks:
logger.error(chunk.content)

View File

@@ -0,0 +1,54 @@
import logging
from io import BytesIO
from typing import List
import pandas as pd
from docreader.models.document import Chunk, Document
from docreader.parser.base_parser import BaseParser
logger = logging.getLogger(__name__)
class ExcelParser(BaseParser):
def parse_into_text(self, content: bytes) -> Document:
chunks: List[Chunk] = []
text: List[str] = []
start, end = 0, 0
excel_file = pd.ExcelFile(BytesIO(content))
for excel_sheet_name in excel_file.sheet_names:
df = excel_file.parse(sheet_name=excel_sheet_name)
df.dropna(how="all", inplace=True)
for _, row in df.iterrows():
page_content = []
for k, v in row.items():
if pd.notna(v):
page_content.append(f"{k}: {v}")
if not page_content:
continue
content_row = ",".join(page_content) + "\n"
end += len(content_row)
text.append(content_row)
chunks.append(
Chunk(content=content_row, seq=len(chunks), start=start, end=end)
)
start = end
return Document(content="".join(text), chunks=chunks)
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
your_file = "/path/to/your/file.xlsx"
parser = ExcelParser()
with open(your_file, "rb") as f:
content = f.read()
document = parser.parse_into_text(content)
logger.error(document.content)
for chunk in document.chunks:
logger.error(chunk.content)
break

View File

@@ -4,8 +4,10 @@ from typing import Dict, Type
from docreader.models.document import Document from docreader.models.document import Document
from docreader.models.read_config import ChunkingConfig from docreader.models.read_config import ChunkingConfig
from docreader.parser.base_parser import BaseParser from docreader.parser.base_parser import BaseParser
from docreader.parser.csv_parser import CSVParser
from docreader.parser.doc_parser import DocParser from docreader.parser.doc_parser import DocParser
from docreader.parser.docx2_parser import Docx2Parser from docreader.parser.docx2_parser import Docx2Parser
from docreader.parser.excel_parser import ExcelParser
from docreader.parser.image_parser import ImageParser from docreader.parser.image_parser import ImageParser
from docreader.parser.markdown_parser import MarkdownParser from docreader.parser.markdown_parser import MarkdownParser
from docreader.parser.pdf_parser import PDFParser from docreader.parser.pdf_parser import PDFParser
@@ -37,6 +39,9 @@ class Parser:
"tiff": ImageParser, "tiff": ImageParser,
"webp": ImageParser, "webp": ImageParser,
"markdown": MarkdownParser, "markdown": MarkdownParser,
"csv": CSVParser,
"xlsx": ExcelParser,
"xls": ExcelParser,
} }
logger.info( logger.info(
"Parser initialized with %d parsers: %s", "Parser initialized with %d parsers: %s",

View File

@@ -23,7 +23,7 @@ export function formatStringDate(date: any) {
); );
} }
export function kbFileTypeVerification(file: any) { export function kbFileTypeVerification(file: any) {
let validTypes = ["pdf", "txt", "md", "docx", "doc", "jpg", "jpeg", "png"]; let validTypes = ["pdf", "txt", "md", "docx", "doc", "jpg", "jpeg", "png", "csv", "xlsx", "xls"];
let type = file.name.substring(file.name.lastIndexOf(".") + 1); let type = file.name.substring(file.name.lastIndexOf(".") + 1);
if (!validTypes.includes(type)) { if (!validTypes.includes(type)) {
MessagePlugin.error("文件类型错误!"); MessagePlugin.error("文件类型错误!");

View File

@@ -1353,7 +1353,7 @@ func (s *knowledgeService) UpdateKnowledge(ctx context.Context, knowledge *types
// isValidFileType checks if a file type is supported // isValidFileType checks if a file type is supported
func isValidFileType(filename string) bool { func isValidFileType(filename string) bool {
switch strings.ToLower(getFileType(filename)) { switch strings.ToLower(getFileType(filename)) {
case "pdf", "txt", "docx", "doc", "md", "markdown", "png", "jpg", "jpeg", "gif": case "pdf", "txt", "docx", "doc", "md", "markdown", "png", "jpg", "jpeg", "gif", "csv", "xlsx", "xls":
return true return true
default: default:
return false return false