From 587d1b2bd3f4e9397e3e7cf0ebad2d54db1471ec Mon Sep 17 00:00:00 2001 From: begoniezhao Date: Wed, 19 Nov 2025 17:27:17 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=20CSV=E3=80=81XLSX?= =?UTF-8?q?=E3=80=81XLS=20=E6=96=87=E4=BB=B6=E7=B1=BB=E5=9E=8B=E8=A7=A3?= =?UTF-8?q?=E6=9E=90=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docreader/parser/__init__.py | 4 ++ docreader/parser/base_parser.py | 3 ++ docreader/parser/csv_parser.py | 50 +++++++++++++++++++++ docreader/parser/excel_parser.py | 54 +++++++++++++++++++++++ docreader/parser/parser.py | 5 +++ frontend/src/utils/index.ts | 2 +- internal/application/service/knowledge.go | 2 +- 7 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 docreader/parser/csv_parser.py create mode 100644 docreader/parser/excel_parser.py diff --git a/docreader/parser/__init__.py b/docreader/parser/__init__.py index 085b09f..497e1f0 100644 --- a/docreader/parser/__init__.py +++ b/docreader/parser/__init__.py @@ -13,8 +13,10 @@ The parsers extract content from documents and can split them into meaningful chunks for further processing and indexing. """ +from .csv_parser import CSVParser from .doc_parser import DocParser from .docx2_parser import Docx2Parser +from .excel_parser import ExcelParser from .image_parser import ImageParser from .markdown_parser import MarkdownParser from .parser import Parser @@ -32,4 +34,6 @@ __all__ = [ "ImageParser", # Parser for images with text content "WebParser", # Parser for web pages "Parser", # Main parser factory that selects the appropriate parser + "CSVParser", # Parser for CSV files + "ExcelParser", # Parser for Excel files ] diff --git a/docreader/parser/base_parser.py b/docreader/parser/base_parser.py index 418cbe9..e3a1b0c 100644 --- a/docreader/parser/base_parser.py +++ b/docreader/parser/base_parser.py @@ -339,6 +339,9 @@ class BaseParser(ABC): logger.info( f"Extracted {len(document.content)} characters from {self.file_name}" ) + if document.chunks: + return document + splitter = TextSplitter( chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap, diff --git a/docreader/parser/csv_parser.py b/docreader/parser/csv_parser.py new file mode 100644 index 0000000..f2a713d --- /dev/null +++ b/docreader/parser/csv_parser.py @@ -0,0 +1,50 @@ +import logging +from io import BytesIO +from typing import List + +import pandas as pd + +from docreader.models.document import Chunk, Document +from docreader.parser.base_parser import BaseParser + +logger = logging.getLogger(__name__) + + +class CSVParser(BaseParser): + def parse_into_text(self, content: bytes) -> Document: + chunks: List[Chunk] = [] + text: List[str] = [] + start, end = 0, 0 + + df = pd.read_csv(BytesIO(content), on_bad_lines="skip") + + for i, (idx, row) in enumerate(df.iterrows()): + content_row = ( + ",".join( + f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns + ) + + "\n" + ) + end += len(content_row) + text.append(content_row) + chunks.append(Chunk(content=content_row, seq=i, start=start, end=end)) + start = end + + return Document( + content="".join(text), + chunks=chunks, + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + + your_file = "/path/to/your/file.csv" + parser = CSVParser() + with open(your_file, "rb") as f: + content = f.read() + document = parser.parse_into_text(content) + logger.error(document.content) + + for chunk in document.chunks: + logger.error(chunk.content) diff --git a/docreader/parser/excel_parser.py b/docreader/parser/excel_parser.py new file mode 100644 index 0000000..6c9ac3c --- /dev/null +++ b/docreader/parser/excel_parser.py @@ -0,0 +1,54 @@ +import logging +from io import BytesIO +from typing import List + +import pandas as pd + +from docreader.models.document import Chunk, Document +from docreader.parser.base_parser import BaseParser + +logger = logging.getLogger(__name__) + + +class ExcelParser(BaseParser): + def parse_into_text(self, content: bytes) -> Document: + chunks: List[Chunk] = [] + text: List[str] = [] + start, end = 0, 0 + + excel_file = pd.ExcelFile(BytesIO(content)) + for excel_sheet_name in excel_file.sheet_names: + df = excel_file.parse(sheet_name=excel_sheet_name) + df.dropna(how="all", inplace=True) + + for _, row in df.iterrows(): + page_content = [] + for k, v in row.items(): + if pd.notna(v): + page_content.append(f"{k}: {v}") + if not page_content: + continue + content_row = ",".join(page_content) + "\n" + end += len(content_row) + text.append(content_row) + chunks.append( + Chunk(content=content_row, seq=len(chunks), start=start, end=end) + ) + start = end + + return Document(content="".join(text), chunks=chunks) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + + your_file = "/path/to/your/file.xlsx" + parser = ExcelParser() + with open(your_file, "rb") as f: + content = f.read() + document = parser.parse_into_text(content) + logger.error(document.content) + + for chunk in document.chunks: + logger.error(chunk.content) + break diff --git a/docreader/parser/parser.py b/docreader/parser/parser.py index b53448d..ffba7d3 100644 --- a/docreader/parser/parser.py +++ b/docreader/parser/parser.py @@ -4,8 +4,10 @@ from typing import Dict, Type from docreader.models.document import Document from docreader.models.read_config import ChunkingConfig from docreader.parser.base_parser import BaseParser +from docreader.parser.csv_parser import CSVParser from docreader.parser.doc_parser import DocParser from docreader.parser.docx2_parser import Docx2Parser +from docreader.parser.excel_parser import ExcelParser from docreader.parser.image_parser import ImageParser from docreader.parser.markdown_parser import MarkdownParser from docreader.parser.pdf_parser import PDFParser @@ -37,6 +39,9 @@ class Parser: "tiff": ImageParser, "webp": ImageParser, "markdown": MarkdownParser, + "csv": CSVParser, + "xlsx": ExcelParser, + "xls": ExcelParser, } logger.info( "Parser initialized with %d parsers: %s", diff --git a/frontend/src/utils/index.ts b/frontend/src/utils/index.ts index 04f8c71..631e22e 100644 --- a/frontend/src/utils/index.ts +++ b/frontend/src/utils/index.ts @@ -23,7 +23,7 @@ export function formatStringDate(date: any) { ); } export function kbFileTypeVerification(file: any) { - let validTypes = ["pdf", "txt", "md", "docx", "doc", "jpg", "jpeg", "png"]; + let validTypes = ["pdf", "txt", "md", "docx", "doc", "jpg", "jpeg", "png", "csv", "xlsx", "xls"]; let type = file.name.substring(file.name.lastIndexOf(".") + 1); if (!validTypes.includes(type)) { MessagePlugin.error("文件类型错误!"); diff --git a/internal/application/service/knowledge.go b/internal/application/service/knowledge.go index 911ce96..3215533 100644 --- a/internal/application/service/knowledge.go +++ b/internal/application/service/knowledge.go @@ -1353,7 +1353,7 @@ func (s *knowledgeService) UpdateKnowledge(ctx context.Context, knowledge *types // isValidFileType checks if a file type is supported func isValidFileType(filename string) bool { switch strings.ToLower(getFileType(filename)) { - case "pdf", "txt", "docx", "doc", "md", "markdown", "png", "jpg", "jpeg", "gif": + case "pdf", "txt", "docx", "doc", "md", "markdown", "png", "jpg", "jpeg", "gif", "csv", "xlsx", "xls": return true default: return false