From 8c802c3d019a39bf62a3f3a78efb95b7dd65433c Mon Sep 17 00:00:00 2001 From: CaptainB Date: Tue, 2 Dec 2025 18:17:22 +0800 Subject: [PATCH] feat: add Markdown parsing support for QA handling --- .../impl/table/csv_parse_table_handle.py | 39 ++++++++++++++++--- .../impl/table/xls_parse_table_handle.py | 2 +- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/apps/common/handle/impl/table/csv_parse_table_handle.py b/apps/common/handle/impl/table/csv_parse_table_handle.py index 27853f559..05600ff0e 100644 --- a/apps/common/handle/impl/table/csv_parse_table_handle.py +++ b/apps/common/handle/impl/table/csv_parse_table_handle.py @@ -1,9 +1,10 @@ # coding=utf-8 -import logging +import csv +import io import traceback from charset_normalizer import detect - +from common.handle.base_parse_qa_handle import get_title_row_index_dict, get_row_value from common.handle.base_parse_table_handle import BaseParseTableHandle from common.utils.logger import maxkb_logger @@ -38,7 +39,33 @@ class CsvParseTableHandle(BaseParseTableHandle): def get_content(self, file, save_image): buffer = file.read() try: - return buffer.decode(detect(buffer)['encoding']) - except BaseException as e: - maxkb_logger.error(f'csv split handle error: {e}') - return f'error: {e}' + reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding'])) + rows = list(reader) + + if not rows: + return "" + + # 构建 Markdown 表格 + md_lines = [] + + # 添加表头 + header = [cell.replace('\n', '
').replace('\r', '') for cell in rows[0]] + md_lines.append('| ' + ' | '.join(header) + ' |') + + # 添加分隔线 + md_lines.append('| ' + ' | '.join(['---'] * len(header)) + ' |') + + # 添加数据行 + for row in rows[1:]: + if row: # 跳过空行 + # 确保行长度与表头一致,并将换行符转换为
+ padded_row = [ + cell.replace('\n', '
').replace('\r', '') for cell in row + ] + [''] * (len(header) - len(row)) + md_lines.append('| ' + ' | '.join(padded_row[:len(header)]) + ' |') + + return '\n'.join(md_lines) + + except Exception as e: + maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}") + return "" diff --git a/apps/common/handle/impl/table/xls_parse_table_handle.py b/apps/common/handle/impl/table/xls_parse_table_handle.py index 74a5a2df6..5ba66bc64 100644 --- a/apps/common/handle/impl/table/xls_parse_table_handle.py +++ b/apps/common/handle/impl/table/xls_parse_table_handle.py @@ -89,5 +89,5 @@ class XlsParseTableHandle(BaseParseTableHandle): return md_tables except Exception as e: - max_kb.error(f'excel split handle error: {e}') + maxkb_logger.error(f'excel split handle error: {e}') return f'error: {e}'