feat: add Markdown parsing support for QA handling

This commit is contained in:
CaptainB 2025-12-02 18:17:22 +08:00
parent 582fb99b9e
commit 8c802c3d01
2 changed files with 34 additions and 7 deletions

View File

@ -1,9 +1,10 @@
# coding=utf-8
import logging
import csv
import io
import traceback
from charset_normalizer import detect
from common.handle.base_parse_qa_handle import get_title_row_index_dict, get_row_value
from common.handle.base_parse_table_handle import BaseParseTableHandle
from common.utils.logger import maxkb_logger
@ -38,7 +39,33 @@ class CsvParseTableHandle(BaseParseTableHandle):
def get_content(self, file, save_image):
buffer = file.read()
try:
return buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
maxkb_logger.error(f'csv split handle error: {e}')
return f'error: {e}'
reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
rows = list(reader)
if not rows:
return ""
# 构建 Markdown 表格
md_lines = []
# 添加表头
header = [cell.replace('\n', '<br>').replace('\r', '') for cell in rows[0]]
md_lines.append('| ' + ' | '.join(header) + ' |')
# 添加分隔线
md_lines.append('| ' + ' | '.join(['---'] * len(header)) + ' |')
# 添加数据行
for row in rows[1:]:
if row: # 跳过空行
# 确保行长度与表头一致,并将换行符转换为 <br>
padded_row = [
cell.replace('\n', '<br>').replace('\r', '') for cell in row
] + [''] * (len(header) - len(row))
md_lines.append('| ' + ' | '.join(padded_row[:len(header)]) + ' |')
return '\n'.join(md_lines)
except Exception as e:
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
return ""

View File

@ -89,5 +89,5 @@ class XlsParseTableHandle(BaseParseTableHandle):
return md_tables
except Exception as e:
max_kb.error(f'excel split handle error: {e}')
maxkb_logger.error(f'excel split handle error: {e}')
return f'error: {e}'