MaxKB/apps/common/handle/impl/xls_split_handle.py
CaptainB 8d503c8bf8 fix: update post_cell function to handle different newline characters in cell values
--bug=1054683 --user=刘瑞斌 【github#2831】知识库上传excel、应用编排文档内容提取节点中上传excel,单元格中有换行,导入后没有在一个单元格里显示 https://www.tapd.cn/57709429/s/1690232
2025-04-24 16:05:09 +08:00

81 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
"""
@project: maxkb
@Author
@file xls_parse_qa_handle.py
@date2024/5/21 14:59
@desc:
"""
from typing import List
import xlrd
from common.handle.base_split_handle import BaseSplitHandle
def post_cell(cell_value):
return cell_value.replace('\r\n', '<br>').replace('\n', '<br>').replace('|', '&#124;')
def row_to_md(row):
return '| ' + ' | '.join(
[post_cell(str(cell)) if cell is not None else '' for cell in row]) + ' |\n'
def handle_sheet(file_name, sheet, limit: int):
rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
paragraphs = []
result = {'name': file_name, 'content': paragraphs}
try:
title_row_list = next(rows)
title_md_content = row_to_md(title_row_list)
title_md_content += '| ' + ' | '.join(
['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
except Exception as e:
return result
if len(title_row_list) == 0:
return result
result_item_content = ''
for row in rows:
next_md_content = row_to_md(row)
next_md_content_len = len(next_md_content)
result_item_content_len = len(result_item_content)
if len(result_item_content) == 0:
result_item_content += title_md_content
result_item_content += next_md_content
else:
if result_item_content_len + next_md_content_len < limit:
result_item_content += next_md_content
else:
paragraphs.append({'content': result_item_content, 'title': ''})
result_item_content = title_md_content + next_md_content
if len(result_item_content) > 0:
paragraphs.append({'content': result_item_content, 'title': ''})
return result
class XlsSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
try:
workbook = xlrd.open_workbook(file_contents=buffer)
worksheets = workbook.sheets()
worksheets_size = len(worksheets)
return [row for row in
[handle_sheet(file.name,
sheet, limit) if worksheets_size == 1 and sheet.name == 'Sheet1' else handle_sheet(
sheet.name, sheet, limit) for sheet
in worksheets] if row is not None]
except Exception as e:
return [{'name': file.name, 'content': []}]
def get_content(self, file, save_image):
pass
def support(self, file, get_buffer):
file_name: str = file.name.lower()
buffer = get_buffer(file)
if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
return True
return False