chore: 文档内容无法提取的时候输出错误信息

This commit is contained in:
CaptainB 2024-11-22 17:54:06 +08:00 committed by 刘瑞斌
parent 98db08d263
commit 64e8f4dc9f
3 changed files with 46 additions and 37 deletions

View File

@ -41,4 +41,4 @@ class CsvSplitHandle(BaseParseTableHandle):
return buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
max_kb.error(f'csv split handle error: {e}')
return [{'name': file.name, 'paragraphs': []}]
return f'error: {e}'

View File

@ -63,21 +63,26 @@ class XlsSplitHandle(BaseParseTableHandle):
def get_content(self, file):
# 打开 .xls 文件
workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True)
sheets = workbook.sheets()
md_tables = ''
for sheet in sheets:
try:
workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True)
sheets = workbook.sheets()
md_tables = ''
for sheet in sheets:
# 获取表头和内容
headers = sheet.row_values(0)
data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)]
# 获取表头和内容
headers = sheet.row_values(0)
data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)]
# 构建 Markdown 表格
md_table = '| ' + ' | '.join(headers) + ' |\n'
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
for row in data:
# 将每个单元格中的内容替换换行符为 <br> 以保留原始格式
md_table += '| ' + ' | '.join([str(cell).replace('\n', '<br>') if cell else '' for cell in row]) + ' |\n'
md_tables += md_table + '\n\n'
# 构建 Markdown 表格
md_table = '| ' + ' | '.join(headers) + ' |\n'
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
for row in data:
# 将每个单元格中的内容替换换行符为 <br> 以保留原始格式
md_table += '| ' + ' | '.join(
[str(cell).replace('\n', '<br>') if cell else '' for cell in row]) + ' |\n'
md_tables += md_table + '\n\n'
return md_tables
return md_tables
except Exception as e:
max_kb.error(f'excel split handle error: {e}')
return f'error: {e}'

View File

@ -75,28 +75,32 @@ class XlsxSplitHandle(BaseParseTableHandle):
def get_content(self, file):
# 加载 Excel 文件
workbook = load_workbook(file)
md_tables = ''
# 如果未指定 sheet_name则使用第一个工作表
for sheetname in workbook.sheetnames:
sheet = workbook[sheetname] if sheetname else workbook.active
try:
# 加载 Excel 文件
workbook = load_workbook(file)
md_tables = ''
# 如果未指定 sheet_name则使用第一个工作表
for sheetname in workbook.sheetnames:
sheet = workbook[sheetname] if sheetname else workbook.active
# 获取工作表的所有行
rows = list(sheet.iter_rows(values_only=True))
if not rows:
continue
# 获取工作表的所有行
rows = list(sheet.iter_rows(values_only=True))
if not rows:
continue
# 提取表头和内容
headers = rows[0]
data = rows[1:]
# 提取表头和内容
headers = rows[0]
data = rows[1:]
# 构建 Markdown 表格
md_table = '| ' + ' | '.join(headers) + ' |\n'
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
for row in data:
md_table += '| ' + ' | '.join(
[str(cell).replace('\n', '<br>') if cell is not None else '' for cell in row]) + ' |\n'
# 构建 Markdown 表格
md_table = '| ' + ' | '.join(headers) + ' |\n'
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
for row in data:
md_table += '| ' + ' | '.join(
[str(cell).replace('\n', '<br>') if cell is not None else '' for cell in row]) + ' |\n'
md_tables += md_table + '\n\n'
return md_tables
md_tables += md_table + '\n\n'
return md_tables
except Exception as e:
max_kb.error(f'excel split handle error: {e}')
return f'error: {e}'