refactor: 处理纵向合并的单元格

This commit is contained in:
CaptainB 2024-09-18 11:35:14 +08:00 committed by 刘瑞斌
parent 45bf3477d1
commit 3e3b77e34d
2 changed files with 67 additions and 35 deletions

View File

@ -19,26 +19,41 @@ class XlsSplitHandle(BaseParseTableHandle):
def handle(self, file, get_buffer, save_image):
buffer = get_buffer(file)
try:
wb = xlrd.open_workbook(file_contents=buffer)
wb = xlrd.open_workbook(file_contents=buffer, formatting_info=True)
result = []
sheets = wb.sheets()
for sheet in sheets:
# 获取合并单元格的范围信息
merged_cells = sheet.merged_cells
print(merged_cells)
data = []
paragraphs = []
rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
if not rows: continue
ti = next(rows)
for r in rows:
l = []
for i, c in enumerate(r):
if not c:
continue
t = str(ti[i]) if i < len(ti) else ""
t += (": " if t else "") + str(c)
l.append(t)
l = "; ".join(l)
if sheet.name.lower().find("sheet") < 0:
l += " ——" + sheet.name
paragraphs.append({'title': '', 'content': l})
# 获取第一行作为标题行
headers = [sheet.cell_value(0, col_idx) for col_idx in range(sheet.ncols)]
# 从第二行开始遍历每一行(跳过标题行)
for row_idx in range(1, sheet.nrows):
row_data = {}
for col_idx in range(sheet.ncols):
cell_value = sheet.cell_value(row_idx, col_idx)
# 检查是否为空单元格,如果为空检查是否在合并区域中
if cell_value == "":
# 检查当前单元格是否在合并区域
for (rlo, rhi, clo, chi) in merged_cells:
if rlo <= row_idx < rhi and clo <= col_idx < chi:
# 使用合并区域的左上角单元格的值
cell_value = sheet.cell_value(rlo, clo)
break
# 将标题作为键,单元格的值作为值存入字典
row_data[headers[col_idx]] = cell_value
data.append(row_data)
for row in data:
row_output = "; ".join([f"{key}: {value}" for key, value in row.items()])
# print(row_output)
paragraphs.append({'title': '', 'content': row_output})
result.append({'name': sheet.name, 'paragraphs': paragraphs})
except BaseException as e:

View File

@ -17,6 +17,35 @@ class XlsxSplitHandle(BaseParseTableHandle):
return True
return False
def fill_merged_cells(self, sheet, image_dict):
data = []
# 获取第一行作为标题行
headers = [cell.value for cell in sheet[1]]
# 从第二行开始遍历每一行
for row in sheet.iter_rows(min_row=2, values_only=False):
row_data = {}
for col_idx, cell in enumerate(row):
cell_value = cell.value
# 如果单元格为空,并且该单元格在合并单元格内,获取合并单元格的值
if cell_value is None:
for merged_range in sheet.merged_cells.ranges:
if cell.coordinate in merged_range:
cell_value = sheet[merged_range.min_row][merged_range.min_col - 1].value
break
image = image_dict.get(cell_value, None)
if image is not None:
cell_value = f'![](/api/image/{image.id})'
# 使用标题作为键,单元格的值作为值存入字典
row_data[headers[col_idx]] = cell_value
data.append(row_data)
return data
def handle(self, file, get_buffer, save_image):
buffer = get_buffer(file)
try:
@ -30,25 +59,13 @@ class XlsxSplitHandle(BaseParseTableHandle):
for sheetname in wb.sheetnames:
paragraphs = []
ws = wb[sheetname]
rows = list(ws.rows)
if not rows: continue
ti = list(rows[0])
for r in list(rows[1:]):
l = []
for i, c in enumerate(r):
if not c.value:
continue
t = str(ti[i].value) if i < len(ti) else ""
content = str(c.value)
image = image_dict.get(content, None)
if image is not None:
content = f'![](/api/image/{image.id})'
t += (": " if t else "") + content
l.append(t)
l = "; ".join(l)
if sheetname.lower().find("sheet") < 0:
l += " ——" + sheetname
paragraphs.append({'title': '', 'content': l})
data = self.fill_merged_cells(ws, image_dict)
for row in data:
row_output = "; ".join([f"{key}: {value}" for key, value in row.items()])
# print(row_output)
paragraphs.append({'title': '', 'content': row_output})
result.append({'name': sheetname, 'paragraphs': paragraphs})
except BaseException as e: