diff --git a/apps/common/handle/impl/table/xls_parse_table_handle.py b/apps/common/handle/impl/table/xls_parse_table_handle.py index 75e59ede1..6c30d49de 100644 --- a/apps/common/handle/impl/table/xls_parse_table_handle.py +++ b/apps/common/handle/impl/table/xls_parse_table_handle.py @@ -19,26 +19,41 @@ class XlsSplitHandle(BaseParseTableHandle): def handle(self, file, get_buffer, save_image): buffer = get_buffer(file) try: - wb = xlrd.open_workbook(file_contents=buffer) + wb = xlrd.open_workbook(file_contents=buffer, formatting_info=True) result = [] sheets = wb.sheets() for sheet in sheets: + # 获取合并单元格的范围信息 + merged_cells = sheet.merged_cells + print(merged_cells) + data = [] paragraphs = [] - rows = iter([sheet.row_values(i) for i in range(sheet.nrows)]) - if not rows: continue - ti = next(rows) - for r in rows: - l = [] - for i, c in enumerate(r): - if not c: - continue - t = str(ti[i]) if i < len(ti) else "" - t += (": " if t else "") + str(c) - l.append(t) - l = "; ".join(l) - if sheet.name.lower().find("sheet") < 0: - l += " ——" + sheet.name - paragraphs.append({'title': '', 'content': l}) + # 获取第一行作为标题行 + headers = [sheet.cell_value(0, col_idx) for col_idx in range(sheet.ncols)] + # 从第二行开始遍历每一行(跳过标题行) + for row_idx in range(1, sheet.nrows): + row_data = {} + for col_idx in range(sheet.ncols): + cell_value = sheet.cell_value(row_idx, col_idx) + + # 检查是否为空单元格,如果为空检查是否在合并区域中 + if cell_value == "": + # 检查当前单元格是否在合并区域 + for (rlo, rhi, clo, chi) in merged_cells: + if rlo <= row_idx < rhi and clo <= col_idx < chi: + # 使用合并区域的左上角单元格的值 + cell_value = sheet.cell_value(rlo, clo) + break + + # 将标题作为键,单元格的值作为值存入字典 + row_data[headers[col_idx]] = cell_value + data.append(row_data) + + for row in data: + row_output = "; ".join([f"{key}: {value}" for key, value in row.items()]) + # print(row_output) + paragraphs.append({'title': '', 'content': row_output}) + result.append({'name': sheet.name, 'paragraphs': paragraphs}) except BaseException as e: diff --git a/apps/common/handle/impl/table/xlsx_parse_table_handle.py b/apps/common/handle/impl/table/xlsx_parse_table_handle.py index c83ef253d..35ef2f14b 100644 --- a/apps/common/handle/impl/table/xlsx_parse_table_handle.py +++ b/apps/common/handle/impl/table/xlsx_parse_table_handle.py @@ -17,6 +17,35 @@ class XlsxSplitHandle(BaseParseTableHandle): return True return False + def fill_merged_cells(self, sheet, image_dict): + data = [] + + # 获取第一行作为标题行 + headers = [cell.value for cell in sheet[1]] + + # 从第二行开始遍历每一行 + for row in sheet.iter_rows(min_row=2, values_only=False): + row_data = {} + for col_idx, cell in enumerate(row): + cell_value = cell.value + + # 如果单元格为空,并且该单元格在合并单元格内,获取合并单元格的值 + if cell_value is None: + for merged_range in sheet.merged_cells.ranges: + if cell.coordinate in merged_range: + cell_value = sheet[merged_range.min_row][merged_range.min_col - 1].value + break + + image = image_dict.get(cell_value, None) + if image is not None: + cell_value = f'![](/api/image/{image.id})' + + # 使用标题作为键,单元格的值作为值存入字典 + row_data[headers[col_idx]] = cell_value + data.append(row_data) + + return data + def handle(self, file, get_buffer, save_image): buffer = get_buffer(file) try: @@ -30,25 +59,13 @@ class XlsxSplitHandle(BaseParseTableHandle): for sheetname in wb.sheetnames: paragraphs = [] ws = wb[sheetname] - rows = list(ws.rows) - if not rows: continue - ti = list(rows[0]) - for r in list(rows[1:]): - l = [] - for i, c in enumerate(r): - if not c.value: - continue - t = str(ti[i].value) if i < len(ti) else "" - content = str(c.value) - image = image_dict.get(content, None) - if image is not None: - content = f'![](/api/image/{image.id})' - t += (": " if t else "") + content - l.append(t) - l = "; ".join(l) - if sheetname.lower().find("sheet") < 0: - l += " ——" + sheetname - paragraphs.append({'title': '', 'content': l}) + data = self.fill_merged_cells(ws, image_dict) + + for row in data: + row_output = "; ".join([f"{key}: {value}" for key, value in row.items()]) + # print(row_output) + paragraphs.append({'title': '', 'content': row_output}) + result.append({'name': sheetname, 'paragraphs': paragraphs}) except BaseException as e: