From 746f587698cc3f3ad25d8145a770068ad2df4c0d Mon Sep 17 00:00:00 2001 From: CaptainB Date: Thu, 12 Sep 2024 10:47:20 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E8=A1=A8=E6=A0=BC=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=8C=BA=E5=88=86xls=E5=92=8Cxlsx?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../impl/table/xls_parse_table_handle.py | 47 +++++++++++++++++++ ...e_handle.py => xlsx_parse_table_handle.py} | 6 +-- .../serializers/document_serializers.py | 5 +- 3 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 apps/common/handle/impl/table/xls_parse_table_handle.py rename apps/common/handle/impl/table/{excel_parse_table_handle.py => xlsx_parse_table_handle.py} (91%) diff --git a/apps/common/handle/impl/table/xls_parse_table_handle.py b/apps/common/handle/impl/table/xls_parse_table_handle.py new file mode 100644 index 000000000..75e59ede1 --- /dev/null +++ b/apps/common/handle/impl/table/xls_parse_table_handle.py @@ -0,0 +1,47 @@ +# coding=utf-8 +import logging + +import xlrd + +from common.handle.base_parse_table_handle import BaseParseTableHandle + +max_kb = logging.getLogger("max_kb") + + +class XlsSplitHandle(BaseParseTableHandle): + def support(self, file, get_buffer): + file_name: str = file.name.lower() + buffer = get_buffer(file) + if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer): + return True + return False + + def handle(self, file, get_buffer, save_image): + buffer = get_buffer(file) + try: + wb = xlrd.open_workbook(file_contents=buffer) + result = [] + sheets = wb.sheets() + for sheet in sheets: + paragraphs = [] + rows = iter([sheet.row_values(i) for i in range(sheet.nrows)]) + if not rows: continue + ti = next(rows) + for r in rows: + l = [] + for i, c in enumerate(r): + if not c: + continue + t = str(ti[i]) if i < len(ti) else "" + t += (": " if t else "") + str(c) + l.append(t) + l = "; ".join(l) + if sheet.name.lower().find("sheet") < 0: + l += " ——" + sheet.name + paragraphs.append({'title': '', 'content': l}) + result.append({'name': sheet.name, 'paragraphs': paragraphs}) + + except BaseException as e: + max_kb.error(f'excel split handle error: {e}') + return [{'name': file.name, 'paragraphs': []}] + return result diff --git a/apps/common/handle/impl/table/excel_parse_table_handle.py b/apps/common/handle/impl/table/xlsx_parse_table_handle.py similarity index 91% rename from apps/common/handle/impl/table/excel_parse_table_handle.py rename to apps/common/handle/impl/table/xlsx_parse_table_handle.py index 5c0802c12..c83ef253d 100644 --- a/apps/common/handle/impl/table/excel_parse_table_handle.py +++ b/apps/common/handle/impl/table/xlsx_parse_table_handle.py @@ -10,10 +10,10 @@ from common.handle.impl.tools import xlsx_embed_cells_images max_kb = logging.getLogger("max_kb") -class ExcelSplitHandle(BaseParseTableHandle): +class XlsxSplitHandle(BaseParseTableHandle): def support(self, file, get_buffer): file_name: str = file.name.lower() - if file_name.endswith('.xls') or file_name.endswith('.xlsx'): + if file_name.endswith('.xlsx'): return True return False @@ -34,13 +34,11 @@ class ExcelSplitHandle(BaseParseTableHandle): if not rows: continue ti = list(rows[0]) for r in list(rows[1:]): - title = [] l = [] for i, c in enumerate(r): if not c.value: continue t = str(ti[i].value) if i < len(ti) else "" - title.append(t) content = str(c.value) image = image_dict.get(content, None) if image is not None: diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py index c8b5a35f3..edc7b6c22 100644 --- a/apps/dataset/serializers/document_serializers.py +++ b/apps/dataset/serializers/document_serializers.py @@ -34,7 +34,8 @@ from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle from common.handle.impl.table.csv_parse_table_handle import CsvSplitHandle -from common.handle.impl.table.excel_parse_table_handle import ExcelSplitHandle +from common.handle.impl.table.xlsx_parse_table_handle import XlsxSplitHandle +from common.handle.impl.table.xls_parse_table_handle import XlsSplitHandle from common.handle.impl.text_split_handle import TextSplitHandle from common.mixins.api_mixin import ApiMixin from common.util.common import post, flat_map @@ -53,7 +54,7 @@ from embedding.task.embedding import embedding_by_document, delete_embedding_by_ from smartdoc.conf import PROJECT_DIR parse_qa_handle_list = [XlsParseQAHandle(), CsvParseQAHandle(), XlsxParseQAHandle()] -parse_table_handle_list = [CsvSplitHandle(), ExcelSplitHandle()] +parse_table_handle_list = [CsvSplitHandle(), XlsSplitHandle(), XlsxSplitHandle()] class FileBufferHandle: