From 746f587698cc3f3ad25d8145a770068ad2df4c0d Mon Sep 17 00:00:00 2001
From: CaptainB <bin@fit2cloud.com>
Date: Thu, 12 Sep 2024 10:47:20 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E8=A1=A8=E6=A0=BC=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E5=8C=BA=E5=88=86xls=E5=92=8Cxlsx?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../impl/table/xls_parse_table_handle.py      | 47 +++++++++++++++++++
 ...e_handle.py => xlsx_parse_table_handle.py} |  6 +--
 .../serializers/document_serializers.py       |  5 +-
 3 files changed, 52 insertions(+), 6 deletions(-)
 create mode 100644 apps/common/handle/impl/table/xls_parse_table_handle.py
 rename apps/common/handle/impl/table/{excel_parse_table_handle.py => xlsx_parse_table_handle.py} (91%)

diff --git a/apps/common/handle/impl/table/xls_parse_table_handle.py b/apps/common/handle/impl/table/xls_parse_table_handle.py
new file mode 100644
index 000000000..75e59ede1
--- /dev/null
+++ b/apps/common/handle/impl/table/xls_parse_table_handle.py
@@ -0,0 +1,47 @@
+# coding=utf-8
+import logging
+
+import xlrd
+
+from common.handle.base_parse_table_handle import BaseParseTableHandle
+
+max_kb = logging.getLogger("max_kb")
+
+
+class XlsSplitHandle(BaseParseTableHandle):
+    def support(self, file, get_buffer):
+        file_name: str = file.name.lower()
+        buffer = get_buffer(file)
+        if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
+            return True
+        return False
+
+    def handle(self, file, get_buffer, save_image):
+        buffer = get_buffer(file)
+        try:
+            wb = xlrd.open_workbook(file_contents=buffer)
+            result = []
+            sheets = wb.sheets()
+            for sheet in sheets:
+                paragraphs = []
+                rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
+                if not rows: continue
+                ti = next(rows)
+                for r in rows:
+                    l = []
+                    for i, c in enumerate(r):
+                        if not c:
+                            continue
+                        t = str(ti[i]) if i < len(ti) else ""
+                        t += (": " if t else "") + str(c)
+                        l.append(t)
+                    l = "; ".join(l)
+                    if sheet.name.lower().find("sheet") < 0:
+                        l += " ——" + sheet.name
+                    paragraphs.append({'title': '', 'content': l})
+                result.append({'name': sheet.name, 'paragraphs': paragraphs})
+
+        except BaseException as e:
+            max_kb.error(f'excel split handle error: {e}')
+            return [{'name': file.name, 'paragraphs': []}]
+        return result
diff --git a/apps/common/handle/impl/table/excel_parse_table_handle.py b/apps/common/handle/impl/table/xlsx_parse_table_handle.py
similarity index 91%
rename from apps/common/handle/impl/table/excel_parse_table_handle.py
rename to apps/common/handle/impl/table/xlsx_parse_table_handle.py
index 5c0802c12..c83ef253d 100644
--- a/apps/common/handle/impl/table/excel_parse_table_handle.py
+++ b/apps/common/handle/impl/table/xlsx_parse_table_handle.py
@@ -10,10 +10,10 @@ from common.handle.impl.tools import xlsx_embed_cells_images
 max_kb = logging.getLogger("max_kb")
 
 
-class ExcelSplitHandle(BaseParseTableHandle):
+class XlsxSplitHandle(BaseParseTableHandle):
     def support(self, file, get_buffer):
         file_name: str = file.name.lower()
-        if file_name.endswith('.xls') or file_name.endswith('.xlsx'):
+        if file_name.endswith('.xlsx'):
             return True
         return False
 
@@ -34,13 +34,11 @@ class ExcelSplitHandle(BaseParseTableHandle):
                 if not rows: continue
                 ti = list(rows[0])
                 for r in list(rows[1:]):
-                    title = []
                     l = []
                     for i, c in enumerate(r):
                         if not c.value:
                             continue
                         t = str(ti[i].value) if i < len(ti) else ""
-                        title.append(t)
                         content = str(c.value)
                         image = image_dict.get(content, None)
                         if image is not None:
diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py
index c8b5a35f3..edc7b6c22 100644
--- a/apps/dataset/serializers/document_serializers.py
+++ b/apps/dataset/serializers/document_serializers.py
@@ -34,7 +34,8 @@ from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
 from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
 from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
 from common.handle.impl.table.csv_parse_table_handle import CsvSplitHandle
-from common.handle.impl.table.excel_parse_table_handle import ExcelSplitHandle
+from common.handle.impl.table.xlsx_parse_table_handle import XlsxSplitHandle
+from common.handle.impl.table.xls_parse_table_handle import XlsSplitHandle
 from common.handle.impl.text_split_handle import TextSplitHandle
 from common.mixins.api_mixin import ApiMixin
 from common.util.common import post, flat_map
@@ -53,7 +54,7 @@ from embedding.task.embedding import embedding_by_document, delete_embedding_by_
 from smartdoc.conf import PROJECT_DIR
 
 parse_qa_handle_list = [XlsParseQAHandle(), CsvParseQAHandle(), XlsxParseQAHandle()]
-parse_table_handle_list = [CsvSplitHandle(), ExcelSplitHandle()]
+parse_table_handle_list = [CsvSplitHandle(), XlsSplitHandle(), XlsxSplitHandle()]
 
 
 class FileBufferHandle: