mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 10:02:46 +00:00
fix: 表格数据区分xls和xlsx
This commit is contained in:
parent
c58635e7cc
commit
746f587698
|
|
@ -0,0 +1,47 @@
|
|||
# coding=utf-8
|
||||
import logging
|
||||
|
||||
import xlrd
|
||||
|
||||
from common.handle.base_parse_table_handle import BaseParseTableHandle
|
||||
|
||||
max_kb = logging.getLogger("max_kb")
|
||||
|
||||
|
||||
class XlsSplitHandle(BaseParseTableHandle):
|
||||
def support(self, file, get_buffer):
|
||||
file_name: str = file.name.lower()
|
||||
buffer = get_buffer(file)
|
||||
if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
|
||||
return True
|
||||
return False
|
||||
|
||||
def handle(self, file, get_buffer, save_image):
|
||||
buffer = get_buffer(file)
|
||||
try:
|
||||
wb = xlrd.open_workbook(file_contents=buffer)
|
||||
result = []
|
||||
sheets = wb.sheets()
|
||||
for sheet in sheets:
|
||||
paragraphs = []
|
||||
rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
|
||||
if not rows: continue
|
||||
ti = next(rows)
|
||||
for r in rows:
|
||||
l = []
|
||||
for i, c in enumerate(r):
|
||||
if not c:
|
||||
continue
|
||||
t = str(ti[i]) if i < len(ti) else ""
|
||||
t += (": " if t else "") + str(c)
|
||||
l.append(t)
|
||||
l = "; ".join(l)
|
||||
if sheet.name.lower().find("sheet") < 0:
|
||||
l += " ——" + sheet.name
|
||||
paragraphs.append({'title': '', 'content': l})
|
||||
result.append({'name': sheet.name, 'paragraphs': paragraphs})
|
||||
|
||||
except BaseException as e:
|
||||
max_kb.error(f'excel split handle error: {e}')
|
||||
return [{'name': file.name, 'paragraphs': []}]
|
||||
return result
|
||||
|
|
@ -10,10 +10,10 @@ from common.handle.impl.tools import xlsx_embed_cells_images
|
|||
max_kb = logging.getLogger("max_kb")
|
||||
|
||||
|
||||
class ExcelSplitHandle(BaseParseTableHandle):
|
||||
class XlsxSplitHandle(BaseParseTableHandle):
|
||||
def support(self, file, get_buffer):
|
||||
file_name: str = file.name.lower()
|
||||
if file_name.endswith('.xls') or file_name.endswith('.xlsx'):
|
||||
if file_name.endswith('.xlsx'):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
|
@ -34,13 +34,11 @@ class ExcelSplitHandle(BaseParseTableHandle):
|
|||
if not rows: continue
|
||||
ti = list(rows[0])
|
||||
for r in list(rows[1:]):
|
||||
title = []
|
||||
l = []
|
||||
for i, c in enumerate(r):
|
||||
if not c.value:
|
||||
continue
|
||||
t = str(ti[i].value) if i < len(ti) else ""
|
||||
title.append(t)
|
||||
content = str(c.value)
|
||||
image = image_dict.get(content, None)
|
||||
if image is not None:
|
||||
|
|
@ -34,7 +34,8 @@ from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
|
|||
from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
|
||||
from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
|
||||
from common.handle.impl.table.csv_parse_table_handle import CsvSplitHandle
|
||||
from common.handle.impl.table.excel_parse_table_handle import ExcelSplitHandle
|
||||
from common.handle.impl.table.xlsx_parse_table_handle import XlsxSplitHandle
|
||||
from common.handle.impl.table.xls_parse_table_handle import XlsSplitHandle
|
||||
from common.handle.impl.text_split_handle import TextSplitHandle
|
||||
from common.mixins.api_mixin import ApiMixin
|
||||
from common.util.common import post, flat_map
|
||||
|
|
@ -53,7 +54,7 @@ from embedding.task.embedding import embedding_by_document, delete_embedding_by_
|
|||
from smartdoc.conf import PROJECT_DIR
|
||||
|
||||
parse_qa_handle_list = [XlsParseQAHandle(), CsvParseQAHandle(), XlsxParseQAHandle()]
|
||||
parse_table_handle_list = [CsvSplitHandle(), ExcelSplitHandle()]
|
||||
parse_table_handle_list = [CsvSplitHandle(), XlsSplitHandle(), XlsxSplitHandle()]
|
||||
|
||||
|
||||
class FileBufferHandle:
|
||||
|
|
|
|||
Loading…
Reference in New Issue