diff --git a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
index 176230d2d..dc62d667e 100644
--- a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
+++ b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
@@ -1,23 +1,36 @@
# coding=utf-8
+import io
+
from django.db.models import QuerySet
from application.flow.i_step_node import NodeResult
from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode
from dataset.models import File
+from dataset.serializers.document_serializers import split_handles, parse_table_handle_list, FileBufferHandle
class BaseDocumentExtractNode(IDocumentExtractNode):
def execute(self, document, **kwargs):
+ get_buffer = FileBufferHandle().get_buffer
+
self.context['document_list'] = document
content = ''
spliter = '\n-----------------------------------\n'
- if len(document) > 0:
- for doc in document:
- file = QuerySet(File).filter(id=doc['file_id']).first()
- file_type = doc['name'].split('.')[-1]
- if file_type.lower() in ['txt', 'md', 'csv', 'html']:
- content += spliter + doc['name'] + '\n' + file.get_byte().tobytes().decode('utf-8')
+ if document is None:
+ return NodeResult({'content': content}, {})
+ for doc in document:
+ file = QuerySet(File).filter(id=doc['file_id']).first()
+ buffer = io.BytesIO(file.get_byte().tobytes())
+ buffer.name = doc['name'] # this is the important line
+
+ for split_handle in (parse_table_handle_list + split_handles):
+ if split_handle.support(buffer, get_buffer):
+ # 回到文件头
+ buffer.seek(0)
+ file_content = split_handle.get_content(buffer)
+ content += spliter + '## ' + doc['name'] + '\n' + file_content
+ return NodeResult({'content': content}, {})
return NodeResult({'content': content}, {})
diff --git a/apps/common/handle/base_parse_table_handle.py b/apps/common/handle/base_parse_table_handle.py
index 487290378..b84690859 100644
--- a/apps/common/handle/base_parse_table_handle.py
+++ b/apps/common/handle/base_parse_table_handle.py
@@ -17,3 +17,7 @@ class BaseParseTableHandle(ABC):
@abstractmethod
def handle(self, file, get_buffer,save_image):
pass
+
+ @abstractmethod
+ def get_content(self, file):
+ pass
\ No newline at end of file
diff --git a/apps/common/handle/base_split_handle.py b/apps/common/handle/base_split_handle.py
index f9b573f0f..ea48e6868 100644
--- a/apps/common/handle/base_split_handle.py
+++ b/apps/common/handle/base_split_handle.py
@@ -18,3 +18,7 @@ class BaseSplitHandle(ABC):
@abstractmethod
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
pass
+
+ @abstractmethod
+ def get_content(self, file):
+ pass
diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py
index c31c53ec1..350a3921a 100644
--- a/apps/common/handle/impl/doc_split_handle.py
+++ b/apps/common/handle/impl/doc_split_handle.py
@@ -189,3 +189,13 @@ class DocSplitHandle(BaseSplitHandle):
".DOC") or file_name.endswith(".DOCX"):
return True
return False
+
+ def get_content(self, file):
+ try:
+ image_list = []
+ buffer = file.read()
+ doc = Document(io.BytesIO(buffer))
+ return self.to_md(doc, image_list, get_image_id_func())
+ except BaseException as e:
+ traceback.print_exception(e)
+ return ''
\ No newline at end of file
diff --git a/apps/common/handle/impl/html_split_handle.py b/apps/common/handle/impl/html_split_handle.py
index 878d9edda..688904567 100644
--- a/apps/common/handle/impl/html_split_handle.py
+++ b/apps/common/handle/impl/html_split_handle.py
@@ -7,6 +7,7 @@
@desc:
"""
import re
+import traceback
from typing import List
from bs4 import BeautifulSoup
@@ -59,3 +60,14 @@ class HTMLSplitHandle(BaseSplitHandle):
return {'name': file.name,
'content': split_model.parse(content)
}
+
+ def get_content(self, file):
+ buffer = file.read()
+
+ try:
+ encoding = get_encoding(buffer)
+ content = buffer.decode(encoding)
+ return html2text(content)
+ except BaseException as e:
+ traceback.print_exception(e)
+ return ''
\ No newline at end of file
diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py
index 52a33b0de..828196b7b 100644
--- a/apps/common/handle/impl/pdf_split_handle.py
+++ b/apps/common/handle/impl/pdf_split_handle.py
@@ -11,6 +11,7 @@ import os
import re
import tempfile
import time
+import traceback
from typing import List
import fitz
@@ -297,3 +298,17 @@ class PdfSplitHandle(BaseSplitHandle):
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
return True
return False
+
+ def get_content(self, file):
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+ # 将上传的文件保存到临时文件中
+ temp_file.write(file.read())
+ # 获取临时文件的路径
+ temp_file_path = temp_file.name
+
+ pdf_document = fitz.open(temp_file_path)
+ try:
+ return self.handle_pdf_content(file, pdf_document)
+ except BaseException as e:
+ traceback.print_exception(e)
+ return ''
\ No newline at end of file
diff --git a/apps/common/handle/impl/table/csv_parse_table_handle.py b/apps/common/handle/impl/table/csv_parse_table_handle.py
index 71152f38e..c3a85db86 100644
--- a/apps/common/handle/impl/table/csv_parse_table_handle.py
+++ b/apps/common/handle/impl/table/csv_parse_table_handle.py
@@ -34,3 +34,11 @@ class CsvSplitHandle(BaseParseTableHandle):
paragraphs.append({'title': '', 'content': line})
return [{'name': file.name, 'paragraphs': paragraphs}]
+
+ def get_content(self, file):
+ buffer = file.read()
+ try:
+ return buffer.decode(detect(buffer)['encoding'])
+ except BaseException as e:
+ max_kb.error(f'csv split handle error: {e}')
+ return [{'name': file.name, 'paragraphs': []}]
\ No newline at end of file
diff --git a/apps/common/handle/impl/table/xls_parse_table_handle.py b/apps/common/handle/impl/table/xls_parse_table_handle.py
index 6c30d49de..9259a3550 100644
--- a/apps/common/handle/impl/table/xls_parse_table_handle.py
+++ b/apps/common/handle/impl/table/xls_parse_table_handle.py
@@ -60,3 +60,24 @@ class XlsSplitHandle(BaseParseTableHandle):
max_kb.error(f'excel split handle error: {e}')
return [{'name': file.name, 'paragraphs': []}]
return result
+
+ def get_content(self, file):
+ # 打开 .xls 文件
+ workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True)
+ sheets = workbook.sheets()
+ md_tables = ''
+ for sheet in sheets:
+
+ # 获取表头和内容
+ headers = sheet.row_values(0)
+ data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)]
+
+ # 构建 Markdown 表格
+ md_table = '| ' + ' | '.join(headers) + ' |\n'
+ md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
+ for row in data:
+ # 将每个单元格中的内容替换换行符为
以保留原始格式
+ md_table += '| ' + ' | '.join([str(cell).replace('\n', '
') if cell else '' for cell in row]) + ' |\n'
+ md_tables += md_table + '\n\n'
+
+ return md_tables
diff --git a/apps/common/handle/impl/table/xlsx_parse_table_handle.py b/apps/common/handle/impl/table/xlsx_parse_table_handle.py
index 35ef2f14b..e92d3c11a 100644
--- a/apps/common/handle/impl/table/xlsx_parse_table_handle.py
+++ b/apps/common/handle/impl/table/xlsx_parse_table_handle.py
@@ -72,3 +72,31 @@ class XlsxSplitHandle(BaseParseTableHandle):
max_kb.error(f'excel split handle error: {e}')
return [{'name': file.name, 'paragraphs': []}]
return result
+
+
+ def get_content(self, file):
+ # 加载 Excel 文件
+ workbook = load_workbook(file)
+ md_tables = ''
+ # 如果未指定 sheet_name,则使用第一个工作表
+ for sheetname in workbook.sheetnames:
+ sheet = workbook[sheetname] if sheetname else workbook.active
+
+ # 获取工作表的所有行
+ rows = list(sheet.iter_rows(values_only=True))
+ if not rows:
+ continue
+
+ # 提取表头和内容
+ headers = rows[0]
+ data = rows[1:]
+
+ # 构建 Markdown 表格
+ md_table = '| ' + ' | '.join(headers) + ' |\n'
+ md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
+ for row in data:
+ md_table += '| ' + ' | '.join(
+ [str(cell).replace('\n', '
') if cell is not None else '' for cell in row]) + ' |\n'
+
+ md_tables += md_table + '\n\n'
+ return md_tables
\ No newline at end of file
diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py
index 467607ff5..984c4e1e9 100644
--- a/apps/common/handle/impl/text_split_handle.py
+++ b/apps/common/handle/impl/text_split_handle.py
@@ -7,6 +7,7 @@
@desc:
"""
import re
+import traceback
from typing import List
from charset_normalizer import detect
@@ -49,3 +50,11 @@ class TextSplitHandle(BaseSplitHandle):
return {'name': file.name,
'content': split_model.parse(content)
}
+
+ def get_content(self, file):
+ buffer = file.read()
+ try:
+ return buffer.decode(detect(buffer)['encoding'])
+ except BaseException as e:
+ traceback.print_exception(e)
+ return ''
\ No newline at end of file
diff --git a/ui/src/components/ai-chat/ExecutionDetailDialog.vue b/ui/src/components/ai-chat/ExecutionDetailDialog.vue
index f63c7392d..bc3f2e160 100644
--- a/ui/src/components/ai-chat/ExecutionDetailDialog.vue
+++ b/ui/src/components/ai-chat/ExecutionDetailDialog.vue
@@ -182,6 +182,25 @@
+
+
+
+
+