diff --git a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py index 176230d2d..dc62d667e 100644 --- a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py +++ b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py @@ -1,23 +1,36 @@ # coding=utf-8 +import io + from django.db.models import QuerySet from application.flow.i_step_node import NodeResult from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode from dataset.models import File +from dataset.serializers.document_serializers import split_handles, parse_table_handle_list, FileBufferHandle class BaseDocumentExtractNode(IDocumentExtractNode): def execute(self, document, **kwargs): + get_buffer = FileBufferHandle().get_buffer + self.context['document_list'] = document content = '' spliter = '\n-----------------------------------\n' - if len(document) > 0: - for doc in document: - file = QuerySet(File).filter(id=doc['file_id']).first() - file_type = doc['name'].split('.')[-1] - if file_type.lower() in ['txt', 'md', 'csv', 'html']: - content += spliter + doc['name'] + '\n' + file.get_byte().tobytes().decode('utf-8') + if document is None: + return NodeResult({'content': content}, {}) + for doc in document: + file = QuerySet(File).filter(id=doc['file_id']).first() + buffer = io.BytesIO(file.get_byte().tobytes()) + buffer.name = doc['name'] # this is the important line + + for split_handle in (parse_table_handle_list + split_handles): + if split_handle.support(buffer, get_buffer): + # 回到文件头 + buffer.seek(0) + file_content = split_handle.get_content(buffer) + content += spliter + '## ' + doc['name'] + '\n' + file_content + return NodeResult({'content': content}, {}) return NodeResult({'content': content}, {}) diff --git a/apps/common/handle/base_parse_table_handle.py b/apps/common/handle/base_parse_table_handle.py index 487290378..b84690859 100644 --- a/apps/common/handle/base_parse_table_handle.py +++ b/apps/common/handle/base_parse_table_handle.py @@ -17,3 +17,7 @@ class BaseParseTableHandle(ABC): @abstractmethod def handle(self, file, get_buffer,save_image): pass + + @abstractmethod + def get_content(self, file): + pass \ No newline at end of file diff --git a/apps/common/handle/base_split_handle.py b/apps/common/handle/base_split_handle.py index f9b573f0f..ea48e6868 100644 --- a/apps/common/handle/base_split_handle.py +++ b/apps/common/handle/base_split_handle.py @@ -18,3 +18,7 @@ class BaseSplitHandle(ABC): @abstractmethod def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): pass + + @abstractmethod + def get_content(self, file): + pass diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py index c31c53ec1..350a3921a 100644 --- a/apps/common/handle/impl/doc_split_handle.py +++ b/apps/common/handle/impl/doc_split_handle.py @@ -189,3 +189,13 @@ class DocSplitHandle(BaseSplitHandle): ".DOC") or file_name.endswith(".DOCX"): return True return False + + def get_content(self, file): + try: + image_list = [] + buffer = file.read() + doc = Document(io.BytesIO(buffer)) + return self.to_md(doc, image_list, get_image_id_func()) + except BaseException as e: + traceback.print_exception(e) + return '' \ No newline at end of file diff --git a/apps/common/handle/impl/html_split_handle.py b/apps/common/handle/impl/html_split_handle.py index 878d9edda..688904567 100644 --- a/apps/common/handle/impl/html_split_handle.py +++ b/apps/common/handle/impl/html_split_handle.py @@ -7,6 +7,7 @@ @desc: """ import re +import traceback from typing import List from bs4 import BeautifulSoup @@ -59,3 +60,14 @@ class HTMLSplitHandle(BaseSplitHandle): return {'name': file.name, 'content': split_model.parse(content) } + + def get_content(self, file): + buffer = file.read() + + try: + encoding = get_encoding(buffer) + content = buffer.decode(encoding) + return html2text(content) + except BaseException as e: + traceback.print_exception(e) + return '' \ No newline at end of file diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index 52a33b0de..828196b7b 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -11,6 +11,7 @@ import os import re import tempfile import time +import traceback from typing import List import fitz @@ -297,3 +298,17 @@ class PdfSplitHandle(BaseSplitHandle): if file_name.endswith(".pdf") or file_name.endswith(".PDF"): return True return False + + def get_content(self, file): + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + # 将上传的文件保存到临时文件中 + temp_file.write(file.read()) + # 获取临时文件的路径 + temp_file_path = temp_file.name + + pdf_document = fitz.open(temp_file_path) + try: + return self.handle_pdf_content(file, pdf_document) + except BaseException as e: + traceback.print_exception(e) + return '' \ No newline at end of file diff --git a/apps/common/handle/impl/table/csv_parse_table_handle.py b/apps/common/handle/impl/table/csv_parse_table_handle.py index 71152f38e..c3a85db86 100644 --- a/apps/common/handle/impl/table/csv_parse_table_handle.py +++ b/apps/common/handle/impl/table/csv_parse_table_handle.py @@ -34,3 +34,11 @@ class CsvSplitHandle(BaseParseTableHandle): paragraphs.append({'title': '', 'content': line}) return [{'name': file.name, 'paragraphs': paragraphs}] + + def get_content(self, file): + buffer = file.read() + try: + return buffer.decode(detect(buffer)['encoding']) + except BaseException as e: + max_kb.error(f'csv split handle error: {e}') + return [{'name': file.name, 'paragraphs': []}] \ No newline at end of file diff --git a/apps/common/handle/impl/table/xls_parse_table_handle.py b/apps/common/handle/impl/table/xls_parse_table_handle.py index 6c30d49de..9259a3550 100644 --- a/apps/common/handle/impl/table/xls_parse_table_handle.py +++ b/apps/common/handle/impl/table/xls_parse_table_handle.py @@ -60,3 +60,24 @@ class XlsSplitHandle(BaseParseTableHandle): max_kb.error(f'excel split handle error: {e}') return [{'name': file.name, 'paragraphs': []}] return result + + def get_content(self, file): + # 打开 .xls 文件 + workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True) + sheets = workbook.sheets() + md_tables = '' + for sheet in sheets: + + # 获取表头和内容 + headers = sheet.row_values(0) + data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)] + + # 构建 Markdown 表格 + md_table = '| ' + ' | '.join(headers) + ' |\n' + md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n' + for row in data: + # 将每个单元格中的内容替换换行符为
以保留原始格式 + md_table += '| ' + ' | '.join([str(cell).replace('\n', '
') if cell else '' for cell in row]) + ' |\n' + md_tables += md_table + '\n\n' + + return md_tables diff --git a/apps/common/handle/impl/table/xlsx_parse_table_handle.py b/apps/common/handle/impl/table/xlsx_parse_table_handle.py index 35ef2f14b..e92d3c11a 100644 --- a/apps/common/handle/impl/table/xlsx_parse_table_handle.py +++ b/apps/common/handle/impl/table/xlsx_parse_table_handle.py @@ -72,3 +72,31 @@ class XlsxSplitHandle(BaseParseTableHandle): max_kb.error(f'excel split handle error: {e}') return [{'name': file.name, 'paragraphs': []}] return result + + + def get_content(self, file): + # 加载 Excel 文件 + workbook = load_workbook(file) + md_tables = '' + # 如果未指定 sheet_name,则使用第一个工作表 + for sheetname in workbook.sheetnames: + sheet = workbook[sheetname] if sheetname else workbook.active + + # 获取工作表的所有行 + rows = list(sheet.iter_rows(values_only=True)) + if not rows: + continue + + # 提取表头和内容 + headers = rows[0] + data = rows[1:] + + # 构建 Markdown 表格 + md_table = '| ' + ' | '.join(headers) + ' |\n' + md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n' + for row in data: + md_table += '| ' + ' | '.join( + [str(cell).replace('\n', '
') if cell is not None else '' for cell in row]) + ' |\n' + + md_tables += md_table + '\n\n' + return md_tables \ No newline at end of file diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py index 467607ff5..984c4e1e9 100644 --- a/apps/common/handle/impl/text_split_handle.py +++ b/apps/common/handle/impl/text_split_handle.py @@ -7,6 +7,7 @@ @desc: """ import re +import traceback from typing import List from charset_normalizer import detect @@ -49,3 +50,11 @@ class TextSplitHandle(BaseSplitHandle): return {'name': file.name, 'content': split_model.parse(content) } + + def get_content(self, file): + buffer = file.read() + try: + return buffer.decode(detect(buffer)['encoding']) + except BaseException as e: + traceback.print_exception(e) + return '' \ No newline at end of file diff --git a/ui/src/components/ai-chat/ExecutionDetailDialog.vue b/ui/src/components/ai-chat/ExecutionDetailDialog.vue index f63c7392d..bc3f2e160 100644 --- a/ui/src/components/ai-chat/ExecutionDetailDialog.vue +++ b/ui/src/components/ai-chat/ExecutionDetailDialog.vue @@ -182,6 +182,25 @@ + + +