From a0cfcb73a935a6d1c7b968acb74a08f69452068c Mon Sep 17 00:00:00 2001 From: shaohuzhang1 Date: Thu, 14 Nov 2024 11:41:49 +0800 Subject: [PATCH 1/7] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=AF=B9=E8=AF=9D?= =?UTF-8?q?=E6=97=B6,=E6=A8=A1=E5=9E=8B=E5=8F=82=E6=95=B0=E5=88=A0?= =?UTF-8?q?=E9=99=A4=E5=90=8E=E4=BE=9D=E7=84=B6=E8=BF=9B=E8=A1=8C=E6=A0=A1?= =?UTF-8?q?=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/application/flow/workflow_manage.py | 1 - apps/application/serializers/application_serializers.py | 2 -- apps/application/serializers/chat_serializers.py | 2 -- 3 files changed, 5 deletions(-) diff --git a/apps/application/flow/workflow_manage.py b/apps/application/flow/workflow_manage.py index 0cd92e8e1..d36a58b1f 100644 --- a/apps/application/flow/workflow_manage.py +++ b/apps/application/flow/workflow_manage.py @@ -143,7 +143,6 @@ class Flow: if model_params_setting is None: model_params_setting = model_params_setting_form.get_default_form_data() node.properties.get('node_data', {})['model_params_setting'] = model_params_setting - model_params_setting_form.valid_form(model_params_setting) if node.properties.get('status', 200) != 200: raise ValidationError(ErrorDetail(f'节点{node.properties.get("stepName")} 不可用')) node_list = [node for node in self.nodes if (node.type == 'function-lib-node')] diff --git a/apps/application/serializers/application_serializers.py b/apps/application/serializers/application_serializers.py index 7e5cc8deb..23b058040 100644 --- a/apps/application/serializers/application_serializers.py +++ b/apps/application/serializers/application_serializers.py @@ -836,8 +836,6 @@ class ApplicationSerializer(serializers.Serializer): ApplicationSerializer.Edit(data=instance).is_valid( raise_exception=True) application_id = self.data.get("application_id") - valid_model_params_setting(instance.get('model_id'), - instance.get('model_params_setting')) application = QuerySet(Application).get(id=application_id) if instance.get('model_id') is None or len(instance.get('model_id')) == 0: diff --git a/apps/application/serializers/chat_serializers.py b/apps/application/serializers/chat_serializers.py index 45e18a1ed..f3710fe00 100644 --- a/apps/application/serializers/chat_serializers.py +++ b/apps/application/serializers/chat_serializers.py @@ -294,7 +294,6 @@ class ChatSerializers(serializers.Serializer): return chat_id def open_simple(self, application): - valid_model_params_setting(application.model_id, application.model_params_setting) application_id = self.data.get('application_id') dataset_id_list = [str(row.dataset_id) for row in QuerySet(ApplicationDatasetMapping).filter( @@ -376,7 +375,6 @@ class ChatSerializers(serializers.Serializer): model_id = self.data.get('model_id') dataset_id_list = self.data.get('dataset_id_list') dialogue_number = 3 if self.data.get('multiple_rounds_dialogue', False) else 0 - valid_model_params_setting(model_id, self.data.get('model_params_setting')) application = Application(id=None, dialogue_number=dialogue_number, model_id=model_id, dataset_setting=self.data.get('dataset_setting'), model_setting=self.data.get('model_setting'), From b57a619bdb8d1de9db514ddc108f295818a1ca17 Mon Sep 17 00:00:00 2001 From: CaptainB Date: Thu, 14 Nov 2024 11:11:53 +0800 Subject: [PATCH 2/7] =?UTF-8?q?feat:=20=E9=AB=98=E7=BA=A7=E7=BC=96?= =?UTF-8?q?=E6=8E=92=E6=94=AF=E6=8C=81=E6=96=87=E4=BB=B6=E4=B8=8A=E4=BC=A0?= =?UTF-8?q?(WIP)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../impl/base_document_extract_node.py | 25 +++++++++++++---- apps/common/handle/base_parse_table_handle.py | 4 +++ apps/common/handle/base_split_handle.py | 4 +++ apps/common/handle/impl/doc_split_handle.py | 10 +++++++ apps/common/handle/impl/html_split_handle.py | 12 ++++++++ apps/common/handle/impl/pdf_split_handle.py | 15 ++++++++++ .../impl/table/csv_parse_table_handle.py | 8 ++++++ .../impl/table/xls_parse_table_handle.py | 21 ++++++++++++++ .../impl/table/xlsx_parse_table_handle.py | 28 +++++++++++++++++++ apps/common/handle/impl/text_split_handle.py | 9 ++++++ .../ai-chat/ExecutionDetailDialog.vue | 19 +++++++++++++ 11 files changed, 149 insertions(+), 6 deletions(-) diff --git a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py index 176230d2d..dc62d667e 100644 --- a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py +++ b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py @@ -1,23 +1,36 @@ # coding=utf-8 +import io + from django.db.models import QuerySet from application.flow.i_step_node import NodeResult from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode from dataset.models import File +from dataset.serializers.document_serializers import split_handles, parse_table_handle_list, FileBufferHandle class BaseDocumentExtractNode(IDocumentExtractNode): def execute(self, document, **kwargs): + get_buffer = FileBufferHandle().get_buffer + self.context['document_list'] = document content = '' spliter = '\n-----------------------------------\n' - if len(document) > 0: - for doc in document: - file = QuerySet(File).filter(id=doc['file_id']).first() - file_type = doc['name'].split('.')[-1] - if file_type.lower() in ['txt', 'md', 'csv', 'html']: - content += spliter + doc['name'] + '\n' + file.get_byte().tobytes().decode('utf-8') + if document is None: + return NodeResult({'content': content}, {}) + for doc in document: + file = QuerySet(File).filter(id=doc['file_id']).first() + buffer = io.BytesIO(file.get_byte().tobytes()) + buffer.name = doc['name'] # this is the important line + + for split_handle in (parse_table_handle_list + split_handles): + if split_handle.support(buffer, get_buffer): + # 回到文件头 + buffer.seek(0) + file_content = split_handle.get_content(buffer) + content += spliter + '## ' + doc['name'] + '\n' + file_content + return NodeResult({'content': content}, {}) return NodeResult({'content': content}, {}) diff --git a/apps/common/handle/base_parse_table_handle.py b/apps/common/handle/base_parse_table_handle.py index 487290378..b84690859 100644 --- a/apps/common/handle/base_parse_table_handle.py +++ b/apps/common/handle/base_parse_table_handle.py @@ -17,3 +17,7 @@ class BaseParseTableHandle(ABC): @abstractmethod def handle(self, file, get_buffer,save_image): pass + + @abstractmethod + def get_content(self, file): + pass \ No newline at end of file diff --git a/apps/common/handle/base_split_handle.py b/apps/common/handle/base_split_handle.py index f9b573f0f..ea48e6868 100644 --- a/apps/common/handle/base_split_handle.py +++ b/apps/common/handle/base_split_handle.py @@ -18,3 +18,7 @@ class BaseSplitHandle(ABC): @abstractmethod def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): pass + + @abstractmethod + def get_content(self, file): + pass diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py index c31c53ec1..350a3921a 100644 --- a/apps/common/handle/impl/doc_split_handle.py +++ b/apps/common/handle/impl/doc_split_handle.py @@ -189,3 +189,13 @@ class DocSplitHandle(BaseSplitHandle): ".DOC") or file_name.endswith(".DOCX"): return True return False + + def get_content(self, file): + try: + image_list = [] + buffer = file.read() + doc = Document(io.BytesIO(buffer)) + return self.to_md(doc, image_list, get_image_id_func()) + except BaseException as e: + traceback.print_exception(e) + return '' \ No newline at end of file diff --git a/apps/common/handle/impl/html_split_handle.py b/apps/common/handle/impl/html_split_handle.py index 878d9edda..688904567 100644 --- a/apps/common/handle/impl/html_split_handle.py +++ b/apps/common/handle/impl/html_split_handle.py @@ -7,6 +7,7 @@ @desc: """ import re +import traceback from typing import List from bs4 import BeautifulSoup @@ -59,3 +60,14 @@ class HTMLSplitHandle(BaseSplitHandle): return {'name': file.name, 'content': split_model.parse(content) } + + def get_content(self, file): + buffer = file.read() + + try: + encoding = get_encoding(buffer) + content = buffer.decode(encoding) + return html2text(content) + except BaseException as e: + traceback.print_exception(e) + return '' \ No newline at end of file diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index 52a33b0de..828196b7b 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -11,6 +11,7 @@ import os import re import tempfile import time +import traceback from typing import List import fitz @@ -297,3 +298,17 @@ class PdfSplitHandle(BaseSplitHandle): if file_name.endswith(".pdf") or file_name.endswith(".PDF"): return True return False + + def get_content(self, file): + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + # 将上传的文件保存到临时文件中 + temp_file.write(file.read()) + # 获取临时文件的路径 + temp_file_path = temp_file.name + + pdf_document = fitz.open(temp_file_path) + try: + return self.handle_pdf_content(file, pdf_document) + except BaseException as e: + traceback.print_exception(e) + return '' \ No newline at end of file diff --git a/apps/common/handle/impl/table/csv_parse_table_handle.py b/apps/common/handle/impl/table/csv_parse_table_handle.py index 71152f38e..c3a85db86 100644 --- a/apps/common/handle/impl/table/csv_parse_table_handle.py +++ b/apps/common/handle/impl/table/csv_parse_table_handle.py @@ -34,3 +34,11 @@ class CsvSplitHandle(BaseParseTableHandle): paragraphs.append({'title': '', 'content': line}) return [{'name': file.name, 'paragraphs': paragraphs}] + + def get_content(self, file): + buffer = file.read() + try: + return buffer.decode(detect(buffer)['encoding']) + except BaseException as e: + max_kb.error(f'csv split handle error: {e}') + return [{'name': file.name, 'paragraphs': []}] \ No newline at end of file diff --git a/apps/common/handle/impl/table/xls_parse_table_handle.py b/apps/common/handle/impl/table/xls_parse_table_handle.py index 6c30d49de..9259a3550 100644 --- a/apps/common/handle/impl/table/xls_parse_table_handle.py +++ b/apps/common/handle/impl/table/xls_parse_table_handle.py @@ -60,3 +60,24 @@ class XlsSplitHandle(BaseParseTableHandle): max_kb.error(f'excel split handle error: {e}') return [{'name': file.name, 'paragraphs': []}] return result + + def get_content(self, file): + # 打开 .xls 文件 + workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True) + sheets = workbook.sheets() + md_tables = '' + for sheet in sheets: + + # 获取表头和内容 + headers = sheet.row_values(0) + data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)] + + # 构建 Markdown 表格 + md_table = '| ' + ' | '.join(headers) + ' |\n' + md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n' + for row in data: + # 将每个单元格中的内容替换换行符为
以保留原始格式 + md_table += '| ' + ' | '.join([str(cell).replace('\n', '
') if cell else '' for cell in row]) + ' |\n' + md_tables += md_table + '\n\n' + + return md_tables diff --git a/apps/common/handle/impl/table/xlsx_parse_table_handle.py b/apps/common/handle/impl/table/xlsx_parse_table_handle.py index 35ef2f14b..e92d3c11a 100644 --- a/apps/common/handle/impl/table/xlsx_parse_table_handle.py +++ b/apps/common/handle/impl/table/xlsx_parse_table_handle.py @@ -72,3 +72,31 @@ class XlsxSplitHandle(BaseParseTableHandle): max_kb.error(f'excel split handle error: {e}') return [{'name': file.name, 'paragraphs': []}] return result + + + def get_content(self, file): + # 加载 Excel 文件 + workbook = load_workbook(file) + md_tables = '' + # 如果未指定 sheet_name,则使用第一个工作表 + for sheetname in workbook.sheetnames: + sheet = workbook[sheetname] if sheetname else workbook.active + + # 获取工作表的所有行 + rows = list(sheet.iter_rows(values_only=True)) + if not rows: + continue + + # 提取表头和内容 + headers = rows[0] + data = rows[1:] + + # 构建 Markdown 表格 + md_table = '| ' + ' | '.join(headers) + ' |\n' + md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n' + for row in data: + md_table += '| ' + ' | '.join( + [str(cell).replace('\n', '
') if cell is not None else '' for cell in row]) + ' |\n' + + md_tables += md_table + '\n\n' + return md_tables \ No newline at end of file diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py index 467607ff5..984c4e1e9 100644 --- a/apps/common/handle/impl/text_split_handle.py +++ b/apps/common/handle/impl/text_split_handle.py @@ -7,6 +7,7 @@ @desc: """ import re +import traceback from typing import List from charset_normalizer import detect @@ -49,3 +50,11 @@ class TextSplitHandle(BaseSplitHandle): return {'name': file.name, 'content': split_model.parse(content) } + + def get_content(self, file): + buffer = file.read() + try: + return buffer.decode(detect(buffer)['encoding']) + except BaseException as e: + traceback.print_exception(e) + return '' \ No newline at end of file diff --git a/ui/src/components/ai-chat/ExecutionDetailDialog.vue b/ui/src/components/ai-chat/ExecutionDetailDialog.vue index f63c7392d..bc3f2e160 100644 --- a/ui/src/components/ai-chat/ExecutionDetailDialog.vue +++ b/ui/src/components/ai-chat/ExecutionDetailDialog.vue @@ -182,6 +182,25 @@ + + +