feat: 高级编排支持文件上传(WIP)

This commit is contained in:
CaptainB 2024-11-14 11:11:53 +08:00 committed by 刘瑞斌
parent a0cfcb73a9
commit b57a619bdb
11 changed files with 149 additions and 6 deletions

View File

@ -1,23 +1,36 @@
# coding=utf-8
import io
from django.db.models import QuerySet
from application.flow.i_step_node import NodeResult
from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode
from dataset.models import File
from dataset.serializers.document_serializers import split_handles, parse_table_handle_list, FileBufferHandle
class BaseDocumentExtractNode(IDocumentExtractNode):
def execute(self, document, **kwargs):
get_buffer = FileBufferHandle().get_buffer
self.context['document_list'] = document
content = ''
spliter = '\n-----------------------------------\n'
if len(document) > 0:
for doc in document:
file = QuerySet(File).filter(id=doc['file_id']).first()
file_type = doc['name'].split('.')[-1]
if file_type.lower() in ['txt', 'md', 'csv', 'html']:
content += spliter + doc['name'] + '\n' + file.get_byte().tobytes().decode('utf-8')
if document is None:
return NodeResult({'content': content}, {})
for doc in document:
file = QuerySet(File).filter(id=doc['file_id']).first()
buffer = io.BytesIO(file.get_byte().tobytes())
buffer.name = doc['name'] # this is the important line
for split_handle in (parse_table_handle_list + split_handles):
if split_handle.support(buffer, get_buffer):
# 回到文件头
buffer.seek(0)
file_content = split_handle.get_content(buffer)
content += spliter + '## ' + doc['name'] + '\n' + file_content
return NodeResult({'content': content}, {})
return NodeResult({'content': content}, {})

View File

@ -17,3 +17,7 @@ class BaseParseTableHandle(ABC):
@abstractmethod
def handle(self, file, get_buffer,save_image):
pass
@abstractmethod
def get_content(self, file):
pass

View File

@ -18,3 +18,7 @@ class BaseSplitHandle(ABC):
@abstractmethod
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
pass
@abstractmethod
def get_content(self, file):
pass

View File

@ -189,3 +189,13 @@ class DocSplitHandle(BaseSplitHandle):
".DOC") or file_name.endswith(".DOCX"):
return True
return False
def get_content(self, file):
try:
image_list = []
buffer = file.read()
doc = Document(io.BytesIO(buffer))
return self.to_md(doc, image_list, get_image_id_func())
except BaseException as e:
traceback.print_exception(e)
return ''

View File

@ -7,6 +7,7 @@
@desc:
"""
import re
import traceback
from typing import List
from bs4 import BeautifulSoup
@ -59,3 +60,14 @@ class HTMLSplitHandle(BaseSplitHandle):
return {'name': file.name,
'content': split_model.parse(content)
}
def get_content(self, file):
buffer = file.read()
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
return html2text(content)
except BaseException as e:
traceback.print_exception(e)
return ''

View File

@ -11,6 +11,7 @@ import os
import re
import tempfile
import time
import traceback
from typing import List
import fitz
@ -297,3 +298,17 @@ class PdfSplitHandle(BaseSplitHandle):
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
return True
return False
def get_content(self, file):
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
# 将上传的文件保存到临时文件中
temp_file.write(file.read())
# 获取临时文件的路径
temp_file_path = temp_file.name
pdf_document = fitz.open(temp_file_path)
try:
return self.handle_pdf_content(file, pdf_document)
except BaseException as e:
traceback.print_exception(e)
return ''

View File

@ -34,3 +34,11 @@ class CsvSplitHandle(BaseParseTableHandle):
paragraphs.append({'title': '', 'content': line})
return [{'name': file.name, 'paragraphs': paragraphs}]
def get_content(self, file):
buffer = file.read()
try:
return buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
max_kb.error(f'csv split handle error: {e}')
return [{'name': file.name, 'paragraphs': []}]

View File

@ -60,3 +60,24 @@ class XlsSplitHandle(BaseParseTableHandle):
max_kb.error(f'excel split handle error: {e}')
return [{'name': file.name, 'paragraphs': []}]
return result
def get_content(self, file):
# 打开 .xls 文件
workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True)
sheets = workbook.sheets()
md_tables = ''
for sheet in sheets:
# 获取表头和内容
headers = sheet.row_values(0)
data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)]
# 构建 Markdown 表格
md_table = '| ' + ' | '.join(headers) + ' |\n'
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
for row in data:
# 将每个单元格中的内容替换换行符为 <br> 以保留原始格式
md_table += '| ' + ' | '.join([str(cell).replace('\n', '<br>') if cell else '' for cell in row]) + ' |\n'
md_tables += md_table + '\n\n'
return md_tables

View File

@ -72,3 +72,31 @@ class XlsxSplitHandle(BaseParseTableHandle):
max_kb.error(f'excel split handle error: {e}')
return [{'name': file.name, 'paragraphs': []}]
return result
def get_content(self, file):
# 加载 Excel 文件
workbook = load_workbook(file)
md_tables = ''
# 如果未指定 sheet_name则使用第一个工作表
for sheetname in workbook.sheetnames:
sheet = workbook[sheetname] if sheetname else workbook.active
# 获取工作表的所有行
rows = list(sheet.iter_rows(values_only=True))
if not rows:
continue
# 提取表头和内容
headers = rows[0]
data = rows[1:]
# 构建 Markdown 表格
md_table = '| ' + ' | '.join(headers) + ' |\n'
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
for row in data:
md_table += '| ' + ' | '.join(
[str(cell).replace('\n', '<br>') if cell is not None else '' for cell in row]) + ' |\n'
md_tables += md_table + '\n\n'
return md_tables

View File

@ -7,6 +7,7 @@
@desc:
"""
import re
import traceback
from typing import List
from charset_normalizer import detect
@ -49,3 +50,11 @@ class TextSplitHandle(BaseSplitHandle):
return {'name': file.name,
'content': split_model.parse(content)
}
def get_content(self, file):
buffer = file.read()
try:
return buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
traceback.print_exception(e)
return ''

View File

@ -182,6 +182,25 @@
</div>
</template>
<!-- 文档内容提取 -->
<template v-if="item.type === WorkflowType.DocumentExtractNode">
<div class="card-never border-r-4">
<h5 class="p-8-12">参数输出</h5>
<div class="p-8-12 border-t-dashed lighter">
<el-scrollbar height="150">
<MdPreview
v-if="item.content"
ref="editorRef"
editorId="preview-only"
:modelValue="item.content"
style="background: none"
/>
<template v-else> - </template>
</el-scrollbar>
</div>
</div>
</template>
<!-- 函数库 -->
<template
v-if="