mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
feat: 高级编排支持文件上传(WIP)
This commit is contained in:
parent
a0cfcb73a9
commit
b57a619bdb
|
|
@ -1,23 +1,36 @@
|
|||
# coding=utf-8
|
||||
import io
|
||||
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from application.flow.i_step_node import NodeResult
|
||||
from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode
|
||||
from dataset.models import File
|
||||
from dataset.serializers.document_serializers import split_handles, parse_table_handle_list, FileBufferHandle
|
||||
|
||||
|
||||
class BaseDocumentExtractNode(IDocumentExtractNode):
|
||||
def execute(self, document, **kwargs):
|
||||
get_buffer = FileBufferHandle().get_buffer
|
||||
|
||||
self.context['document_list'] = document
|
||||
content = ''
|
||||
spliter = '\n-----------------------------------\n'
|
||||
if len(document) > 0:
|
||||
for doc in document:
|
||||
file = QuerySet(File).filter(id=doc['file_id']).first()
|
||||
file_type = doc['name'].split('.')[-1]
|
||||
if file_type.lower() in ['txt', 'md', 'csv', 'html']:
|
||||
content += spliter + doc['name'] + '\n' + file.get_byte().tobytes().decode('utf-8')
|
||||
if document is None:
|
||||
return NodeResult({'content': content}, {})
|
||||
|
||||
for doc in document:
|
||||
file = QuerySet(File).filter(id=doc['file_id']).first()
|
||||
buffer = io.BytesIO(file.get_byte().tobytes())
|
||||
buffer.name = doc['name'] # this is the important line
|
||||
|
||||
for split_handle in (parse_table_handle_list + split_handles):
|
||||
if split_handle.support(buffer, get_buffer):
|
||||
# 回到文件头
|
||||
buffer.seek(0)
|
||||
file_content = split_handle.get_content(buffer)
|
||||
content += spliter + '## ' + doc['name'] + '\n' + file_content
|
||||
return NodeResult({'content': content}, {})
|
||||
|
||||
return NodeResult({'content': content}, {})
|
||||
|
||||
|
|
|
|||
|
|
@ -17,3 +17,7 @@ class BaseParseTableHandle(ABC):
|
|||
@abstractmethod
|
||||
def handle(self, file, get_buffer,save_image):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_content(self, file):
|
||||
pass
|
||||
|
|
@ -18,3 +18,7 @@ class BaseSplitHandle(ABC):
|
|||
@abstractmethod
|
||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_content(self, file):
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -189,3 +189,13 @@ class DocSplitHandle(BaseSplitHandle):
|
|||
".DOC") or file_name.endswith(".DOCX"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_content(self, file):
|
||||
try:
|
||||
image_list = []
|
||||
buffer = file.read()
|
||||
doc = Document(io.BytesIO(buffer))
|
||||
return self.to_md(doc, image_list, get_image_id_func())
|
||||
except BaseException as e:
|
||||
traceback.print_exception(e)
|
||||
return ''
|
||||
|
|
@ -7,6 +7,7 @@
|
|||
@desc:
|
||||
"""
|
||||
import re
|
||||
import traceback
|
||||
from typing import List
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
|
@ -59,3 +60,14 @@ class HTMLSplitHandle(BaseSplitHandle):
|
|||
return {'name': file.name,
|
||||
'content': split_model.parse(content)
|
||||
}
|
||||
|
||||
def get_content(self, file):
|
||||
buffer = file.read()
|
||||
|
||||
try:
|
||||
encoding = get_encoding(buffer)
|
||||
content = buffer.decode(encoding)
|
||||
return html2text(content)
|
||||
except BaseException as e:
|
||||
traceback.print_exception(e)
|
||||
return ''
|
||||
|
|
@ -11,6 +11,7 @@ import os
|
|||
import re
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
from typing import List
|
||||
|
||||
import fitz
|
||||
|
|
@ -297,3 +298,17 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_content(self, file):
|
||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||
# 将上传的文件保存到临时文件中
|
||||
temp_file.write(file.read())
|
||||
# 获取临时文件的路径
|
||||
temp_file_path = temp_file.name
|
||||
|
||||
pdf_document = fitz.open(temp_file_path)
|
||||
try:
|
||||
return self.handle_pdf_content(file, pdf_document)
|
||||
except BaseException as e:
|
||||
traceback.print_exception(e)
|
||||
return ''
|
||||
|
|
@ -34,3 +34,11 @@ class CsvSplitHandle(BaseParseTableHandle):
|
|||
paragraphs.append({'title': '', 'content': line})
|
||||
|
||||
return [{'name': file.name, 'paragraphs': paragraphs}]
|
||||
|
||||
def get_content(self, file):
|
||||
buffer = file.read()
|
||||
try:
|
||||
return buffer.decode(detect(buffer)['encoding'])
|
||||
except BaseException as e:
|
||||
max_kb.error(f'csv split handle error: {e}')
|
||||
return [{'name': file.name, 'paragraphs': []}]
|
||||
|
|
@ -60,3 +60,24 @@ class XlsSplitHandle(BaseParseTableHandle):
|
|||
max_kb.error(f'excel split handle error: {e}')
|
||||
return [{'name': file.name, 'paragraphs': []}]
|
||||
return result
|
||||
|
||||
def get_content(self, file):
|
||||
# 打开 .xls 文件
|
||||
workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True)
|
||||
sheets = workbook.sheets()
|
||||
md_tables = ''
|
||||
for sheet in sheets:
|
||||
|
||||
# 获取表头和内容
|
||||
headers = sheet.row_values(0)
|
||||
data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)]
|
||||
|
||||
# 构建 Markdown 表格
|
||||
md_table = '| ' + ' | '.join(headers) + ' |\n'
|
||||
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
|
||||
for row in data:
|
||||
# 将每个单元格中的内容替换换行符为 <br> 以保留原始格式
|
||||
md_table += '| ' + ' | '.join([str(cell).replace('\n', '<br>') if cell else '' for cell in row]) + ' |\n'
|
||||
md_tables += md_table + '\n\n'
|
||||
|
||||
return md_tables
|
||||
|
|
|
|||
|
|
@ -72,3 +72,31 @@ class XlsxSplitHandle(BaseParseTableHandle):
|
|||
max_kb.error(f'excel split handle error: {e}')
|
||||
return [{'name': file.name, 'paragraphs': []}]
|
||||
return result
|
||||
|
||||
|
||||
def get_content(self, file):
|
||||
# 加载 Excel 文件
|
||||
workbook = load_workbook(file)
|
||||
md_tables = ''
|
||||
# 如果未指定 sheet_name,则使用第一个工作表
|
||||
for sheetname in workbook.sheetnames:
|
||||
sheet = workbook[sheetname] if sheetname else workbook.active
|
||||
|
||||
# 获取工作表的所有行
|
||||
rows = list(sheet.iter_rows(values_only=True))
|
||||
if not rows:
|
||||
continue
|
||||
|
||||
# 提取表头和内容
|
||||
headers = rows[0]
|
||||
data = rows[1:]
|
||||
|
||||
# 构建 Markdown 表格
|
||||
md_table = '| ' + ' | '.join(headers) + ' |\n'
|
||||
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
|
||||
for row in data:
|
||||
md_table += '| ' + ' | '.join(
|
||||
[str(cell).replace('\n', '<br>') if cell is not None else '' for cell in row]) + ' |\n'
|
||||
|
||||
md_tables += md_table + '\n\n'
|
||||
return md_tables
|
||||
|
|
@ -7,6 +7,7 @@
|
|||
@desc:
|
||||
"""
|
||||
import re
|
||||
import traceback
|
||||
from typing import List
|
||||
|
||||
from charset_normalizer import detect
|
||||
|
|
@ -49,3 +50,11 @@ class TextSplitHandle(BaseSplitHandle):
|
|||
return {'name': file.name,
|
||||
'content': split_model.parse(content)
|
||||
}
|
||||
|
||||
def get_content(self, file):
|
||||
buffer = file.read()
|
||||
try:
|
||||
return buffer.decode(detect(buffer)['encoding'])
|
||||
except BaseException as e:
|
||||
traceback.print_exception(e)
|
||||
return ''
|
||||
|
|
@ -182,6 +182,25 @@
|
|||
</div>
|
||||
</template>
|
||||
|
||||
<!-- 文档内容提取 -->
|
||||
<template v-if="item.type === WorkflowType.DocumentExtractNode">
|
||||
<div class="card-never border-r-4">
|
||||
<h5 class="p-8-12">参数输出</h5>
|
||||
<div class="p-8-12 border-t-dashed lighter">
|
||||
<el-scrollbar height="150">
|
||||
<MdPreview
|
||||
v-if="item.content"
|
||||
ref="editorRef"
|
||||
editorId="preview-only"
|
||||
:modelValue="item.content"
|
||||
style="background: none"
|
||||
/>
|
||||
<template v-else> - </template>
|
||||
</el-scrollbar>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<!-- 函数库 -->
|
||||
<template
|
||||
v-if="
|
||||
|
|
|
|||
Loading…
Reference in New Issue