From 86f500208f297dd39ff52d1afb7dc08e12432568 Mon Sep 17 00:00:00 2001 From: shaohuzhang1 <80892890+shaohuzhang1@users.noreply.github.com> Date: Thu, 23 May 2024 14:19:18 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=E4=B8=8A=E4=BC=A0htm?= =?UTF-8?q?l=E6=A0=BC=E5=BC=8F=E7=9A=84=E6=96=87=E6=A1=A3=20#364=20(#518)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/handle/impl/html_split_handle.py | 66 +++++++++++++++++++ .../serializers/document_serializers.py | 3 +- ui/src/utils/utils.ts | 2 +- .../dataset/component/UploadComponent.vue | 6 +- 4 files changed, 73 insertions(+), 4 deletions(-) create mode 100644 apps/common/handle/impl/html_split_handle.py diff --git a/apps/common/handle/impl/html_split_handle.py b/apps/common/handle/impl/html_split_handle.py new file mode 100644 index 000000000..564f6db49 --- /dev/null +++ b/apps/common/handle/impl/html_split_handle.py @@ -0,0 +1,66 @@ +# coding=utf-8 +""" + @project: maxkb + @Author:虎 + @file: html_split_handle.py + @date:2024/5/23 10:58 + @desc: +""" +import re +from typing import List + +from bs4 import BeautifulSoup +from charset_normalizer import detect +from html2text import html2text + +from common.handle.base_split_handle import BaseSplitHandle +from common.util.split_model import SplitModel + +default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), + re.compile('(?<=\\n)(? 0: + charset = charset_list[0] + return charset + return detect(buffer)['encoding'] + + +class HTMLSplitHandle(BaseSplitHandle): + def support(self, file, get_buffer): + buffer = get_buffer(file) + file_name: str = file.name.lower() + if file_name.endswith(".html"): + return True + result = detect(buffer) + if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \ + result['confidence'] > 0.5: + return True + return False + + def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): + buffer = get_buffer(file) + + if pattern_list is not None and len(pattern_list) > 0: + split_model = SplitModel(pattern_list, with_filter, limit) + else: + split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) + try: + encoding = get_encoding(buffer) + content = buffer.decode(encoding) + content = html2text(content) + except BaseException as e: + return {'name': file.name, + 'content': []} + return {'name': file.name, + 'content': split_model.parse(content) + } diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py index b659ea978..c5f88e336 100644 --- a/apps/dataset/serializers/document_serializers.py +++ b/apps/dataset/serializers/document_serializers.py @@ -25,6 +25,7 @@ from common.event.common import work_thread_pool from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs, UpdateEmbeddingDatasetIdArgs from common.exception.app_exception import AppApiException from common.handle.impl.doc_split_handle import DocSplitHandle +from common.handle.impl.html_split_handle import HTMLSplitHandle from common.handle.impl.pdf_split_handle import PdfSplitHandle from common.handle.impl.text_split_handle import TextSplitHandle from common.mixins.api_mixin import ApiMixin @@ -772,7 +773,7 @@ class FileBufferHandle: default_split_handle = TextSplitHandle() -split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle] +split_handles = [HTMLSplitHandle(), DocSplitHandle(), PdfSplitHandle(), default_split_handle] def save_image(image_list): diff --git a/ui/src/utils/utils.ts b/ui/src/utils/utils.ts index 027b1a67b..581a4ec5c 100644 --- a/ui/src/utils/utils.ts +++ b/ui/src/utils/utils.ts @@ -43,7 +43,7 @@ export function getImgUrl(name: string) { } // 是否是白名单后缀 export function isRightType(name: string) { - const typeList = ['txt', 'pdf', 'docx', 'csv', 'md'] + const typeList = ['txt', 'pdf', 'docx', 'csv', 'md', 'html'] return typeList.includes(fileType(name)) } diff --git a/ui/src/views/dataset/component/UploadComponent.vue b/ui/src/views/dataset/component/UploadComponent.vue index 98a785a93..54181156e 100644 --- a/ui/src/views/dataset/component/UploadComponent.vue +++ b/ui/src/views/dataset/component/UploadComponent.vue @@ -17,7 +17,7 @@ action="#" :auto-upload="false" :show-file-list="false" - accept=".txt, .md, .csv, .log, .docx, .pdf" + accept=".txt, .md, .csv, .log, .docx, .pdf, .html" :limit="50" :on-exceed="onExceed" :on-change="fileHandleChange" @@ -31,7 +31,9 @@ 选择文件夹

-

支持格式:TXT、Markdown、PDF、DOCX,每次最多上传50个文件,每个文件不超过 100MB

+

+ 支持格式:TXT、Markdown、PDF、DOCX、HTML 每次最多上传50个文件,每个文件不超过 100MB +

若使用【高级分段】建议上传前规范文件的分段标识