From cc62c35995e14897aaca88d0d14e9ea61fc30556 Mon Sep 17 00:00:00 2001 From: shaohuzhang1 Date: Thu, 29 Feb 2024 18:48:10 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E5=88=86=E6=AE=B5=E6=97=B6=20title?= =?UTF-8?q?=E8=B6=85=E8=BF=87256=E5=AD=97=E7=AC=A6=E5=B0=86=E8=B6=85?= =?UTF-8?q?=E5=87=BA=E9=83=A8=E5=88=86=E6=8B=BC=E6=8E=A5=E7=BB=99content?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/util/split_model.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/apps/common/util/split_model.py b/apps/common/util/split_model.py index 4fccc3291..104ce2cec 100644 --- a/apps/common/util/split_model.py +++ b/apps/common/util/split_model.py @@ -8,7 +8,7 @@ """ import re from functools import reduce -from typing import List +from typing import List, Dict import jieba @@ -334,7 +334,24 @@ class SplitModel: result = result_tree_to_paragraph(result_tree, [], []) # 过滤段落内容不为空字符串的数据 result = [item for item in result if 'content' in item and len(item.get('content').strip()) > 0] - return [{**item, 'title': item.get('title').replace("#", '') if 'title' in item else ''} for item in result] + return [self.post_reset_paragraph(item) for item in result] + + def post_reset_paragraph(self, paragraph: Dict): + result = self.filter_title_special_characters(paragraph) + result = self.sub_title(result) + return result + + @staticmethod + def sub_title(paragraph: Dict): + if 'title' in paragraph: + title = paragraph.get('title') + if len(title) > 255: + return {**paragraph, 'title': title[0:255], 'content': title[255:len(title)] + paragraph.get('content')} + return paragraph + + @staticmethod + def filter_title_special_characters(paragraph: Dict): + return {**paragraph, 'title': paragraph.get('title').replace("#", '') if 'title' in paragraph else ''} default_split_pattern = {