From 0316afa299b641be06709f6c0b4def6afeb15797 Mon Sep 17 00:00:00 2001 From: wxg0103 <727495428@qq.com> Date: Wed, 9 Jul 2025 20:34:34 +0800 Subject: [PATCH] refactor: enhance HTML tag removal in text processing to exclude audio, video, and image tags --- apps/common/utils/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/common/utils/common.py b/apps/common/utils/common.py index 0b9732b5c..6d12e803a 100644 --- a/apps/common/utils/common.py +++ b/apps/common/utils/common.py @@ -116,6 +116,9 @@ def markdown_to_plain_text(md: str) -> str: text = re.sub(r'\n{2,}', '\n', text) # 使用正则表达式去除所有 HTML 标签 text = re.sub(r'<[^>]+>', '', text) + # 先移除特定媒体标签(优先级高于通用HTML标签移除) + text = re.sub(r'<(audio|video)[^>]*>.*?', '', text, flags=re.DOTALL) # 匹配音频/视频标签 + text = re.sub(r']*>', '', text) # 匹配图片标签 # 去除多余的空白字符(包括换行符、制表符等) text = re.sub(r'\s+', ' ', text) # 去除表单渲染