fix: Filter special character

This commit is contained in:
zhangzhanwei 2025-12-11 16:22:03 +08:00 committed by CaptainB
parent 638f90e69f
commit 9347696676
2 changed files with 25 additions and 12 deletions

View File

@ -17,7 +17,7 @@ from django.utils.translation import gettext_lazy as _
from application.flow.i_step_node import NodeResult
from application.flow.step_node.knowledge_write_node.i_knowledge_write_node import IKnowledgeWriteNode
from common.chunk import text_to_chunk
from common.utils.common import bulk_create_in_batches
from common.utils.common import bulk_create_in_batches, filter_special_character
from knowledge.models import Document, KnowledgeType, Paragraph, File, FileSourceType, Problem, ProblemParagraphMapping, \
Tag, DocumentTag
from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage
@ -83,10 +83,11 @@ def get_paragraph_problem_model(knowledge_id: str, document_id: str, instance: D
paragraph = Paragraph(
id=uuid.uuid7(),
document_id=document_id,
content=instance.get("content"),
content=filter_special_character(instance.get("content")),
knowledge_id=knowledge_id,
title=instance.get("title") if 'title' in instance else '',
chunks=instance.get('chunks') if 'chunks' in instance else text_to_chunk(instance.get("content")),
chunks=[filter_special_character(c) for c in (instance.get('chunks') if 'chunks' in instance else text_to_chunk(
instance.get("content")))],
)
problem_paragraph_object_list = [ProblemParagraphObject(
@ -145,11 +146,11 @@ def get_document_paragraph_model(knowledge_id: str, instance: Dict):
instance.get('paragraphs') if 'paragraphs' in instance else []
)
def save_knowledge_tags(knowledge_id: str, tags: List[Dict[str,Any]]):
def save_knowledge_tags(knowledge_id: str, tags: List[Dict[str, Any]]):
existed_tags_dict = {
(key, value): str(tag_id)
for key,value,tag_id in QuerySet(Tag).filter(knowledge_id=knowledge_id).values_list("key", "value", "id")
for key, value, tag_id in QuerySet(Tag).filter(knowledge_id=knowledge_id).values_list("key", "value", "id")
}
tag_model_list = []
@ -158,7 +159,7 @@ def save_knowledge_tags(knowledge_id: str, tags: List[Dict[str,Any]]):
key = tag.get("key")
value = tag.get("value")
if (key,value) not in existed_tags_dict:
if (key, value) not in existed_tags_dict:
tag_model = Tag(
id=uuid.uuid7(),
knowledge_id=knowledge_id,
@ -166,15 +167,16 @@ def save_knowledge_tags(knowledge_id: str, tags: List[Dict[str,Any]]):
value=value
)
tag_model_list.append(tag_model)
new_tag_dict[(key,value)] = str(tag_model.id)
new_tag_dict[(key, value)] = str(tag_model.id)
if tag_model_list:
Tag.objects.bulk_create(tag_model_list)
all_tag_dict={**existed_tags_dict,**new_tag_dict}
all_tag_dict = {**existed_tags_dict, **new_tag_dict}
return all_tag_dict, new_tag_dict
def batch_add_document_tag(document_tag_map: Dict[str, List[str]]):
"""
批量添加文档-标签关联
@ -199,12 +201,13 @@ def batch_add_document_tag(document_tag_map: Dict[str, List[str]]):
)
for doc_id, tag_ids in document_tag_map.items()
for tag_id in tag_ids
if (doc_id,tag_id) not in existed_relations
if (doc_id, tag_id) not in existed_relations
]
if new_relations:
QuerySet(DocumentTag).bulk_create(new_relations)
class BaseKnowledgeWriteNode(IKnowledgeWriteNode):
def save_context(self, details, workflow_manage):
@ -241,7 +244,7 @@ class BaseKnowledgeWriteNode(IKnowledgeWriteNode):
for tag in single_document_tag_list:
tag_key = (tag['key'], tag['value'])
if tag_key not in knowledge_tag_dict:
knowledge_tag_dict[tag_key]= tag
knowledge_tag_dict[tag_key] = tag
if single_document_tag_list:
document_tags_map[str(document_instance.id)] = single_document_tag_list
@ -259,9 +262,9 @@ class BaseKnowledgeWriteNode(IKnowledgeWriteNode):
# 为每个文档添加其对应的标签
for doc_id, doc_tags in document_tags_map.items():
doc_tag_ids = [
all_tag_dict[(tag.get("key"),tag.get("value"))]
all_tag_dict[(tag.get("key"), tag.get("value"))]
for tag in doc_tags
if (tag.get("key"),tag.get("value")) in all_tag_dict
if (tag.get("key"), tag.get("value")) in all_tag_dict
]
if doc_tag_ids:
document_tag_id_map[doc_id] = doc_tag_ids

View File

@ -340,3 +340,13 @@ def generate_uuid(tag: str):
def filter_workspace(query_list):
return [q for q in query_list if q.name != "workspace_id"]
def filter_special_character(_str):
"""
过滤特殊字符
"""
s_list = ["\\u0000"]
for t in s_list:
_str = _str.replace(t, '')
return _str