diff --git a/apps/application/flow/step_node/knowledge_write_node/impl/base_knowledge_write_node.py b/apps/application/flow/step_node/knowledge_write_node/impl/base_knowledge_write_node.py index 6425b3b02..3d2fa5a7a 100644 --- a/apps/application/flow/step_node/knowledge_write_node/impl/base_knowledge_write_node.py +++ b/apps/application/flow/step_node/knowledge_write_node/impl/base_knowledge_write_node.py @@ -16,6 +16,7 @@ from rest_framework import serializers from django.utils.translation import gettext_lazy as _ from application.flow.i_step_node import NodeResult from application.flow.step_node.knowledge_write_node.i_knowledge_write_node import IKnowledgeWriteNode +from common.chunk import text_to_chunk from common.utils.common import bulk_create_in_batches from knowledge.models import Document, KnowledgeType, Paragraph, File, FileSourceType, Problem, ProblemParagraphMapping from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage @@ -67,14 +68,14 @@ def link_file(source_file_id, document_id): # 保存文件内容和元数据 new_file.save(file_content) - def get_paragraph_problem_model(knowledge_id: str, document_id: str, instance: Dict): paragraph = Paragraph( id=uuid.uuid7(), document_id=document_id, content=instance.get("content"), knowledge_id=knowledge_id, - title=instance.get("title") if 'title' in instance else '' + title=instance.get("title") if 'title' in instance else '', + chunks = instance.get('chunks') if 'chunks' in instance else text_to_chunk(instance.get("content")), ) problem_paragraph_object_list = [ProblemParagraphObject( diff --git a/apps/knowledge/migrations/0006_paragraph_chunks.py b/apps/knowledge/migrations/0006_paragraph_chunks.py new file mode 100644 index 000000000..10131e488 --- /dev/null +++ b/apps/knowledge/migrations/0006_paragraph_chunks.py @@ -0,0 +1,19 @@ +# Generated by Django 5.2.8 on 2025-11-24 07:09 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('knowledge', '0005_knowledgeaction'), + ] + + operations = [ + migrations.AddField( + model_name='paragraph', + name='chunks', + field=django.contrib.postgres.fields.ArrayField(base_field=models.CharField(), default=list, size=None, verbose_name='块'), + ), + ] diff --git a/apps/knowledge/models/knowledge.py b/apps/knowledge/models/knowledge.py index 95c695eb7..ef72f4647 100644 --- a/apps/knowledge/models/knowledge.py +++ b/apps/knowledge/models/knowledge.py @@ -3,6 +3,7 @@ import zipfile from enum import Enum import uuid_utils.compat as uuid +from django.contrib.postgres.fields import ArrayField from django.contrib.postgres.search import SearchVectorField from django.db import models from django.db.models import QuerySet @@ -242,6 +243,7 @@ class Paragraph(AppModelMixin): hit_num = models.IntegerField(verbose_name="命中次数", default=0) is_active = models.BooleanField(default=True, db_index=True) position = models.IntegerField(verbose_name="段落顺序", default=0, db_index=True) + chunks = ArrayField(verbose_name="块", base_field=models.CharField(), default=list) class Meta: db_table = "paragraph" diff --git a/apps/knowledge/vector/base_vector.py b/apps/knowledge/vector/base_vector.py index 2a7731fce..57171189c 100644 --- a/apps/knowledge/vector/base_vector.py +++ b/apps/knowledge/vector/base_vector.py @@ -23,7 +23,7 @@ lock = threading.Lock() def chunk_data(data: Dict): if str(data.get('source_type')) == str(SourceType.PARAGRAPH.value): text = data.get('text') - chunk_list = text_to_chunk(text) + chunk_list = data.get('chunks') if data.get('chunks') else text_to_chunk(text) return [{**data, 'text': chunk} for chunk in chunk_list] return [data] @@ -63,7 +63,8 @@ class BaseVectorStore(ABC): BaseVectorStore.vector_exists = True return True - def save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str, source_id: str, + def save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str, + source_id: str, is_active: bool, embedding: Embeddings): """ @@ -104,7 +105,8 @@ class BaseVectorStore(ABC): break @abstractmethod - def _save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str, source_id: str, + def _save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str, + source_id: str, is_active: bool, embedding: Embeddings): pass