feat: Chunks stored

2025-12-26 01:33:05 +00:00 · 2025-11-24 15:48:36 +08:00 · 2025-11-24 15:48:36 +08:00 · e5a2c576dc
parent 1d60741b4f
commit e5a2c576dc
4 changed files with 29 additions and 5 deletions
--- a/apps/application/flow/step_node/knowledge_write_node/impl/base_knowledge_write_node.py
+++ b/apps/application/flow/step_node/knowledge_write_node/impl/base_knowledge_write_node.py
@ -16,6 +16,7 @@ from rest_framework import serializers
 from django.utils.translation import gettext_lazy as _
 from application.flow.i_step_node import NodeResult
 from application.flow.step_node.knowledge_write_node.i_knowledge_write_node import IKnowledgeWriteNode
+from common.chunk import text_to_chunk
 from common.utils.common import bulk_create_in_batches
 from knowledge.models import Document, KnowledgeType, Paragraph, File, FileSourceType, Problem, ProblemParagraphMapping
 from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage
@ -67,14 +68,14 @@ def link_file(source_file_id, document_id):
        # 保存文件内容和元数据
        new_file.save(file_content)

-
 def get_paragraph_problem_model(knowledge_id: str, document_id: str, instance: Dict):
    paragraph = Paragraph(
        id=uuid.uuid7(),
        document_id=document_id,
        content=instance.get("content"),
        knowledge_id=knowledge_id,
-        title=instance.get("title") if 'title' in instance else ''
+        title=instance.get("title") if 'title' in instance else '',
+        chunks = instance.get('chunks') if 'chunks' in instance else text_to_chunk(instance.get("content")),
    )

    problem_paragraph_object_list = [ProblemParagraphObject(
--- a/apps/knowledge/migrations/0006_paragraph_chunks.py
+++ b/apps/knowledge/migrations/0006_paragraph_chunks.py
@ -0,0 +1,19 @@
+# Generated by Django 5.2.8 on 2025-11-24 07:09
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('knowledge', '0005_knowledgeaction'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='paragraph',
+            name='chunks',
+            field=django.contrib.postgres.fields.ArrayField(base_field=models.CharField(), default=list, size=None, verbose_name='块'),
+        ),
+    ]
--- a/apps/knowledge/models/knowledge.py
+++ b/apps/knowledge/models/knowledge.py
@ -3,6 +3,7 @@ import zipfile
 from enum import Enum

 import uuid_utils.compat as uuid
+from django.contrib.postgres.fields import ArrayField
 from django.contrib.postgres.search import SearchVectorField
 from django.db import models
 from django.db.models import QuerySet
@ -242,6 +243,7 @@ class Paragraph(AppModelMixin):
    hit_num = models.IntegerField(verbose_name="命中次数", default=0)
    is_active = models.BooleanField(default=True, db_index=True)
    position = models.IntegerField(verbose_name="段落顺序", default=0, db_index=True)
+    chunks = ArrayField(verbose_name="块", base_field=models.CharField(), default=list)

    class Meta:
        db_table = "paragraph"
--- a/apps/knowledge/vector/base_vector.py
+++ b/apps/knowledge/vector/base_vector.py
@ -23,7 +23,7 @@ lock = threading.Lock()
 def chunk_data(data: Dict):
    if str(data.get('source_type')) == str(SourceType.PARAGRAPH.value):
        text = data.get('text')
-        chunk_list = text_to_chunk(text)
+        chunk_list = data.get('chunks') if data.get('chunks') else text_to_chunk(text)
        return [{**data, 'text': chunk} for chunk in chunk_list]
    return [data]

@ -63,7 +63,8 @@ class BaseVectorStore(ABC):
                BaseVectorStore.vector_exists = True
        return True

-    def save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str, source_id: str,
+    def save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str,
+             source_id: str,
             is_active: bool,
             embedding: Embeddings):
        """
@ -104,7 +105,8 @@ class BaseVectorStore(ABC):
                break

    @abstractmethod
-    def _save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str, source_id: str,
+    def _save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str,
+              source_id: str,
              is_active: bool,
              embedding: Embeddings):
        pass