feat: Chunks stored

This commit is contained in:
zhangzhanwei 2025-11-24 15:48:36 +08:00 committed by zhanweizhang7
parent 1d60741b4f
commit e5a2c576dc
4 changed files with 29 additions and 5 deletions

View File

@ -16,6 +16,7 @@ from rest_framework import serializers
from django.utils.translation import gettext_lazy as _
from application.flow.i_step_node import NodeResult
from application.flow.step_node.knowledge_write_node.i_knowledge_write_node import IKnowledgeWriteNode
from common.chunk import text_to_chunk
from common.utils.common import bulk_create_in_batches
from knowledge.models import Document, KnowledgeType, Paragraph, File, FileSourceType, Problem, ProblemParagraphMapping
from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage
@ -67,14 +68,14 @@ def link_file(source_file_id, document_id):
# 保存文件内容和元数据
new_file.save(file_content)
def get_paragraph_problem_model(knowledge_id: str, document_id: str, instance: Dict):
paragraph = Paragraph(
id=uuid.uuid7(),
document_id=document_id,
content=instance.get("content"),
knowledge_id=knowledge_id,
title=instance.get("title") if 'title' in instance else ''
title=instance.get("title") if 'title' in instance else '',
chunks = instance.get('chunks') if 'chunks' in instance else text_to_chunk(instance.get("content")),
)
problem_paragraph_object_list = [ProblemParagraphObject(

View File

@ -0,0 +1,19 @@
# Generated by Django 5.2.8 on 2025-11-24 07:09
import django.contrib.postgres.fields
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('knowledge', '0005_knowledgeaction'),
]
operations = [
migrations.AddField(
model_name='paragraph',
name='chunks',
field=django.contrib.postgres.fields.ArrayField(base_field=models.CharField(), default=list, size=None, verbose_name=''),
),
]

View File

@ -3,6 +3,7 @@ import zipfile
from enum import Enum
import uuid_utils.compat as uuid
from django.contrib.postgres.fields import ArrayField
from django.contrib.postgres.search import SearchVectorField
from django.db import models
from django.db.models import QuerySet
@ -242,6 +243,7 @@ class Paragraph(AppModelMixin):
hit_num = models.IntegerField(verbose_name="命中次数", default=0)
is_active = models.BooleanField(default=True, db_index=True)
position = models.IntegerField(verbose_name="段落顺序", default=0, db_index=True)
chunks = ArrayField(verbose_name="", base_field=models.CharField(), default=list)
class Meta:
db_table = "paragraph"

View File

@ -23,7 +23,7 @@ lock = threading.Lock()
def chunk_data(data: Dict):
if str(data.get('source_type')) == str(SourceType.PARAGRAPH.value):
text = data.get('text')
chunk_list = text_to_chunk(text)
chunk_list = data.get('chunks') if data.get('chunks') else text_to_chunk(text)
return [{**data, 'text': chunk} for chunk in chunk_list]
return [data]
@ -63,7 +63,8 @@ class BaseVectorStore(ABC):
BaseVectorStore.vector_exists = True
return True
def save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str, source_id: str,
def save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str,
source_id: str,
is_active: bool,
embedding: Embeddings):
"""
@ -104,7 +105,8 @@ class BaseVectorStore(ABC):
break
@abstractmethod
def _save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str, source_id: str,
def _save(self, text, source_type: SourceType, knowledge_id: str, document_id: str, paragraph_id: str,
source_id: str,
is_active: bool,
embedding: Embeddings):
pass