diff --git a/apps/common/utils/common.py b/apps/common/utils/common.py index a3db845fe..4a741852f 100644 --- a/apps/common/utils/common.py +++ b/apps/common/utils/common.py @@ -23,6 +23,7 @@ from pydub import AudioSegment from ..exception.app_exception import AppApiException from ..models.db_model_manage import DBModelManage +import hashlib def password_encrypt(row_password): @@ -124,6 +125,7 @@ def get_file_content(path): content = file.read() return content + def sub_array(array: List, item_num=10): result = [] temp = [] @@ -270,3 +272,8 @@ def bulk_create_in_batches(model, data, batch_size=1000): batch = data[i:i + batch_size] model.objects.bulk_create(batch) + +def get_sha256_hash(_bytes): + sha256 = hashlib.sha256() + sha256.update(_bytes) + return sha256.hexdigest() diff --git a/apps/knowledge/migrations/0005_remove_file_workspace_id_file_file_size_and_more.py b/apps/knowledge/migrations/0005_remove_file_workspace_id_file_file_size_and_more.py new file mode 100644 index 000000000..4d4b3ee44 --- /dev/null +++ b/apps/knowledge/migrations/0005_remove_file_workspace_id_file_file_size_and_more.py @@ -0,0 +1,37 @@ +# Generated by Django 5.2 on 2025-05-07 03:40 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('knowledge', '0004_knowledge_file_size_limit_alter_document_status_and_more'), + ] + + operations = [ + migrations.RemoveField( + model_name='file', + name='workspace_id', + ), + migrations.AddField( + model_name='file', + name='file_size', + field=models.IntegerField(default=0, verbose_name='文件大小'), + ), + migrations.AddField( + model_name='file', + name='sha256_hash', + field=models.CharField(default='', verbose_name='文件sha256_hash标识'), + ), + migrations.AddField( + model_name='file', + name='source_id', + field=models.CharField(default='TEMPORARY_100_MINUTE', verbose_name='资源id'), + ), + migrations.AddField( + model_name='file', + name='source_type', + field=models.CharField(choices=[('KNOWLEDGE', 'Knowledge'), ('APPLICATION', 'Application'), ('TEMPORARY_30_MINUTE', 'Temporary 30 Minute'), ('TEMPORARY_100_MINUTE', 'Temporary 120 Minute'), ('TEMPORARY_1_DAY', 'Temporary 1 Day')], default='TEMPORARY_100_MINUTE', verbose_name='资源类型'), + ) + ] diff --git a/apps/knowledge/models/knowledge.py b/apps/knowledge/models/knowledge.py index ce087324c..aa1ef7f6b 100644 --- a/apps/knowledge/models/knowledge.py +++ b/apps/knowledge/models/knowledge.py @@ -3,6 +3,7 @@ from enum import Enum import uuid_utils.compat as uuid from django.contrib.postgres.search import SearchVectorField from django.db import models +from django.db.models import QuerySet from django.db.models.signals import pre_delete from django.dispatch import receiver from mptt.fields import TreeForeignKey @@ -10,6 +11,7 @@ from mptt.models import MPTTModel from common.db.sql_execute import select_one from common.mixins.app_model_mixin import AppModelMixin +from common.utils.common import get_sha256_hash from models_provider.models import Model from users.models import User @@ -221,6 +223,19 @@ class SearchMode(models.TextChoices): blend = 'blend' +class FileSourceType(models.TextChoices): + # 知识库 跟随知识库被删除而被删除 source_id 为知识库id + KNOWLEDGE = "KNOWLEDGE" + # 应用 跟随应用被删除而被删除 source_id 为应用id + APPLICATION = "APPLICATION" + # 临时30分钟 数据30分钟后被清理 source_id 为TEMPORARY_30_MINUTE + TEMPORARY_30_MINUTE = "TEMPORARY_30_MINUTE" + # 临时120分钟 数据120分钟后被清理 source_id为TEMPORARY_100_MINUTE + TEMPORARY_120_MINUTE = "TEMPORARY_100_MINUTE" + # 临时1天 数据1天后被清理 source_id为TEMPORARY_1_DAY + TEMPORARY_1_DAY = "TEMPORARY_1_DAY" + + class VectorField(models.Field): def db_type(self, connection): return 'vector' @@ -246,7 +261,11 @@ class Embedding(models.Model): class File(AppModelMixin): id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id") file_name = models.CharField(max_length=256, verbose_name="文件名称", default="") - workspace_id = models.CharField(max_length=64, verbose_name="工作空间id", default="default", db_index=True) + file_size = models.IntegerField(verbose_name="文件大小", default=0) + sha256_hash = models.CharField(verbose_name="文件sha256_hash标识", default="") + source_type = models.CharField(verbose_name="资源类型", choices=FileSourceType, + default=FileSourceType.TEMPORARY_120_MINUTE.value) + source_id = models.CharField(verbose_name="资源id", default=FileSourceType.TEMPORARY_120_MINUTE.value) loid = models.IntegerField(verbose_name="loid") meta = models.JSONField(verbose_name="文件关联数据", default=dict) @@ -254,8 +273,13 @@ class File(AppModelMixin): db_table = "file" def save(self, bytea=None, force_insert=False, force_update=False, using=None, update_fields=None): - result = select_one("SELECT lo_from_bytea(%s, %s::bytea) as loid", [0, bytea]) - self.loid = result['loid'] + sha256_hash = get_sha256_hash(bytea) + f = QuerySet(File).filter(sha256_hash=sha256_hash).first() + if f is not None: + self.loid = f.loid + else: + result = select_one("SELECT lo_from_bytea(%s, %s::bytea) as loid", [0, bytea]) + self.loid = result['loid'] super().save() def get_bytes(self): @@ -265,4 +289,6 @@ class File(AppModelMixin): @receiver(pre_delete, sender=File) def on_delete_file(sender, instance, **kwargs): - select_one(f'SELECT lo_unlink({instance.loid})', []) + exist = QuerySet(File).filter(loid=instance.loid).exclude(id=instance.id).exists() + if not exist: + select_one(f'SELECT lo_unlink({instance.loid})', [])