feat: 支持知识库文档导出 (#553)

2025-12-26 01:33:05 +00:00 · 2024-05-27 15:48:24 +08:00 · 2024-05-27 15:48:24 +08:00 · 399b2727e8
parent 768fd7e921
commit 399b2727e8
13 changed files with 263 additions and 8 deletions
--- a/apps/dataset/serializers/dataset_serializers.py
+++ b/apps/dataset/serializers/dataset_serializers.py
@ -12,13 +12,15 @@ import re
 import traceback
 import uuid
 from functools import reduce
-from typing import Dict
+from typing import Dict, List
 from urllib.parse import urlparse

+import xlwt
 from django.contrib.postgres.fields import ArrayField
 from django.core import validators
 from django.db import transaction, models
 from django.db.models import QuerySet
+from django.http import HttpResponse
 from drf_yasg import openapi
 from rest_framework import serializers

@ -668,6 +670,50 @@ class DataSetSerializers(serializers.ModelSerializer):
            if not QuerySet(DataSet).filter(id=self.data.get("id")).exists():
                raise AppApiException(300, "id不存在")

+        def export_excel(self, with_valid=True):
+            if with_valid:
+                self.is_valid(raise_exception=True)
+            document_list = QuerySet(Document).filter(dataset_id=self.data.get('id'))
+            paragraph_list = native_search(QuerySet(Paragraph).filter(dataset_id=self.data.get("id")), get_file_content(
+                os.path.join(PROJECT_DIR, "apps", "dataset", 'sql', 'list_paragraph_document_name.sql')))
+            problem_mapping_list = native_search(
+                QuerySet(ProblemParagraphMapping).filter(dataset_id=self.data.get("id")), get_file_content(
+                    os.path.join(PROJECT_DIR, "apps", "dataset", 'sql', 'list_problem_mapping.sql')),
+                with_table_name=True)
+            data_dict, document_dict = DocumentSerializers.Operate.merge_problem(paragraph_list, problem_mapping_list,
+                                                                                 document_list)
+            workbook = DocumentSerializers.Operate.get_workbook(data_dict, document_dict)
+            response = HttpResponse(content_type='application/vnd.ms-excel')
+            response['Content-Disposition'] = 'attachment; filename="dataset.xls"'
+            workbook.save(response)
+            return response
+
+        @staticmethod
+        def merge_problem(paragraph_list: List[Dict], problem_mapping_list: List[Dict]):
+            result = {}
+            document_dict = {}
+
+            for paragraph in paragraph_list:
+                problem_list = [problem_mapping.get('content') for problem_mapping in problem_mapping_list if
+                                problem_mapping.get('paragraph_id') == paragraph.get('id')]
+                document_sheet = result.get(paragraph.get('document_id'))
+                d = document_dict.get(paragraph.get('document_name'))
+                if d is None:
+                    document_dict[paragraph.get('document_name')] = {paragraph.get('document_id')}
+                else:
+                    d.add(paragraph.get('document_id'))
+
+                if document_sheet is None:
+                    result[paragraph.get('document_id')] = [[paragraph.get('title'), paragraph.get('content'),
+                                                             '\n'.join(problem_list)]]
+                else:
+                    document_sheet.append([paragraph.get('title'), paragraph.get('content'), '\n'.join(problem_list)])
+            result_document_dict = {}
+            for d_name in document_dict:
+                for index, d_id in enumerate(document_dict.get(d_name)):
+                    result_document_dict[d_id] = d_name if index == 0 else d_name + str(index)
+            return result, result_document_dict
+
        @transaction.atomic
        def delete(self):
            self.is_valid()
--- a/apps/dataset/serializers/document_serializers.py
+++ b/apps/dataset/serializers/document_serializers.py
@ -14,12 +14,14 @@ import uuid
 from functools import reduce
 from typing import List, Dict

+import xlwt
 from django.core import validators
 from django.db import transaction
 from django.db.models import QuerySet
 from django.http import HttpResponse
 from drf_yasg import openapi
 from rest_framework import serializers
+from xlwt import Utils

 from common.db.search import native_search, native_page_search
 from common.event.common import work_thread_pool
@ -423,6 +425,85 @@ class DocumentSerializers(ApiMixin, serializers.Serializer):
            if not QuerySet(Document).filter(id=document_id).exists():
                raise AppApiException(500, "文档id不存在")

+        def export(self, with_valid=True):
+            if with_valid:
+                self.is_valid(raise_exception=True)
+            document = QuerySet(Document).filter(id=self.data.get("document_id")).first()
+            paragraph_list = native_search(QuerySet(Paragraph).filter(document_id=self.data.get("document_id")),
+                                           get_file_content(
+                                               os.path.join(PROJECT_DIR, "apps", "dataset", 'sql',
+                                                            'list_paragraph_document_name.sql')))
+            problem_mapping_list = native_search(
+                QuerySet(ProblemParagraphMapping).filter(document_id=self.data.get("document_id")), get_file_content(
+                    os.path.join(PROJECT_DIR, "apps", "dataset", 'sql', 'list_problem_mapping.sql')),
+                with_table_name=True)
+            data_dict, document_dict = self.merge_problem(paragraph_list, problem_mapping_list, [document])
+            workbook = self.get_workbook(data_dict, document_dict)
+            response = HttpResponse(content_type='application/vnd.ms-excel')
+            response['Content-Disposition'] = f'attachment; filename="data.xls"'
+            workbook.save(response)
+            return response
+
+        @staticmethod
+        def get_workbook(data_dict, document_dict):
+            # 创建工作簿对象
+            workbook = xlwt.Workbook(encoding='utf-8')
+            for sheet_id in data_dict:
+                # 添加工作表
+                worksheet = workbook.add_sheet(document_dict.get(sheet_id))
+                data = [
+                    ['分段标题（选填）', '分段内容（必填，问题答案，最长不超过4096个字符）', '问题（选填，单元格内一行一个）'],
+                    *data_dict.get(sheet_id)
+                ]
+                # 写入数据到工作表
+                for row_idx, row in enumerate(data):
+                    for col_idx, col in enumerate(row):
+                        worksheet.write(row_idx, col_idx, col)
+                    # 创建HttpResponse对象返回Excel文件
+            return workbook
+
+        @staticmethod
+        def merge_problem(paragraph_list: List[Dict], problem_mapping_list: List[Dict], document_list):
+            result = {}
+            document_dict = {}
+
+            for paragraph in paragraph_list:
+                problem_list = [problem_mapping.get('content') for problem_mapping in problem_mapping_list if
+                                problem_mapping.get('paragraph_id') == paragraph.get('id')]
+                document_sheet = result.get(paragraph.get('document_id'))
+                document_name = DocumentSerializers.Operate.reset_document_name(paragraph.get('document_name'))
+                d = document_dict.get(document_name)
+                if d is None:
+                    document_dict[document_name] = {paragraph.get('document_id')}
+                else:
+                    d.add(paragraph.get('document_id'))
+
+                if document_sheet is None:
+                    result[paragraph.get('document_id')] = [[paragraph.get('title'), paragraph.get('content'),
+                                                             '\n'.join(problem_list)]]
+                else:
+                    document_sheet.append([paragraph.get('title'), paragraph.get('content'), '\n'.join(problem_list)])
+            for document in document_list:
+                if document.id not in result:
+                    document_name = DocumentSerializers.Operate.reset_document_name(document.name)
+                    result[document.id] = [[]]
+                    d = document_dict.get(document_name)
+                    if d is None:
+                        document_dict[document_name] = {document.id}
+                    else:
+                        d.add(document.id)
+            result_document_dict = {}
+            for d_name in document_dict:
+                for index, d_id in enumerate(document_dict.get(d_name)):
+                    result_document_dict[d_id] = d_name if index == 0 else d_name + str(index)
+            return result, result_document_dict
+
+        @staticmethod
+        def reset_document_name(document_name):
+            if document_name is None or not Utils.valid_sheet_name(document_name):
+                return "Sheet"
+            return document_name.strip()
+
        def one(self, with_valid=False):
            if with_valid:
                self.is_valid(raise_exception=True)
--- a/apps/dataset/sql/list_paragraph_document_name.sql
+++ b/apps/dataset/sql/list_paragraph_document_name.sql
@ -0,0 +1,5 @@
+SELECT
+	(SELECT "name" FROM "document" WHERE "id"=document_id) as document_name,
+	*
+FROM
+	"paragraph"
--- a/apps/dataset/sql/list_problem_mapping.sql
+++ b/apps/dataset/sql/list_problem_mapping.sql
@ -0,0 +1,2 @@
+SELECT "problem"."content",problem_paragraph_mapping.paragraph_id FROM problem problem
+LEFT JOIN problem_paragraph_mapping problem_paragraph_mapping ON problem_paragraph_mapping.problem_id=problem."id"
--- a/apps/dataset/urls.py
+++ b/apps/dataset/urls.py
@ -8,6 +8,7 @@ urlpatterns = [
    path('dataset/web', views.Dataset.CreateWebDataset.as_view()),
    path('dataset/qa', views.Dataset.CreateQADataset.as_view()),
    path('dataset/<str:dataset_id>', views.Dataset.Operate.as_view(), name="dataset_key"),
+    path('dataset/<str:dataset_id>/export', views.Dataset.Export.as_view(), name="export"),
    path('dataset/<str:dataset_id>/re_embedding', views.Dataset.Embedding.as_view(), name="dataset_key"),
    path('dataset/<str:dataset_id>/application', views.Dataset.Application.as_view()),
    path('dataset/<int:current_page>/<int:page_size>', views.Dataset.Page.as_view(), name="dataset"),
@ -27,6 +28,8 @@ urlpatterns = [
    path('dataset/document/split_pattern', views.Document.SplitPattern.as_view(),
         name="document_operate"),
    path('dataset/<str:dataset_id>/document/migrate/<str:target_dataset_id>', views.Document.Migrate.as_view()),
+    path('dataset/<str:dataset_id>/document/<str:document_id>/export', views.Document.Export.as_view(),
+         name="document_export"),
    path('dataset/<str:dataset_id>/document/<str:document_id>/sync', views.Document.SyncWeb.as_view()),
    path('dataset/<str:dataset_id>/document/<str:document_id>/refresh', views.Document.Refresh.as_view()),
    path('dataset/<str:dataset_id>/document/<str:document_id>/paragraph', views.Paragraph.as_view()),
--- a/apps/dataset/views/dataset.py
+++ b/apps/dataset/views/dataset.py
@ -152,6 +152,19 @@ class Dataset(APIView):
            return result.success(
                DataSetSerializers.Operate(data={'id': dataset_id, 'user_id': request.user.id}).re_embedding())

+    class Export(APIView):
+        authentication_classes = [TokenAuth]
+
+        @action(methods="GET", detail=False)
+        @swagger_auto_schema(operation_summary="导出知识库", operation_id="导出知识库",
+                             manual_parameters=DataSetSerializers.Operate.get_request_params_api(),
+                             tags=["知识库"]
+                             )
+        @has_permissions(lambda r, keywords: Permission(group=Group.DATASET, operate=Operate.MANAGE,
+                                                        dynamic_tag=keywords.get('dataset_id')))
+        def get(self, request: Request, dataset_id: str):
+            return DataSetSerializers.Operate(data={'id': dataset_id, 'user_id': request.user.id}).export_excel()
+
    class Operate(APIView):
        authentication_classes = [TokenAuth]

--- a/apps/dataset/views/document.py
+++ b/apps/dataset/views/document.py
@ -230,6 +230,20 @@ class Document(APIView):

                ))

+    class Export(APIView):
+        authentication_classes = [TokenAuth]
+
+        @action(methods=['GET'], detail=False)
+        @swagger_auto_schema(operation_summary="导出文档",
+                             operation_id="导出文档",
+                             manual_parameters=DocumentSerializers.Operate.get_request_params_api(),
+                             tags=["知识库/文档"])
+        @has_permissions(
+            lambda r, k: Permission(group=Group.DATASET, operate=Operate.USE,
+                                    dynamic_tag=k.get('dataset_id')))
+        def get(self, request: Request, dataset_id: str, document_id: str):
+            return DocumentSerializers.Operate(data={'document_id': document_id, 'dataset_id': dataset_id}).export()
+
    class Operate(APIView):
        authentication_classes = [TokenAuth]

--- a/ui/src/api/dataset.ts
+++ b/ui/src/api/dataset.ts
@ -1,5 +1,5 @@
 import { Result } from '@/request/Result'
-import { get, post, del, put } from '@/request/index'
+import { get, post, del, put, exportExcel } from '@/request/index'
 import type { datasetData } from '@/api/type/dataset'
 import type { pageRequest } from '@/api/type/common'
 import type { ApplicationFormType } from '@/api/type/application'
@ -187,6 +187,20 @@ const putReEmbeddingDataset: (
  return put(`${prefix}/${dataset_id}/re_embedding`, undefined, undefined, loading)
 }

+/**
+ * 导出知识库
+ * @param dataset_name 知识库名称
+ * @param dataset_id   知识库id
+ * @returns
+ */
+const exportDataset: (
+  dataset_name: string,
+  dataset_id: string,
+  loading?: Ref<boolean>
+) => Promise<any> = (dataset_name, dataset_id, loading) => {
+  return exportExcel(dataset_name + '.xls', `dataset/${dataset_id}/export`, undefined, loading)
+}
+
 export default {
  getDataset,
  getAllDataset,
@ -199,5 +213,6 @@ export default {
  postWebDataset,
  putSyncWebDataset,
  putReEmbeddingDataset,
-  postQADataset
+  postQADataset,
+  exportDataset
 }
--- a/ui/src/api/document.ts
+++ b/ui/src/api/document.ts
@ -256,6 +256,28 @@ const exportQATemplate: (fileName: string, type: string, loading?: Ref<boolean>)
  return exportExcel(fileName, `${prefix}/document/template/export`, { type }, loading)
 }

+/**
+ * 导出文档
+ * @param document_name 文档名称
+ * @param dataset_id    数据集id
+ * @param document_id   文档id
+ * @param loading       加载器
+ * @returns
+ */
+const exportDocument: (
+  document_name: string,
+  dataset_id: string,
+  document_id: string,
+  loading?: Ref<boolean>
+) => Promise<any> = (document_name, dataset_id, document_id, loading) => {
+  return exportExcel(
+    document_name + '.xls',
+    `${prefix}/${dataset_id}/document/${document_id}/export`,
+    {},
+    loading
+  )
+}
+
 export default {
  postSplitDocument,
  getDocument,
@ -273,5 +295,6 @@ export default {
  putMigrateMulDocument,
  batchEditHitHandling,
  exportQATemplate,
-  postQADocument
+  postQADocument,
+  exportDocument
 }
--- a/ui/src/components/icons/index.ts
+++ b/ui/src/components/icons/index.ts
@ -910,5 +910,30 @@ export const iconMap: any = {
        )
      ])
    }
+  },
+  'app-export': {
+    iconReader: () => {
+      return h('i', [
+        h(
+          'svg',
+          {
+            style: { height: '100%', width: '100%' },
+            viewBox: '0 0 1024 1024',
+            version: '1.1',
+            xmlns: 'http://www.w3.org/2000/svg'
+          },
+          [
+            h('path', {
+              d: 'M142.859375 854.80357107c0 14.52455392 11.8125 26.33705392 26.39732108 26.33705393h263.67187499a26.33705392 26.33705392 0 1 1 1e-8 52.734375H116.52232107A26.33705392 26.33705392 0 0 1 90.125 907.47767893V116.52232107C90.125 101.9375 101.9375 90.125 116.52232107 90.125h790.95535786c14.58482107 0 26.39732108 11.8125 26.39732107 26.39732108v316.40624999a26.33705392 26.33705392 0 1 1-52.734375 1e-8V169.25669607a26.33705392 26.33705392 0 0 0-26.39732107-26.39732107H169.25669607a26.33705392 26.33705392 0 0 0-26.39732107 26.39732108v685.48660785z',
+              fill: 'currentColor'
+            }),
+            h('path', {
+              d: 'M797.97098203 650.85714298H274.72544608a26.33705392 26.33705392 0 1 0-1e-8 52.734375h523.90848285L638.20089298 863.96428595a26.33705392 26.33705392 0 0 0 37.30580309 37.24553513l205.09151799-205.09151715a26.39732108 26.39732108 0 0 0 0-37.24553596L675.50669608 453.78125a26.39732108 26.39732108 0 1 0-37.30580311 37.36607108l159.77008905 159.7098219z',
+              fill: 'currentColor'
+            })
+          ]
+        )
+      ])
+    }
  }
 }
--- a/ui/src/request/index.ts
+++ b/ui/src/request/index.ts
@ -211,8 +211,13 @@ export const exportExcel: (
  url: string,
  params: any,
  loading?: NProgress | Ref<boolean>
-) => void = (fileName: string, url: string, params: any, loading?: NProgress | Ref<boolean>) => {
-  promise(request({ url: url, method: 'get', params, responseType: 'blob' }), loading)
+) => Promise<any> = (
+  fileName: string,
+  url: string,
+  params: any,
+  loading?: NProgress | Ref<boolean>
+) => {
+  return promise(request({ url: url, method: 'get', params, responseType: 'blob' }), loading)
    .then((res: any) => {
      if (res) {
        const blob = new Blob([res], {
@ -225,6 +230,7 @@ export const exportExcel: (
        //释放内存
        window.URL.revokeObjectURL(link.href)
      }
+      return true
    })
    .catch((e) => {})
 }
--- a/ui/src/views/dataset/index.vue
+++ b/ui/src/views/dataset/index.vue
@ -85,7 +85,12 @@
                            <el-dropdown-item
                              icon="Setting"
                              @click.stop="router.push({ path: `/dataset/${item.id}/setting` })"
-                              >设置</el-dropdown-item
+                            >
+                              设置</el-dropdown-item
+                            >
+                            <el-dropdown-item @click.stop="export_dataset(item)">
+                              <AppIcon iconName="app-export" style="font-size: 16px"></AppIcon
+                              >导出</el-dropdown-item
                            >
                            <el-dropdown-item icon="Delete" @click.stop="deleteDataset(item)"
                              >删除</el-dropdown-item
@ -144,6 +149,11 @@ function searchHandle() {
  datasetList.value = []
  getList()
 }
+const export_dataset = (item: any) => {
+  datasetApi.exportDataset(item.name, item.id, loading).then((ok) => {
+    MsgSuccess('导出成功')
+  })
+}

 function deleteDataset(row: any) {
  MsgConfirm(
--- a/ui/src/views/document/index.vue
+++ b/ui/src/views/document/index.vue
@ -165,6 +165,10 @@
                          <AppIcon iconName="app-migrate"></AppIcon>
                          迁移
                        </el-dropdown-item>
+                        <el-dropdown-item @click="exportDocument(row)">
+                          <AppIcon iconName="app-export"></AppIcon>
+                          导出
+                        </el-dropdown-item>
                        <el-dropdown-item icon="Delete" @click.stop="deleteDocument(row)"
                          >删除</el-dropdown-item
                        >
@ -200,6 +204,10 @@
                          <AppIcon iconName="app-migrate"></AppIcon>
                          迁移</el-dropdown-item
                        >
+                        <el-dropdown-item @click="exportDocument(row)">
+                          <AppIcon iconName="app-export"></AppIcon>
+                          导出
+                        </el-dropdown-item>
                        <el-dropdown-item icon="Delete" @click.stop="deleteDocument(row)"
                          >删除</el-dropdown-item
                        >
@ -281,7 +289,11 @@ const multipleSelection = ref<any[]>([])
 const title = ref('')

 const SelectDatasetDialogRef = ref()
-
+const exportDocument = (document: any) => {
+  documentApi.exportDocument(document.name, document.dataset_id, document.id, loading).then(() => {
+    MsgSuccess('导出成功')
+  })
+}
 function openDatasetDialog(row?: any) {
  const arr: string[] = []
  if (row) {