From 57b15a8a7f833fe04e4e445753639fdad245ce8a Mon Sep 17 00:00:00 2001 From: CaptainB Date: Thu, 22 Aug 2024 16:52:32 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E7=9F=A5=E8=AF=86=E5=BA=93=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E4=B8=8A=E4=BC=A0csv=E5=92=8Cexcel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --story=1016154 --user=刘瑞斌 【知识库】-支持上传表格类型文档(Excel/CSV)按行分段 https://www.tapd.cn/57709429/s/1567910 --- apps/common/handle/base_parse_table_handle.py | 19 +++++++ .../impl/table/csv_parse_table_handle.py | 34 ++++++++++++ .../impl/table/excel_parse_table_handle.py | 49 ++++++++++++++++++ .../serializers/document_serializers.py | 43 +++++++++++++++ apps/dataset/template/MaxKB表格模板.csv | 13 +++++ apps/dataset/template/MaxKB表格模板.xlsx | Bin 0 -> 9864 bytes apps/dataset/urls.py | 2 + apps/dataset/views/document.py | 29 +++++++++++ ui/src/api/document.ts | 27 ++++++++++ ui/src/utils/utils.ts | 1 + .../views/dataset/UploadDocumentDataset.vue | 15 ++++++ .../dataset/component/UploadComponent.vue | 43 ++++++++++++++- 12 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 apps/common/handle/base_parse_table_handle.py create mode 100644 apps/common/handle/impl/table/csv_parse_table_handle.py create mode 100644 apps/common/handle/impl/table/excel_parse_table_handle.py create mode 100644 apps/dataset/template/MaxKB表格模板.csv create mode 100644 apps/dataset/template/MaxKB表格模板.xlsx diff --git a/apps/common/handle/base_parse_table_handle.py b/apps/common/handle/base_parse_table_handle.py new file mode 100644 index 000000000..e5331e19f --- /dev/null +++ b/apps/common/handle/base_parse_table_handle.py @@ -0,0 +1,19 @@ +# coding=utf-8 +""" + @project: maxkb + @Author:虎 + @file: base_parse_qa_handle.py + @date:2024/5/21 14:56 + @desc: +""" +from abc import ABC, abstractmethod + + +class BaseParseTableHandle(ABC): + @abstractmethod + def support(self, file, get_buffer): + pass + + @abstractmethod + def handle(self, file, get_buffer): + pass diff --git a/apps/common/handle/impl/table/csv_parse_table_handle.py b/apps/common/handle/impl/table/csv_parse_table_handle.py new file mode 100644 index 000000000..1104dd899 --- /dev/null +++ b/apps/common/handle/impl/table/csv_parse_table_handle.py @@ -0,0 +1,34 @@ +# coding=utf-8 +import logging + +from charset_normalizer import detect + +from common.handle.base_parse_table_handle import BaseParseTableHandle + +max_kb = logging.getLogger("max_kb") + + +class CsvSplitHandle(BaseParseTableHandle): + def support(self, file, get_buffer): + file_name: str = file.name.lower() + if file_name.endswith(".csv"): + return True + return False + + def handle(self, file, get_buffer): + buffer = get_buffer(file) + try: + content = buffer.decode(detect(buffer)['encoding']) + except BaseException as e: + max_kb.error(f'csv split handle error: {e}') + return [{'name': file.name, 'paragraphs': []}] + + csv_model = content.split('\n') + paragraphs = [] + # 第一行为标题 + title = csv_model[0].split(',') + for row in csv_model[1:]: + line = '; '.join([f'{key}:{value}' for key, value in zip(title, row.split(','))]) + paragraphs.append({'title': '', 'content': line}) + + return [{'name': file.name, 'paragraphs': paragraphs}] diff --git a/apps/common/handle/impl/table/excel_parse_table_handle.py b/apps/common/handle/impl/table/excel_parse_table_handle.py new file mode 100644 index 000000000..665e70ebc --- /dev/null +++ b/apps/common/handle/impl/table/excel_parse_table_handle.py @@ -0,0 +1,49 @@ +# coding=utf-8 +import io +import logging + +from openpyxl import load_workbook + +from common.handle.base_parse_table_handle import BaseParseTableHandle + +max_kb = logging.getLogger("max_kb") + + +class ExcelSplitHandle(BaseParseTableHandle): + def support(self, file, get_buffer): + file_name: str = file.name.lower() + if file_name.endswith('.xls') or file_name.endswith('.xlsx'): + return True + return False + + def handle(self, file, get_buffer): + buffer = get_buffer(file) + try: + wb = load_workbook(io.BytesIO(buffer)) + result = [] + for sheetname in wb.sheetnames: + paragraphs = [] + ws = wb[sheetname] + rows = list(ws.rows) + if not rows: continue + ti = list(rows[0]) + for r in list(rows[1:]): + title = [] + l = [] + for i, c in enumerate(r): + if not c.value: + continue + t = str(ti[i].value) if i < len(ti) else "" + title.append(t) + t += (": " if t else "") + str(c.value) + l.append(t) + l = "; ".join(l) + if sheetname.lower().find("sheet") < 0: + l += " ——" + sheetname + paragraphs.append({'title': '', 'content': l}) + result.append({'name': sheetname, 'paragraphs': paragraphs}) + + except BaseException as e: + max_kb.error(f'excel split handle error: {e}') + return [{'name': file.name, 'paragraphs': []}] + return result diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py index 43d401428..0c29d348a 100644 --- a/apps/dataset/serializers/document_serializers.py +++ b/apps/dataset/serializers/document_serializers.py @@ -33,6 +33,8 @@ from common.handle.impl.pdf_split_handle import PdfSplitHandle from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle +from common.handle.impl.table.csv_parse_table_handle import CsvSplitHandle +from common.handle.impl.table.excel_parse_table_handle import ExcelSplitHandle from common.handle.impl.text_split_handle import TextSplitHandle from common.mixins.api_mixin import ApiMixin from common.util.common import post, flat_map @@ -51,6 +53,7 @@ from embedding.task.embedding import embedding_by_document, delete_embedding_by_ from smartdoc.conf import PROJECT_DIR parse_qa_handle_list = [XlsParseQAHandle(), CsvParseQAHandle(), XlsxParseQAHandle()] +parse_table_handle_list = [CsvSplitHandle(), ExcelSplitHandle()] class FileBufferHandle: @@ -152,6 +155,13 @@ class DocumentInstanceQASerializer(ApiMixin, serializers.Serializer): error_messages=ErrMessage.file("文件"))) +class DocumentInstanceTableSerializer(ApiMixin, serializers.Serializer): + file_list = serializers.ListSerializer(required=True, + error_messages=ErrMessage.list("文件列表"), + child=serializers.FileField(required=True, + error_messages=ErrMessage.file("文件"))) + + class DocumentSerializers(ApiMixin, serializers.Serializer): class Export(ApiMixin, serializers.Serializer): type = serializers.CharField(required=True, validators=[ @@ -187,6 +197,23 @@ class DocumentSerializers(ApiMixin, serializers.Serializer): return HttpResponse(content, status=200, headers={'Content-Type': 'application/vnd.ms-excel', 'Content-Disposition': 'attachment; filename="excel_template.xlsx"'}) + def table_export(self, with_valid=True): + if with_valid: + self.is_valid(raise_exception=True) + + if self.data.get('type') == 'csv': + file = open(os.path.join(PROJECT_DIR, "apps", "dataset", 'template', 'MaxKB表格模板.csv'), "rb") + content = file.read() + file.close() + return HttpResponse(content, status=200, headers={'Content-Type': 'text/cxv', + 'Content-Disposition': 'attachment; filename="csv_template.csv"'}) + elif self.data.get('type') == 'excel': + file = open(os.path.join(PROJECT_DIR, "apps", "dataset", 'template', 'MaxKB表格模板.xlsx'), "rb") + content = file.read() + file.close() + return HttpResponse(content, status=200, headers={'Content-Type': 'application/vnd.ms-excel', + 'Content-Disposition': 'attachment; filename="excel_template.xlsx"'}) + class Migrate(ApiMixin, serializers.Serializer): dataset_id = serializers.UUIDField(required=True, error_messages=ErrMessage.char( @@ -633,6 +660,14 @@ class DocumentSerializers(ApiMixin, serializers.Serializer): return parse_qa_handle.handle(file, get_buffer) raise AppApiException(500, '不支持的文件格式') + @staticmethod + def parse_table_file(file): + get_buffer = FileBufferHandle().get_buffer + for parse_table_handle in parse_table_handle_list: + if parse_table_handle.support(file, get_buffer): + return parse_table_handle.handle(file, get_buffer) + raise AppApiException(500, '不支持的文件格式') + def save_qa(self, instance: Dict, with_valid=True): if with_valid: DocumentInstanceQASerializer(data=instance).is_valid(raise_exception=True) @@ -641,6 +676,14 @@ class DocumentSerializers(ApiMixin, serializers.Serializer): document_list = flat_map([self.parse_qa_file(file) for file in file_list]) return DocumentSerializers.Batch(data={'dataset_id': self.data.get('dataset_id')}).batch_save(document_list) + def save_table(self, instance: Dict, with_valid=True): + if with_valid: + DocumentInstanceTableSerializer(data=instance).is_valid(raise_exception=True) + self.is_valid(raise_exception=True) + file_list = instance.get('file_list') + document_list = flat_map([self.parse_table_file(file) for file in file_list]) + return DocumentSerializers.Batch(data={'dataset_id': self.data.get('dataset_id')}).batch_save(document_list) + @post(post_function=post_embedding) @transaction.atomic def save(self, instance: Dict, with_valid=False, **kwargs): diff --git a/apps/dataset/template/MaxKB表格模板.csv b/apps/dataset/template/MaxKB表格模板.csv new file mode 100644 index 000000000..7cf0f6306 --- /dev/null +++ b/apps/dataset/template/MaxKB表格模板.csv @@ -0,0 +1,13 @@ +职务,报销类型,一线城市报销标准(元),二线城市报销标准(元),三线城市报销标准(元) +普通员工,住宿费,500,400,300 +部门主管,住宿费,600,500,400 +部门总监,住宿费,700,600,500 +区域总经理,住宿费,800,700,600 +普通员工,伙食费,50,40,30 +部门主管,伙食费,50,40,30 +部门总监,伙食费,50,40,30 +区域总经理,伙食费,50,40,30 +普通员工,交通费,50,40,30 +部门主管,交通费,50,40,30 +部门总监,交通费,50,40,30 +区域总经理,交通费,50,40,30 diff --git a/apps/dataset/template/MaxKB表格模板.xlsx b/apps/dataset/template/MaxKB表格模板.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..2bc94a5b80dab10b685f8e411f5bebfdd66636cb GIT binary patch literal 9864 zcma)iby!tR_ck5U5+V)K-O}A9UDDlgXplU#gd7^Ay9A^~LK^8t@*GM+y88otpYnZv z@Ab_ev-h=U-D~ZA&&-aFjQ`&!eGE!D~h$v`?`ze@$u3a7Cq($W*8K|A9@vuU7iJv z^muE>kge6bkQxZTFzmNQeVJV#m}yv7*-8-kirYY_U`-cclfY$8uCn0E>_YeW=*}Gc z$;$2J;qECjGlcPKt0K8`Q+WrLZ3|ZA2SRgD7gn@>M?nukhnI#-tUuJN&~`!b4n>}~ zTu_A>;;nZpjaynTuP+@A;s435zvN42p$B$#9@xeDJG-Wi0P{zBqd{MkI$1F#_GNCw z*L+rZD#)~D>7lnH%xf#xG8Ua-q)guI)OJczBSY^JbSA&HAbUNL#R&y z%BO`HA|yZkAThFI{ywkIs}R175RHm2K>Y3YQ7$<2sHY>(9#xy=i)3Iv@p_?z&pulB z?4cm-1_(SVy1=c;Mw;m!hPJwIhsYy8!BF1L@kO9hkd^xdR>r%aSeawv9cVJ1Py!sQ zeO6hdyEE^queaY=X|anycSapbMU86pU}NmETN23}Fs2F_4)$%psMWEom?}tOvDG7A2Y&4N zwMb|rSse2WnF4C4gNI$x)*Ww0NpF3tOhq%K?&DL`0E)8QHhrbu^^G)Nt)#jgz;nETPtgP(8K&99q zmYzVJfqvMQ&Q|0Ji8dfv>g+tujIS^VBOzKd-H0o6a<#7T+ugYozT+E%fb?gr4sE`5GUgkRws+w3p`^;uJtCc+3B$kbDMV;12Aof z(!Zr6XTz(h_7+)vhMws)0K8@RIJRPDSs&XwwuSeN<_2l*i@PHFvS$}htW5AGwRsJ{bs8O#W-BYl~NykpPn#lDc5y_Dj1RZN%muz0XXGa*qZiH9pcKi{p(3&_rQL zt$Gg8vFLz=!u&$pW;ouAT@)!hVSVy#bP-I~nFZ`!Suhll`XxxQAZl4X}ubps7L6lj*7UHyQj5rs{Gr%`D0jkQ0(Pmj8S zf%r4u5nl&n3>D3ko;cL*KGAf0 zr#oGZea@i?|755FKwM7arkNUVW$}@Iyj3qf_FZG(O6J>-U}3YS7ntjgURSN>*M-M5 z-r20wNGz~l&u3z4wncbEo1^st_wB3Z*Wd?pwY@6~ikPmVNMm@!jNUQdX}s-fxUD^kEAg49>oFx`9!p1<51Efy_QPjtMdiFkSgVM^D|dNMp4bhl|7--;wnC zfP9ZzP66H&GoXE)OT)P10lbP^NgU>hQ%}jffvoa-Al`Qr#QMpwR&y-_Jg!?g$J*cv zrpWHb=M2V}joupMx+Gq>UK;Dg&`xWsJR}og?`Y#|UPi@+QPRI*3+`+o?Z6h+?WR+j zMThCqZuY5YESt%Ap{5Bf3D&)Eb{XL!8xvVa(qs9)Ey ziR5Cke zWJ1H~z|IMVDdwSs??cfq!mD5Dw_h^Bq+bUtdSU0G@Hk*k6QZabb3EM4`5z5;Y&p0i z47IfH31j6I&A<~izzKq@x>l1|r@Ajy1Nb?LV}RQNWA8X_rRH2zJPc%Cv3hQ@K=S5G zCo;Uj<#9z}mOT)wG9}W3^Pr9~ijyywUHGJ=;8~BjdK$XOiK4svkJrfmw|{xut33D@ zS1WUS^WXL=!B^L)7w}L}tq*SF(ZT;I_EY$$)7aG+iY4U2X?t-f9JR5uM>*8%K4n*T zxh>`~uMuCrJtq4;ig3i!jK6sZtxlSfQbJVkU8A#zNP*^x>Rnn01+D5meCBR{ml9Df zUU0UJ5=Z8;j~{8<`Ar;y7AYksPq!Xs5_iXVH!Z3YpVk_@4UkSA1PuSE0?i5 z+9^iT)9wED{6n_j^Hn%RvaCqKNhDLcnwAV=wqRpQ=8vdF2lj)WL6o15`iH~wI7ZV) zGXOYK?oTDgS!GmUZ1yuxI&MRT(c!yGaBPfZyk1#%F|?1><_o=TvQi;=PceridLEu} zdI8+K?~a63r4legve7#IQ;{{VCKZB~{; zcBU+vS5~?LaMbVmqU0DCvEmro(plV zv+v3wG}OdiM)6rVIZdivFsTL76XTN8spEYEL4ShJ-gj#^kecH#;Y?v~*SiD6ya6l$ ziLEZYH^!q5wc%bK=s3Nl0;i)#+47Pc%*1c@hVwk7MOY-6B7V-_PUs@x+eqyW^avBb zKY*?r+T*W3GZBjK;y6C#aSrH>8lzrag3{A+(ZSjq8ISM>2-P?@VDg;~EsB-I&kZvI zKghnJf6eATD3C*&XceT!2zwf>$ad!G5sOLlq1UW0)yVYu!V6JsEJv9uGccw#ovX~# zzJUcxvi!M-p~H;q{^F4xZXvfgWz2BzsD0Dz;6s8m*@O%AGZnGj08;Yj*HJ%)E=POZ zf`b)av6_oo28SHI~c3y-@b0kVZc8Pefwgx@SCG^0R+VY_jh_0^=| zbjmmnYvvWWcd!EU6x~Z?tH*Vjd{*}SOv;InugHEJM7BK;$N}_!;m-jp8ss_yj4=BQE}uc)wX_^-%L(tL#w!B>AHU_ z3(IRZSL?dxob&cAmr6H2`)oP2G~aPZky~!@s;;6wu@(oPh+;oOtY!?A*67Jz{G!N1 zZ$?cq!-*x@14o3>8C^Pw$S3a*)4SoRQ>v9`#TF<8>InA!6d_c?te!L&;gV-ynhS!K zM*$kWz-&Lu!AjbZB2hvNUx{O9ORl=D$}*fN@(e}EVooS?!nu!aEU>BuIi<~Q6E(;vYNpr6{j|FOQz1x)_$dV&2CXas zMc&~>7X(Lt=dZ4OlN^YOr%EMV1^FZ_3$*zJq+ms0G+>pioK>! z-9bb7d%aVwK|)B#XSWRvfdH4;(*44frP-B^Lv!ZRQ`I^d$t{4HfdLKf*;Ga(6i<q-+T@*$z=~AbNYf-X**ZH+bc(D zgHp%fe7$22J*%8wI@^TPVtYIczow~!BEH&v_NqqmzQX7pz8&E_XP_4H)lI5^l~@FL zb`ST@KY{JoCIdB8C@4DvD5!^B=Ho));_78*?((>=T-4i*otJrjAJcVjR1ky_K3VB? zxU`qF_(PYkL=2Y+30Zlaa`;0C_IIM92A|D*u}U0tTx49@_ev{`YILNVFo-LqtKR1$ zoLhS|ckr31)q)&{v(61(hXz_UM3$LeOMb@!$E{xL#rv`i?~T^3fEAJ6Iwdm0*{dFl zU%qIrH@_^msiBA#^BJuZ^atB%#AK+fe?8M``^ftwR`B~(&T8;ebvAr^4fIZ@M)HBY z;hc#E>EzoSjsQb%ztINq3j(6PJY(?eA{NEA&&NVqw;AUS85$5JKc3fe$gj`H-ggmyhnRy79*s}G`94f&s)AN5w%b) zeiXE#raw~dJ--oo0=)}1zp*enCss899F=_^Apxs~xw8gIi6Obcwsa~|(tq=SuG%b6 z@O*baU#vc;*m=qrH=nZJvx!Gk+d!XG&@Mb6CN`*{ELyNp_A<;E8=u=*Hh1A+o=N>GfvH3 zTmKn7ZuGpQ80Qv=MjE^PQAwcTI-rwts8H9DJH0Tctn9lX>|Nm!=MF)54kKZy^{t!H za`99e*VH7c+c$Eul{N}4+r-mPsgO3jmmD`UD;5h|I)1hvD!&b(-q$t$eEc%${c&#g zn08J2^E+E-j!u6jhn{$m0mmU$t+wff!2`p$KO2rCKS5QMS8M)K6ME0^={W|wvG)8K zkhwficI*?n?lyU4{Y$~3XB|B3P8jH8mYW!3#p4*_+MIJsKR%rW3RekwCYVNEV5@Y@ zJ!g!!*XR(}-iZ7jm?ygZA@oie=AFt|xU}dKMhv_nGH4`{0sVJDw7iDW$|7bNDe}u( zpH}^H-;767Q;WOn5xmV$#ka~g;Y2Mko~-4o_EI@D5tLxgkKd8h?^DKcAs1I34MFXu zaLvc=lyozrjoj+P#F>oC@zf!1zQa!n!xktkY?IU-E|A^fO_s>s(o4(zG;9qr5lVvJ zaq!F}5&PyfS5Q^pwCmeCtam8Dmg|J_rFqqJ$qmP%ZCRe8F;MJ6C00i$EE>(q8nv6T-ReNwt5)duY|nNX!cKM8t^DawWE z-Wp-JNZalTszlp<7hqIt21eMGk9?^(C%~J9iQeFK8)mLpXx4r?03Ub%tPxX+>i_|S zLQX%S9XGntBX@JZ>)f^rhqq)dF&|mC-P0`-MbXg>NFYx^z3TnP!yRYe-6m;^T;Xp$=2N@@<5?3(bm zJ2YwFXFFY#s+NH>L!uQI)_5h%L9+WFSvVH!&NLk%XS8tY=` zq~DOb&CBEJ+GpbY9&aK?WNf!~agZDMt2%6$K z#}6B^H<<2Eaj0av)TmL#b7m+gMARg@OeJtXF;N8NVqsxmXlyowBSlI{2ndmFMkw50 z0{a)?i0Lsf!Z)5%o(m~99)t%#hlQ`Ljj$+@P*Vy}XP>i>rTV%0Top;jU$><%v~b*l zYl4${L#SPG0^da9QPK4Dr&TGwy7zMQzHVdV7x2qvrVeAWgckNXVWb<8MpHzJ=%=BP z^&&w_!$h-e@ISlRiVqI%4WjPqSgU#U>fRZVRmkh;;78~AEB{Nj5GL9$siNU1ikE5X ziGVUlHH`1mz-Q5yQwJAgAr~bhm7%>jW^RF;Fm53ClEe0(iBCaK*egKlz-9{^)-~No zopv+b5Y5is7y9@E>=j~Vfiwt*pYUU>%P9j92irBsx-1JdthfX$+{i6B9-1L>Whim! z*0{Q8v3la+deEAnzS85A5dJ6w1cq7ylA5Yj1!cvb2!L^HL3Wx#8Rb;Kb8Ricj!oLC zCl|qMgszF$u-xL|-xSJ6Z9E4F#0D)M?Z248q$&1XBjpMNuZAVF&-8 zcGG(5F1ufPZN=7pK^j8i-I^;cDqM=zp!i=__O+0ihOM0^wm>tW#g4p;I`0ssYy2vh#Z|jzM7ZX|ahPB5Z2~A0+osrw&EH zqsQo{Qq8P=QSed)J&T;G#vAoJ7EO^Dp$L}@aS7D<%H6M9bpnI;s@#fy#p`IwhTvKc zOl1-tS$FnM92_@ zs1cyi)-)k1eQt#LEU|TC=}25}>Y*imTvs*C{P|GccXdGtw-uoD7{nPrw%x6h(Da!S z!Ck(^-tKg|ZkaBJLY~c!{ar1OZMm9KS|*n^3%aC|EyqsnAo`~LFTNk0Zdki!idwuj zeb6`1<=#P_kqBBO3I|^&W`Ozc(q)|mEQaVwu0|zIHKQ5{M9(brDqE|h2_ zX_i=??xg}GTIPwps3PxojKSy`!H6W|>*JU?7JV`^Y2($JF*j*wFvt_tev-Yph43Sg>c1`SQ~cGocK zOtx_ps=F*krRtM5Ot5cw`>eU%h-%5aTwX<@TIK#ov7oLL7v!61F5aQ65JAKC5h?5} z(6lLLrp2a#*fF81Q;o(YqSeyJw2@2Dfz5MHs!-kPAVq>L++$e9(+#dcw7rNET@Q)k zx*5&Iae2H{(Y!&2%e6?8%eL7EQ%$$o$0w=8N^Zvhc8b?9L4oD*}yhZC~8>h?$9$G+p;LTmZD8Cl!3({ zk%I1!p{q)md=QkDY?9QiEWojwRLG~bzh<<~#Buoj>kR`7-!b`QH~s-|ntolfXeF8F zSz6Hi>sR4YHC+TXkQdi!IEDA9vv#c)H^abuK}C|noSoj9j@##N8HN1_HUuqv2Awqr zNxM8U1kiyhM;kXBEGJ77RQFIRtvXk~1=t*Sg4t;wxL$mW`~2eiG4NpG=xF;hh7dRK z7+#Y+m$?CQ`Pk6-7ON}LlZ&_l>7mW-S7c46V*A`>FMX&Zo-pbRO-Orum`ShU`hhJz zxrF;p5suY0QEQI)iKmb6XC0WgvV2t09@_2Cq-GOlw04HWR>RJvrJ$anSgG1*z3(j* zWmxTM*tAl;HZsp;C-Ss|mBS_MGD8~)(RN~}1IAT7xi`M}G>yA-y=^vDfc12=PvXcl zZyss=(=@{Qs-!~wZcT0ewOzpT=M%tjWDRIFdf%|NXx=I(n|-CZ(Gz!i0C;^Qu31FV zf7ihDng6wKKi_V`{!*QsBJ`yy(vi~-!5ksZ2)iM?rcJLK3{%!1ZE-UP<;(H(__$Ty z1rX-fPC2!w_4I^D$(D(nQ_NLDsBde&31ndqCTG#LLI2;!u?P>O8UoDievjh#@VI}z ze`s5N8008_$}m4lR)9w2x>zxL9>?}ZtD*-k);CuciXVz*z5yu;_oLL*a~Al2f3*Pi8%JTx2hbx@zh=nk|}SQg7s2&>VRwBJ~Fuu z25tx>%rR5O>Az7OWe?-0i?<%zQdbtq;^&eq;uSW10pxCRP7gk8NOhu+?{($_TH|wBrhQ@ zLm9vkQ<=yA&E%X1<7a!`nSUya2w1*hLRAW=D+4pb25%HHiDR!P_DpEp0#tHBxMF2oMqA(!H5P%rOWc|Ubfxew4h2c z*hW%GOfF588=h)ID|r_zmRpdIj->NWr(3VvKdz6BEE}(eCKe!q40y4^~apMdKzC_@d6JK%c2~;bf1E>VS-^vB5RF8Sv5!c$gRLzTweczZabwp)1OU$JX*hp1bh~ zK;$c;6k6J*5i@%-QIaG=fFh5jK3ZQck{C!yPK_pJw9MC_F{_cVvF~5u&{)zcH zW&W43$2s|{JQ{l}e@m44r@h}(Wgdlp*`xV8>_3S!zf<&kD#%~xkcT1tr_nzWLw<+< zp4{*k{OG}D{009r)!~1A^kWSFS9$zs1-PI7=6?bH$x;~U-yGEMFZ#PDd$jY5r4N6H z{g+?+UC-~ksK4}h6aM?2>URylp924-;pQPn;U8zh|Nq}!j`S1yM|kp|Bjoo01Me|Gu61OTh-wPrHAo?SH-WV-m)%^4P=nzk2u){AWtWKi~ZKq>M-GuU/hit_test', views.Dataset.HitTest.as_view()), path('dataset//document', views.Document.as_view(), name='document'), path('dataset/document/template/export', views.Template.as_view()), + path('dataset/document/table_template/export', views.TableTemplate.as_view()), path('dataset//document/web', views.WebDocument.as_view()), path('dataset//document/qa', views.QaDocument.as_view()), + path('dataset//document/table', views.TableDocument.as_view()), path('dataset//document/_bach', views.Document.Batch.as_view()), path('dataset//document/batch_hit_handling', views.Document.BatchEditHitHandling.as_view()), path('dataset//document//', views.Document.Page.as_view()), diff --git a/apps/dataset/views/document.py b/apps/dataset/views/document.py index f522d01ce..1988ca75a 100644 --- a/apps/dataset/views/document.py +++ b/apps/dataset/views/document.py @@ -33,6 +33,17 @@ class Template(APIView): def get(self, request: Request): return DocumentSerializers.Export(data={'type': request.query_params.get('type')}).export(with_valid=True) +class TableTemplate(APIView): + authentication_classes = [TokenAuth] + + @action(methods=['GET'], detail=False) + @swagger_auto_schema(operation_summary="获取表格模版", + operation_id="获取表格模版", + manual_parameters=DocumentSerializers.Export.get_request_params_api(), + tags=["知识库/文档"]) + def get(self, request: Request): + return DocumentSerializers.Export(data={'type': request.query_params.get('type')}).table_export(with_valid=True) + class WebDocument(APIView): authentication_classes = [TokenAuth] @@ -71,6 +82,24 @@ class QaDocument(APIView): {'file_list': request.FILES.getlist('file')}, with_valid=True)) +class TableDocument(APIView): + authentication_classes = [TokenAuth] + parser_classes = [MultiPartParser] + + @action(methods=['POST'], detail=False) + @swagger_auto_schema(operation_summary="导入表格并创建文档", + operation_id="导入表格并创建文档", + manual_parameters=DocumentWebInstanceSerializer.get_request_params_api(), + responses=result.get_api_response(DocumentSerializers.Create.get_response_body_api()), + tags=["知识库/文档"]) + @has_permissions( + lambda r, k: Permission(group=Group.DATASET, operate=Operate.MANAGE, + dynamic_tag=k.get('dataset_id'))) + def post(self, request: Request, dataset_id: str): + return result.success( + DocumentSerializers.Create(data={'dataset_id': dataset_id}).save_table( + {'file_list': request.FILES.getlist('file')}, + with_valid=True)) class Document(APIView): authentication_classes = [TokenAuth] diff --git a/ui/src/api/document.ts b/ui/src/api/document.ts index 5bf294100..0653f2d40 100644 --- a/ui/src/api/document.ts +++ b/ui/src/api/document.ts @@ -211,6 +211,19 @@ const postQADocument: ( return post(`${prefix}/${dataset_id}/document/qa`, data, undefined, loading) } +/** + * 导入表格 + * @param 参数 + * file + */ +const postTableDocument: ( + dataset_id: string, + data: any, + loading?: Ref +) => Promise> = (dataset_id, data, loading) => { + return post(`${prefix}/${dataset_id}/document/table`, data, undefined, loading) +} + /** * 批量迁移文档 * @param 参数 dataset_id,target_dataset_id, @@ -256,6 +269,18 @@ const exportQATemplate: (fileName: string, type: string, loading?: Ref) return exportExcel(fileName, `${prefix}/document/template/export`, { type }, loading) } +/** + * 获得table模版 + * @param 参数 fileName,type, + */ +const exportTableTemplate: (fileName: string, type: string, loading?: Ref) => void = ( + fileName, + type, + loading +) => { + return exportExcel(fileName, `${prefix}/document/table_template/export`, { type }, loading) +} + /** * 导出文档 * @param document_name 文档名称 @@ -295,6 +320,8 @@ export default { putMigrateMulDocument, batchEditHitHandling, exportQATemplate, + exportTableTemplate, postQADocument, + postTableDocument, exportDocument } diff --git a/ui/src/utils/utils.ts b/ui/src/utils/utils.ts index 9b30135fb..b2d77d834 100644 --- a/ui/src/utils/utils.ts +++ b/ui/src/utils/utils.ts @@ -39,6 +39,7 @@ export function fileType(name: string) { */ const typeList: any = { txt: ['txt', 'pdf', 'docx', 'csv', 'md', 'html', 'PDF'], + table: ['xlsx', 'xls', 'csv'], QA: ['xlsx', 'csv', 'xls'] } diff --git a/ui/src/views/dataset/UploadDocumentDataset.vue b/ui/src/views/dataset/UploadDocumentDataset.vue index 370451a54..c434ea2fc 100644 --- a/ui/src/views/dataset/UploadDocumentDataset.vue +++ b/ui/src/views/dataset/UploadDocumentDataset.vue @@ -78,6 +78,21 @@ async function next() { router.push({ path: `/dataset/${id}/document` }) }) } + } else if (documentsType.value === 'table') { + let fd = new FormData() + documentsFiles.value.forEach((item: any) => { + if (item?.raw) { + fd.append('file', item?.raw) + } + }) + if (id) { + // table文档上传 + documentApi.postTableDocument(id as string, fd, loading).then((res) => { + MsgSuccess('提交成功') + clearStore() + router.push({ path: `/dataset/${id}/document` }) + }) + } } else { if (active.value++ > 2) active.value = 0 } diff --git a/ui/src/views/dataset/component/UploadComponent.vue b/ui/src/views/dataset/component/UploadComponent.vue index d51aff263..305ae1ca4 100644 --- a/ui/src/views/dataset/component/UploadComponent.vue +++ b/ui/src/views/dataset/component/UploadComponent.vue @@ -10,6 +10,7 @@ 文本文件 + 表格 QA 问答对 @@ -48,6 +49,42 @@ 下载 CSV 模板 + + + +
+

+ 拖拽文件至此上传或 + 选择文件 + 选择文件夹 +

+
+

当前支持 EXCEL和CSV 格式文件。

+

第一行必须是列标题,且列标题必须是有意义的术语,表中每条记录将作为一个分段。

+

每次最多上传50个文档,每个文档最大不能超过100MB。

+
+
+
+ + 下载 Excel 模板 + + + 下载 CSV 模板 +

- 支持格式:TXT、Markdown、PDF、DOCX、HTML 每次最多上传50个文件,每个文件不超过 100MB + 支持格式:TXT、Markdown、PDF、DOCX、HTML、Excel、CSV 每次最多上传50个文件,每个文件不超过 100MB

若使用【高级分段】建议上传前规范文件的分段标识

@@ -133,6 +170,10 @@ function downloadTemplate(type: string) { documentApi.exportQATemplate(`${type}模版.${type == 'csv' ? type : 'xlsx'}`, type) } +function downloadTableTemplate(type: string) { + documentApi.exportTableTemplate(`${type}模版.${type == 'csv' ? type : 'xlsx'}`, type) +} + function radioChange() { form.value.fileList = [] }