From c1d3a46dc7dbf6122e653de33b84079c4f11de18 Mon Sep 17 00:00:00 2001 From: archer <545436317@qq.com> Date: Mon, 10 Apr 2023 19:47:03 +0800 Subject: [PATCH] =?UTF-8?q?perf:=20csv=E6=96=87=E4=BB=B6=E9=80=89=E6=8B=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- public/docs/csvSelect.md | 5 + src/pages/api/model/data/exportModelData.ts | 14 +- .../model/detail/components/ModelDataCard.tsx | 52 +++---- ...SelectJsonModal.tsx => SelectCsvModal.tsx} | 63 ++++---- .../detail/components/SelectFileModal.tsx | 2 +- src/utils/file.ts | 136 ++++++++++++++++++ src/utils/tools.ts | 98 ++----------- 7 files changed, 222 insertions(+), 148 deletions(-) create mode 100644 public/docs/csvSelect.md rename src/pages/model/detail/components/{SelectJsonModal.tsx => SelectCsvModal.tsx} (68%) create mode 100644 src/utils/file.ts diff --git a/public/docs/csvSelect.md b/public/docs/csvSelect.md new file mode 100644 index 000000000..c82e5160e --- /dev/null +++ b/public/docs/csvSelect.md @@ -0,0 +1,5 @@ +接受一个csv文件,表格头包含 question 和 answer。question 代表问题,answer 代表答案。 +| question | answer | +| --- | --- | +| 什么是 laf | laf 是一个云函数开发平台…… | +| 什么是 sealos | Sealos 是以 kubernetes 为内核的云操作系统发行版,可以…… | diff --git a/src/pages/api/model/data/exportModelData.ts b/src/pages/api/model/data/exportModelData.ts index 68145e5d9..3b511eb6e 100644 --- a/src/pages/api/model/data/exportModelData.ts +++ b/src/pages/api/model/data/exportModelData.ts @@ -4,6 +4,7 @@ import { connectToDatabase } from '@/service/mongo'; import { authToken } from '@/service/utils/tools'; import { connectRedis } from '@/service/redis'; import { VecModelDataIdx } from '@/constants/redis'; +import { clearStrLineBreak } from '@/utils/tools'; export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { @@ -40,13 +41,16 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< } ); - const data = searchRes.documents.map((item: any) => ({ - prompt: item.value.q, - completion: item.value.text - })); + let str = `question,answer\n`; + + searchRes.documents.forEach((item: any) => { + if (item.value.q && item.value.text) { + str += `"${clearStrLineBreak(item.value.q)}","${clearStrLineBreak(item.value.text)}"\n`; + } + }); jsonRes(res, { - data: JSON.stringify(data) + data: str.slice(0, str.length - 1) }); } catch (err) { jsonRes(res, { diff --git a/src/pages/model/detail/components/ModelDataCard.tsx b/src/pages/model/detail/components/ModelDataCard.tsx index d482d128b..2abe88a6f 100644 --- a/src/pages/model/detail/components/ModelDataCard.tsx +++ b/src/pages/model/detail/components/ModelDataCard.tsx @@ -28,8 +28,8 @@ import { getExportDataList } from '@/api/model'; import { DeleteIcon, RepeatIcon, EditIcon } from '@chakra-ui/icons'; -import { useToast } from '@/hooks/useToast'; import { useLoading } from '@/hooks/useLoading'; +import { fileDownload } from '@/utils/file'; import dynamic from 'next/dynamic'; import { useMutation, useQuery } from '@tanstack/react-query'; import type { FormData as InputDataType } from './InputDataModal'; @@ -37,10 +37,10 @@ import type { FormData as InputDataType } from './InputDataModal'; const InputModel = dynamic(() => import('./InputDataModal')); const SelectFileModel = dynamic(() => import('./SelectFileModal')); const SelectUrlModel = dynamic(() => import('./SelectUrlModal')); -const SelectJsonModel = dynamic(() => import('./SelectJsonModal')); +const SelectCsvModal = dynamic(() => import('./SelectCsvModal')); const ModelDataCard = ({ model }: { model: ModelSchema }) => { - const { Loading } = useLoading(); + const { Loading, setIsLoading } = useLoading(); const { data: modelDataList, @@ -70,9 +70,9 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => { onClose: onCloseSelectUrlModal } = useDisclosure(); const { - isOpen: isOpenSelectJsonModal, - onOpen: onOpenSelectJsonModal, - onClose: onCloseSelectJsonModal + isOpen: isOpenSelectCsvModal, + onOpen: onOpenSelectCsvModal, + onClose: onCloseSelectCsvModal } = useDisclosure(); const { data: splitDataLen, refetch } = useQuery(['getModelSplitDataList'], () => @@ -91,18 +91,18 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => { const { mutate: onclickExport, isLoading: isLoadingExport } = useMutation({ mutationFn: () => getExportDataList(model._id), onSuccess(res) { - // 导出为文件 - const blob = new Blob([res], { type: 'application/json;charset=utf-8' }); - - // 创建下载链接 - const downloadLink = document.createElement('a'); - downloadLink.href = window.URL.createObjectURL(blob); - downloadLink.download = `data.json`; - - // 添加链接到页面并触发下载 - document.body.appendChild(downloadLink); - downloadLink.click(); - document.body.removeChild(downloadLink); + try { + console.log(res); + setIsLoading(true); + fileDownload({ + text: res, + type: 'text/csv', + filename: 'data.csv' + }); + } catch (error) { + error; + } + setIsLoading(false); } }); @@ -110,7 +110,7 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => { <> - 模型数据: {total}组{' '} + 模型数据: {total}组 (测试版本) @@ -128,7 +128,7 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => { mr={2} size={'sm'} isLoading={isLoadingExport} - title={'v2.3之前版本的数据无法导出'} + title={'换行数据导出时,会进行格式转换'} onClick={() => onclickExport()} > 导出 @@ -148,9 +148,9 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => { > 手动输入 - 文件QA拆分 - 网站内容QA拆分 - JSON导入 + 文本内容 QA 拆分 + 网站内容 QA 拆分 + csv 问答对导入 @@ -248,10 +248,10 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => { onSuccess={refetchData} /> )} - {isOpenSelectJsonModal && ( - )} diff --git a/src/pages/model/detail/components/SelectJsonModal.tsx b/src/pages/model/detail/components/SelectCsvModal.tsx similarity index 68% rename from src/pages/model/detail/components/SelectJsonModal.tsx rename to src/pages/model/detail/components/SelectCsvModal.tsx index 3088c242d..7473d3035 100644 --- a/src/pages/model/detail/components/SelectJsonModal.tsx +++ b/src/pages/model/detail/components/SelectCsvModal.tsx @@ -13,10 +13,14 @@ import { import { useToast } from '@/hooks/useToast'; import { useSelectFile } from '@/hooks/useSelectFile'; import { useConfirm } from '@/hooks/useConfirm'; -import { readTxtContent } from '@/utils/tools'; +import { readCsvContent } from '@/utils/file'; import { useMutation } from '@tanstack/react-query'; import { postModelDataJsonData } from '@/api/model'; import Markdown from '@/components/Markdown'; +import { useMarkdown } from '@/hooks/useMarkdown'; +import { fileDownload } from '@/utils/file'; + +const csvTemplate = `question,answer\n"什么是 laf","laf 是一个云函数开发平台……"\n"什么是 sealos","Sealos 是以 kubernetes 为内核的云操作系统发行版,可以……"`; const SelectJsonModal = ({ onClose, @@ -29,7 +33,7 @@ const SelectJsonModal = ({ }) => { const [selecting, setSelecting] = useState(false); const { toast } = useToast(); - const { File, onOpen } = useSelectFile({ fileType: '.json', multiple: true }); + const { File, onOpen } = useSelectFile({ fileType: '.csv', multiple: true }); const [fileData, setFileData] = useState< { prompt: string; completion: string; vector?: number[] }[] >([]); @@ -41,21 +45,12 @@ const SelectJsonModal = ({ async (e: File[]) => { setSelecting(true); try { - const jsonData = ( - await Promise.all(e.map((item) => readTxtContent(item).then((text) => JSON.parse(text)))) - ).flat(); - // check 文件类型 - for (let i = 0; i < jsonData.length; i++) { - if (!jsonData[i]?.prompt || !jsonData[i]?.completion) { - throw new Error('缺少 prompt 或 completion'); - } - } - - setFileData(jsonData); + const data = await Promise.all(e.map((item) => readCsvContent(item))); + console.log(data); } catch (error: any) { console.log(error); toast({ - title: error?.message || 'JSON文件格式有误', + title: error?.message || 'csv 文件格式有误', status: 'error' }); } @@ -84,34 +79,36 @@ const SelectJsonModal = ({ } }); + const { data: intro } = useMarkdown({ url: '/csvSelect.md' }); + return ( - JSON数据集 + csv 问答对导入 - - + + + + fileDownload({ + text: csvTemplate, + type: 'text/csv', + filename: 'template.csv' + }) + } + > + 点击下载csv模板 + 一共 {fileData.length} 组数据 diff --git a/src/pages/model/detail/components/SelectFileModal.tsx b/src/pages/model/detail/components/SelectFileModal.tsx index c21face6a..a68885c87 100644 --- a/src/pages/model/detail/components/SelectFileModal.tsx +++ b/src/pages/model/detail/components/SelectFileModal.tsx @@ -16,7 +16,7 @@ import { useToast } from '@/hooks/useToast'; import { useSelectFile } from '@/hooks/useSelectFile'; import { encode } from 'gpt-token-utils'; import { useConfirm } from '@/hooks/useConfirm'; -import { readTxtContent, readPdfContent, readDocContent } from '@/utils/tools'; +import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file'; import { useMutation } from '@tanstack/react-query'; import { postModelDataSplitData } from '@/api/model'; import { formatPrice } from '@/utils/user'; diff --git a/src/utils/file.ts b/src/utils/file.ts new file mode 100644 index 000000000..427486102 --- /dev/null +++ b/src/utils/file.ts @@ -0,0 +1,136 @@ +import mammoth from 'mammoth'; + +/** + * 读取 txt 文件内容 + */ +export const readTxtContent = (file: File) => { + return new Promise((resolve: (_: string) => void, reject) => { + try { + const reader = new FileReader(); + reader.onload = () => { + resolve(reader.result as string); + }; + reader.onerror = (err) => { + console.log('error txt read:', err); + reject('读取 txt 文件失败'); + }; + reader.readAsText(file); + } catch (error) { + reject('浏览器不支持文件内容读取'); + } + }); +}; + +/** + * 读取 pdf 内容 + */ +export const readPdfContent = (file: File) => + new Promise((resolve, reject) => { + try { + const pdfjsLib = window['pdfjs-dist/build/pdf']; + pdfjsLib.workerSrc = '/js/pdf.worker.js'; + + const readPDFPage = async (doc: any, pageNo: number) => { + const page = await doc.getPage(pageNo); + const tokenizedText = await page.getTextContent(); + const pageText = tokenizedText.items.map((token: any) => token.str).join(' '); + return pageText; + }; + + let reader = new FileReader(); + reader.readAsArrayBuffer(file); + reader.onload = async (event) => { + if (!event?.target?.result) return reject('解析 PDF 失败'); + try { + const doc = await pdfjsLib.getDocument(event.target.result).promise; + const pageTextPromises = []; + for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) { + pageTextPromises.push(readPDFPage(doc, pageNo)); + } + const pageTexts = await Promise.all(pageTextPromises); + resolve(pageTexts.join('\n')); + } catch (err) { + console.log(err, 'pdfjs error'); + reject('解析 PDF 失败'); + } + }; + reader.onerror = (err) => { + console.log(err, 'reader error'); + reject('解析 PDF 失败'); + }; + } catch (error) { + reject('浏览器不支持文件内容读取'); + } + }); + +/** + * 读取doc + */ +export const readDocContent = (file: File) => + new Promise((resolve, reject) => { + try { + const reader = new FileReader(); + reader.readAsArrayBuffer(file); + reader.onload = async ({ target }) => { + if (!target?.result) return reject('读取 doc 文件失败'); + try { + const res = await mammoth.extractRawText({ + arrayBuffer: target.result as ArrayBuffer + }); + resolve(res?.value); + } catch (error) { + reject('读取 doc 文件失败, 请转换成 PDF'); + } + }; + reader.onerror = (err) => { + console.log('error doc read:', err); + + reject('读取 doc 文件失败'); + }; + } catch (error) { + reject('浏览器不支持文件内容读取'); + } + }); + +/** + * 读取csv + */ +export const readCsvContent = async (file: File) => { + try { + const textArr = (await readTxtContent(file)).split('\n'); + const header = textArr.shift()?.split(','); + if (!header) { + throw new Error('csv 格式错误'); + } + // 拆分每一行数据 + const data = []; + } catch (error) { + return Promise.reject('解析 csv 文件失败'); + } +}; + +/** + * file download + */ +export const fileDownload = ({ + text, + type, + filename +}: { + text: string; + type: string; + filename: string; +}) => { + // 导出为文件 + const blob = new Blob([text], { type: `${type};charset=utf-8` }); + + // 创建下载链接 + const downloadLink = document.createElement('a'); + downloadLink.href = window.URL.createObjectURL(blob); + downloadLink.download = filename; + + // 添加链接到页面并触发下载 + document.body.appendChild(downloadLink); + downloadLink.click(); + document.body.removeChild(downloadLink); +}; diff --git a/src/utils/tools.ts b/src/utils/tools.ts index 82fae3b6e..05589cd49 100644 --- a/src/utils/tools.ts +++ b/src/utils/tools.ts @@ -1,6 +1,5 @@ import crypto from 'crypto'; import { useToast } from '@/hooks/useToast'; -import mammoth from 'mammoth'; /** * copy text data @@ -34,11 +33,17 @@ export const useCopyData = () => { }; }; +/** + * 密码加密 + */ export const createHashPassword = (text: string) => { const hash = crypto.createHash('sha256').update(text).digest('hex'); return hash; }; +/** + * 对象转成 query 字符串 + */ export const Obj2Query = (obj: Record) => { const queryParams = new URLSearchParams(); for (const key in obj) { @@ -47,86 +52,6 @@ export const Obj2Query = (obj: Record) => { return queryParams.toString(); }; -/** - * 读取 txt 文件内容 - */ -export const readTxtContent = (file: File) => { - return new Promise((resolve: (_: string) => void, reject) => { - const reader = new FileReader(); - reader.onload = () => { - resolve(reader.result as string); - }; - reader.onerror = (err) => { - console.log('error txt read:', err); - reject('读取 txt 文件失败'); - }; - reader.readAsText(file); - }); -}; - -/** - * 读取 pdf 内容 - */ -export const readPdfContent = (file: File) => - new Promise((resolve, reject) => { - const pdfjsLib = window['pdfjs-dist/build/pdf']; - pdfjsLib.workerSrc = '/js/pdf.worker.js'; - - const readPDFPage = async (doc: any, pageNo: number) => { - const page = await doc.getPage(pageNo); - const tokenizedText = await page.getTextContent(); - const pageText = tokenizedText.items.map((token: any) => token.str).join(' '); - return pageText; - }; - - let reader = new FileReader(); - reader.readAsArrayBuffer(file); - reader.onload = async (event) => { - if (!event?.target?.result) return reject('解析 PDF 失败'); - try { - const doc = await pdfjsLib.getDocument(event.target.result).promise; - const pageTextPromises = []; - for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) { - pageTextPromises.push(readPDFPage(doc, pageNo)); - } - const pageTexts = await Promise.all(pageTextPromises); - resolve(pageTexts.join('\n')); - } catch (err) { - console.log(err, 'pdfjs error'); - reject('解析 PDF 失败'); - } - }; - reader.onerror = (err) => { - console.log(err, 'reader error'); - reject('解析 PDF 失败'); - }; - }); - -/** - * 读取doc - */ -export const readDocContent = (file: File) => - new Promise((resolve, reject) => { - const reader = new FileReader(); - reader.readAsArrayBuffer(file); - reader.onload = async ({ target }) => { - if (!target?.result) return reject('读取 doc 文件失败'); - try { - const res = await mammoth.extractRawText({ - arrayBuffer: target.result as ArrayBuffer - }); - resolve(res?.value); - } catch (error) { - reject('读取 doc 文件失败, 请转换成 PDF'); - } - }; - reader.onerror = (err) => { - console.log('error doc read:', err); - - reject('读取 doc 文件失败'); - }; - }); - /** * 向量转成 float32 buffer 格式 */ @@ -138,11 +63,18 @@ export const vectorToBuffer = (vector: number[]) => { return buffer; }; -export function formatVector(vector: number[]) { +export const formatVector = (vector: number[]) => { let formattedVector = vector.slice(0, 1536); // 截取前1536个元素 if (vector.length > 1536) { formattedVector = formattedVector.concat(Array(1536 - formattedVector.length).fill(0)); // 在后面添加0 } return formattedVector; -} +}; + +/** + * 字符串清理,替换换行符号 + */ +export const clearStrLineBreak = (str: string) => { + return str.replace(/\n/g, '\n').replace(/\n/g, '\\n').trim(); +};