mirror of
https://github.com/labring/FastGPT.git
synced 2025-12-25 20:02:47 +00:00
* perf: system toolset & mcp (#5200) * feat: support system toolset * fix: type * fix: system tool config * chore: mcptool config migrate * refactor: mcp toolset * fix: fe type error * fix: type error * fix: show version * chore: support extract tool's secretInputConfig out of inputs * chore: compatible with old version mcp * chore: adjust * deps: update dependency @fastgpt-skd/plugin * fix: version * fix: some bug (#5316) * chore: compatible with old version mcp * fix: version * fix: compatible bug * fix: mcp object params * fix: type error * chore: update test cases * chore: remove log * fix: toolset node name * optimize app logs sort (#5310) * log keys config modal * multiple select * api * fontsize * code * chatid * fix build * fix * fix component * change name * log keys config * fix * delete unused * fix * perf: log code * perf: send auth code modal enter press * fix log (#5328) * perf: mcp toolset comment * perf: log ui * remove log (#5347) * doc * fix: action * remove log * fix: Table Optimization (#5319) * feat: table test: 1 * feat: table test: 2 * feat: table test: 3 * feat: table test: 4 * feat: table test : 5 把maxSize改回chunkSize * feat: table test : 6 都删了,只看maxSize * feat: table test : 7 恢复初始,接下来删除标签功能 * feat: table test : 8 删除标签功能 * feat: table test : 9 删除标签功能成功 * feat: table test : 10 继续调试,修改trainingStates * feat: table test : 11 修改第一步 * feat: table test : 12 修改第二步 * feat: table test : 13 修改了HtmlTable2Md * feat: table test : 14 修改表头分块规则 * feat: table test : 15 前面表格分的太细了 * feat: table test : 16 改着改着表头又不加了 * feat: table test : 17 用CUSTOM_SPLIT_SIGN不行,重新改 * feat: table test : 18 表头仍然还会多加,但现在分块搞的合理了终于 * feat: table test : 19 还是需要搞好表头问题,先保存一下调试情况 * feat: table test : 20 调试结束,看一下replace有没有问题,没问题就pr * feat: table test : 21 先把注释删了 * feat: table test : 21 注释replace都改了,下面切main分支看看情况 * feat: table test : 22 修改旧文件 * feat: table test : 23 修改测试文件 * feat: table test : 24 xlsx表格处理 * feat: table test : 25 刚才没保存先com了 * feat: table test : 26 fix * feat: table test : 27 先com一版调试 * feat: table test : 28 试试放format2csv里 * feat: table test : 29 xlsx解决 * feat: table test : 30 tablesplit解决 * feat: table test : 31 * feat: table test : 32 * perf: table split * perf: mcp old version compatibility (#5342) * fix: system-tool secret inputs * fix: rewrite runtime node i18n for system tool * perf: mcp old version compatibility * fix: splitPluginId * fix: old mcp toolId * fix: filter secret key * feat: support system toolset activation * chore: remove log * perf: mcp update * perf: rewrite toolset * fix:delete variable id (#5335) * perf: variable update * fix: multiple select ui * perf: model config move to plugin * fix: var conflit * perf: variable checker * Avoid empty number * update doc time * fix: test * fix: mcp object * update count app * update count app --------- Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com> Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: heheer <zhiyu44@qq.com> Co-authored-by: colnii <1286949794@qq.com> Co-authored-by: dreamer6680 <1468683855@qq.com>
235 lines
7.2 KiB
TypeScript
235 lines
7.2 KiB
TypeScript
import { batchRun, delay } from '@fastgpt/global/common/system/utils';
|
|
import { addLog } from '../../common/system/log';
|
|
import { htmlTable2Md } from '@fastgpt/global/common/string/markdown';
|
|
import axios, { type Method } from 'axios';
|
|
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
|
import { getErrText } from '@fastgpt/global/common/error/utils';
|
|
import { type ImageType } from '../../worker/readFile/type';
|
|
import { getImageBase64 } from '../../common/file/image/utils';
|
|
|
|
type ApiResponseDataType<T = any> = {
|
|
code: string;
|
|
msg?: string;
|
|
data: T;
|
|
};
|
|
|
|
export const useDoc2xServer = ({ apiKey }: { apiKey: string }) => {
|
|
// Init request
|
|
const instance = axios.create({
|
|
baseURL: 'https://v2.doc2x.noedgeai.com/api',
|
|
timeout: 60000,
|
|
headers: {
|
|
Authorization: `Bearer ${apiKey}`
|
|
}
|
|
});
|
|
// Response check
|
|
const checkRes = (data: ApiResponseDataType) => {
|
|
if (data === undefined) {
|
|
addLog.info('[Doc2x] Server data is empty');
|
|
return Promise.reject('服务器异常');
|
|
}
|
|
return data;
|
|
};
|
|
const responseError = (err: any) => {
|
|
if (!err) {
|
|
return Promise.reject({ message: '[Doc2x] Unknown error' });
|
|
}
|
|
if (typeof err === 'string') {
|
|
return Promise.reject({ message: `[Doc2x] ${err}` });
|
|
}
|
|
if (typeof err.message === 'string') {
|
|
return Promise.reject({ message: `[Doc2x] ${err.message}` });
|
|
}
|
|
if (typeof err.data === 'string') {
|
|
return Promise.reject({ message: `[Doc2x] ${err.data}` });
|
|
}
|
|
if (err?.response?.data) {
|
|
return Promise.reject({ message: `[Doc2x] ${getErrText(err?.response?.data)}` });
|
|
}
|
|
|
|
addLog.error('[Doc2x] Unknown error', err);
|
|
return Promise.reject({ message: `[Doc2x] ${getErrText(err)}` });
|
|
};
|
|
const request = <T>(url: string, data: any, method: Method): Promise<ApiResponseDataType<T>> => {
|
|
// Remove empty data
|
|
for (const key in data) {
|
|
if (data[key] === undefined) {
|
|
delete data[key];
|
|
}
|
|
}
|
|
|
|
return instance
|
|
.request({
|
|
url,
|
|
method,
|
|
data: ['POST', 'PUT'].includes(method) ? data : undefined,
|
|
params: !['POST', 'PUT'].includes(method) ? data : undefined
|
|
})
|
|
.then((res) => checkRes(res.data))
|
|
.catch((err) => responseError(err));
|
|
};
|
|
|
|
const parsePDF = async (fileBuffer: Buffer) => {
|
|
addLog.debug('[Doc2x] PDF parse start');
|
|
const startTime = Date.now();
|
|
|
|
// 1. Get pre-upload URL first
|
|
const {
|
|
code,
|
|
msg,
|
|
data: preupload_data
|
|
} = await request<{ uid: string; url: string }>('/v2/parse/preupload', null, 'POST');
|
|
if (!['ok', 'success'].includes(code)) {
|
|
return Promise.reject(`[Doc2x] Failed to get pre-upload URL: ${msg}`);
|
|
}
|
|
const upload_url = preupload_data.url;
|
|
const uid = preupload_data.uid;
|
|
|
|
// 2. Upload file to pre-signed URL with binary stream
|
|
const blob = new Blob([fileBuffer], { type: 'application/pdf' });
|
|
const response = await axios
|
|
.put(upload_url, blob, {
|
|
headers: {
|
|
'Content-Type': 'application/pdf'
|
|
}
|
|
})
|
|
.catch((error) => {
|
|
return Promise.reject(`[Doc2x] Failed to upload file: ${getErrText(error)}`);
|
|
});
|
|
if (response.status !== 200) {
|
|
return Promise.reject(
|
|
`[Doc2x] Upload failed with status ${response.status}: ${response.statusText}`
|
|
);
|
|
}
|
|
addLog.debug(`[Doc2x] Uploaded file success, uid: ${uid}`);
|
|
|
|
await delay(5000);
|
|
|
|
// 3. Get the result by uid
|
|
const checkResult = async () => {
|
|
// 10 minutes
|
|
let retry = 120;
|
|
|
|
while (retry > 0) {
|
|
try {
|
|
const {
|
|
code,
|
|
data: result_data,
|
|
msg
|
|
} = await request<{
|
|
progress: number;
|
|
status: 'processing' | 'failed' | 'success';
|
|
result: {
|
|
pages: {
|
|
md: string;
|
|
}[];
|
|
};
|
|
}>(`/v2/parse/status?uid=${uid}`, null, 'GET');
|
|
|
|
// Error
|
|
if (!['ok', 'success'].includes(code)) {
|
|
return Promise.reject(`[Doc2x] Failed to get result (uid: ${uid}): ${msg}`);
|
|
}
|
|
|
|
// Process
|
|
if (['ready', 'processing'].includes(result_data.status)) {
|
|
addLog.debug(`[Doc2x] Waiting for the result, uid: ${uid}`);
|
|
await delay(5000);
|
|
}
|
|
|
|
// Finifsh
|
|
if (result_data.status === 'success') {
|
|
const cleanedText = result_data.result.pages
|
|
.map((page) => page.md)
|
|
.join('')
|
|
.replace(/\\[\(\)]/g, '$')
|
|
.replace(/\\[\[\]]/g, '$$')
|
|
.replace(/<img\s+src="([^"]+)"(?:\s*\?[^>]*)?(?:\s*\/>|>)/g, '')
|
|
.replace(/<!-- Media -->/g, '')
|
|
.replace(/<!-- Footnote -->/g, '')
|
|
.replace(/<!-- Meanless:[\s\S]*?-->/g, '')
|
|
.replace(/<!-- figureText:[\s\S]*?-->/g, '')
|
|
.replace(/\$(.+?)\s+\\tag\{(.+?)\}\$/g, '$$$1 \\qquad \\qquad ($2)$$')
|
|
.replace(/\\text\{([^}]*?)(\b\w+)_(\w+\b)([^}]*?)\}/g, '\\text{$1$2\\_$3$4}');
|
|
const remainingTags = cleanedText.match(/<!--[\s\S]*?-->/g);
|
|
if (remainingTags) {
|
|
addLog.warn(`[Doc2x] Remaining dirty tags after cleaning:`, {
|
|
count: remainingTags.length,
|
|
tags: remainingTags.slice(0, 3)
|
|
});
|
|
}
|
|
return {
|
|
text: cleanedText,
|
|
pages: result_data.result.pages.length
|
|
};
|
|
}
|
|
} catch (error) {
|
|
// Just network error
|
|
addLog.warn(`[Doc2x] Get result error`, { error });
|
|
await delay(500);
|
|
}
|
|
|
|
retry--;
|
|
}
|
|
return Promise.reject(`[Doc2x] Failed to get result (uid: ${uid}): Process timeout`);
|
|
};
|
|
|
|
const { text, pages } = await checkResult();
|
|
|
|
//  => 
|
|
const parseTextImage = async (text: string) => {
|
|
// Extract image links and convert to base64
|
|
const imageList: { id: string; url: string }[] = [];
|
|
let processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => {
|
|
const id = `IMAGE_${getNanoid()}_IMAGE`;
|
|
imageList.push({
|
|
id,
|
|
url
|
|
});
|
|
return ``;
|
|
});
|
|
|
|
// Get base64 from image url
|
|
let resultImageList: ImageType[] = [];
|
|
await batchRun(
|
|
imageList,
|
|
async (item) => {
|
|
try {
|
|
const { base64, mime } = await getImageBase64(item.url);
|
|
resultImageList.push({
|
|
uuid: item.id,
|
|
mime,
|
|
base64
|
|
});
|
|
} catch (error) {
|
|
processedText = processedText.replace(item.id, item.url);
|
|
addLog.warn(`[Doc2x] Failed to get image from ${item.url}: ${getErrText(error)}`);
|
|
}
|
|
},
|
|
5
|
|
);
|
|
|
|
return {
|
|
text: processedText,
|
|
imageList: resultImageList
|
|
};
|
|
};
|
|
const { text: formatText, imageList } = await parseTextImage(htmlTable2Md(text));
|
|
|
|
addLog.debug(`[Doc2x] PDF parse finished`, {
|
|
time: `${Math.round((Date.now() - startTime) / 1000)}s`,
|
|
pages
|
|
});
|
|
|
|
return {
|
|
pages,
|
|
text: formatText,
|
|
imageList
|
|
};
|
|
};
|
|
|
|
return {
|
|
parsePDF
|
|
};
|
|
};
|