FastGPT/packages/service/thirdProvider/doc2x/index.ts
Archer e25d7efb5b
feature: V4.11.1 (#5350)
* perf: system toolset & mcp (#5200)

* feat: support system toolset

* fix: type

* fix: system tool config

* chore: mcptool config migrate

* refactor: mcp toolset

* fix: fe type error

* fix: type error

* fix: show version

* chore: support extract tool's secretInputConfig out of inputs

* chore: compatible with old version mcp

* chore: adjust

* deps: update dependency @fastgpt-skd/plugin

* fix: version

* fix: some bug (#5316)

* chore: compatible with old version mcp

* fix: version

* fix: compatible bug

* fix: mcp object params

* fix: type error

* chore: update test cases

* chore: remove log

* fix: toolset node name

* optimize app logs sort (#5310)

* log keys config modal

* multiple select

* api

* fontsize

* code

* chatid

* fix build

* fix

* fix component

* change name

* log keys config

* fix

* delete unused

* fix

* perf: log code

* perf: send auth code modal enter press

* fix log (#5328)

* perf: mcp toolset comment

* perf: log ui

* remove log (#5347)

* doc

* fix: action

* remove log

* fix: Table Optimization (#5319)

* feat: table test: 1

* feat: table test: 2

* feat: table test: 3

* feat: table test: 4

* feat: table test : 5 把maxSize改回chunkSize

* feat: table test : 6 都删了,只看maxSize

* feat: table test : 7 恢复初始,接下来删除标签功能

* feat: table test : 8 删除标签功能

* feat: table test : 9 删除标签功能成功

* feat: table test : 10 继续调试,修改trainingStates

* feat: table test : 11 修改第一步

* feat: table test : 12 修改第二步

* feat: table test : 13 修改了HtmlTable2Md

* feat: table test : 14 修改表头分块规则

* feat: table test : 15 前面表格分的太细了

* feat: table test : 16 改着改着表头又不加了

* feat: table test : 17 用CUSTOM_SPLIT_SIGN不行,重新改

* feat: table test : 18 表头仍然还会多加,但现在分块搞的合理了终于

* feat: table test : 19 还是需要搞好表头问题,先保存一下调试情况

* feat: table test : 20 调试结束,看一下replace有没有问题,没问题就pr

* feat: table test : 21 先把注释删了

* feat: table test : 21 注释replace都改了,下面切main分支看看情况

* feat: table test : 22 修改旧文件

* feat: table test : 23 修改测试文件

* feat: table test : 24 xlsx表格处理

* feat: table test : 25 刚才没保存先com了

* feat: table test : 26 fix

* feat: table test : 27 先com一版调试

* feat: table test : 28 试试放format2csv里

* feat: table test : 29 xlsx解决

* feat: table test : 30 tablesplit解决

* feat: table test : 31

* feat: table test : 32

* perf: table split

* perf: mcp old version compatibility (#5342)

* fix: system-tool secret inputs

* fix: rewrite runtime node i18n for system tool

* perf: mcp old version compatibility

* fix: splitPluginId

* fix: old mcp toolId

* fix: filter secret key

* feat: support system toolset activation

* chore: remove log

* perf: mcp update

* perf: rewrite toolset

* fix:delete variable id (#5335)

* perf: variable update

* fix: multiple select ui

* perf: model config move to plugin

* fix: var conflit

* perf: variable checker

* Avoid empty number

* update doc time

* fix: test

* fix: mcp object

* update count app

* update count app

---------

Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: heheer <zhiyu44@qq.com>
Co-authored-by: colnii <1286949794@qq.com>
Co-authored-by: dreamer6680 <1468683855@qq.com>
2025-08-01 16:08:20 +08:00

235 lines
7.2 KiB
TypeScript

import { batchRun, delay } from '@fastgpt/global/common/system/utils';
import { addLog } from '../../common/system/log';
import { htmlTable2Md } from '@fastgpt/global/common/string/markdown';
import axios, { type Method } from 'axios';
import { getNanoid } from '@fastgpt/global/common/string/tools';
import { getErrText } from '@fastgpt/global/common/error/utils';
import { type ImageType } from '../../worker/readFile/type';
import { getImageBase64 } from '../../common/file/image/utils';
type ApiResponseDataType<T = any> = {
code: string;
msg?: string;
data: T;
};
export const useDoc2xServer = ({ apiKey }: { apiKey: string }) => {
// Init request
const instance = axios.create({
baseURL: 'https://v2.doc2x.noedgeai.com/api',
timeout: 60000,
headers: {
Authorization: `Bearer ${apiKey}`
}
});
// Response check
const checkRes = (data: ApiResponseDataType) => {
if (data === undefined) {
addLog.info('[Doc2x] Server data is empty');
return Promise.reject('服务器异常');
}
return data;
};
const responseError = (err: any) => {
if (!err) {
return Promise.reject({ message: '[Doc2x] Unknown error' });
}
if (typeof err === 'string') {
return Promise.reject({ message: `[Doc2x] ${err}` });
}
if (typeof err.message === 'string') {
return Promise.reject({ message: `[Doc2x] ${err.message}` });
}
if (typeof err.data === 'string') {
return Promise.reject({ message: `[Doc2x] ${err.data}` });
}
if (err?.response?.data) {
return Promise.reject({ message: `[Doc2x] ${getErrText(err?.response?.data)}` });
}
addLog.error('[Doc2x] Unknown error', err);
return Promise.reject({ message: `[Doc2x] ${getErrText(err)}` });
};
const request = <T>(url: string, data: any, method: Method): Promise<ApiResponseDataType<T>> => {
// Remove empty data
for (const key in data) {
if (data[key] === undefined) {
delete data[key];
}
}
return instance
.request({
url,
method,
data: ['POST', 'PUT'].includes(method) ? data : undefined,
params: !['POST', 'PUT'].includes(method) ? data : undefined
})
.then((res) => checkRes(res.data))
.catch((err) => responseError(err));
};
const parsePDF = async (fileBuffer: Buffer) => {
addLog.debug('[Doc2x] PDF parse start');
const startTime = Date.now();
// 1. Get pre-upload URL first
const {
code,
msg,
data: preupload_data
} = await request<{ uid: string; url: string }>('/v2/parse/preupload', null, 'POST');
if (!['ok', 'success'].includes(code)) {
return Promise.reject(`[Doc2x] Failed to get pre-upload URL: ${msg}`);
}
const upload_url = preupload_data.url;
const uid = preupload_data.uid;
// 2. Upload file to pre-signed URL with binary stream
const blob = new Blob([fileBuffer], { type: 'application/pdf' });
const response = await axios
.put(upload_url, blob, {
headers: {
'Content-Type': 'application/pdf'
}
})
.catch((error) => {
return Promise.reject(`[Doc2x] Failed to upload file: ${getErrText(error)}`);
});
if (response.status !== 200) {
return Promise.reject(
`[Doc2x] Upload failed with status ${response.status}: ${response.statusText}`
);
}
addLog.debug(`[Doc2x] Uploaded file success, uid: ${uid}`);
await delay(5000);
// 3. Get the result by uid
const checkResult = async () => {
// 10 minutes
let retry = 120;
while (retry > 0) {
try {
const {
code,
data: result_data,
msg
} = await request<{
progress: number;
status: 'processing' | 'failed' | 'success';
result: {
pages: {
md: string;
}[];
};
}>(`/v2/parse/status?uid=${uid}`, null, 'GET');
// Error
if (!['ok', 'success'].includes(code)) {
return Promise.reject(`[Doc2x] Failed to get result (uid: ${uid}): ${msg}`);
}
// Process
if (['ready', 'processing'].includes(result_data.status)) {
addLog.debug(`[Doc2x] Waiting for the result, uid: ${uid}`);
await delay(5000);
}
// Finifsh
if (result_data.status === 'success') {
const cleanedText = result_data.result.pages
.map((page) => page.md)
.join('')
.replace(/\\[\(\)]/g, '$')
.replace(/\\[\[\]]/g, '$$')
.replace(/<img\s+src="([^"]+)"(?:\s*\?[^>]*)?(?:\s*\/>|>)/g, '![img]($1)')
.replace(/<!-- Media -->/g, '')
.replace(/<!-- Footnote -->/g, '')
.replace(/<!-- Meanless:[\s\S]*?-->/g, '')
.replace(/<!-- figureText:[\s\S]*?-->/g, '')
.replace(/\$(.+?)\s+\\tag\{(.+?)\}\$/g, '$$$1 \\qquad \\qquad ($2)$$')
.replace(/\\text\{([^}]*?)(\b\w+)_(\w+\b)([^}]*?)\}/g, '\\text{$1$2\\_$3$4}');
const remainingTags = cleanedText.match(/<!--[\s\S]*?-->/g);
if (remainingTags) {
addLog.warn(`[Doc2x] Remaining dirty tags after cleaning:`, {
count: remainingTags.length,
tags: remainingTags.slice(0, 3)
});
}
return {
text: cleanedText,
pages: result_data.result.pages.length
};
}
} catch (error) {
// Just network error
addLog.warn(`[Doc2x] Get result error`, { error });
await delay(500);
}
retry--;
}
return Promise.reject(`[Doc2x] Failed to get result (uid: ${uid}): Process timeout`);
};
const { text, pages } = await checkResult();
// ![](url) => ![](base64)
const parseTextImage = async (text: string) => {
// Extract image links and convert to base64
const imageList: { id: string; url: string }[] = [];
let processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => {
const id = `IMAGE_${getNanoid()}_IMAGE`;
imageList.push({
id,
url
});
return `![](${id})`;
});
// Get base64 from image url
let resultImageList: ImageType[] = [];
await batchRun(
imageList,
async (item) => {
try {
const { base64, mime } = await getImageBase64(item.url);
resultImageList.push({
uuid: item.id,
mime,
base64
});
} catch (error) {
processedText = processedText.replace(item.id, item.url);
addLog.warn(`[Doc2x] Failed to get image from ${item.url}: ${getErrText(error)}`);
}
},
5
);
return {
text: processedText,
imageList: resultImageList
};
};
const { text: formatText, imageList } = await parseTextImage(htmlTable2Md(text));
addLog.debug(`[Doc2x] PDF parse finished`, {
time: `${Math.round((Date.now() - startTime) / 1000)}s`,
pages
});
return {
pages,
text: formatText,
imageList
};
};
return {
parsePDF
};
};