mirror of
https://github.com/labring/FastGPT.git
synced 2025-12-25 20:02:47 +00:00
Merge d73cf97b65 into 4fbe27f2df
This commit is contained in:
commit
5014c2e512
|
|
@ -16,6 +16,8 @@
|
|||
"url": "", // 自定义 PDF 解析服务地址
|
||||
"key": "", // 自定义 PDF 解析服务密钥
|
||||
"doc2xKey": "", // doc2x 服务密钥
|
||||
"textinAppId": "", // 合合信息 Textin 服务 App ID
|
||||
"textinSecretCode": "", // 合合信息 Textin 服务 Secret Code
|
||||
"price": 0 // PDF 解析服务价格
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -189,6 +189,8 @@ export type customPdfParseType = {
|
|||
url?: string;
|
||||
key?: string;
|
||||
doc2xKey?: string;
|
||||
textinAppId?: string;
|
||||
textinSecretCode?: string;
|
||||
price?: number;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import { batchRun } from '@fastgpt/global/common/system/utils';
|
|||
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
|
||||
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
|
||||
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
|
||||
import { useTextinServer } from '../../../thirdProvider/textin';
|
||||
import { readRawContentFromBuffer } from '../../../worker/function';
|
||||
import { uploadImage2S3Bucket } from '../../s3/utils';
|
||||
import { Mimes } from '../../s3/constants';
|
||||
|
|
@ -125,6 +126,30 @@ export const readS3FileContentByBuffer = async ({
|
|||
imageList
|
||||
};
|
||||
};
|
||||
// Textin api
|
||||
const parsePdfFromTextin = async (): Promise<ReadFileResponse> => {
|
||||
const appId = global.systemEnv.customPdfParse?.textinAppId;
|
||||
const secretCode = global.systemEnv.customPdfParse?.textinSecretCode;
|
||||
if (!appId || !secretCode) return systemParse();
|
||||
|
||||
const { pages, text, imageList } = await useTextinServer({
|
||||
appId,
|
||||
secretCode
|
||||
}).parsePDF(buffer);
|
||||
|
||||
createPdfParseUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
pages,
|
||||
usageId
|
||||
});
|
||||
|
||||
return {
|
||||
rawText: text,
|
||||
formatText: text,
|
||||
imageList
|
||||
};
|
||||
};
|
||||
// Doc2x api
|
||||
const parsePdfFromDoc2x = async (): Promise<ReadFileResponse> => {
|
||||
const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey;
|
||||
|
|
@ -147,8 +172,17 @@ export const readS3FileContentByBuffer = async ({
|
|||
};
|
||||
// Custom read file service
|
||||
const pdfParseFn = async (): Promise<ReadFileResponse> => {
|
||||
console.log(
|
||||
'global.systemEnv.customPdfParse?.textinAppId',
|
||||
global.systemEnv.customPdfParse?.textinAppId
|
||||
);
|
||||
console.log(
|
||||
'global.systemEnv.customPdfParse?.doc2xKey',
|
||||
global.systemEnv.customPdfParse?.doc2xKey
|
||||
);
|
||||
if (!customPdfParse) return systemParse();
|
||||
if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService();
|
||||
if (global.systemEnv.customPdfParse?.textinAppId) return parsePdfFromTextin();
|
||||
if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x();
|
||||
|
||||
return systemParse();
|
||||
|
|
|
|||
|
|
@ -12,7 +12,9 @@ export const initFastGPTConfig = (config?: FastGPTConfigFileType) => {
|
|||
|
||||
// Special config computed
|
||||
config.feConfigs.showCustomPdfParse =
|
||||
!!config.systemEnv.customPdfParse?.url || !!config.systemEnv.customPdfParse?.doc2xKey;
|
||||
!!config.systemEnv.customPdfParse?.url ||
|
||||
!!config.systemEnv.customPdfParse?.textinAppId ||
|
||||
!!config.systemEnv.customPdfParse?.doc2xKey;
|
||||
config.feConfigs.customPdfParsePrice = config.systemEnv.customPdfParse?.price || 0;
|
||||
|
||||
global.feConfigs = config.feConfigs;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,99 @@
|
|||
import { addLog } from '../../common/system/log';
|
||||
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
|
||||
import axios from 'axios';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
|
||||
export const useTextinServer = ({ appId, secretCode }: { appId: string; secretCode: string }) => {
|
||||
// Init request
|
||||
const instance = axios.create({
|
||||
baseURL: 'https://api.textin.com/ai/service/v1',
|
||||
timeout: 300000,
|
||||
headers: {
|
||||
'x-ti-app-id': appId,
|
||||
'x-ti-secret-code': secretCode
|
||||
}
|
||||
});
|
||||
|
||||
// Response error handler
|
||||
const responseError = (err: any) => {
|
||||
if (!err) {
|
||||
return Promise.reject({ message: '[Textin] Unknown error' });
|
||||
}
|
||||
if (typeof err === 'string') {
|
||||
return Promise.reject({ message: `[Textin] ${err}` });
|
||||
}
|
||||
if (err?.response?.data) {
|
||||
return Promise.reject({
|
||||
message: `[Textin] ${getErrText(err?.response?.data)}`
|
||||
});
|
||||
}
|
||||
if (typeof err.message === 'string') {
|
||||
return Promise.reject({ message: `[Textin] ${err.message}` });
|
||||
}
|
||||
|
||||
addLog.error('[Textin] Unknown error', err);
|
||||
return Promise.reject({ message: `[Textin] ${getErrText(err)}` });
|
||||
};
|
||||
|
||||
const parsePDF = async (fileBuffer: Buffer) => {
|
||||
addLog.debug('[Textin] PDF parse start');
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
// Build request parameters (https://docs.textin.com/xparse/parse-quickstart#url%E5%8F%82%E6%95%B0%E8%AF%B4%E6%98%8E)
|
||||
const params = {
|
||||
get_image: 'objects', // 返回页面内的子图像
|
||||
image_output_type: 'base64str', // 图片对象以base64字符串返回
|
||||
parse_mode: 'auto', // 自动模式:直接提取pdf中的文字
|
||||
dpi: 144, // 坐标基准144 dpi
|
||||
markdown_details: 1, // 返回detail字段(markdown元素详细信息)
|
||||
table_flavor: 'md', // 表格按markdown语法输出
|
||||
paratext_mode: 'none', // 不展示非正文内容(页眉页脚等)
|
||||
page_details: 0, // 不返回pages字段
|
||||
remove_watermark: 1 // 去除水印
|
||||
};
|
||||
|
||||
// Send request
|
||||
const { data } = await instance.post('/pdf_to_markdown', fileBuffer, {
|
||||
params,
|
||||
headers: { 'Content-Type': 'application/octet-stream' }
|
||||
});
|
||||
|
||||
// Check response code
|
||||
if (data.code !== 200) {
|
||||
return Promise.reject(
|
||||
`[Textin] API error: ${data.message || data.code || 'Unknown error'}`
|
||||
);
|
||||
}
|
||||
|
||||
// Get markdown content
|
||||
const rawMarkdown = data.result?.markdown;
|
||||
if (!rawMarkdown) {
|
||||
return Promise.reject('[Textin] No markdown content in response');
|
||||
}
|
||||
console.log('rawMarkdown', rawMarkdown);
|
||||
// Process tables and images (reuse existing utility functions)
|
||||
const { text, imageList } = matchMdImg(rawMarkdown);
|
||||
|
||||
// Get page count
|
||||
const pages = data.result?.pages?.length || data.result?.total_page_number || 1;
|
||||
|
||||
addLog.debug(`[Textin] PDF parse finished`, {
|
||||
time: `${Math.round((Date.now() - startTime) / 1000)}s`,
|
||||
pages
|
||||
});
|
||||
|
||||
return {
|
||||
pages,
|
||||
text,
|
||||
imageList
|
||||
};
|
||||
} catch (error) {
|
||||
return responseError(error);
|
||||
}
|
||||
};
|
||||
|
||||
return {
|
||||
parsePDF
|
||||
};
|
||||
};
|
||||
|
|
@ -16,6 +16,8 @@
|
|||
"url": "", // 自定义 PDF 解析服务地址
|
||||
"key": "", // 自定义 PDF 解析服务密钥
|
||||
"doc2xKey": "", // doc2x 服务密钥
|
||||
"textinAppId": "", // 合合信息 Textin 服务 App ID
|
||||
"textinSecretCode": "", // 合合信息 Textin 服务 Secret Code
|
||||
"price": 0 // PDF 解析服务价格
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue