This commit is contained in:
YeYuheng 2025-12-25 09:28:51 +00:00 committed by GitHub
commit 5014c2e512
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 142 additions and 1 deletions

View File

@ -16,6 +16,8 @@
"url": "", // PDF
"key": "", // PDF
"doc2xKey": "", // doc2x
"textinAppId": "", // Textin App ID
"textinSecretCode": "", // Textin Secret Code
"price": 0 // PDF
}
}

View File

@ -189,6 +189,8 @@ export type customPdfParseType = {
url?: string;
key?: string;
doc2xKey?: string;
textinAppId?: string;
textinSecretCode?: string;
price?: number;
};

View File

@ -7,6 +7,7 @@ import { batchRun } from '@fastgpt/global/common/system/utils';
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
import { useTextinServer } from '../../../thirdProvider/textin';
import { readRawContentFromBuffer } from '../../../worker/function';
import { uploadImage2S3Bucket } from '../../s3/utils';
import { Mimes } from '../../s3/constants';
@ -125,6 +126,30 @@ export const readS3FileContentByBuffer = async ({
imageList
};
};
// Textin api
const parsePdfFromTextin = async (): Promise<ReadFileResponse> => {
const appId = global.systemEnv.customPdfParse?.textinAppId;
const secretCode = global.systemEnv.customPdfParse?.textinSecretCode;
if (!appId || !secretCode) return systemParse();
const { pages, text, imageList } = await useTextinServer({
appId,
secretCode
}).parsePDF(buffer);
createPdfParseUsage({
teamId,
tmbId,
pages,
usageId
});
return {
rawText: text,
formatText: text,
imageList
};
};
// Doc2x api
const parsePdfFromDoc2x = async (): Promise<ReadFileResponse> => {
const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey;
@ -147,8 +172,17 @@ export const readS3FileContentByBuffer = async ({
};
// Custom read file service
const pdfParseFn = async (): Promise<ReadFileResponse> => {
console.log(
'global.systemEnv.customPdfParse?.textinAppId',
global.systemEnv.customPdfParse?.textinAppId
);
console.log(
'global.systemEnv.customPdfParse?.doc2xKey',
global.systemEnv.customPdfParse?.doc2xKey
);
if (!customPdfParse) return systemParse();
if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService();
if (global.systemEnv.customPdfParse?.textinAppId) return parsePdfFromTextin();
if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x();
return systemParse();

View File

@ -12,7 +12,9 @@ export const initFastGPTConfig = (config?: FastGPTConfigFileType) => {
// Special config computed
config.feConfigs.showCustomPdfParse =
!!config.systemEnv.customPdfParse?.url || !!config.systemEnv.customPdfParse?.doc2xKey;
!!config.systemEnv.customPdfParse?.url ||
!!config.systemEnv.customPdfParse?.textinAppId ||
!!config.systemEnv.customPdfParse?.doc2xKey;
config.feConfigs.customPdfParsePrice = config.systemEnv.customPdfParse?.price || 0;
global.feConfigs = config.feConfigs;

View File

@ -0,0 +1,99 @@
import { addLog } from '../../common/system/log';
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
import axios from 'axios';
import { getErrText } from '@fastgpt/global/common/error/utils';
export const useTextinServer = ({ appId, secretCode }: { appId: string; secretCode: string }) => {
// Init request
const instance = axios.create({
baseURL: 'https://api.textin.com/ai/service/v1',
timeout: 300000,
headers: {
'x-ti-app-id': appId,
'x-ti-secret-code': secretCode
}
});
// Response error handler
const responseError = (err: any) => {
if (!err) {
return Promise.reject({ message: '[Textin] Unknown error' });
}
if (typeof err === 'string') {
return Promise.reject({ message: `[Textin] ${err}` });
}
if (err?.response?.data) {
return Promise.reject({
message: `[Textin] ${getErrText(err?.response?.data)}`
});
}
if (typeof err.message === 'string') {
return Promise.reject({ message: `[Textin] ${err.message}` });
}
addLog.error('[Textin] Unknown error', err);
return Promise.reject({ message: `[Textin] ${getErrText(err)}` });
};
const parsePDF = async (fileBuffer: Buffer) => {
addLog.debug('[Textin] PDF parse start');
const startTime = Date.now();
try {
// Build request parameters (https://docs.textin.com/xparse/parse-quickstart#url%E5%8F%82%E6%95%B0%E8%AF%B4%E6%98%8E)
const params = {
get_image: 'objects', // 返回页面内的子图像
image_output_type: 'base64str', // 图片对象以base64字符串返回
parse_mode: 'auto', // 自动模式直接提取pdf中的文字
dpi: 144, // 坐标基准144 dpi
markdown_details: 1, // 返回detail字段markdown元素详细信息
table_flavor: 'md', // 表格按markdown语法输出
paratext_mode: 'none', // 不展示非正文内容(页眉页脚等)
page_details: 0, // 不返回pages字段
remove_watermark: 1 // 去除水印
};
// Send request
const { data } = await instance.post('/pdf_to_markdown', fileBuffer, {
params,
headers: { 'Content-Type': 'application/octet-stream' }
});
// Check response code
if (data.code !== 200) {
return Promise.reject(
`[Textin] API error: ${data.message || data.code || 'Unknown error'}`
);
}
// Get markdown content
const rawMarkdown = data.result?.markdown;
if (!rawMarkdown) {
return Promise.reject('[Textin] No markdown content in response');
}
console.log('rawMarkdown', rawMarkdown);
// Process tables and images (reuse existing utility functions)
const { text, imageList } = matchMdImg(rawMarkdown);
// Get page count
const pages = data.result?.pages?.length || data.result?.total_page_number || 1;
addLog.debug(`[Textin] PDF parse finished`, {
time: `${Math.round((Date.now() - startTime) / 1000)}s`,
pages
});
return {
pages,
text,
imageList
};
} catch (error) {
return responseError(error);
}
};
return {
parsePDF
};
};

View File

@ -16,6 +16,8 @@
"url": "", // PDF
"key": "", // PDF
"doc2xKey": "", // doc2x
"textinAppId": "", // Textin App ID
"textinSecretCode": "", // Textin Secret Code
"price": 0 // PDF
}
}