diff --git a/document/public/deploy/config/config.json b/document/public/deploy/config/config.json index 5bfd98710..b99722471 100644 --- a/document/public/deploy/config/config.json +++ b/document/public/deploy/config/config.json @@ -16,6 +16,8 @@ "url": "", // 自定义 PDF 解析服务地址 "key": "", // 自定义 PDF 解析服务密钥 "doc2xKey": "", // doc2x 服务密钥 + "textinAppId": "", // 合合信息 Textin 服务 App ID + "textinSecretCode": "", // 合合信息 Textin 服务 Secret Code "price": 0 // PDF 解析服务价格 } } diff --git a/packages/global/common/system/types/index.d.ts b/packages/global/common/system/types/index.d.ts index 0ec10272f..8805e84d3 100644 --- a/packages/global/common/system/types/index.d.ts +++ b/packages/global/common/system/types/index.d.ts @@ -189,6 +189,8 @@ export type customPdfParseType = { url?: string; key?: string; doc2xKey?: string; + textinAppId?: string; + textinSecretCode?: string; price?: number; }; diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index 5ec78e11c..954e7fe03 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -7,6 +7,7 @@ import { batchRun } from '@fastgpt/global/common/system/utils'; import { matchMdImg } from '@fastgpt/global/common/string/markdown'; import { createPdfParseUsage } from '../../../support/wallet/usage/controller'; import { useDoc2xServer } from '../../../thirdProvider/doc2x'; +import { useTextinServer } from '../../../thirdProvider/textin'; import { readRawContentFromBuffer } from '../../../worker/function'; import { uploadImage2S3Bucket } from '../../s3/utils'; import { Mimes } from '../../s3/constants'; @@ -125,6 +126,30 @@ export const readS3FileContentByBuffer = async ({ imageList }; }; + // Textin api + const parsePdfFromTextin = async (): Promise => { + const appId = global.systemEnv.customPdfParse?.textinAppId; + const secretCode = global.systemEnv.customPdfParse?.textinSecretCode; + if (!appId || !secretCode) return systemParse(); + + const { pages, text, imageList } = await useTextinServer({ + appId, + secretCode + }).parsePDF(buffer); + + createPdfParseUsage({ + teamId, + tmbId, + pages, + usageId + }); + + return { + rawText: text, + formatText: text, + imageList + }; + }; // Doc2x api const parsePdfFromDoc2x = async (): Promise => { const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey; @@ -147,8 +172,17 @@ export const readS3FileContentByBuffer = async ({ }; // Custom read file service const pdfParseFn = async (): Promise => { + console.log( + 'global.systemEnv.customPdfParse?.textinAppId', + global.systemEnv.customPdfParse?.textinAppId + ); + console.log( + 'global.systemEnv.customPdfParse?.doc2xKey', + global.systemEnv.customPdfParse?.doc2xKey + ); if (!customPdfParse) return systemParse(); if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService(); + if (global.systemEnv.customPdfParse?.textinAppId) return parsePdfFromTextin(); if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x(); return systemParse(); diff --git a/packages/service/common/system/tools.ts b/packages/service/common/system/tools.ts index 0f5cff06f..8f5a9ffbb 100644 --- a/packages/service/common/system/tools.ts +++ b/packages/service/common/system/tools.ts @@ -12,7 +12,9 @@ export const initFastGPTConfig = (config?: FastGPTConfigFileType) => { // Special config computed config.feConfigs.showCustomPdfParse = - !!config.systemEnv.customPdfParse?.url || !!config.systemEnv.customPdfParse?.doc2xKey; + !!config.systemEnv.customPdfParse?.url || + !!config.systemEnv.customPdfParse?.textinAppId || + !!config.systemEnv.customPdfParse?.doc2xKey; config.feConfigs.customPdfParsePrice = config.systemEnv.customPdfParse?.price || 0; global.feConfigs = config.feConfigs; diff --git a/packages/service/thirdProvider/textin/index.ts b/packages/service/thirdProvider/textin/index.ts new file mode 100644 index 000000000..f04eaa559 --- /dev/null +++ b/packages/service/thirdProvider/textin/index.ts @@ -0,0 +1,99 @@ +import { addLog } from '../../common/system/log'; +import { matchMdImg } from '@fastgpt/global/common/string/markdown'; +import axios from 'axios'; +import { getErrText } from '@fastgpt/global/common/error/utils'; + +export const useTextinServer = ({ appId, secretCode }: { appId: string; secretCode: string }) => { + // Init request + const instance = axios.create({ + baseURL: 'https://api.textin.com/ai/service/v1', + timeout: 300000, + headers: { + 'x-ti-app-id': appId, + 'x-ti-secret-code': secretCode + } + }); + + // Response error handler + const responseError = (err: any) => { + if (!err) { + return Promise.reject({ message: '[Textin] Unknown error' }); + } + if (typeof err === 'string') { + return Promise.reject({ message: `[Textin] ${err}` }); + } + if (err?.response?.data) { + return Promise.reject({ + message: `[Textin] ${getErrText(err?.response?.data)}` + }); + } + if (typeof err.message === 'string') { + return Promise.reject({ message: `[Textin] ${err.message}` }); + } + + addLog.error('[Textin] Unknown error', err); + return Promise.reject({ message: `[Textin] ${getErrText(err)}` }); + }; + + const parsePDF = async (fileBuffer: Buffer) => { + addLog.debug('[Textin] PDF parse start'); + const startTime = Date.now(); + + try { + // Build request parameters (https://docs.textin.com/xparse/parse-quickstart#url%E5%8F%82%E6%95%B0%E8%AF%B4%E6%98%8E) + const params = { + get_image: 'objects', // 返回页面内的子图像 + image_output_type: 'base64str', // 图片对象以base64字符串返回 + parse_mode: 'auto', // 自动模式:直接提取pdf中的文字 + dpi: 144, // 坐标基准144 dpi + markdown_details: 1, // 返回detail字段(markdown元素详细信息) + table_flavor: 'md', // 表格按markdown语法输出 + paratext_mode: 'none', // 不展示非正文内容(页眉页脚等) + page_details: 0, // 不返回pages字段 + remove_watermark: 1 // 去除水印 + }; + + // Send request + const { data } = await instance.post('/pdf_to_markdown', fileBuffer, { + params, + headers: { 'Content-Type': 'application/octet-stream' } + }); + + // Check response code + if (data.code !== 200) { + return Promise.reject( + `[Textin] API error: ${data.message || data.code || 'Unknown error'}` + ); + } + + // Get markdown content + const rawMarkdown = data.result?.markdown; + if (!rawMarkdown) { + return Promise.reject('[Textin] No markdown content in response'); + } + console.log('rawMarkdown', rawMarkdown); + // Process tables and images (reuse existing utility functions) + const { text, imageList } = matchMdImg(rawMarkdown); + + // Get page count + const pages = data.result?.pages?.length || data.result?.total_page_number || 1; + + addLog.debug(`[Textin] PDF parse finished`, { + time: `${Math.round((Date.now() - startTime) / 1000)}s`, + pages + }); + + return { + pages, + text, + imageList + }; + } catch (error) { + return responseError(error); + } + }; + + return { + parsePDF + }; +}; diff --git a/projects/app/data/config.json b/projects/app/data/config.json index 5bfd98710..b99722471 100644 --- a/projects/app/data/config.json +++ b/projects/app/data/config.json @@ -16,6 +16,8 @@ "url": "", // 自定义 PDF 解析服务地址 "key": "", // 自定义 PDF 解析服务密钥 "doc2xKey": "", // doc2x 服务密钥 + "textinAppId": "", // 合合信息 Textin 服务 App ID + "textinSecretCode": "", // 合合信息 Textin 服务 Secret Code "price": 0 // PDF 解析服务价格 } }