From 4d570ecd4f6ea79d0bf97ec9e84db1e5f7659e92 Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Thu, 8 Aug 2024 18:56:05 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=8F=92=E4=BB=B6=EF=BC=9ADo?= =?UTF-8?q?c2X=E6=8F=92=E4=BB=B6=E9=80=82=E9=85=8D=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E5=8A=9F=E8=83=BD=20(#2284)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Change to download first and check contentType * Up to date * Some bug fix, still some bug with img * Update tool to read from file * improve formatting in PDF * Add tool of img file OCR , but meet some bug * Bug fix for parameter passing error. * Modification Introduction --- packages/plugins/register.ts | 2 + .../plugins/src/Doc2X/FileImg2text/index.ts | 172 +++++++ .../src/Doc2X/FileImg2text/template.json | 437 ++++++++++++++++++ .../plugins/src/Doc2X/FilePDF2text/index.ts | 165 +++++++ .../src/Doc2X/FilePDF2text/template.json | 389 ++++++++++++++++ .../plugins/src/Doc2X/URLImg2text/index.ts | 56 ++- .../src/Doc2X/URLImg2text/template.json | 4 +- .../plugins/src/Doc2X/URLPDF2text/index.ts | 30 +- .../src/Doc2X/URLPDF2text/template.json | 4 +- packages/plugins/src/Doc2X/template.json | 2 +- 10 files changed, 1230 insertions(+), 31 deletions(-) create mode 100644 packages/plugins/src/Doc2X/FileImg2text/index.ts create mode 100644 packages/plugins/src/Doc2X/FileImg2text/template.json create mode 100644 packages/plugins/src/Doc2X/FilePDF2text/index.ts create mode 100644 packages/plugins/src/Doc2X/FilePDF2text/template.json diff --git a/packages/plugins/register.ts b/packages/plugins/register.ts index 66e91f15b..648b23221 100644 --- a/packages/plugins/register.ts +++ b/packages/plugins/register.ts @@ -13,6 +13,8 @@ const staticPluginList = [ 'Doc2X', 'Doc2X/URLPDF2text', 'Doc2X/URLImg2text', + `Doc2X/FilePDF2text`, + `Doc2X/FileImg2text`, 'feishu' ]; // Run in worker thread (Have npm packages) diff --git a/packages/plugins/src/Doc2X/FileImg2text/index.ts b/packages/plugins/src/Doc2X/FileImg2text/index.ts new file mode 100644 index 000000000..2789cdd92 --- /dev/null +++ b/packages/plugins/src/Doc2X/FileImg2text/index.ts @@ -0,0 +1,172 @@ +import { delay } from '@fastgpt/global/common/system/utils'; +import { addLog } from '@fastgpt/service/common/system/log'; + +type Props = { + apikey: string; + files: Array; + img_correction: boolean; + formula: boolean; +}; + +type Response = Promise<{ + result: string; + failreason: string; + success: boolean; +}>; + +const main = async ({ apikey, files, img_correction, formula }: Props): Response => { + // Check the apikey + if (!apikey) { + return { + result: '', + failreason: `API key is required`, + success: false + }; + } + + let real_api_key = apikey; + if (!apikey.startsWith('sk-')) { + const response = await fetch('https://api.doc2x.noedgeai.com/api/token/refresh', { + method: 'POST', + headers: { + Authorization: `Bearer ${apikey}` + } + }); + if (response.status !== 200) { + return { + result: '', + failreason: `Get token failed: ${await response.text()}`, + success: false + }; + } + const data = await response.json(); + real_api_key = data.data.token; + } + + let final_result = ''; + let fail_reason = ''; + let flag = false; + //Process each file one by one + for await (const url of files) { + // Fetch the image and check its content type + const imageResponse = await fetch(url); + if (!imageResponse.ok) { + fail_reason += `\n---\nFile:${url} \n\nFailed to fetch image from URL\n\n`; + flag = true; + continue; + } + + const contentType = imageResponse.headers.get('content-type'); + const fileName = url.match(/read\?filename=([^&]+)/)?.[1] || 'unknown.png'; + if (!contentType || !contentType.startsWith('image/')) { + fail_reason += `\n---\nFile:${url} \n\nThe provided URL does not point to an image: ${contentType}\n\n`; + flag = true; + continue; + } + + const blob = await imageResponse.blob(); + const formData = new FormData(); + formData.append('file', blob, fileName); + formData.append('img_correction', img_correction ? '1' : '0'); + formData.append('equation', formula ? '1' : '0'); + + let upload_url = 'https://api.doc2x.noedgeai.com/api/platform/async/img'; + if (real_api_key.startsWith('sk-')) { + upload_url = 'https://api.doc2x.noedgeai.com/api/v1/async/img'; + } + + let uuid; + let upload_flag = true; + const uploadAttempts = [1, 2, 3]; + for await (const attempt of uploadAttempts) { + const upload_response = await fetch(upload_url, { + method: 'POST', + headers: { + Authorization: `Bearer ${real_api_key}` + }, + body: formData + }); + + if (!upload_response.ok) { + // Rate limit, wait for 10s and retry at most 3 times + if (upload_response.status === 429 && attempt < 3) { + await delay(10000); + continue; + } + fail_reason += `\n---\nFile:${fileName}\n\nFailed to upload file: ${await upload_response.text()}\n\n`; + flag = true; + upload_flag = false; + break; + } + if (!upload_flag) { + continue; + } + + const upload_data = await upload_response.json(); + uuid = upload_data.data.uuid; + break; + } + + // Get the result by uuid + let result_url = 'https://api.doc2x.noedgeai.com/api/platform/async/status?uuid=' + uuid; + if (real_api_key.startsWith('sk-')) { + result_url = 'https://api.doc2x.noedgeai.com/api/v1/async/status?uuid=' + uuid; + } + + let required_flag = true; + const maxAttempts = 100; + // Wait for the result, at most 100s + for await (const _ of Array(maxAttempts).keys()) { + const result_response = await fetch(result_url, { + headers: { + Authorization: `Bearer ${real_api_key}` + } + }); + if (!result_response.ok) { + fail_reason += `\n---\nFile:${fileName}\n\nFailed to get result: ${await result_response.text()}\n\n`; + flag = true; + required_flag = false; + break; + } + const result_data = await result_response.json(); + if (['ready', 'processing'].includes(result_data.data.status)) { + await delay(1000); + } else if (result_data.data.status === 'pages limit exceeded') { + fail_reason += `\n---\nFile:${fileName}\n\nFailed to get result: pages limit exceeded\n\n`; + flag = true; + required_flag = false; + break; + } else if (result_data.data.status === 'success') { + let result; + try { + result = result_data.data.result.pages[0].md; + result = result.replace(/\\[\(\)]/g, '$').replace(/\\[\[\]]/g, '$$'); + } catch { + // no pages + final_result += `\n---\nFile:${fileName}\n\n \n\n`; + required_flag = false; + } + final_result += `\n---\nFile:${fileName}\n\n${result}\n\n`; + required_flag = false; + break; + } else { + fail_reason += `\n---\nFile:${fileName}\n\nFailed to get result: ${result_data.data.status}\n\n`; + flag = true; + required_flag = false; + break; + } + } + if (required_flag) { + fail_reason += `\n---\nFile:${fileName}\n\nTimeout waiting for result\n\n`; + flag = true; + } + } + + return { + result: final_result, + failreason: fail_reason, + success: !flag + }; +}; + +export default main; diff --git a/packages/plugins/src/Doc2X/FileImg2text/template.json b/packages/plugins/src/Doc2X/FileImg2text/template.json new file mode 100644 index 000000000..9c877b832 --- /dev/null +++ b/packages/plugins/src/Doc2X/FileImg2text/template.json @@ -0,0 +1,437 @@ +{ + "author": "Menghuan1918", + "version": "488", + "name": "Doc2X 图像(文件)识别", + "avatar": "plugins/doc2x", + "intro": "将上传的图片文件发送至Doc2X进行解析,返回带LaTeX公式的markdown格式的文本", + "inputExplanationUrl": "https://fael3z0zfze.feishu.cn/wiki/Rkc5witXWiJoi5kORd2cofh6nDg?fromScene=spaceOverview", + "showStatus": true, + "weight": 10, + + "isTool": true, + "templateType": "tools", + + "workflow": { + "nodes": [ + { + "nodeId": "pluginInput", + "name": "自定义插件输入", + "intro": "可以配置插件需要哪些输入,利用这些输入来运行插件", + "avatar": "core/workflow/template/workflowStart", + "flowNodeType": "pluginInput", + "showStatus": false, + "position": { + "x": 362.9862638626885, + "y": 6.16353826540589 + }, + "version": "481", + "inputs": [ + { + "renderTypeList": ["input"], + "selectedTypeIndex": 0, + "valueType": "string", + "canEdit": true, + "key": "apikey", + "label": "apikey", + "description": "Doc2X的验证密匙,对于个人用户可以从Doc2X官网 - 个人信息 - 身份令牌获得", + "required": true, + "toolDescription": "", + "defaultValue": "" + }, + { + "renderTypeList": ["reference"], + "selectedTypeIndex": 0, + "valueType": "arrayString", + "canEdit": true, + "key": "files", + "label": "files", + "description": "待处理图片文件", + "required": true, + "toolDescription": "待处理图片文件" + }, + { + "renderTypeList": ["switch"], + "selectedTypeIndex": 0, + "valueType": "boolean", + "canEdit": true, + "key": "img_correction", + "label": "img_correction", + "description": "是否启用图形矫正功能", + "required": true, + "toolDescription": "", + "defaultValue": false + }, + { + "renderTypeList": ["switch"], + "selectedTypeIndex": 0, + "valueType": "boolean", + "canEdit": true, + "key": "formula", + "label": "formula", + "description": "是否开启纯公式识别(仅适用于图片内容仅有公式时)", + "required": true, + "toolDescription": "", + "defaultValue": false + } + ], + "outputs": [ + { + "id": "apikey", + "valueType": "string", + "key": "apikey", + "label": "apikey", + "type": "hidden" + }, + { + "id": "url", + "valueType": "arrayString", + "key": "files", + "label": "files", + "type": "hidden" + }, + { + "id": "img_correction", + "valueType": "boolean", + "key": "img_correction", + "label": "img_correction", + "type": "hidden" + }, + { + "id": "formula", + "valueType": "boolean", + "key": "formula", + "label": "formula", + "type": "hidden" + } + ] + }, + { + "nodeId": "pluginOutput", + "name": "自定义插件输出", + "intro": "自定义配置外部输出,使用插件时,仅暴露自定义配置的输出", + "avatar": "core/workflow/template/pluginOutput", + "flowNodeType": "pluginOutput", + "showStatus": false, + "position": { + "x": 1661.4708279314577, + "y": 23.877720915480012 + }, + "version": "481", + "inputs": [ + { + "renderTypeList": ["reference"], + "valueType": "string", + "canEdit": true, + "key": "result", + "label": "result", + "description": "处理结果(或者是报错信息)", + "value": ["zHG5jJBkXmjB", "xWQuEf50F3mr"] + }, + { + "renderTypeList": ["reference"], + "valueType": "string", + "canEdit": true, + "key": "failreason", + "label": "failreason", + "description": "文件处理失败原因,由文件名以及报错组成,多个文件之间由横线分隔开", + "value": ["zHG5jJBkXmjB", "jbv4nVZvmFXm"] + }, + { + "renderTypeList": ["reference"], + "valueType": "boolean", + "canEdit": true, + "key": "success", + "label": "success", + "description": "是否全部文件都处理成功,如有没有处理成功的文件,失败原因将会输出在failreason中", + "value": ["zHG5jJBkXmjB", "k46cjNulVk5Y"] + } + ], + "outputs": [] + }, + { + "nodeId": "zHG5jJBkXmjB", + "name": "HTTP 请求", + "intro": "可以发出一个 HTTP 请求,实现更为复杂的操作(联网搜索、数据库查询等)", + "avatar": "core/workflow/template/httpRequest", + "flowNodeType": "httpRequest468", + "showStatus": true, + "position": { + "x": 1081.967607938733, + "y": -426.08028677656125 + }, + "version": "481", + "inputs": [ + { + "key": "system_addInputParam", + "renderTypeList": ["addInputParam"], + "valueType": "dynamic", + "label": "", + "required": false, + "description": "core.module.input.description.HTTP Dynamic Input", + "customInputConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": true + } + }, + { + "key": "system_httpMethod", + "renderTypeList": ["custom"], + "valueType": "string", + "label": "", + "value": "POST", + "required": true + }, + { + "key": "system_httpReqUrl", + "renderTypeList": ["hidden"], + "valueType": "string", + "label": "", + "description": "core.module.input.description.Http Request Url", + "placeholder": "https://api.ai.com/getInventory", + "required": false, + "value": "Doc2X/FileImg2text" + }, + { + "key": "system_httpHeader", + "renderTypeList": ["custom"], + "valueType": "any", + "value": [], + "label": "", + "description": "core.module.input.description.Http Request Header", + "placeholder": "core.module.input.description.Http Request Header", + "required": false + }, + { + "key": "system_httpParams", + "renderTypeList": ["hidden"], + "valueType": "any", + "value": [], + "label": "", + "required": false + }, + { + "key": "system_httpJsonBody", + "renderTypeList": ["hidden"], + "valueType": "any", + "value": "{\n \"apikey\": \"{{apikey}}\",\n \"files\": {{files}},\n \"img_correction\": {{img_correction}},\n \"formula\": {{formula}}\n}", + "label": "", + "required": false + }, + { + "renderTypeList": ["reference"], + "valueType": "string", + "canEdit": true, + "key": "apikey", + "label": "apikey", + "customInputConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": true + }, + "required": true, + "value": ["pluginInput", "apikey"] + }, + { + "renderTypeList": ["reference"], + "valueType": "arrayString", + "canEdit": true, + "key": "files", + "label": "files", + "customInputConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": true + }, + "required": true, + "value": ["pluginInput", "url"] + }, + { + "renderTypeList": ["reference"], + "valueType": "boolean", + "canEdit": true, + "key": "img_correction", + "label": "img_correction", + "customInputConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": true + }, + "required": true, + "value": ["pluginInput", "img_correction"] + }, + { + "renderTypeList": ["reference"], + "valueType": "boolean", + "canEdit": true, + "key": "formula", + "label": "formula", + "customInputConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": true + }, + "required": true, + "value": ["pluginInput", "formula"] + } + ], + "outputs": [ + { + "id": "error", + "key": "error", + "label": "请求错误", + "description": "HTTP请求错误信息,成功时返回空", + "valueType": "object", + "type": "static" + }, + { + "id": "httpRawResponse", + "key": "httpRawResponse", + "label": "原始响应", + "required": true, + "description": "HTTP请求的原始响应。只能接受字符串或JSON类型响应数据。", + "valueType": "any", + "type": "static" + }, + { + "id": "system_addOutputParam", + "key": "system_addOutputParam", + "type": "dynamic", + "valueType": "dynamic", + "label": "", + "customFieldConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": false + } + }, + { + "id": "xWQuEf50F3mr", + "valueType": "string", + "type": "dynamic", + "key": "result", + "label": "result" + }, + { + "id": "jbv4nVZvmFXm", + "valueType": "string", + "type": "dynamic", + "key": "failreason", + "label": "failreason" + }, + { + "id": "k46cjNulVk5Y", + "valueType": "boolean", + "type": "dynamic", + "key": "success", + "label": "success" + } + ] + } + ], + "edges": [ + { + "source": "pluginInput", + "target": "zHG5jJBkXmjB", + "sourceHandle": "pluginInput-source-right", + "targetHandle": "zHG5jJBkXmjB-target-left" + }, + { + "source": "zHG5jJBkXmjB", + "target": "pluginOutput", + "sourceHandle": "zHG5jJBkXmjB-source-right", + "targetHandle": "pluginOutput-target-left" + } + ] + } +} diff --git a/packages/plugins/src/Doc2X/FilePDF2text/index.ts b/packages/plugins/src/Doc2X/FilePDF2text/index.ts new file mode 100644 index 000000000..4b6695a5a --- /dev/null +++ b/packages/plugins/src/Doc2X/FilePDF2text/index.ts @@ -0,0 +1,165 @@ +import { delay } from '@fastgpt/global/common/system/utils'; +import { addLog } from '@fastgpt/service/common/system/log'; +import { result } from 'lodash'; + +type Props = { + apikey: string; + files: Array; + ocr: boolean; +}; + +// Response type same as HTTP outputs +type Response = Promise<{ + result: string; + failreason: string; + success: boolean; +}>; + +const main = async ({ apikey, files, ocr }: Props): Response => { + // Check the apikey + if (!apikey) { + return { + result: '', + failreason: `API key is required`, + success: false + }; + } + + let real_api_key = apikey; + if (!apikey.startsWith('sk-')) { + const response = await fetch('https://api.doc2x.noedgeai.com/api/token/refresh', { + method: 'POST', + headers: { + Authorization: `Bearer ${apikey}` + } + }); + if (response.status !== 200) { + return { + result: '', + failreason: `Get token failed: ${await response.text()}`, + success: false + }; + } + const data = await response.json(); + real_api_key = data.data.token; + } + + let final_result = ''; + let fail_reason = ''; + let flag = false; + //Process each file one by one + for await (const url of files) { + //Fetch the pdf and check its contene type + const PDFResponse = await fetch(url); + if (!PDFResponse.ok) { + fail_reason += `\n---\nFile:${url} \n\nFailed to fetch PDF from URL\n\n`; + flag = true; + continue; + } + + const contentType = PDFResponse.headers.get('content-type'); + const file_name = url.match(/read\?filename=([^&]+)/)?.[1] || 'unknown.pdf'; + if (!contentType || !contentType.startsWith('application/pdf')) { + fail_reason += `\n---\nFile:${file_name}\n\nThe provided file does not point to a PDF: ${contentType}\n\n`; + flag = true; + continue; + } + + const blob = await PDFResponse.blob(); + const formData = new FormData(); + formData.append('file', blob, file_name); + formData.append('ocr', ocr ? '1' : '0'); + + let upload_url = 'https://api.doc2x.noedgeai.com/api/platform/async/pdf'; + if (real_api_key.startsWith('sk-')) { + upload_url = 'https://api.doc2x.noedgeai.com/api/v1/async/pdf'; + } + + let uuid; + let upload_flag = true; + const uploadAttempts = [1, 2, 3]; + for await (const attempt of uploadAttempts) { + const upload_response = await fetch(upload_url, { + method: 'POST', + headers: { + Authorization: `Bearer ${real_api_key}` + }, + body: formData + }); + if (!upload_response.ok) { + // Rate limit, wait for 10s and retry at most 3 times + if (upload_response.status === 429 && attempt < 3) { + await delay(10000); + continue; + } + fail_reason += `\n---\nFile:${file_name}\n\nFailed to upload file: ${await upload_response.text()}\n\n`; + flag = true; + upload_flag = false; + } + if (!upload_flag) { + continue; + } + const upload_data = await upload_response.json(); + uuid = upload_data.data.uuid; + break; + } + + // Get the result by uuid + let result_url = 'https://api.doc2x.noedgeai.com/api/platform/async/status?uuid=' + uuid; + if (real_api_key.startsWith('sk-')) { + result_url = 'https://api.doc2x.noedgeai.com/api/v1/async/status?uuid=' + uuid; + } + + let required_flag = true; + let result = ''; + // Wait for the result, at most 100s + const maxAttempts = 100; + for await (const _ of Array(maxAttempts).keys()) { + const result_response = await fetch(result_url, { + headers: { + Authorization: `Bearer ${real_api_key}` + } + }); + if (!result_response.ok) { + fail_reason += `\n---\nFile:${file_name}\n\nFailed to get result: ${await result_response.text()}\n\n`; + flag = true; + required_flag = false; + break; + } + const result_data = await result_response.json(); + if (['ready', 'processing'].includes(result_data.data.status)) { + await delay(1000); + } else if (result_data.data.status === 'pages limit exceeded') { + fail_reason += `\n---\nFile:${file_name}\n\nPages limit exceeded\n\n`; + flag = true; + required_flag = false; + break; + } else if (result_data.data.status === 'success') { + result = await Promise.all( + result_data.data.result.pages.map((page: { md: any }) => page.md) + ).then((pages) => pages.join('\n')); + result = result.replace(/\\[\(\)]/g, '$').replace(/\\[\[\]]/g, '$$'); + final_result += `\n---\nFile:${file_name}\n\n${result}\n\n`; + required_flag = false; + break; + } else { + fail_reason += `\n---\nFile:${file_name}\n\nFailed to get result: ${result_data.data.status}\n\n`; + flag = true; + required_flag = false; + break; + } + } + if (required_flag) { + fail_reason += `\n---\nFile:${file_name}\n\nTimeout after 100s for uuid ${uuid}\n\n`; + flag = true; + } + } + + return { + result: final_result, + failreason: fail_reason, + success: !flag + }; +}; + +export default main; diff --git a/packages/plugins/src/Doc2X/FilePDF2text/template.json b/packages/plugins/src/Doc2X/FilePDF2text/template.json new file mode 100644 index 000000000..afb7ab800 --- /dev/null +++ b/packages/plugins/src/Doc2X/FilePDF2text/template.json @@ -0,0 +1,389 @@ +{ + "author": "Menghuan1918", + "version": "488", + "name": "Doc2X PDF文件(文件)识别", + "avatar": "plugins/doc2x", + "intro": "将上传的PDF文件发送至Doc2X进行解析,返回带LaTeX公式的markdown格式的文本", + "inputExplanationUrl": "https://fael3z0zfze.feishu.cn/wiki/Rkc5witXWiJoi5kORd2cofh6nDg?fromScene=spaceOverview", + "showStatus": true, + "weight": 10, + + "isTool": true, + "templateType": "tools", + + "workflow": { + "nodes": [ + { + "nodeId": "pluginInput", + "name": "自定义插件输入", + "intro": "可以配置插件需要哪些输入,利用这些输入来运行插件", + "avatar": "core/workflow/template/workflowStart", + "flowNodeType": "pluginInput", + "showStatus": false, + "position": { + "x": 388.243055058894, + "y": -75.09744210499466 + }, + "version": "481", + "inputs": [ + { + "renderTypeList": ["input"], + "selectedTypeIndex": 0, + "valueType": "string", + "canEdit": true, + "key": "apikey", + "label": "apikey", + "description": "Doc2X的验证密匙,对于个人用户可以从Doc2X官网 - 个人信息 - 身份令牌获得", + "required": true, + "toolDescription": "", + "defaultValue": "" + }, + { + "renderTypeList": ["reference"], + "selectedTypeIndex": 0, + "valueType": "arrayString", + "canEdit": true, + "key": "files", + "label": "files", + "description": "待处理的PDF文件", + "required": true, + "toolDescription": "待处理的PDF文件" + }, + { + "renderTypeList": ["switch"], + "selectedTypeIndex": 0, + "valueType": "boolean", + "canEdit": true, + "key": "ocr", + "label": "ocr", + "description": "是否开启对PDF文件内图片的OCR识别,建议开启", + "required": true, + "toolDescription": "", + "defaultValue": true + } + ], + "outputs": [ + { + "id": "apikey", + "valueType": "string", + "key": "apikey", + "label": "apikey", + "type": "hidden" + }, + { + "id": "url", + "valueType": "arrayString", + "key": "files", + "label": "files", + "type": "hidden" + }, + { + "id": "formula", + "valueType": "boolean", + "key": "ocr", + "label": "ocr", + "type": "hidden" + } + ] + }, + { + "nodeId": "pluginOutput", + "name": "自定义插件输出", + "intro": "自定义配置外部输出,使用插件时,仅暴露自定义配置的输出", + "avatar": "core/workflow/template/pluginOutput", + "flowNodeType": "pluginOutput", + "showStatus": false, + "position": { + "x": 1649.7796447278438, + "y": -96.05331527115042 + }, + "version": "481", + "inputs": [ + { + "renderTypeList": ["reference"], + "valueType": "string", + "canEdit": true, + "key": "result", + "label": "result", + "description": "处理结果,由文件名以及文档内容组成,多个文件之间由横线分隔开", + "value": ["zHG5jJBkXmjB", "xWQuEf50F3mr"] + }, + { + "renderTypeList": ["reference"], + "valueType": "string", + "canEdit": true, + "key": "failreason", + "label": "failreason", + "description": "文件处理失败原因,由文件名以及报错组成,多个文件之间由横线分隔开", + "value": ["zHG5jJBkXmjB", "yDxzW5CFalGw"] + }, + { + "renderTypeList": ["reference"], + "valueType": "boolean", + "canEdit": true, + "key": "success", + "label": "success", + "description": "是否全部文件都处理成功,如有没有处理成功的文件,失败原因将会输出在failreason中", + "value": ["zHG5jJBkXmjB", "m6CJJj7GFud5"] + } + ], + "outputs": [] + }, + { + "nodeId": "zHG5jJBkXmjB", + "name": "HTTP 请求", + "intro": "可以发出一个 HTTP 请求,实现更为复杂的操作(联网搜索、数据库查询等)", + "avatar": "core/workflow/template/httpRequest", + "flowNodeType": "httpRequest468", + "showStatus": true, + "position": { + "x": 1077.7986740892777, + "y": -496.9521622173004 + }, + "version": "481", + "inputs": [ + { + "key": "system_addInputParam", + "renderTypeList": ["addInputParam"], + "valueType": "dynamic", + "label": "", + "required": false, + "description": "core.module.input.description.HTTP Dynamic Input", + "customInputConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": true + } + }, + { + "key": "system_httpMethod", + "renderTypeList": ["custom"], + "valueType": "string", + "label": "", + "value": "POST", + "required": true + }, + { + "key": "system_httpReqUrl", + "renderTypeList": ["hidden"], + "valueType": "string", + "label": "", + "description": "core.module.input.description.Http Request Url", + "placeholder": "https://api.ai.com/getInventory", + "required": false, + "value": "Doc2X/FilePDF2text" + }, + { + "key": "system_httpHeader", + "renderTypeList": ["custom"], + "valueType": "any", + "value": [], + "label": "", + "description": "core.module.input.description.Http Request Header", + "placeholder": "core.module.input.description.Http Request Header", + "required": false + }, + { + "key": "system_httpParams", + "renderTypeList": ["hidden"], + "valueType": "any", + "value": [], + "label": "", + "required": false + }, + { + "key": "system_httpJsonBody", + "renderTypeList": ["hidden"], + "valueType": "any", + "value": "{\n \"apikey\": \"{{apikey}}\",\n \"files\": {{files}},\n \"ocr\": {{ocr}}\n}", + "label": "", + "required": false + }, + { + "renderTypeList": ["reference"], + "valueType": "string", + "canEdit": true, + "key": "apikey", + "label": "apikey", + "customInputConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": true + }, + "required": true, + "value": ["pluginInput", "apikey"] + }, + { + "renderTypeList": ["reference"], + "valueType": "arrayString", + "canEdit": true, + "key": "files", + "label": "files", + "customInputConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": true + }, + "required": true, + "value": ["pluginInput", "url"] + }, + { + "renderTypeList": ["reference"], + "valueType": "boolean", + "canEdit": true, + "key": "ocr", + "label": "ocr", + "customInputConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": true + }, + "required": true, + "value": ["pluginInput", "formula"] + } + ], + "outputs": [ + { + "id": "error", + "key": "error", + "label": "请求错误", + "description": "HTTP请求错误信息,成功时返回空", + "valueType": "object", + "type": "static" + }, + { + "id": "httpRawResponse", + "key": "httpRawResponse", + "label": "原始响应", + "required": true, + "description": "HTTP请求的原始响应。只能接受字符串或JSON类型响应数据。", + "valueType": "any", + "type": "static" + }, + { + "id": "system_addOutputParam", + "key": "system_addOutputParam", + "type": "dynamic", + "valueType": "dynamic", + "label": "", + "customFieldConfig": { + "selectValueTypeList": [ + "string", + "number", + "boolean", + "object", + "arrayString", + "arrayNumber", + "arrayBoolean", + "arrayObject", + "any", + "chatHistory", + "datasetQuote", + "dynamic", + "selectApp", + "selectDataset" + ], + "showDescription": false, + "showDefaultValue": false + } + }, + { + "id": "xWQuEf50F3mr", + "valueType": "string", + "type": "dynamic", + "key": "result", + "label": "result" + }, + { + "id": "m6CJJj7GFud5", + "valueType": "boolean", + "type": "dynamic", + "key": "success", + "label": "success" + }, + { + "id": "yDxzW5CFalGw", + "valueType": "string", + "type": "dynamic", + "key": "failreason", + "label": "failreason" + } + ] + } + ], + "edges": [ + { + "source": "pluginInput", + "target": "zHG5jJBkXmjB", + "sourceHandle": "pluginInput-source-right", + "targetHandle": "zHG5jJBkXmjB-target-left" + }, + { + "source": "zHG5jJBkXmjB", + "target": "pluginOutput", + "sourceHandle": "zHG5jJBkXmjB-source-right", + "targetHandle": "pluginOutput-target-left" + } + ] + } +} diff --git a/packages/plugins/src/Doc2X/URLImg2text/index.ts b/packages/plugins/src/Doc2X/URLImg2text/index.ts index 60d356505..0f51e702a 100644 --- a/packages/plugins/src/Doc2X/URLImg2text/index.ts +++ b/packages/plugins/src/Doc2X/URLImg2text/index.ts @@ -8,7 +8,6 @@ type Props = { formula: boolean; }; -// Response type same as HTTP outputs type Response = Promise<{ result: string; success: boolean; @@ -41,36 +40,36 @@ const main = async ({ apikey, url, img_correction, formula }: Props): Response = real_api_key = data.data.token; } - //Get the image binary from the URL - const extension = url.split('.').pop()?.toLowerCase(); - const name = url.split('/').pop()?.split('.').shift(); - let mini = ''; - switch (extension) { - case 'jpg': - case 'jpeg': - mini = 'image/jpeg'; - break; - case 'png': - mini = 'image/png'; - break; - default: - return { - result: `Not supported image format, only support jpg/jpeg/png`, - success: false - }; + let imageResponse; + // Fetch the image and check its content type + try { + imageResponse = await fetch(url); + } catch (e) { + return { + result: `Failed to fetch image from URL: ${url} with error: ${e}`, + success: false + }; } - const response = await fetch(url); - if (!response.ok) { + if (!imageResponse.ok) { return { result: `Failed to fetch image from URL: ${url}`, success: false }; } - const blob = await response.blob(); + const contentType = imageResponse.headers.get('content-type'); + if (!contentType || !contentType.startsWith('image/')) { + return { + result: `The provided URL does not point to an image: ${contentType}`, + success: false + }; + } + + const blob = await imageResponse.blob(); const formData = new FormData(); - formData.append('file', new Blob([blob], { type: mini }), name + '.' + extension); + const fileName = url.split('/').pop()?.split('?')[0] || 'image'; + formData.append('file', blob, fileName); formData.append('img_correction', img_correction ? '1' : '0'); formData.append('equation', formula ? '1' : '0'); @@ -135,8 +134,17 @@ const main = async ({ apikey, url, img_correction, formula }: Props): Response = success: false }; } else if (result_data.data.status === 'success') { - let result = result_data.data.result.pages[0].md; - result = result.replace(/\\[\(\)]/g, '$').replace(/\\[\[\]]/g, '$$'); + let result; + try { + result = result_data.data.result.pages[0].md; + result = result.replace(/\\[\(\)]/g, '$').replace(/\\[\[\]]/g, '$$'); + } catch { + // no pages + return { + result: '', + success: true + }; + } return { result: result, success: true diff --git a/packages/plugins/src/Doc2X/URLImg2text/template.json b/packages/plugins/src/Doc2X/URLImg2text/template.json index 0c1a5ef13..77bc4e23e 100644 --- a/packages/plugins/src/Doc2X/URLImg2text/template.json +++ b/packages/plugins/src/Doc2X/URLImg2text/template.json @@ -3,7 +3,7 @@ "version": "488", "name": "Doc2X 图像(URL)识别", "avatar": "plugins/doc2x", - "intro": "将传入的图片(URL)发送至Doc2X进行解析,返回带LaTeX公式的markdown格式的文本", + "intro": "从URL下载图片并发送至Doc2X进行解析,返回带LaTeX公式的markdown格式的文本", "inputExplanationUrl": "https://fael3z0zfze.feishu.cn/wiki/Rkc5witXWiJoi5kORd2cofh6nDg?fromScene=spaceOverview", "showStatus": true, "weight": 10, @@ -220,7 +220,7 @@ "key": "system_httpJsonBody", "renderTypeList": ["hidden"], "valueType": "any", - "value": "{\n \"apikey\": \"{{apikey}}\",\n \"url\": \"{{url}}\",\n \"img_correction\": \"{{img_correction}}\",\n \"formula\": \"{{img_correction}}\"\n}", + "value": "{\n \"apikey\": \"{{apikey}}\",\n \"url\": \"{{url}}\",\n \"img_correction\": {{img_correction}},\n \"formula\": {{formula}}\n}", "label": "", "required": false }, diff --git a/packages/plugins/src/Doc2X/URLPDF2text/index.ts b/packages/plugins/src/Doc2X/URLPDF2text/index.ts index 3d7c6cabc..79ea19fe7 100644 --- a/packages/plugins/src/Doc2X/URLPDF2text/index.ts +++ b/packages/plugins/src/Doc2X/URLPDF2text/index.ts @@ -40,9 +40,35 @@ const main = async ({ apikey, url, ocr }: Props): Response => { real_api_key = data.data.token; } - //Get the image binary from the URL + //Fetch the pdf and check its contene type + let PDFResponse; + try { + PDFResponse = await fetch(url); + } catch (e) { + return { + result: `Failed to fetch PDF from URL: ${url} with error: ${e}`, + success: false + }; + } + if (!PDFResponse.ok) { + return { + result: `Failed to fetch PDF from URL: ${url}`, + success: false + }; + } + + const contentType = PDFResponse.headers.get('content-type'); + if (!contentType || !contentType.startsWith('application/pdf')) { + return { + result: `The provided URL does not point to a PDF: ${contentType}`, + success: false + }; + } + + const blob = await PDFResponse.blob(); const formData = new FormData(); - formData.append('pdf_url', url); + const fileName = url.split('/').pop()?.split('?')[0] || 'pdf'; + formData.append('file', blob, fileName); formData.append('ocr', ocr ? '1' : '0'); let upload_url = 'https://api.doc2x.noedgeai.com/api/platform/async/pdf'; diff --git a/packages/plugins/src/Doc2X/URLPDF2text/template.json b/packages/plugins/src/Doc2X/URLPDF2text/template.json index 971846ae7..f3613a279 100644 --- a/packages/plugins/src/Doc2X/URLPDF2text/template.json +++ b/packages/plugins/src/Doc2X/URLPDF2text/template.json @@ -3,7 +3,7 @@ "version": "488", "name": "Doc2X PDF文件(URL)识别", "avatar": "plugins/doc2x", - "intro": "将传入的PDF文件(URL)发送至Doc2X进行解析,返回带LaTeX公式的markdown格式的文本", + "intro": "从URL下载PDF文件,并发送至Doc2X进行解析,返回带LaTeX公式的markdown格式的文本", "inputExplanationUrl": "https://fael3z0zfze.feishu.cn/wiki/Rkc5witXWiJoi5kORd2cofh6nDg?fromScene=spaceOverview", "showStatus": true, "weight": 10, @@ -201,7 +201,7 @@ "key": "system_httpJsonBody", "renderTypeList": ["hidden"], "valueType": "any", - "value": "{\n \"apikey\": \"{{apikey}}\",\n \"url\": \"{{url}}\",\n \"ocr\": \"{{ocr}}\"\n}", + "value": "{\n \"apikey\": \"{{apikey}}\",\n \"url\": \"{{url}}\",\n \"ocr\": {{ocr}}\n}", "label": "", "required": false }, diff --git a/packages/plugins/src/Doc2X/template.json b/packages/plugins/src/Doc2X/template.json index de2b17c30..52610d1bb 100644 --- a/packages/plugins/src/Doc2X/template.json +++ b/packages/plugins/src/Doc2X/template.json @@ -3,7 +3,7 @@ "version": "488", "name": "Doc2X服务", "avatar": "plugins/doc2x", - "intro": "传入的URL形式的图片或PDF文件发送至Doc2X进行解析,返回带LaTeX公式的markdown格式的文本。", + "intro": "将传入的图片或PDF文件发送至Doc2X进行解析,返回带LaTeX公式的markdown格式的文本。", "showStatus": true, "weight": 10,