mirror of
https://github.com/labring/FastGPT.git
synced 2025-12-25 20:02:47 +00:00
Some checks are pending
Document deploy / sync-images (push) Waiting to run
Document deploy / generate-timestamp (push) Blocked by required conditions
Document deploy / build-images (map[domain:https://fastgpt.cn suffix:cn]) (push) Blocked by required conditions
Document deploy / build-images (map[domain:https://fastgpt.io suffix:io]) (push) Blocked by required conditions
Document deploy / update-images (map[deployment:fastgpt-docs domain:https://fastgpt.cn kube_config:KUBE_CONFIG_CN suffix:cn]) (push) Blocked by required conditions
Document deploy / update-images (map[deployment:fastgpt-docs domain:https://fastgpt.io kube_config:KUBE_CONFIG_IO suffix:io]) (push) Blocked by required conditions
Build FastGPT images in Personal warehouse / get-vars (push) Waiting to run
Build FastGPT images in Personal warehouse / build-fastgpt-images (map[arch:amd64 runs-on:ubuntu-24.04]) (push) Blocked by required conditions
Build FastGPT images in Personal warehouse / build-fastgpt-images (map[arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Blocked by required conditions
Build FastGPT images in Personal warehouse / release-fastgpt-images (push) Blocked by required conditions
* perf: html2md * perf: index * Add model log * update next version * log index
93 lines
2.7 KiB
TypeScript
93 lines
2.7 KiB
TypeScript
import TurndownService from 'turndown';
|
|
import { type ImageType } from '../readFile/type';
|
|
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
|
import { simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
|
|
// @ts-ignore
|
|
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
|
|
|
|
const MAX_HTML_SIZE = 100 * 1000; // 100k characters limit
|
|
|
|
const processBase64Images = (htmlContent: string) => {
|
|
// 优化后的正则:
|
|
// 1. 使用精确的 base64 字符集 [A-Za-z0-9+/=]+ 避免回溯
|
|
// 2. 明确捕获 mime 类型和 base64 数据
|
|
// 3. 减少不必要的捕获组
|
|
const base64Regex = /src="data:([^;]+);base64,([A-Za-z0-9+/=]+)"/g;
|
|
const images: ImageType[] = [];
|
|
|
|
const processedHtml = htmlContent.replace(base64Regex, (_match, mime, base64Data) => {
|
|
const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
|
|
images.push({
|
|
uuid,
|
|
base64: base64Data,
|
|
mime
|
|
});
|
|
return `src="${uuid}"`;
|
|
});
|
|
|
|
return { processedHtml, images };
|
|
};
|
|
|
|
export const html2md = (
|
|
html: string
|
|
): {
|
|
rawText: string;
|
|
imageList: ImageType[];
|
|
} => {
|
|
const turndownService = new TurndownService({
|
|
headingStyle: 'atx',
|
|
bulletListMarker: '-',
|
|
codeBlockStyle: 'fenced',
|
|
fence: '```',
|
|
emDelimiter: '_',
|
|
strongDelimiter: '**',
|
|
linkStyle: 'inlined',
|
|
linkReferenceStyle: 'full'
|
|
});
|
|
|
|
try {
|
|
turndownService.remove(['i', 'script', 'iframe', 'style']);
|
|
turndownService.use(turndownPluginGfm.gfm);
|
|
|
|
// add custom handling for media tag
|
|
turndownService.addRule('media', {
|
|
filter: ['video', 'source', 'audio'],
|
|
replacement: function (content, node) {
|
|
const mediaNode = node as HTMLVideoElement | HTMLAudioElement | HTMLSourceElement;
|
|
const src = mediaNode.getAttribute('src');
|
|
const sources = mediaNode.getElementsByTagName('source');
|
|
const firstSourceSrc = sources.length > 0 ? sources[0].getAttribute('src') : null;
|
|
const mediaSrc = src || firstSourceSrc;
|
|
|
|
if (mediaSrc) {
|
|
return `[${mediaSrc}](${mediaSrc}) `;
|
|
}
|
|
|
|
return content;
|
|
}
|
|
});
|
|
|
|
// Base64 img to id, otherwise it will occupy memory when going to md
|
|
const { processedHtml, images } = processBase64Images(html);
|
|
|
|
// if html is too large, return the original html
|
|
if (processedHtml.length > MAX_HTML_SIZE) {
|
|
return { rawText: processedHtml, imageList: [] };
|
|
}
|
|
|
|
const md = turndownService.turndown(processedHtml);
|
|
// const { text, imageList } = matchMdImg(md);
|
|
|
|
return {
|
|
rawText: simpleMarkdownText(md),
|
|
imageList: images
|
|
};
|
|
} catch (error) {
|
|
console.log('html 2 markdown error', error);
|
|
return {
|
|
rawText: '',
|
|
imageList: []
|
|
};
|
|
}
|
|
};
|