FastGPT/test/cases/service/worker/htmlStr2Md.test.ts
Archer a499d05a02
Some checks are pending
Document deploy / sync-images (push) Waiting to run
Document deploy / generate-timestamp (push) Blocked by required conditions
Document deploy / build-images (map[domain:https://fastgpt.cn suffix:cn]) (push) Blocked by required conditions
Document deploy / build-images (map[domain:https://fastgpt.io suffix:io]) (push) Blocked by required conditions
Document deploy / update-images (map[deployment:fastgpt-docs domain:https://fastgpt.cn kube_config:KUBE_CONFIG_CN suffix:cn]) (push) Blocked by required conditions
Document deploy / update-images (map[deployment:fastgpt-docs domain:https://fastgpt.io kube_config:KUBE_CONFIG_IO suffix:io]) (push) Blocked by required conditions
Build FastGPT images in Personal warehouse / get-vars (push) Waiting to run
Build FastGPT images in Personal warehouse / build-fastgpt-images (map[arch:amd64 runs-on:ubuntu-24.04]) (push) Blocked by required conditions
Build FastGPT images in Personal warehouse / build-fastgpt-images (map[arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Blocked by required conditions
Build FastGPT images in Personal warehouse / release-fastgpt-images (push) Blocked by required conditions
V4.14.0 features (#5850)
* feat: migrate chat files to s3 (#5802)

* feat: migrate chat files to s3

* feat: add delete jobs for deleting s3 files

* chore: improvements

* fix: lockfile

* fix: imports

* feat: add ttl for those uploaded files but not send yet

* feat: init bullmq worker

* fix: s3 key

* perf: s3 internal url

* remove env

* fix: re-sign a new url

* fix: re-sign a new url

* perf: s3 code

---------

Co-authored-by: archer <545436317@qq.com>

* update pacakge

* feat: add more file type for uploading (#5807)

* fix: re-sign a new url

* wip: file selector

* feat: add more file type for uploading

* feat: migrate chat files to s3 (#5802)

* feat: migrate chat files to s3

* feat: add delete jobs for deleting s3 files

* chore: improvements

* fix: lockfile

* fix: imports

* feat: add ttl for those uploaded files but not send yet

* feat: init bullmq worker

* fix: s3 key

* perf: s3 internal url

* remove env

* fix: re-sign a new url

* fix: re-sign a new url

* perf: s3 code

---------

Co-authored-by: archer <545436317@qq.com>

* fix: limit minmax available file upload number

* perf: file select modal code

* fix: fileselect refresh

* fix: ts

---------

Co-authored-by: archer <545436317@qq.com>

* bugfix: chat page (#5809)

* fix: upload avatar

* fix: chat page username display issue and setting button visibility

* doc

* Markdown match base64 performance

* feat: improve global variables(time, file, dataset) (#5804)

* feat: improve global variables(time, file, dataset)

* feat: optimize code

* perf: time variables code

* fix: model, file

* fix: hide file upload

* fix: ts

* hide dataset select

---------

Co-authored-by: archer <545436317@qq.com>

* perf: insert training queue

* perf: s3 upload error i18n

* fix: share page s3

* fix: timeselector ui error

* var update node

* Timepicker ui

* feat: plugin support password

* fix: password disabled UX

* fix: button size

* fix: no model cache for chat page (#5820)

* rename function

* fix: workflow bug

* fix: interactive loop

* fix test

* perf: common textare no richtext

* move system plugin config (#5803) (#5813)

* move system plugin config (#5803)

* move system plugin config

* extract tag bar

* filter

* tool detail temp

* marketplace

* params

* fix

* type

* search

* tags render

* status

* ui

* code

* connect to backend (#5815)

* feat: marketplace apis & type definitions (#5817)

* chore: marketplace init

* chore: marketplace list api type

* chore: detail api

* marketplace & import

* feat: marketplace ui (#5826)

* temp

* marketplace

* import

* feat: detail return readme

* chore: cache data expire 10 mins

* chore: update docs

* feat: marketplace ui

---------

Co-authored-by: heheer <zhiyu44@qq.com>

* feat: marketplace (#5830)

* temp

* marketplace

* chore: tool list tag filter

* chore: adjust

---------

Co-authored-by: heheer <zhiyu44@qq.com>

* tool detail drawer

* remove tag filter

* fix

* fix

* fix build

* update pnpm-lock

* fix type

* perf code

* marketplace router

* fix build

* navbar icon

* fix ui

* fix init

* docs: marketplace/plugin (#5832)

* temp

* marketplace

* docs(plugin): system tool docs

---------

Co-authored-by: heheer <zhiyu44@qq.com>

* default url

* feat: i18n/ docker build (#5833)

* chore: docker build

* feat: i18n selector

* fix

* fix

* fix: i18n parse

* fix: i18n parse

---------

Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: heheer <zhiyu44@qq.com>

* marketplace url

* update action

* market place code

* market place code

* title

* fix: nextconfig

* fix: copilot review

* Remove bypassable regex-based XSS sanitization from marketplace search (#5835)

* Initial plan

* Remove problematic regex-based XSS sanitization from search inputs

Co-authored-by: c121914yu <50446880+c121914yu@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: c121914yu <50446880+c121914yu@users.noreply.github.com>

* feat: tool tag openapi

* api check

* fix: tsc

* fix: ts

* fix: lock

* sdk version

* ts

* sdk version

* remove invalid tip

* perf: export data add timezone

* perf: admin plugin api move

* perf: tool code

* move tag code

* perf: marketplace and team plugin code

* remove workflow invalid request

* rename global tool code

* rename global tool code

* rename api

* fix some bugs (#5841)

* fix some bugs

* fix

* perf: Tag filter

* fix: ts

* fix: ts

---------

Co-authored-by: archer <545436317@qq.com>

* perf: Concat function

* fix: workflow snapshot push

* fix: ts type

* fix: login to config/*

* fix: ts

* fix: model avatar (#5848)

* fix: model avatar

* fix: ts

* fix: avatar migration to s3

* update lock

* fix: avatar redirect

---------

Co-authored-by: archer <545436317@qq.com>

* fix tool detail (#5847)

* fix tool detail

* init script

* fix build

* perf: plugin detail modal

* change tooltags to tags

* fix icon

---------

Co-authored-by: archer <545436317@qq.com>

* fix tag filter scroll (#5852)

* fix create app plugin & import info (#5853)

* tag size

* rename toolkit

* download url

* import plugin status (#5854)

* init doc

* fix: init shell

---------

Co-authored-by: 伍闲犬 <whoeverimf5@gmail.com>
Co-authored-by: Zeng Qingwen <143274079+fishwww-ww@users.noreply.github.com>
Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: heheer <zhiyu44@qq.com>
Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
2025-11-04 16:58:12 +08:00

318 lines
10 KiB
TypeScript

import { describe, it, expect } from 'vitest';
import { html2md } from '@fastgpt/service/worker/htmlStr2Md/utils';
describe('html2md 性能和功能测试', () => {
// 性能基准
const PERFORMANCE_THRESHOLDS = {
smallHtml: 100, // 小文档应该在 100ms 内完成
mediumHtml: 500, // 中等文档应该在 500ms 内完成
largeBase64: 2000 // 大 base64 图片应该在 2s 内完成(优化后)
};
describe('功能正确性', () => {
it('应该正确处理简单的 HTML', () => {
const html = '<p>Hello <strong>World</strong></p>';
const result = html2md(html);
expect(result.rawText).toContain('Hello');
expect(result.rawText).toContain('**World**');
expect(result.imageList).toHaveLength(0);
});
it('应该正确提取 base64 图片', () => {
const base64Data =
'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==';
const html = `<img src="data:image/png;base64,${base64Data}" alt="test">`;
const result = html2md(html);
expect(result.imageList).toHaveLength(1);
expect(result.imageList[0].base64).toBe(base64Data);
expect(result.imageList[0].mime).toBe('image/png');
expect(result.imageList[0].uuid).toMatch(/^IMAGE_[a-zA-Z0-9]+_IMAGE$/);
});
it('应该处理多个 base64 图片', () => {
const base64Data = 'iVBORw0KGgo=';
const html = `
<img src="data:image/png;base64,${base64Data}">
<img src="data:image/jpeg;base64,${base64Data}">
<img src="data:image/gif;base64,${base64Data}">
`;
const result = html2md(html);
expect(result.imageList).toHaveLength(3);
expect(result.imageList.map((img) => img.mime)).toEqual([
'image/png',
'image/jpeg',
'image/gif'
]);
});
it('应该正确处理表格', () => {
const html = `
<table>
<tr><td>Cell 1</td><td>Cell 2</td></tr>
<tr><td>Cell 3</td><td>Cell 4</td></tr>
</table>
`;
const result = html2md(html);
expect(result.rawText).toContain('Cell 1');
expect(result.rawText).toContain('Cell 2');
expect(result.rawText).toContain('|'); // Markdown 表格语法
});
it('应该移除 script 和 style 标签', () => {
const html = `
<p>Visible content</p>
<script>alert('should be removed')</script>
<style>body { color: red; }</style>
`;
const result = html2md(html);
expect(result.rawText).toContain('Visible content');
expect(result.rawText).not.toContain('alert');
expect(result.rawText).not.toContain('color: red');
});
it('应该处理视频标签', () => {
const html = `<video src="https://example.com/video.mp4"></video>`;
const result = html2md(html);
expect(result.rawText).toContain('https://example.com/video.mp4');
});
});
describe('性能测试', () => {
it('小型 HTML 文档性能(~10KB)', () => {
const html = '<p>' + 'Hello World '.repeat(1000) + '</p>';
const start = Date.now();
const result = html2md(html);
const duration = Date.now() - start;
expect(result.rawText).toContain('Hello World');
expect(duration).toBeLessThan(PERFORMANCE_THRESHOLDS.smallHtml);
});
it('中等大小 HTML 文档性能(~50KB)', () => {
const html = '<div>' + '<p>Content </p>'.repeat(5000) + '</div>';
const start = Date.now();
const result = html2md(html);
const duration = Date.now() - start;
expect(result.rawText).toContain('Content');
expect(duration).toBeLessThan(PERFORMANCE_THRESHOLDS.mediumHtml);
});
it('大型 base64 图片性能(~1MB)', () => {
// 生成约 1MB 的 base64 数据
const base64Data = 'A'.repeat(1024 * 1024);
const html = `<img src="data:image/png;base64,${base64Data}">`;
const start = Date.now();
const result = html2md(html);
const duration = Date.now() - start;
expect(result.imageList).toHaveLength(1);
expect(result.imageList[0].base64).toBe(base64Data);
expect(duration).toBeLessThan(PERFORMANCE_THRESHOLDS.largeBase64);
});
it('多个大型 base64 图片性能', () => {
// 3 个约 500KB 的 base64 图片
const base64Data = 'B'.repeat(500 * 1024);
const html = `
<img src="data:image/png;base64,${base64Data}">
<img src="data:image/jpeg;base64,${base64Data}">
<img src="data:image/gif;base64,${base64Data}">
`;
const start = Date.now();
const result = html2md(html);
const duration = Date.now() - start;
expect(result.imageList).toHaveLength(3);
expect(duration).toBeLessThan(PERFORMANCE_THRESHOLDS.largeBase64 * 2);
});
it('深度嵌套 HTML 性能', () => {
// 创建深度嵌套的 HTML 结构
let html = '';
const depth = 50;
for (let i = 0; i < depth; i++) {
html += '<div><table><tr><td>';
}
html += 'Deep content';
for (let i = 0; i < depth; i++) {
html += '</td></tr></table></div>';
}
const start = Date.now();
const result = html2md(html);
const duration = Date.now() - start;
expect(result.rawText).toContain('Deep content');
expect(duration).toBeLessThan(1000); // 应该在 1 秒内完成
});
});
describe('防御性功能', () => {
it('应该拒绝超大 HTML 文档', () => {
const hugeHtml = 'x'.repeat(100 * 1000 + 1);
const result = html2md(hugeHtml);
expect(result.rawText).toBe(hugeHtml);
expect(result.imageList).toHaveLength(0);
});
it('应该处理空 HTML', () => {
const result = html2md('');
expect(result.rawText).toBe('');
expect(result.imageList).toHaveLength(0);
});
it('应该处理无效的 HTML', () => {
const invalidHtml = '<div><p>Unclosed tags';
const result = html2md(invalidHtml);
// 应该不会崩溃,并尽可能提取内容
expect(result.rawText).toBeTruthy();
});
it('应该处理包含特殊字符的 HTML', () => {
const html = '<p>&lt;script&gt;alert("xss")&lt;/script&gt;</p>';
const result = html2md(html);
expect(result.rawText).toContain('<script>');
expect(result.rawText).toContain('</script>');
});
});
describe('边界情况', () => {
it('应该处理只包含空白的 HTML', () => {
const html = ' \n\n\t ';
const result = html2md(html);
expect(result.rawText).toBe('');
expect(result.imageList).toHaveLength(0);
});
it('应该处理包含 Unicode 字符的 HTML', () => {
const html = '<p>你好世界 🌍 مرحبا</p>';
const result = html2md(html);
expect(result.rawText).toContain('你好世界');
expect(result.rawText).toContain('🌍');
expect(result.rawText).toContain('مرحبا');
});
it('应该正确处理混合的 base64 和普通图片', () => {
const base64Data = 'iVBORw0KGgo=';
const html = `
<img src="data:image/png;base64,${base64Data}">
<img src="https://example.com/image.jpg">
`;
const result = html2md(html);
expect(result.imageList).toHaveLength(1); // 只有 base64 图片被提取
expect(result.rawText).toContain('https://example.com/image.jpg'); // 普通 URL 保留在文本中
});
it('应该去重重复的图片', () => {
const base64Data = 'iVBORw0KGgo=';
const html = `
<img src="data:image/png;base64,${base64Data}">
<p>Some text</p>
<img src="data:image/png;base64,${base64Data}">
`;
const result = html2md(html);
// 注意: 当前实现会为每个 base64 生成新的 UUID
// 如果需要去重,需要额外的逻辑
expect(result.imageList.length).toBeGreaterThanOrEqual(1);
});
});
describe('正则表达式优化验证', () => {
it('优化后的正则应该正确匹配合法的 base64', () => {
const validBase64 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=';
const html = `<img src="data:image/png;base64,${validBase64}">`;
const result = html2md(html);
expect(result.imageList).toHaveLength(1);
expect(result.imageList[0].base64).toBe(validBase64);
});
it('优化后的正则应该处理非法的 base64 字符', () => {
// 包含非法字符 @ 和 #
const invalidBase64 = 'ABC@123#XYZ';
const html = `<img src="data:image/png;base64,${invalidBase64}">`;
const result = html2md(html);
// 注意: matchMdImg 会在 Markdown 中提取这个图片
// 因为它使用更宽松的正则 [^)]+
// 这个测试验证系统不会因为非法字符而崩溃
expect(result.imageList.length).toBeGreaterThanOrEqual(0);
expect(result.rawText).toBeTruthy();
});
it('应该处理 base64 末尾的填充字符', () => {
const base64WithPadding = 'iVBORw0KGgo==';
const html = `<img src="data:image/png;base64,${base64WithPadding}">`;
const result = html2md(html);
expect(result.imageList).toHaveLength(1);
expect(result.imageList[0].base64).toBe(base64WithPadding);
});
});
describe('实例复用验证', () => {
it('多次调用应该使用相同的 TurndownService 实例', () => {
const html1 = '<p>Test 1</p>';
const html2 = '<p>Test 2</p>';
const result1 = html2md(html1);
const result2 = html2md(html2);
expect(result1.rawText).toContain('Test 1');
expect(result2.rawText).toContain('Test 2');
// 两次调用都应该成功,且性能稳定
});
it('批量转换性能应该稳定', () => {
const htmlTemplates = Array(10)
.fill(null)
.map((_, i) => `<p>Content ${i}</p>`);
const durations: number[] = [];
htmlTemplates.forEach((html) => {
const start = performance.now();
html2md(html);
durations.push(performance.now() - start);
});
// 计算平均耗时
const avgDuration = durations.reduce((a, b) => a + b, 0) / durations.length;
// 所有调用都应该快速完成 - 放宽到 100ms
expect(avgDuration).toBeLessThan(100);
// 性能应该稳定(标准差不应该太大)
// 只有在平均耗时 > 0 时才检查标准差
if (avgDuration > 0) {
const variance =
durations.reduce((sum, d) => sum + Math.pow(d - avgDuration, 2), 0) / durations.length;
const stdDev = Math.sqrt(variance);
// 标准差不应该超过平均值的200%(更宽松的条件,因为测试环境可能不稳定)
expect(stdDev).toBeLessThan(avgDuration * 2.0);
}
});
});
});