fix: 处理PDF中出现 \0 字符报 Null characters are not allowed

--bug=1048190 --user=刘瑞斌 【知识库】- 上传PDF文档 报错  ,关联issue #1468 https://www.tapd.cn/57709429/s/1611070
This commit is contained in:
CaptainB 2024-11-18 12:42:42 +08:00 committed by 刘瑞斌
parent 4dd497ea26
commit e1df4b2857

View File

@ -104,6 +104,9 @@ class PdfSplitHandle(BaseSplitHandle):
content += page_content
# Null characters are not allowed.
content = content.replace('\0', '')
elapsed_time = time.time() - start_time
max_kb.debug(
f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}")
@ -156,6 +159,10 @@ class PdfSplitHandle(BaseSplitHandle):
text = text[:idx]
chapter_text += text # 提取文本
# Null characters are not allowed.
chapter_text = chapter_text.replace('\0', '')
# 限制章节内容长度
if 0 < limit < len(chapter_text):
split_text = PdfSplitHandle.split_text(chapter_text, limit)
@ -228,6 +235,9 @@ class PdfSplitHandle(BaseSplitHandle):
text = text[:idx]
chapter_text += text
# Null characters are not allowed.
chapter_text = chapter_text.replace('\0', '')
# 限制章节内容长度
if 0 < limit < len(chapter_text):
split_text = PdfSplitHandle.split_text(chapter_text, limit)