mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
fix: 处理PDF中出现 \0 字符报 Null characters are not allowed
--bug=1048190 --user=刘瑞斌 【知识库】- 上传PDF文档 报错 ,关联issue #1468 https://www.tapd.cn/57709429/s/1611070
This commit is contained in:
parent
4dd497ea26
commit
e1df4b2857
|
|
@ -104,6 +104,9 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
|
||||
content += page_content
|
||||
|
||||
# Null characters are not allowed.
|
||||
content = content.replace('\0', '')
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
max_kb.debug(
|
||||
f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}")
|
||||
|
|
@ -156,6 +159,10 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
text = text[:idx]
|
||||
|
||||
chapter_text += text # 提取文本
|
||||
|
||||
# Null characters are not allowed.
|
||||
chapter_text = chapter_text.replace('\0', '')
|
||||
|
||||
# 限制章节内容长度
|
||||
if 0 < limit < len(chapter_text):
|
||||
split_text = PdfSplitHandle.split_text(chapter_text, limit)
|
||||
|
|
@ -228,6 +235,9 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
text = text[:idx]
|
||||
chapter_text += text
|
||||
|
||||
# Null characters are not allowed.
|
||||
chapter_text = chapter_text.replace('\0', '')
|
||||
|
||||
# 限制章节内容长度
|
||||
if 0 < limit < len(chapter_text):
|
||||
split_text = PdfSplitHandle.split_text(chapter_text, limit)
|
||||
|
|
|
|||
Loading…
Reference in New Issue