mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
chore: 解析错误时输出错误原因
This commit is contained in:
parent
ec4fe833b1
commit
2a87af6172
|
|
@ -28,9 +28,9 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
|||
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"),
|
||||
re.compile("(?<!\n)\n\n+")]
|
||||
|
||||
|
||||
max_kb = logging.getLogger("max_kb")
|
||||
|
||||
|
||||
class PdfSplitHandle(BaseSplitHandle):
|
||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||
|
|
@ -60,6 +60,13 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
|
||||
loader = PyPDFLoader(page_num_pdf, extract_images=True)
|
||||
page_content = "\n" + loader.load()[0].page_content
|
||||
except NotImplementedError as e:
|
||||
# 文件格式不支持,直接退出
|
||||
raise e
|
||||
except BaseException as e:
|
||||
# 当页出错继续进行下一页,防止一个页面出错导致整个文件解析失败
|
||||
max_kb.error(f"File: {file.name}, Page: {page_num + 1}, error: {e}")
|
||||
continue
|
||||
finally:
|
||||
os.remove(page_num_pdf)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue