fix: handle string type for limit and improve error logging in pdf_split_handle

--bug=1057493 --user=刘瑞斌 【知识库】上传文档,使用高级分段报错 https://www.tapd.cn/62980211/s/1720110
This commit is contained in:
CaptainB 2025-06-30 12:40:11 +08:00
parent 049c0e0bb0
commit 82a2203be6
14 changed files with 63 additions and 24 deletions

View File

@ -8,10 +8,12 @@
"""
import csv
import io
import traceback
from charset_normalizer import detect
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
from common.utils.logger import maxkb_logger
def read_csv_standard(file_path):
@ -56,4 +58,5 @@ class CsvParseQAHandle(BaseParseQAHandle):
'problem_list': problem_list})
return [{'name': file.name, 'paragraphs': paragraph_list}]
except Exception as e:
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}]

View File

@ -6,10 +6,12 @@
@date2024/5/21 14:59
@desc:
"""
import traceback
import xlrd
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
from common.utils.logger import maxkb_logger
def handle_sheet(file_name, sheet):
@ -58,4 +60,5 @@ class XlsParseQAHandle(BaseParseQAHandle):
sheet.name, sheet) for sheet
in worksheets] if row is not None]
except Exception as e:
maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}]

View File

@ -7,11 +7,13 @@
@desc:
"""
import io
import traceback
import openpyxl
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
from common.handle.impl.common_handle import xlsx_embed_cells_images
from common.utils.logger import maxkb_logger
def handle_sheet(file_name, sheet, image_dict):
@ -69,4 +71,5 @@ class XlsxParseQAHandle(BaseParseQAHandle):
sheet.title, sheet, image_dict) for sheet
in worksheets] if row is not None]
except Exception as e:
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}]

View File

@ -1,5 +1,6 @@
# coding=utf-8
import logging
import traceback
from charset_normalizer import detect
@ -19,7 +20,7 @@ class CsvParseTableHandle(BaseParseTableHandle):
try:
content = buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
maxkb_logger.error(f'csv split handle error: {e}')
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}]
csv_model = content.split('\n')

View File

@ -1,5 +1,6 @@
# coding=utf-8
import logging
import traceback
import xlrd
@ -55,7 +56,7 @@ class XlsParseTableHandle(BaseParseTableHandle):
result.append({'name': sheet.name, 'paragraphs': paragraphs})
except BaseException as e:
maxkb_logger.error(f'excel split handle error: {e}')
maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}]
return result

View File

@ -1,6 +1,7 @@
# coding=utf-8
import io
import logging
import traceback
from openpyxl import load_workbook
@ -73,7 +74,7 @@ class XlsxParseTableHandle(BaseParseTableHandle):
result.append({'name': sheetname, 'paragraphs': paragraphs})
except BaseException as e:
maxkb_logger.error(f'excel split handle error: {e}')
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}]
return result

View File

@ -9,11 +9,13 @@
import csv
import io
import os
import traceback
from typing import List
from charset_normalizer import detect
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
def post_cell(cell_value):
@ -60,6 +62,7 @@ class CsvSplitHandle(BaseSplitHandle):
paragraphs.append({'content': result_item_content, 'title': ''})
return result
except Exception as e:
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
return result
def get_content(self, file, save_image):

View File

@ -7,7 +7,6 @@
@desc:
"""
import io
import logging
import os
import re
import traceback
@ -155,7 +154,7 @@ class DocSplitHandle(BaseSplitHandle):
return title
except Exception as e:
traceback.print_exc()
maxkb_logger.error(f"Error processing DOC file: {e}, {traceback.format_exc()}")
return paragraph.text
return get_paragraph_txt(paragraph, doc, images_list, get_image_id)
@ -207,12 +206,15 @@ class DocSplitHandle(BaseSplitHandle):
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
except BaseException as e:
traceback.print_exception(e)
return {'name': file_name,
'content': []}
return {'name': file_name,
'content': split_model.parse(content)
}
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
return {
'name': file_name,
'content': []
}
return {
'name': file_name,
'content': split_model.parse(content)
}
def support(self, file, get_buffer):
file_name: str = file.name.lower()

View File

@ -15,6 +15,7 @@ from charset_normalizer import detect
from html2text import html2text
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
from common.utils.split_model import SplitModel
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
@ -55,11 +56,15 @@ class HTMLSplitHandle(BaseSplitHandle):
content = buffer.decode(encoding)
content = html2text(content)
except BaseException as e:
return {'name': file.name,
'content': []}
return {'name': file.name,
'content': split_model.parse(content)
}
maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")
return {
'name': file.name, 'content': []
}
return {
'name': file.name,
'content': split_model.parse(content)
}
def get_content(self, file, save_image):
buffer = file.read()

View File

@ -6,7 +6,6 @@
@date2024/3/27 18:19
@desc:
"""
import logging
import os
import re
import tempfile
@ -31,7 +30,6 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile("(?<!\n)\n\n+")]
def check_links_in_pdf(doc):
for page_number in range(len(doc)):
page = doc[page_number]
@ -54,6 +52,8 @@ class PdfSplitHandle(BaseSplitHandle):
pdf_document = fitz.open(temp_file_path)
try:
if type(limit) is str:
limit = int(limit)
# 处理有目录的pdf
result = self.handle_toc(pdf_document, limit)
if result is not None:
@ -72,17 +72,20 @@ class PdfSplitHandle(BaseSplitHandle):
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
except BaseException as e:
maxkb_logger.error(f"File: {file.name}, error: {e}")
return {'name': file.name,
'content': []}
maxkb_logger.error(f"File: {file.name}, error: {e}, {traceback.format_exc()}")
return {
'name': file.name,
'content': []
}
finally:
pdf_document.close()
# 处理完后可以删除临时文件
os.remove(temp_file_path)
return {'name': file.name,
'content': split_model.parse(content)
}
return {
'name': file.name,
'content': split_model.parse(content)
}
@staticmethod
def handle_pdf_content(file, pdf_document):

View File

@ -13,6 +13,7 @@ from typing import List
from charset_normalizer import detect
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
from common.utils.split_model import SplitModel
default_pattern_list = [
@ -47,6 +48,7 @@ class TextSplitHandle(BaseSplitHandle):
try:
content = buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
maxkb_logger.error(f"Error processing TEXT file {file.name}: {e}, {traceback.format_exc()}")
return {'name': file.name, 'content': []}
return {'name': file.name, 'content': split_model.parse(content)}

View File

@ -6,11 +6,13 @@
@date2024/5/21 14:59
@desc:
"""
import traceback
from typing import List
import xlrd
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
def post_cell(cell_value):
@ -58,6 +60,8 @@ class XlsSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
try:
if type(limit) is str:
limit = int(limit)
workbook = xlrd.open_workbook(file_contents=buffer)
worksheets = workbook.sheets()
worksheets_size = len(worksheets)
@ -67,6 +71,7 @@ class XlsSplitHandle(BaseSplitHandle):
sheet.name, sheet, limit) for sheet
in worksheets] if row is not None]
except Exception as e:
maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'content': []}]
def get_content(self, file, save_image):

View File

@ -7,12 +7,14 @@
@desc:
"""
import io
import traceback
from typing import List
import openpyxl
from common.handle.base_split_handle import BaseSplitHandle
from common.handle.impl.common_handle import xlsx_embed_cells_images
from common.utils.logger import maxkb_logger
def post_cell(image_dict, cell_value):
@ -64,6 +66,8 @@ class XlsxSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
try:
if type(limit) is str:
limit = int(limit)
workbook = openpyxl.load_workbook(io.BytesIO(buffer))
try:
image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
@ -80,6 +84,7 @@ class XlsxSplitHandle(BaseSplitHandle):
sheet.title, sheet, image_dict, limit) for sheet
in worksheets] if row is not None]
except Exception as e:
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'content': []}]
def get_content(self, file, save_image):

View File

@ -119,6 +119,8 @@ def filter_image_file(result_list: list, image_list):
class ZipSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
if type(limit) is str:
limit = int(limit)
buffer = get_buffer(file)
bytes_io = io.BytesIO(buffer)
result = []