mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
fix: handle string type for limit and improve error logging in pdf_split_handle
--bug=1057493 --user=刘瑞斌 【知识库】上传文档,使用高级分段报错 https://www.tapd.cn/62980211/s/1720110
This commit is contained in:
parent
049c0e0bb0
commit
82a2203be6
|
|
@ -8,10 +8,12 @@
|
|||
"""
|
||||
import csv
|
||||
import io
|
||||
import traceback
|
||||
|
||||
from charset_normalizer import detect
|
||||
|
||||
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
|
||||
from common.utils.logger import maxkb_logger
|
||||
|
||||
|
||||
def read_csv_standard(file_path):
|
||||
|
|
@ -56,4 +58,5 @@ class CsvParseQAHandle(BaseParseQAHandle):
|
|||
'problem_list': problem_list})
|
||||
return [{'name': file.name, 'paragraphs': paragraph_list}]
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
|
||||
return [{'name': file.name, 'paragraphs': []}]
|
||||
|
|
|
|||
|
|
@ -6,10 +6,12 @@
|
|||
@date:2024/5/21 14:59
|
||||
@desc:
|
||||
"""
|
||||
import traceback
|
||||
|
||||
import xlrd
|
||||
|
||||
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
|
||||
from common.utils.logger import maxkb_logger
|
||||
|
||||
|
||||
def handle_sheet(file_name, sheet):
|
||||
|
|
@ -58,4 +60,5 @@ class XlsParseQAHandle(BaseParseQAHandle):
|
|||
sheet.name, sheet) for sheet
|
||||
in worksheets] if row is not None]
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
|
||||
return [{'name': file.name, 'paragraphs': []}]
|
||||
|
|
|
|||
|
|
@ -7,11 +7,13 @@
|
|||
@desc:
|
||||
"""
|
||||
import io
|
||||
import traceback
|
||||
|
||||
import openpyxl
|
||||
|
||||
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
|
||||
from common.handle.impl.common_handle import xlsx_embed_cells_images
|
||||
from common.utils.logger import maxkb_logger
|
||||
|
||||
|
||||
def handle_sheet(file_name, sheet, image_dict):
|
||||
|
|
@ -69,4 +71,5 @@ class XlsxParseQAHandle(BaseParseQAHandle):
|
|||
sheet.title, sheet, image_dict) for sheet
|
||||
in worksheets] if row is not None]
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
|
||||
return [{'name': file.name, 'paragraphs': []}]
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
# coding=utf-8
|
||||
import logging
|
||||
import traceback
|
||||
|
||||
from charset_normalizer import detect
|
||||
|
||||
|
|
@ -19,7 +20,7 @@ class CsvParseTableHandle(BaseParseTableHandle):
|
|||
try:
|
||||
content = buffer.decode(detect(buffer)['encoding'])
|
||||
except BaseException as e:
|
||||
maxkb_logger.error(f'csv split handle error: {e}')
|
||||
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
|
||||
return [{'name': file.name, 'paragraphs': []}]
|
||||
|
||||
csv_model = content.split('\n')
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
# coding=utf-8
|
||||
import logging
|
||||
import traceback
|
||||
|
||||
import xlrd
|
||||
|
||||
|
|
@ -55,7 +56,7 @@ class XlsParseTableHandle(BaseParseTableHandle):
|
|||
result.append({'name': sheet.name, 'paragraphs': paragraphs})
|
||||
|
||||
except BaseException as e:
|
||||
maxkb_logger.error(f'excel split handle error: {e}')
|
||||
maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
|
||||
return [{'name': file.name, 'paragraphs': []}]
|
||||
return result
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
# coding=utf-8
|
||||
import io
|
||||
import logging
|
||||
import traceback
|
||||
|
||||
from openpyxl import load_workbook
|
||||
|
||||
|
|
@ -73,7 +74,7 @@ class XlsxParseTableHandle(BaseParseTableHandle):
|
|||
result.append({'name': sheetname, 'paragraphs': paragraphs})
|
||||
|
||||
except BaseException as e:
|
||||
maxkb_logger.error(f'excel split handle error: {e}')
|
||||
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
|
||||
return [{'name': file.name, 'paragraphs': []}]
|
||||
return result
|
||||
|
||||
|
|
|
|||
|
|
@ -9,11 +9,13 @@
|
|||
import csv
|
||||
import io
|
||||
import os
|
||||
import traceback
|
||||
from typing import List
|
||||
|
||||
from charset_normalizer import detect
|
||||
|
||||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.utils.logger import maxkb_logger
|
||||
|
||||
|
||||
def post_cell(cell_value):
|
||||
|
|
@ -60,6 +62,7 @@ class CsvSplitHandle(BaseSplitHandle):
|
|||
paragraphs.append({'content': result_item_content, 'title': ''})
|
||||
return result
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
|
||||
return result
|
||||
|
||||
def get_content(self, file, save_image):
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@
|
|||
@desc:
|
||||
"""
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
|
|
@ -155,7 +154,7 @@ class DocSplitHandle(BaseSplitHandle):
|
|||
return title
|
||||
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
maxkb_logger.error(f"Error processing DOC file: {e}, {traceback.format_exc()}")
|
||||
return paragraph.text
|
||||
return get_paragraph_txt(paragraph, doc, images_list, get_image_id)
|
||||
|
||||
|
|
@ -207,12 +206,15 @@ class DocSplitHandle(BaseSplitHandle):
|
|||
else:
|
||||
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
|
||||
except BaseException as e:
|
||||
traceback.print_exception(e)
|
||||
return {'name': file_name,
|
||||
'content': []}
|
||||
return {'name': file_name,
|
||||
'content': split_model.parse(content)
|
||||
}
|
||||
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
|
||||
return {
|
||||
'name': file_name,
|
||||
'content': []
|
||||
}
|
||||
return {
|
||||
'name': file_name,
|
||||
'content': split_model.parse(content)
|
||||
}
|
||||
|
||||
def support(self, file, get_buffer):
|
||||
file_name: str = file.name.lower()
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ from charset_normalizer import detect
|
|||
from html2text import html2text
|
||||
|
||||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.utils.logger import maxkb_logger
|
||||
from common.utils.split_model import SplitModel
|
||||
|
||||
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
||||
|
|
@ -55,11 +56,15 @@ class HTMLSplitHandle(BaseSplitHandle):
|
|||
content = buffer.decode(encoding)
|
||||
content = html2text(content)
|
||||
except BaseException as e:
|
||||
return {'name': file.name,
|
||||
'content': []}
|
||||
return {'name': file.name,
|
||||
'content': split_model.parse(content)
|
||||
}
|
||||
maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")
|
||||
|
||||
return {
|
||||
'name': file.name, 'content': []
|
||||
}
|
||||
return {
|
||||
'name': file.name,
|
||||
'content': split_model.parse(content)
|
||||
}
|
||||
|
||||
def get_content(self, file, save_image):
|
||||
buffer = file.read()
|
||||
|
|
|
|||
|
|
@ -6,7 +6,6 @@
|
|||
@date:2024/3/27 18:19
|
||||
@desc:
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
|
@ -31,7 +30,6 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
|||
re.compile("(?<!\n)\n\n+")]
|
||||
|
||||
|
||||
|
||||
def check_links_in_pdf(doc):
|
||||
for page_number in range(len(doc)):
|
||||
page = doc[page_number]
|
||||
|
|
@ -54,6 +52,8 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
|
||||
pdf_document = fitz.open(temp_file_path)
|
||||
try:
|
||||
if type(limit) is str:
|
||||
limit = int(limit)
|
||||
# 处理有目录的pdf
|
||||
result = self.handle_toc(pdf_document, limit)
|
||||
if result is not None:
|
||||
|
|
@ -72,17 +72,20 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
else:
|
||||
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
|
||||
except BaseException as e:
|
||||
maxkb_logger.error(f"File: {file.name}, error: {e}")
|
||||
return {'name': file.name,
|
||||
'content': []}
|
||||
maxkb_logger.error(f"File: {file.name}, error: {e}, {traceback.format_exc()}")
|
||||
return {
|
||||
'name': file.name,
|
||||
'content': []
|
||||
}
|
||||
finally:
|
||||
pdf_document.close()
|
||||
# 处理完后可以删除临时文件
|
||||
os.remove(temp_file_path)
|
||||
|
||||
return {'name': file.name,
|
||||
'content': split_model.parse(content)
|
||||
}
|
||||
return {
|
||||
'name': file.name,
|
||||
'content': split_model.parse(content)
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def handle_pdf_content(file, pdf_document):
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ from typing import List
|
|||
from charset_normalizer import detect
|
||||
|
||||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.utils.logger import maxkb_logger
|
||||
from common.utils.split_model import SplitModel
|
||||
|
||||
default_pattern_list = [
|
||||
|
|
@ -47,6 +48,7 @@ class TextSplitHandle(BaseSplitHandle):
|
|||
try:
|
||||
content = buffer.decode(detect(buffer)['encoding'])
|
||||
except BaseException as e:
|
||||
maxkb_logger.error(f"Error processing TEXT file {file.name}: {e}, {traceback.format_exc()}")
|
||||
return {'name': file.name, 'content': []}
|
||||
return {'name': file.name, 'content': split_model.parse(content)}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,11 +6,13 @@
|
|||
@date:2024/5/21 14:59
|
||||
@desc:
|
||||
"""
|
||||
import traceback
|
||||
from typing import List
|
||||
|
||||
import xlrd
|
||||
|
||||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.utils.logger import maxkb_logger
|
||||
|
||||
|
||||
def post_cell(cell_value):
|
||||
|
|
@ -58,6 +60,8 @@ class XlsSplitHandle(BaseSplitHandle):
|
|||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||
buffer = get_buffer(file)
|
||||
try:
|
||||
if type(limit) is str:
|
||||
limit = int(limit)
|
||||
workbook = xlrd.open_workbook(file_contents=buffer)
|
||||
worksheets = workbook.sheets()
|
||||
worksheets_size = len(worksheets)
|
||||
|
|
@ -67,6 +71,7 @@ class XlsSplitHandle(BaseSplitHandle):
|
|||
sheet.name, sheet, limit) for sheet
|
||||
in worksheets] if row is not None]
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
|
||||
return [{'name': file.name, 'content': []}]
|
||||
|
||||
def get_content(self, file, save_image):
|
||||
|
|
|
|||
|
|
@ -7,12 +7,14 @@
|
|||
@desc:
|
||||
"""
|
||||
import io
|
||||
import traceback
|
||||
from typing import List
|
||||
|
||||
import openpyxl
|
||||
|
||||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.handle.impl.common_handle import xlsx_embed_cells_images
|
||||
from common.utils.logger import maxkb_logger
|
||||
|
||||
|
||||
def post_cell(image_dict, cell_value):
|
||||
|
|
@ -64,6 +66,8 @@ class XlsxSplitHandle(BaseSplitHandle):
|
|||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||
buffer = get_buffer(file)
|
||||
try:
|
||||
if type(limit) is str:
|
||||
limit = int(limit)
|
||||
workbook = openpyxl.load_workbook(io.BytesIO(buffer))
|
||||
try:
|
||||
image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
|
||||
|
|
@ -80,6 +84,7 @@ class XlsxSplitHandle(BaseSplitHandle):
|
|||
sheet.title, sheet, image_dict, limit) for sheet
|
||||
in worksheets] if row is not None]
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
|
||||
return [{'name': file.name, 'content': []}]
|
||||
|
||||
def get_content(self, file, save_image):
|
||||
|
|
|
|||
|
|
@ -119,6 +119,8 @@ def filter_image_file(result_list: list, image_list):
|
|||
|
||||
class ZipSplitHandle(BaseSplitHandle):
|
||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||
if type(limit) is str:
|
||||
limit = int(limit)
|
||||
buffer = get_buffer(file)
|
||||
bytes_io = io.BytesIO(buffer)
|
||||
result = []
|
||||
|
|
|
|||
Loading…
Reference in New Issue