MaxKB/apps/common/handle/impl/csv_split_handle.py
wxg0103 b90995d3aa fix: defect of incorrect document names after importing CSV and docx files into the knowledge base
--bug=1052039 --user=王孝刚 【知识库】-压缩文件中包含csv、docx文件时,导入到知识库后,文档名称包含文件夹名称 https://www.tapd.cn/57709429/s/1651752
2025-02-08 16:00:57 +08:00

73 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
"""
@project: maxkb
@Author
@file csv_parse_qa_handle.py
@date2024/5/21 14:59
@desc:
"""
import csv
import io
import os
from typing import List
from charset_normalizer import detect
from common.handle.base_split_handle import BaseSplitHandle
def post_cell(cell_value):
return cell_value.replace('\n', '<br>').replace('|', '&#124;')
def row_to_md(row):
return '| ' + ' | '.join(
[post_cell(cell) if cell is not None else '' for cell in row]) + ' |\n'
class CsvSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
paragraphs = []
file_name = os.path.basename(file.name)
result = {'name': file_name, 'content': paragraphs}
try:
reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
try:
title_row_list = reader.__next__()
title_md_content = row_to_md(title_row_list)
title_md_content += '| ' + ' | '.join(
['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
except Exception as e:
return result
if len(title_row_list) == 0:
return result
result_item_content = ''
for row in reader:
next_md_content = row_to_md(row)
next_md_content_len = len(next_md_content)
result_item_content_len = len(result_item_content)
if len(result_item_content) == 0:
result_item_content += title_md_content
result_item_content += next_md_content
else:
if result_item_content_len + next_md_content_len < limit:
result_item_content += next_md_content
else:
paragraphs.append({'content': result_item_content, 'title': ''})
result_item_content = next_md_content
if len(result_item_content) > 0:
paragraphs.append({'content': result_item_content, 'title': ''})
return result
except Exception as e:
return result
def get_content(self, file, save_image):
pass
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".csv"):
return True
return False