feat: add initial implementations of various file handling classes for CSV, XLS, and XLSX formats

This commit is contained in:
CaptainB 2025-04-30 14:14:41 +08:00
parent c8ce7e28d8
commit 48297d81e5
36 changed files with 2427 additions and 1 deletions

View File

@ -218,6 +218,14 @@ class PermissionConstants(Enum):
RoleConstants.USER])
KNOWLEDGE_DELETE = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
RoleConstants.USER])
DOCUMENT_READ = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
RoleConstants.USER])
DOCUMENT_CREATE = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
RoleConstants.USER])
DOCUMENT_EDIT = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
RoleConstants.USER])
DOCUMENT_DELETE = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
RoleConstants.USER])
def get_workspace_application_permission(self):
return lambda r, kwargs: Permission(group=self.value.group, operate=self.value.operate,

View File

@ -0,0 +1,8 @@
# coding=utf-8
"""
@project: qabot
@Author
@file __init__.py.py
@date2023/9/6 10:09
@desc:
"""

View File

@ -0,0 +1,52 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file base_parse_qa_handle.py
@date2024/5/21 14:56
@desc:
"""
from abc import ABC, abstractmethod
def get_row_value(row, title_row_index_dict, field):
index = title_row_index_dict.get(field)
if index is None:
return None
if (len(row) - 1) >= index:
return row[index]
return None
def get_title_row_index_dict(title_row_list):
title_row_index_dict = {}
if len(title_row_list) == 1:
title_row_index_dict['content'] = 0
elif len(title_row_list) == 1:
title_row_index_dict['title'] = 0
title_row_index_dict['content'] = 1
else:
title_row_index_dict['title'] = 0
title_row_index_dict['content'] = 1
title_row_index_dict['problem_list'] = 2
for index in range(len(title_row_list)):
title_row = title_row_list[index]
if title_row is None:
title_row = ''
if title_row.startswith('分段标题'):
title_row_index_dict['title'] = index
if title_row.startswith('分段内容'):
title_row_index_dict['content'] = index
if title_row.startswith('问题'):
title_row_index_dict['problem_list'] = index
return title_row_index_dict
class BaseParseQAHandle(ABC):
@abstractmethod
def support(self, file, get_buffer):
pass
@abstractmethod
def handle(self, file, get_buffer, save_image):
pass

View File

@ -0,0 +1,23 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file base_parse_qa_handle.py
@date2024/5/21 14:56
@desc:
"""
from abc import ABC, abstractmethod
class BaseParseTableHandle(ABC):
@abstractmethod
def support(self, file, get_buffer):
pass
@abstractmethod
def handle(self, file, get_buffer,save_image):
pass
@abstractmethod
def get_content(self, file, save_image):
pass

View File

@ -0,0 +1,24 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file base_split_handle.py
@date2024/3/27 18:13
@desc:
"""
from abc import ABC, abstractmethod
from typing import List
class BaseSplitHandle(ABC):
@abstractmethod
def support(self, file, get_buffer):
pass
@abstractmethod
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
pass
@abstractmethod
def get_content(self, file, save_image):
pass

View File

@ -0,0 +1,30 @@
# coding=utf-8
"""
@project: MaxKB
@Author
@file base_to_response.py
@date2024/9/6 16:04
@desc:
"""
from abc import ABC, abstractmethod
from rest_framework import status
class BaseToResponse(ABC):
@abstractmethod
def to_block_response(self, chat_id, chat_record_id, content, is_end, completion_tokens,
prompt_tokens, other_params: dict = None,
_status=status.HTTP_200_OK):
pass
@abstractmethod
def to_stream_chunk_response(self, chat_id, chat_record_id, node_id, up_node_id_list, content, is_end,
completion_tokens,
prompt_tokens, other_params: dict = None):
pass
@staticmethod
def format_stream_chunk(response_str):
return 'data: ' + response_str + '\n\n'

View File

@ -0,0 +1,94 @@
# coding=utf-8
"""
@project: qabot
@Author
@file handle_exception.py
@date2023/9/5 19:29
@desc:
"""
import logging
import traceback
from rest_framework.exceptions import ValidationError, ErrorDetail, APIException
from rest_framework.views import exception_handler
from common.exception.app_exception import AppApiException
from django.utils.translation import gettext_lazy as _
from common.result import result
def to_result(key, args, parent_key=None):
"""
将校验异常 args转换为统一数据
:param key: 校验key
:param args: 校验异常参数
:param parent_key 父key
:return: 接口响应对象
"""
error_detail = list(filter(
lambda d: True if isinstance(d, ErrorDetail) else True if isinstance(d, dict) and len(
d.keys()) > 0 else False,
(args[0] if len(args) > 0 else {key: [ErrorDetail(_('Unknown exception'), code='unknown')]}).get(key)))[0]
if isinstance(error_detail, dict):
return list(map(lambda k: to_result(k, args=[error_detail],
parent_key=key if parent_key is None else parent_key + '.' + key),
error_detail.keys() if len(error_detail) > 0 else []))[0]
return result.Result(500 if isinstance(error_detail.code, str) else error_detail.code,
message=f"{key if parent_key is None else parent_key + '.' + key}】为必填参数" if str(
error_detail) == "This field is required." else error_detail)
def validation_error_to_result(exc: ValidationError):
"""
校验异常转响应对象
:param exc: 校验异常
:return: 接口响应对象
"""
try:
v = find_err_detail(exc.detail)
if v is None:
return result.error(str(exc.detail))
return result.error(str(v))
except Exception as e:
return result.error(str(exc.detail))
def find_err_detail(exc_detail):
if isinstance(exc_detail, ErrorDetail):
return exc_detail
if isinstance(exc_detail, dict):
keys = exc_detail.keys()
for key in keys:
_value = exc_detail[key]
if isinstance(_value, list):
return find_err_detail(_value)
if isinstance(_value, ErrorDetail):
return _value
if isinstance(_value, dict) and len(_value.keys()) > 0:
return find_err_detail(_value)
if isinstance(exc_detail, list):
for v in exc_detail:
r = find_err_detail(v)
if r is not None:
return r
def handle_exception(exc, context):
exception_class = exc.__class__
# 先调用REST framework默认的异常处理方法获得标准错误响应对象
response = exception_handler(exc, context)
# 在此处补充自定义的异常处理
if issubclass(exception_class, ValidationError):
return validation_error_to_result(exc)
if issubclass(exception_class, AppApiException):
return result.Result(exc.code, exc.message, response_status=exc.status_code)
if issubclass(exception_class, APIException):
return result.error(exc.detail)
if response is None:
logging.getLogger("max_kb_error").error(f'{str(exc)}:{traceback.format_exc()}')
return result.error(str(exc))
return response

View File

@ -0,0 +1,8 @@
# coding=utf-8
"""
@project: qabot
@Author
@file __init__.py.py
@date2023/9/6 10:09
@desc:
"""

View File

@ -0,0 +1,116 @@
# coding=utf-8
"""
@project: MaxKB
@Author
@file tools.py
@date2024/9/11 16:41
@desc:
"""
import io
import uuid_utils.compat as uuid
from functools import reduce
from io import BytesIO
from xml.etree.ElementTree import fromstring
from zipfile import ZipFile
from PIL import Image as PILImage
from openpyxl.drawing.image import Image as openpyxl_Image
from openpyxl.packaging.relationship import get_rels_path, get_dependents
from openpyxl.xml.constants import SHEET_DRAWING_NS, REL_NS, SHEET_MAIN_NS
from common.handle.base_parse_qa_handle import get_title_row_index_dict, get_row_value
from knowledge.models import File
def parse_element(element) -> {}:
data = {}
xdr_namespace = "{%s}" % SHEET_DRAWING_NS
targets = level_order_traversal(element, xdr_namespace + "nvPicPr")
for target in targets:
cNvPr = embed = ""
for child in target:
if child.tag == xdr_namespace + "nvPicPr":
cNvPr = child[0].attrib["name"]
elif child.tag == xdr_namespace + "blipFill":
_rel_embed = "{%s}embed" % REL_NS
embed = child[0].attrib[_rel_embed]
if cNvPr:
data[cNvPr] = embed
return data
def parse_element_sheet_xml(element) -> []:
data = []
xdr_namespace = "{%s}" % SHEET_MAIN_NS
targets = level_order_traversal(element, xdr_namespace + "f")
for target in targets:
for child in target:
if child.tag == xdr_namespace + "f":
data.append(child.text)
return data
def level_order_traversal(root, flag: str) -> []:
queue = [root]
targets = []
while queue:
node = queue.pop(0)
children = [child.tag for child in node]
if flag in children:
targets.append(node)
continue
for child in node:
queue.append(child)
return targets
def handle_images(deps, archive: ZipFile) -> []:
images = []
if not PILImage: # Pillow not installed, drop images
return images
for dep in deps:
try:
image_io = archive.read(dep.target)
image = openpyxl_Image(BytesIO(image_io))
except Exception as e:
print(e)
continue
image.embed = dep.id # 文件rId
image.target = dep.target # 文件地址
images.append(image)
return images
def xlsx_embed_cells_images(buffer) -> {}:
archive = ZipFile(buffer)
# 解析cellImage.xml文件
deps = get_dependents(archive, get_rels_path("xl/cellimages.xml"))
image_rel = handle_images(deps=deps, archive=archive)
# 工作表及其中图片ID
sheet_list = {}
for item in archive.namelist():
if not item.startswith('xl/worksheets/sheet'):
continue
key = item.split('/')[-1].split('.')[0].split('sheet')[-1]
sheet_list[key] = parse_element_sheet_xml(fromstring(archive.read(item)))
cell_images_xml = parse_element(fromstring(archive.read("xl/cellimages.xml")))
cell_images_rel = {}
for image in image_rel:
cell_images_rel[image.embed] = image
for cnv, embed in cell_images_xml.items():
cell_images_xml[cnv] = cell_images_rel.get(embed)
result = {}
for key, img in cell_images_xml.items():
image_excel_id_list = [_xl for _xl in
reduce(lambda x, y: [*x, *y], [sheet for sheet_id, sheet in sheet_list.items()], []) if
key in _xl]
if len(image_excel_id_list) > 0:
image_excel_id = image_excel_id_list[-1]
f = archive.open(img.target)
img_byte = io.BytesIO()
im = PILImage.open(f).convert('RGB')
im.save(img_byte, format='JPEG')
image = File(id=uuid.uuid7(), file_name=img.path, meta={'debug': False, 'content': img_byte.getvalue()})
result['=' + image_excel_id] = image
archive.close()
return result

View File

@ -0,0 +1,72 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file csv_parse_qa_handle.py
@date2024/5/21 14:59
@desc:
"""
import csv
import io
import os
from typing import List
from charset_normalizer import detect
from common.handle.base_split_handle import BaseSplitHandle
def post_cell(cell_value):
return cell_value.replace('\n', '<br>').replace('|', '&#124;')
def row_to_md(row):
return '| ' + ' | '.join(
[post_cell(cell) if cell is not None else '' for cell in row]) + ' |\n'
class CsvSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
paragraphs = []
file_name = os.path.basename(file.name)
result = {'name': file_name, 'content': paragraphs}
try:
reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
try:
title_row_list = reader.__next__()
title_md_content = row_to_md(title_row_list)
title_md_content += '| ' + ' | '.join(
['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
except Exception as e:
return result
if len(title_row_list) == 0:
return result
result_item_content = ''
for row in reader:
next_md_content = row_to_md(row)
next_md_content_len = len(next_md_content)
result_item_content_len = len(result_item_content)
if len(result_item_content) == 0:
result_item_content += title_md_content
result_item_content += next_md_content
else:
if result_item_content_len + next_md_content_len < limit:
result_item_content += next_md_content
else:
paragraphs.append({'content': result_item_content, 'title': ''})
result_item_content = title_md_content + next_md_content
if len(result_item_content) > 0:
paragraphs.append({'content': result_item_content, 'title': ''})
return result
except Exception as e:
return result
def get_content(self, file, save_image):
pass
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".csv"):
return True
return False

View File

@ -0,0 +1,235 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file text_split_handle.py
@date2024/3/27 18:19
@desc:
"""
import io
import os
import re
import traceback
import uuid_utils.compat as uuid
from functools import reduce
from typing import List
from docx import Document, ImagePart
from docx.oxml import ns
from docx.table import Table
from docx.text.paragraph import Paragraph
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.split_model import SplitModel
from knowledge.models import File
from django.utils.translation import gettext_lazy as _
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
old_docx_nsmap = {'v': 'urn:schemas-microsoft-com:vml'}
combine_nsmap = {**ns.nsmap, **old_docx_nsmap}
def image_to_mode(image, doc: Document, images_list, get_image_id):
image_ids = image['get_image_id_handle'](image.get('image'))
for img_id in image_ids: # 获取图片id
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
if isinstance(part, ImagePart):
image_uuid = get_image_id(img_id)
if len([i for i in images_list if i.id == image_uuid]) == 0:
image = File(id=image_uuid, file_name=part.filename, meta={'debug': False, 'content': part.blob})
images_list.append(image)
return f'![](/api/image/{image_uuid})'
return None
return None
def get_paragraph_element_images(paragraph_element, doc: Document, images_list, get_image_id):
images_xpath_list = [(".//pic:pic", lambda img: img.xpath('.//a:blip/@r:embed')),
(".//w:pict", lambda img: img.xpath('.//v:imagedata/@r:id', namespaces=combine_nsmap))]
images = []
for images_xpath, get_image_id_handle in images_xpath_list:
try:
_images = paragraph_element.xpath(images_xpath)
if _images is not None and len(_images) > 0:
for image in _images:
images.append({'image': image, 'get_image_id_handle': get_image_id_handle})
except Exception as e:
pass
return images
def images_to_string(images, doc: Document, images_list, get_image_id):
return "".join(
[item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if
item is not None])
def get_paragraph_element_txt(paragraph_element, doc: Document, images_list, get_image_id):
try:
images = get_paragraph_element_images(paragraph_element, doc, images_list, get_image_id)
if len(images) > 0:
return images_to_string(images, doc, images_list, get_image_id)
elif paragraph_element.text is not None:
return paragraph_element.text
return ""
except Exception as e:
print(e)
return ""
def get_paragraph_txt(paragraph: Paragraph, doc: Document, images_list, get_image_id):
try:
return "".join([get_paragraph_element_txt(e, doc, images_list, get_image_id) for e in paragraph._element])
except Exception as e:
return ""
def get_cell_text(cell, doc: Document, images_list, get_image_id):
try:
return "".join(
[get_paragraph_txt(paragraph, doc, images_list, get_image_id) for paragraph in cell.paragraphs]).replace(
"\n", '</br>')
except Exception as e:
return ""
def get_image_id_func():
image_map = {}
def get_image_id(image_id):
_v = image_map.get(image_id)
if _v is None:
image_map[image_id] = uuid.uuid7()
return image_map.get(image_id)
return _v
return get_image_id
title_font_list = [
[36, 100],
[30, 36]
]
def get_title_level(paragraph: Paragraph):
try:
if paragraph.style is not None:
psn = paragraph.style.name
if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'):
return int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题',
''))
if len(paragraph.runs) == 1:
font_size = paragraph.runs[0].font.size
pt = font_size.pt
if pt >= 30:
for _value, index in zip(title_font_list, range(len(title_font_list))):
if pt >= _value[0] and pt < _value[1]:
return index + 1
except Exception as e:
pass
return None
class DocSplitHandle(BaseSplitHandle):
@staticmethod
def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_id):
try:
title_level = get_title_level(paragraph)
if title_level is not None:
title = "".join(["#" for i in range(title_level)]) + " " + paragraph.text
images = reduce(lambda x, y: [*x, *y],
[get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
paragraph._element],
[])
if len(images) > 0:
return title + '\n' + images_to_string(images, doc, images_list, get_image_id) if len(
paragraph.text) > 0 else images_to_string(images, doc, images_list, get_image_id)
return title
except Exception as e:
traceback.print_exc()
return paragraph.text
return get_paragraph_txt(paragraph, doc, images_list, get_image_id)
@staticmethod
def table_to_md(table, doc: Document, images_list, get_image_id):
rows = table.rows
# 创建 Markdown 格式的表格
md_table = '| ' + ' | '.join(
[get_cell_text(cell, doc, images_list, get_image_id) for cell in rows[0].cells]) + ' |\n'
md_table += '| ' + ' | '.join(['---' for i in range(len(rows[0].cells))]) + ' |\n'
for row in rows[1:]:
md_table += '| ' + ' | '.join(
[get_cell_text(cell, doc, images_list, get_image_id) for cell in row.cells]) + ' |\n'
return md_table
def to_md(self, doc, images_list, get_image_id):
elements = []
for element in doc.element.body:
tag = str(element.tag)
if tag.endswith('tbl'):
# 处理表格
table = Table(element, doc)
elements.append(table)
elif tag.endswith('p'):
# 处理段落
paragraph = Paragraph(element, doc)
elements.append(paragraph)
return "\n".join(
[self.paragraph_to_md(element, doc, images_list, get_image_id) if isinstance(element,
Paragraph) else self.table_to_md(
element,
doc,
images_list, get_image_id)
for element
in elements])
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
file_name = os.path.basename(file.name)
try:
image_list = []
buffer = get_buffer(file)
doc = Document(io.BytesIO(buffer))
content = self.to_md(doc, image_list, get_image_id_func())
if len(image_list) > 0:
save_image(image_list)
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
except BaseException as e:
traceback.print_exception(e)
return {'name': file_name,
'content': []}
return {'name': file_name,
'content': split_model.parse(content)
}
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".docx") or file_name.endswith(".doc") or file_name.endswith(
".DOC") or file_name.endswith(".DOCX"):
return True
return False
def get_content(self, file, save_image):
try:
image_list = []
buffer = file.read()
doc = Document(io.BytesIO(buffer))
content = self.to_md(doc, image_list, get_image_id_func())
if len(image_list) > 0:
content = content.replace('/api/image/', '/api/file/')
save_image(image_list)
return content
except BaseException as e:
traceback.print_exception(e)
return f'{e}'

View File

@ -0,0 +1,73 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file html_split_handle.py
@date2024/5/23 10:58
@desc:
"""
import re
import traceback
from typing import List
from bs4 import BeautifulSoup
from charset_normalizer import detect
from html2text import html2text
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.split_model import SplitModel
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
def get_encoding(buffer):
beautiful_soup = BeautifulSoup(buffer, "html.parser")
meta_list = beautiful_soup.find_all('meta')
charset_list = [meta.attrs.get('charset') for meta in meta_list if
meta.attrs is not None and 'charset' in meta.attrs]
if len(charset_list) > 0:
charset = charset_list[0]
return charset
return detect(buffer)['encoding']
class HTMLSplitHandle(BaseSplitHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".html") or file_name.endswith(".HTML"):
return True
return False
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
content = html2text(content)
except BaseException as e:
return {'name': file.name,
'content': []}
return {'name': file.name,
'content': split_model.parse(content)
}
def get_content(self, file, save_image):
buffer = file.read()
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
return html2text(content)
except BaseException as e:
traceback.print_exception(e)
return f'{e}'

View File

@ -0,0 +1,339 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file text_split_handle.py
@date2024/3/27 18:19
@desc:
"""
import logging
import os
import re
import tempfile
import time
import traceback
from typing import List
import fitz
from langchain_community.document_loaders import PyPDFLoader
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.split_model import SplitModel
from django.utils.translation import gettext_lazy as _
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"),
re.compile("(?<!\n)\n\n+")]
max_kb = logging.getLogger("max_kb")
def check_links_in_pdf(doc):
for page_number in range(len(doc)):
page = doc[page_number]
links = page.get_links()
if links:
for link in links:
if link['kind'] == 1:
return True
return False
class PdfSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
# 将上传的文件保存到临时文件中
for chunk in file.chunks():
temp_file.write(chunk)
# 获取临时文件的路径
temp_file_path = temp_file.name
pdf_document = fitz.open(temp_file_path)
try:
# 处理有目录的pdf
result = self.handle_toc(pdf_document, limit)
if result is not None:
return {'name': file.name, 'content': result}
# 没目录但是有链接的pdf
result = self.handle_links(pdf_document, pattern_list, with_filter, limit)
if result is not None and len(result) > 0:
return {'name': file.name, 'content': result}
# 没有目录的pdf
content = self.handle_pdf_content(file, pdf_document)
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
except BaseException as e:
max_kb.error(f"File: {file.name}, error: {e}")
return {'name': file.name,
'content': []}
finally:
pdf_document.close()
# 处理完后可以删除临时文件
os.remove(temp_file_path)
return {'name': file.name,
'content': split_model.parse(content)
}
@staticmethod
def handle_pdf_content(file, pdf_document):
content = ""
for page_num in range(len(pdf_document)):
start_time = time.time()
page = pdf_document.load_page(page_num)
text = page.get_text()
if text and text.strip(): # 如果页面中有文本内容
page_content = text
else:
try:
new_doc = fitz.open()
new_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
page_num_pdf = tempfile.gettempdir() + f"/{file.name}_{page_num}.pdf"
new_doc.save(page_num_pdf)
new_doc.close()
loader = PyPDFLoader(page_num_pdf, extract_images=True)
page_content = "\n" + loader.load()[0].page_content
except NotImplementedError as e:
# 文件格式不支持,直接退出
raise e
except BaseException as e:
# 当页出错继续进行下一页,防止一个页面出错导致整个文件解析失败
max_kb.error(f"File: {file.name}, Page: {page_num + 1}, error: {e}")
continue
finally:
os.remove(page_num_pdf)
content += page_content
# Null characters are not allowed.
content = content.replace('\0', '')
elapsed_time = time.time() - start_time
max_kb.debug(
f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}")
return content
@staticmethod
def handle_toc(doc, limit):
# 找到目录
toc = doc.get_toc()
if toc is None or len(toc) == 0:
return None
# 创建存储章节内容的数组
chapters = []
# 遍历目录并按章节提取文本
for i, entry in enumerate(toc):
level, title, start_page = entry
start_page -= 1 # PyMuPDF 页码从 0 开始,书签页码从 1 开始
chapter_title = title
# 确定结束页码,如果是最后一个章节则到文档末尾
if i + 1 < len(toc):
end_page = toc[i + 1][2] - 1
else:
end_page = doc.page_count - 1
# 去掉标题中的符号
title = PdfSplitHandle.handle_chapter_title(title)
# 提取该章节的文本内容
chapter_text = ""
for page_num in range(start_page, end_page + 1):
page = doc.load_page(page_num) # 加载页面
text = page.get_text("text")
text = re.sub(r'(?<!。)\n+', '', text)
text = re.sub(r'(?<!.)\n+', '', text)
# print(f'title: {title}')
idx = text.find(title)
if idx > -1:
text = text[idx + len(title):]
if i + 1 < len(toc):
l, next_title, next_start_page = toc[i + 1]
next_title = PdfSplitHandle.handle_chapter_title(next_title)
# print(f'next_title: {next_title}')
idx = text.find(next_title)
if idx > -1:
text = text[:idx]
chapter_text += text # 提取文本
# Null characters are not allowed.
chapter_text = chapter_text.replace('\0', '')
# 限制标题长度
real_chapter_title = chapter_title[:256]
# 限制章节内容长度
if 0 < limit < len(chapter_text):
split_text = PdfSplitHandle.split_text(chapter_text, limit)
for text in split_text:
chapters.append({"title": real_chapter_title, "content": text})
else:
chapters.append({"title": real_chapter_title, "content": chapter_text if chapter_text else real_chapter_title})
# 保存章节内容和章节标题
return chapters
@staticmethod
def handle_links(doc, pattern_list, with_filter, limit):
# 检查文档是否包含内部链接
if not check_links_in_pdf(doc):
return
# 创建存储章节内容的数组
chapters = []
toc_start_page = -1
page_content = ""
handle_pre_toc = True
# 遍历 PDF 的每一页,查找带有目录链接的页
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
links = page.get_links()
# 如果目录开始页码未设置,则设置为当前页码
if len(links) > 0:
toc_start_page = page_num
if toc_start_page < 0:
page_content += page.get_text('text')
# 检查该页是否包含内部链接(即指向文档内部的页面)
for num in range(len(links)):
link = links[num]
if link['kind'] == 1: # 'kind' 为 1 表示内部链接
# 获取链接目标的页面
dest_page = link['page']
rect = link['from'] # 获取链接的矩形区域
# 如果目录开始页码包括前言部分,则不处理前言部分
if dest_page < toc_start_page:
handle_pre_toc = False
# 提取链接区域的文本作为标题
link_title = page.get_text("text", clip=rect).strip().split("\n")[0].replace('.', '').strip()
# print(f'link_title: {link_title}')
# 提取目标页面内容作为章节开始
start_page = dest_page
end_page = dest_page
# 下一个link
next_link = links[num + 1] if num + 1 < len(links) else None
next_link_title = None
if next_link is not None and next_link['kind'] == 1:
rect = next_link['from']
next_link_title = page.get_text("text", clip=rect).strip() \
.split("\n")[0].replace('.', '').strip()
# print(f'next_link_title: {next_link_title}')
end_page = next_link['page']
# 提取章节内容
chapter_text = ""
for p_num in range(start_page, end_page + 1):
p = doc.load_page(p_num)
text = p.get_text("text")
text = re.sub(r'(?<!。)\n+', '', text)
text = re.sub(r'(?<!.)\n+', '', text)
# print(f'\n{text}\n')
idx = text.find(link_title)
if idx > -1:
text = text[idx + len(link_title):]
if next_link_title is not None:
idx = text.find(next_link_title)
if idx > -1:
text = text[:idx]
chapter_text += text
# Null characters are not allowed.
chapter_text = chapter_text.replace('\0', '')
# 限制章节内容长度
if 0 < limit < len(chapter_text):
split_text = PdfSplitHandle.split_text(chapter_text, limit)
for text in split_text:
chapters.append({"title": link_title, "content": text})
else:
# 保存章节信息
chapters.append({"title": link_title, "content": chapter_text})
# 目录中没有前言部分,手动处理
if handle_pre_toc:
pre_toc = []
lines = page_content.strip().split('\n')
try:
for line in lines:
if re.match(r'^前\s*言', line):
pre_toc.append({'title': line, 'content': ''})
else:
pre_toc[-1]['content'] += line
for i in range(len(pre_toc)):
pre_toc[i]['content'] = re.sub(r'(?<!。)\n+', '', pre_toc[i]['content'])
pre_toc[i]['content'] = re.sub(r'(?<!.)\n+', '', pre_toc[i]['content'])
except BaseException as e:
max_kb.error(_('This document has no preface and is treated as ordinary text: {e}').format(e=e))
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
# 插入目录前的部分
page_content = re.sub(r'(?<!。)\n+', '', page_content)
page_content = re.sub(r'(?<!.)\n+', '', page_content)
page_content = page_content.strip()
pre_toc = split_model.parse(page_content)
chapters = pre_toc + chapters
return chapters
@staticmethod
def split_text(text, length):
segments = []
current_segment = ""
for char in text:
current_segment += char
if len(current_segment) >= length:
# 查找最近的句号
last_period_index = current_segment.rfind('.')
if last_period_index != -1:
segments.append(current_segment[:last_period_index + 1])
current_segment = current_segment[last_period_index + 1:] # 更新当前段落
else:
segments.append(current_segment)
current_segment = ""
# 处理剩余的部分
if current_segment:
segments.append(current_segment)
return segments
@staticmethod
def handle_chapter_title(title):
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
return title
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
return True
return False
def get_content(self, file, save_image):
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
# 将上传的文件保存到临时文件中
temp_file.write(file.read())
# 获取临时文件的路径
temp_file_path = temp_file.name
pdf_document = fitz.open(temp_file_path)
try:
return self.handle_pdf_content(file, pdf_document)
except BaseException as e:
traceback.print_exception(e)
return f'{e}'

View File

@ -0,0 +1,8 @@
# coding=utf-8
"""
@project: qabot
@Author
@file __init__.py.py
@date2023/9/6 10:09
@desc:
"""

View File

@ -0,0 +1,59 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file csv_parse_qa_handle.py
@date2024/5/21 14:59
@desc:
"""
import csv
import io
from charset_normalizer import detect
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
def read_csv_standard(file_path):
data = []
with open(file_path, 'r') as file:
reader = csv.reader(file)
for row in reader:
data.append(row)
return data
class CsvParseQAHandle(BaseParseQAHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".csv"):
return True
return False
def handle(self, file, get_buffer, save_image):
buffer = get_buffer(file)
try:
reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
try:
title_row_list = reader.__next__()
except Exception as e:
return [{'name': file.name, 'paragraphs': []}]
if len(title_row_list) == 0:
return [{'name': file.name, 'paragraphs': []}]
title_row_index_dict = get_title_row_index_dict(title_row_list)
paragraph_list = []
for row in reader:
content = get_row_value(row, title_row_index_dict, 'content')
if content is None:
continue
problem = get_row_value(row, title_row_index_dict, 'problem_list')
problem = str(problem) if problem is not None else ''
problem_list = [{'content': p[0:255]} for p in problem.split('\n') if len(p.strip()) > 0]
title = get_row_value(row, title_row_index_dict, 'title')
title = str(title) if title is not None else ''
paragraph_list.append({'title': title[0:255],
'content': content[0:102400],
'problem_list': problem_list})
return [{'name': file.name, 'paragraphs': paragraph_list}]
except Exception as e:
return [{'name': file.name, 'paragraphs': []}]

View File

@ -0,0 +1,61 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file xls_parse_qa_handle.py
@date2024/5/21 14:59
@desc:
"""
import xlrd
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
def handle_sheet(file_name, sheet):
rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
try:
title_row_list = next(rows)
except Exception as e:
return {'name': file_name, 'paragraphs': []}
if len(title_row_list) == 0:
return {'name': file_name, 'paragraphs': []}
title_row_index_dict = get_title_row_index_dict(title_row_list)
paragraph_list = []
for row in rows:
content = get_row_value(row, title_row_index_dict, 'content')
if content is None:
continue
problem = get_row_value(row, title_row_index_dict, 'problem_list')
problem = str(problem) if problem is not None else ''
problem_list = [{'content': p[0:255]} for p in problem.split('\n') if len(p.strip()) > 0]
title = get_row_value(row, title_row_index_dict, 'title')
title = str(title) if title is not None else ''
content = str(content)
paragraph_list.append({'title': title[0:255],
'content': content[0:102400],
'problem_list': problem_list})
return {'name': file_name, 'paragraphs': paragraph_list}
class XlsParseQAHandle(BaseParseQAHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
buffer = get_buffer(file)
if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
return True
return False
def handle(self, file, get_buffer, save_image):
buffer = get_buffer(file)
try:
workbook = xlrd.open_workbook(file_contents=buffer)
worksheets = workbook.sheets()
worksheets_size = len(worksheets)
return [row for row in
[handle_sheet(file.name,
sheet) if worksheets_size == 1 and sheet.name == 'Sheet1' else handle_sheet(
sheet.name, sheet) for sheet
in worksheets] if row is not None]
except Exception as e:
return [{'name': file.name, 'paragraphs': []}]

View File

@ -0,0 +1,72 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file xlsx_parse_qa_handle.py
@date2024/5/21 14:59
@desc:
"""
import io
import openpyxl
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
from common.handle.impl.common_handle import xlsx_embed_cells_images
def handle_sheet(file_name, sheet, image_dict):
rows = sheet.rows
try:
title_row_list = next(rows)
title_row_list = [row.value for row in title_row_list]
except Exception as e:
return {'name': file_name, 'paragraphs': []}
if len(title_row_list) == 0:
return {'name': file_name, 'paragraphs': []}
title_row_index_dict = get_title_row_index_dict(title_row_list)
paragraph_list = []
for row in rows:
content = get_row_value(row, title_row_index_dict, 'content')
if content is None or content.value is None:
continue
problem = get_row_value(row, title_row_index_dict, 'problem_list')
problem = str(problem.value) if problem is not None and problem.value is not None else ''
problem_list = [{'content': p[0:255]} for p in problem.split('\n') if len(p.strip()) > 0]
title = get_row_value(row, title_row_index_dict, 'title')
title = str(title.value) if title is not None and title.value is not None else ''
content = str(content.value)
image = image_dict.get(content, None)
if image is not None:
content = f'![](/api/image/{image.id})'
paragraph_list.append({'title': title[0:255],
'content': content[0:102400],
'problem_list': problem_list})
return {'name': file_name, 'paragraphs': paragraph_list}
class XlsxParseQAHandle(BaseParseQAHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".xlsx"):
return True
return False
def handle(self, file, get_buffer, save_image):
buffer = get_buffer(file)
try:
workbook = openpyxl.load_workbook(io.BytesIO(buffer))
try:
image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
save_image([item for item in image_dict.values()])
except Exception as e:
image_dict = {}
worksheets = workbook.worksheets
worksheets_size = len(worksheets)
return [row for row in
[handle_sheet(file.name,
sheet,
image_dict) if worksheets_size == 1 and sheet.title == 'Sheet1' else handle_sheet(
sheet.title, sheet, image_dict) for sheet
in worksheets] if row is not None]
except Exception as e:
return [{'name': file.name, 'paragraphs': []}]

View File

@ -0,0 +1,161 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file text_split_handle.py
@date2024/3/27 18:19
@desc:
"""
import io
import os
import re
import uuid_utils.compat as uuid
import zipfile
from typing import List
from urllib.parse import urljoin
from django.db.models import QuerySet
from common.handle.base_parse_qa_handle import BaseParseQAHandle
from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
from common.utils.common import parse_md_image
from knowledge.models import File
from django.utils.translation import gettext_lazy as _
class FileBufferHandle:
buffer = None
def get_buffer(self, file):
if self.buffer is None:
self.buffer = file.read()
return self.buffer
split_handles = [
XlsParseQAHandle(),
XlsxParseQAHandle(),
CsvParseQAHandle()
]
def file_to_paragraph(file, save_inner_image):
"""
文件转换为段落列表
@param file: 文件
@return: {
name:文件名
paragraphs:段落列表
}
"""
get_buffer = FileBufferHandle().get_buffer
for split_handle in split_handles:
if split_handle.support(file, get_buffer):
return split_handle.handle(file, get_buffer, save_inner_image)
raise Exception(_("Unsupported file format"))
def is_valid_uuid(uuid_str: str):
"""
校验字符串是否是uuid
@param uuid_str: 需要校验的字符串
@return: bool
"""
try:
uuid.UUID(uuid_str)
except ValueError:
return False
return True
def get_image_list(result_list: list, zip_files: List[str]):
"""
获取图片文件列表
@param result_list:
@param zip_files:
@return:
"""
image_file_list = []
for result in result_list:
for p in result.get('paragraphs', []):
content: str = p.get('content', '')
image_list = parse_md_image(content)
for image in image_list:
search = re.search("\(.*\)", image)
if search:
new_image_id = str(uuid.uuid7())
source_image_path = search.group().replace('(', '').replace(')', '')
image_path = urljoin(result.get('name'), '.' + source_image_path if source_image_path.startswith(
'/') else source_image_path)
if not zip_files.__contains__(image_path):
continue
if image_path.startswith('api/file/') or image_path.startswith('api/image/'):
image_id = image_path.replace('api/file/', '').replace('api/image/', '')
if is_valid_uuid(image_id):
image_file_list.append({'source_file': image_path,
'image_id': image_id})
else:
image_file_list.append({'source_file': image_path,
'image_id': new_image_id})
content = content.replace(source_image_path, f'/api/image/{new_image_id}')
p['content'] = content
else:
image_file_list.append({'source_file': image_path,
'image_id': new_image_id})
content = content.replace(source_image_path, f'/api/image/{new_image_id}')
p['content'] = content
return image_file_list
def filter_image_file(result_list: list, image_list):
image_source_file_list = [image.get('source_file') for image in image_list]
return [r for r in result_list if not image_source_file_list.__contains__(r.get('name', ''))]
class ZipParseQAHandle(BaseParseQAHandle):
def handle(self, file, get_buffer, save_image):
buffer = get_buffer(file)
bytes_io = io.BytesIO(buffer)
result = []
# 打开zip文件
with zipfile.ZipFile(bytes_io, 'r') as zip_ref:
# 获取压缩包中的文件名列表
files = zip_ref.namelist()
# 读取压缩包中的文件内容
for file in files:
# 跳过 macOS 特有的元数据目录和文件
if file.endswith('/') or file.startswith('__MACOSX'):
continue
with zip_ref.open(file) as f:
# 对文件内容进行处理
try:
value = file_to_paragraph(f, save_image)
if isinstance(value, list):
result = [*result, *value]
else:
result.append(value)
except Exception:
pass
image_list = get_image_list(result, files)
result = filter_image_file(result, image_list)
image_mode_list = []
for image in image_list:
with zip_ref.open(image.get('source_file')) as f:
i = File(
id=image.get('image_id'),
file_name=os.path.basename(image.get('source_file')),
meta={'debug': False, 'content': f.read()}
)
image_mode_list.append(i)
save_image(image_mode_list)
return result
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".zip") or file_name.endswith(".ZIP"):
return True
return False

View File

@ -0,0 +1,8 @@
# coding=utf-8
"""
@project: qabot
@Author
@file __init__.py.py
@date2023/9/6 10:09
@desc:
"""

View File

@ -0,0 +1,52 @@
# coding=utf-8
"""
@project: MaxKB
@Author
@file openai_to_response.py
@date2024/9/6 16:08
@desc:
"""
import datetime
from django.http import JsonResponse
from openai.types import CompletionUsage
from openai.types.chat import ChatCompletionChunk, ChatCompletionMessage, ChatCompletion
from openai.types.chat.chat_completion import Choice as BlockChoice
from openai.types.chat.chat_completion_chunk import Choice, ChoiceDelta
from rest_framework import status
from common.handle.base_to_response import BaseToResponse
class OpenaiToResponse(BaseToResponse):
def to_block_response(self, chat_id, chat_record_id, content, is_end, completion_tokens, prompt_tokens,
other_params: dict = None,
_status=status.HTTP_200_OK):
if other_params is None:
other_params = {}
data = ChatCompletion(id=chat_record_id, choices=[
BlockChoice(finish_reason='stop', index=0, chat_id=chat_id,
answer_list=other_params.get('answer_list', ""),
message=ChatCompletionMessage(role='assistant', content=content))],
created=datetime.datetime.now().second, model='', object='chat.completion',
usage=CompletionUsage(completion_tokens=completion_tokens,
prompt_tokens=prompt_tokens,
total_tokens=completion_tokens + prompt_tokens)
).dict()
return JsonResponse(data=data, status=_status)
def to_stream_chunk_response(self, chat_id, chat_record_id, node_id, up_node_id_list, content, is_end,
completion_tokens,
prompt_tokens, other_params: dict = None):
if other_params is None:
other_params = {}
chunk = ChatCompletionChunk(id=chat_record_id, model='', object='chat.completion.chunk',
created=datetime.datetime.now().second, choices=[
Choice(delta=ChoiceDelta(content=content, reasoning_content=other_params.get('reasoning_content', ""),
chat_id=chat_id),
finish_reason='stop' if is_end else None,
index=0)],
usage=CompletionUsage(completion_tokens=completion_tokens,
prompt_tokens=prompt_tokens,
total_tokens=completion_tokens + prompt_tokens)).json()
return super().format_stream_chunk(chunk)

View File

@ -0,0 +1,41 @@
# coding=utf-8
"""
@project: MaxKB
@Author
@file system_to_response.py
@date2024/9/6 18:03
@desc:
"""
import json
from rest_framework import status
from common.handle.base_to_response import BaseToResponse
from common.result import result
class SystemToResponse(BaseToResponse):
def to_block_response(self, chat_id, chat_record_id, content, is_end, completion_tokens,
prompt_tokens, other_params: dict = None,
_status=status.HTTP_200_OK):
if other_params is None:
other_params = {}
return result.success({'chat_id': str(chat_id), 'id': str(chat_record_id), 'operate': True,
'content': content, 'is_end': is_end, **other_params,
'completion_tokens': completion_tokens, 'prompt_tokens': prompt_tokens},
response_status=_status,
code=_status)
def to_stream_chunk_response(self, chat_id, chat_record_id, node_id, up_node_id_list, content, is_end,
completion_tokens,
prompt_tokens, other_params: dict = None):
if other_params is None:
other_params = {}
chunk = json.dumps({'chat_id': str(chat_id), 'chat_record_id': str(chat_record_id), 'operate': True,
'content': content, 'node_id': node_id, 'up_node_id_list': up_node_id_list,
'is_end': is_end,
'usage': {'completion_tokens': completion_tokens,
'prompt_tokens': prompt_tokens,
'total_tokens': completion_tokens + prompt_tokens},
**other_params})
return super().format_stream_chunk(chunk)

View File

@ -0,0 +1,8 @@
# coding=utf-8
"""
@project: qabot
@Author
@file __init__.py.py
@date2023/9/6 10:09
@desc:
"""

View File

@ -0,0 +1,44 @@
# coding=utf-8
import logging
from charset_normalizer import detect
from common.handle.base_parse_table_handle import BaseParseTableHandle
max_kb = logging.getLogger("max_kb")
class CsvSplitHandle(BaseParseTableHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".csv"):
return True
return False
def handle(self, file, get_buffer,save_image):
buffer = get_buffer(file)
try:
content = buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
max_kb.error(f'csv split handle error: {e}')
return [{'name': file.name, 'paragraphs': []}]
csv_model = content.split('\n')
paragraphs = []
# 第一行为标题
title = csv_model[0].split(',')
for row in csv_model[1:]:
if not row:
continue
line = '; '.join([f'{key}:{value}' for key, value in zip(title, row.split(','))])
paragraphs.append({'title': '', 'content': line})
return [{'name': file.name, 'paragraphs': paragraphs}]
def get_content(self, file, save_image):
buffer = file.read()
try:
return buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
max_kb.error(f'csv split handle error: {e}')
return f'error: {e}'

View File

@ -0,0 +1,94 @@
# coding=utf-8
import logging
import xlrd
from common.handle.base_parse_table_handle import BaseParseTableHandle
max_kb = logging.getLogger("max_kb")
class XlsSplitHandle(BaseParseTableHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
buffer = get_buffer(file)
if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
return True
return False
def handle(self, file, get_buffer, save_image):
buffer = get_buffer(file)
try:
wb = xlrd.open_workbook(file_contents=buffer, formatting_info=True)
result = []
sheets = wb.sheets()
for sheet in sheets:
# 获取合并单元格的范围信息
merged_cells = sheet.merged_cells
print(merged_cells)
data = []
paragraphs = []
# 获取第一行作为标题行
headers = [sheet.cell_value(0, col_idx) for col_idx in range(sheet.ncols)]
# 从第二行开始遍历每一行(跳过标题行)
for row_idx in range(1, sheet.nrows):
row_data = {}
for col_idx in range(sheet.ncols):
cell_value = sheet.cell_value(row_idx, col_idx)
# 检查是否为空单元格,如果为空检查是否在合并区域中
if cell_value == "":
# 检查当前单元格是否在合并区域
for (rlo, rhi, clo, chi) in merged_cells:
if rlo <= row_idx < rhi and clo <= col_idx < chi:
# 使用合并区域的左上角单元格的值
cell_value = sheet.cell_value(rlo, clo)
break
# 将标题作为键,单元格的值作为值存入字典
row_data[headers[col_idx]] = cell_value
data.append(row_data)
for row in data:
row_output = "; ".join([f"{key}: {value}" for key, value in row.items()])
# print(row_output)
paragraphs.append({'title': '', 'content': row_output})
result.append({'name': sheet.name, 'paragraphs': paragraphs})
except BaseException as e:
max_kb.error(f'excel split handle error: {e}')
return [{'name': file.name, 'paragraphs': []}]
return result
def get_content(self, file, save_image):
# 打开 .xls 文件
try:
workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True)
sheets = workbook.sheets()
md_tables = ''
for sheet in sheets:
# 过滤空白的sheet
if sheet.nrows == 0 or sheet.ncols == 0:
continue
# 获取表头和内容
headers = sheet.row_values(0)
data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)]
# 构建 Markdown 表格
md_table = '| ' + ' | '.join(headers) + ' |\n'
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
for row in data:
# 将每个单元格中的内容替换换行符为 <br> 以保留原始格式
md_table += '| ' + ' | '.join(
[str(cell)
.replace('\r\n', '<br>')
.replace('\n', '<br>')
if cell else '' for cell in row]) + ' |\n'
md_tables += md_table + '\n\n'
return md_tables
except Exception as e:
max_kb.error(f'excel split handle error: {e}')
return f'error: {e}'

View File

@ -0,0 +1,118 @@
# coding=utf-8
import io
import logging
from openpyxl import load_workbook
from common.handle.base_parse_table_handle import BaseParseTableHandle
from common.handle.impl.common_handle import xlsx_embed_cells_images
max_kb = logging.getLogger("max_kb")
class XlsxSplitHandle(BaseParseTableHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith('.xlsx'):
return True
return False
def fill_merged_cells(self, sheet, image_dict):
data = []
# 获取第一行作为标题行
headers = []
for idx, cell in enumerate(sheet[1]):
if cell.value is None:
headers.append(' ' * (idx + 1))
else:
headers.append(cell.value)
# 从第二行开始遍历每一行
for row in sheet.iter_rows(min_row=2, values_only=False):
row_data = {}
for col_idx, cell in enumerate(row):
cell_value = cell.value
# 如果单元格为空,并且该单元格在合并单元格内,获取合并单元格的值
if cell_value is None:
for merged_range in sheet.merged_cells.ranges:
if cell.coordinate in merged_range:
cell_value = sheet[merged_range.min_row][merged_range.min_col - 1].value
break
image = image_dict.get(cell_value, None)
if image is not None:
cell_value = f'![](/api/image/{image.id})'
# 使用标题作为键,单元格的值作为值存入字典
row_data[headers[col_idx]] = cell_value
data.append(row_data)
return data
def handle(self, file, get_buffer, save_image):
buffer = get_buffer(file)
try:
wb = load_workbook(io.BytesIO(buffer))
try:
image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
save_image([item for item in image_dict.values()])
except Exception as e:
image_dict = {}
result = []
for sheetname in wb.sheetnames:
paragraphs = []
ws = wb[sheetname]
data = self.fill_merged_cells(ws, image_dict)
for row in data:
row_output = "; ".join([f"{key}: {value}" for key, value in row.items()])
# print(row_output)
paragraphs.append({'title': '', 'content': row_output})
result.append({'name': sheetname, 'paragraphs': paragraphs})
except BaseException as e:
max_kb.error(f'excel split handle error: {e}')
return [{'name': file.name, 'paragraphs': []}]
return result
def get_content(self, file, save_image):
try:
# 加载 Excel 文件
workbook = load_workbook(file)
try:
image_dict: dict = xlsx_embed_cells_images(file)
if len(image_dict) > 0:
save_image(image_dict.values())
except Exception as e:
print(f'{e}')
image_dict = {}
md_tables = ''
# 如果未指定 sheet_name则使用第一个工作表
for sheetname in workbook.sheetnames:
sheet = workbook[sheetname] if sheetname else workbook.active
rows = self.fill_merged_cells(sheet, image_dict)
if len(rows) == 0:
continue
# 提取表头和内容
headers = [f"{key}" for key, value in rows[0].items()]
# 构建 Markdown 表格
md_table = '| ' + ' | '.join(headers) + ' |\n'
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
for row in rows:
r = [f'{value}' for key, value in row.items()]
md_table += '| ' + ' | '.join(
[str(cell).replace('\n', '<br>') if cell is not None else '' for cell in r]) + ' |\n'
md_tables += md_table + '\n\n'
md_tables = md_tables.replace('/api/image/', '/api/file/')
return md_tables
except Exception as e:
max_kb.error(f'excel split handle error: {e}')
return f'error: {e}'

View File

@ -0,0 +1,60 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file text_split_handle.py
@date2024/3/27 18:19
@desc:
"""
import re
import traceback
from typing import List
from charset_normalizer import detect
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.split_model import SplitModel
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
class TextSplitHandle(BaseSplitHandle):
def support(self, file, get_buffer):
buffer = get_buffer(file)
file_name: str = file.name.lower()
if file_name.endswith(".md") or file_name.endswith('.txt') or file_name.endswith('.TXT') or file_name.endswith(
'.MD'):
return True
result = detect(buffer)
if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \
result['confidence'] > 0.5:
return True
return False
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
try:
content = buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
return {'name': file.name,
'content': []}
return {'name': file.name,
'content': split_model.parse(content)
}
def get_content(self, file, save_image):
buffer = file.read()
try:
return buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
traceback.print_exception(e)
return f'{e}'

View File

@ -0,0 +1,80 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file xls_parse_qa_handle.py
@date2024/5/21 14:59
@desc:
"""
from typing import List
import xlrd
from common.handle.base_split_handle import BaseSplitHandle
def post_cell(cell_value):
return cell_value.replace('\r\n', '<br>').replace('\n', '<br>').replace('|', '&#124;')
def row_to_md(row):
return '| ' + ' | '.join(
[post_cell(str(cell)) if cell is not None else '' for cell in row]) + ' |\n'
def handle_sheet(file_name, sheet, limit: int):
rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
paragraphs = []
result = {'name': file_name, 'content': paragraphs}
try:
title_row_list = next(rows)
title_md_content = row_to_md(title_row_list)
title_md_content += '| ' + ' | '.join(
['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
except Exception as e:
return result
if len(title_row_list) == 0:
return result
result_item_content = ''
for row in rows:
next_md_content = row_to_md(row)
next_md_content_len = len(next_md_content)
result_item_content_len = len(result_item_content)
if len(result_item_content) == 0:
result_item_content += title_md_content
result_item_content += next_md_content
else:
if result_item_content_len + next_md_content_len < limit:
result_item_content += next_md_content
else:
paragraphs.append({'content': result_item_content, 'title': ''})
result_item_content = title_md_content + next_md_content
if len(result_item_content) > 0:
paragraphs.append({'content': result_item_content, 'title': ''})
return result
class XlsSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
try:
workbook = xlrd.open_workbook(file_contents=buffer)
worksheets = workbook.sheets()
worksheets_size = len(worksheets)
return [row for row in
[handle_sheet(file.name,
sheet, limit) if worksheets_size == 1 and sheet.name == 'Sheet1' else handle_sheet(
sheet.name, sheet, limit) for sheet
in worksheets] if row is not None]
except Exception as e:
return [{'name': file.name, 'content': []}]
def get_content(self, file, save_image):
pass
def support(self, file, get_buffer):
file_name: str = file.name.lower()
buffer = get_buffer(file)
if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
return True
return False

View File

@ -0,0 +1,92 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file xlsx_parse_qa_handle.py
@date2024/5/21 14:59
@desc:
"""
import io
from typing import List
import openpyxl
from common.handle.base_split_handle import BaseSplitHandle
from common.handle.impl.common_handle import xlsx_embed_cells_images
def post_cell(image_dict, cell_value):
image = image_dict.get(cell_value, None)
if image is not None:
return f'![](/api/image/{image.id})'
return cell_value.replace('\n', '<br>').replace('|', '&#124;')
def row_to_md(row, image_dict):
return '| ' + ' | '.join(
[post_cell(image_dict, str(cell.value if cell.value is not None else '')) if cell is not None else '' for cell
in row]) + ' |\n'
def handle_sheet(file_name, sheet, image_dict, limit: int):
rows = sheet.rows
paragraphs = []
result = {'name': file_name, 'content': paragraphs}
try:
title_row_list = next(rows)
title_md_content = row_to_md(title_row_list, image_dict)
title_md_content += '| ' + ' | '.join(
['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
except Exception as e:
return result
if len(title_row_list) == 0:
return result
result_item_content = ''
for row in rows:
next_md_content = row_to_md(row, image_dict)
next_md_content_len = len(next_md_content)
result_item_content_len = len(result_item_content)
if len(result_item_content) == 0:
result_item_content += title_md_content
result_item_content += next_md_content
else:
if result_item_content_len + next_md_content_len < limit:
result_item_content += next_md_content
else:
paragraphs.append({'content': result_item_content, 'title': ''})
result_item_content = title_md_content + next_md_content
if len(result_item_content) > 0:
paragraphs.append({'content': result_item_content, 'title': ''})
return result
class XlsxSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
try:
workbook = openpyxl.load_workbook(io.BytesIO(buffer))
try:
image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
save_image([item for item in image_dict.values()])
except Exception as e:
image_dict = {}
worksheets = workbook.worksheets
worksheets_size = len(worksheets)
return [row for row in
[handle_sheet(file.name,
sheet,
image_dict,
limit) if worksheets_size == 1 and sheet.title == 'Sheet1' else handle_sheet(
sheet.title, sheet, image_dict, limit) for sheet
in worksheets] if row is not None]
except Exception as e:
return [{'name': file.name, 'content': []}]
def get_content(self, file, save_image):
pass
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".xlsx"):
return True
return False

View File

@ -0,0 +1,164 @@
# coding=utf-8
"""
@project: maxkb
@Author
@file text_split_handle.py
@date2024/3/27 18:19
@desc:
"""
import io
import os
import re
import zipfile
from typing import List
from urllib.parse import urljoin
import uuid_utils.compat as uuid
from charset_normalizer import detect
from django.utils.translation import gettext_lazy as _
from common.handle.base_split_handle import BaseSplitHandle
from common.handle.impl.csv_split_handle import CsvSplitHandle
from common.handle.impl.doc_split_handle import DocSplitHandle
from common.handle.impl.html_split_handle import HTMLSplitHandle
from common.handle.impl.pdf_split_handle import PdfSplitHandle
from common.handle.impl.text_split_handle import TextSplitHandle
from common.handle.impl.xls_split_handle import XlsSplitHandle
from common.handle.impl.xlsx_split_handle import XlsxSplitHandle
from common.utils.common import parse_md_image
from knowledge.models import File
class FileBufferHandle:
buffer = None
def get_buffer(self, file):
if self.buffer is None:
self.buffer = file.read()
return self.buffer
default_split_handle = TextSplitHandle()
split_handles = [
HTMLSplitHandle(),
DocSplitHandle(),
PdfSplitHandle(),
XlsxSplitHandle(),
XlsSplitHandle(),
CsvSplitHandle(),
default_split_handle
]
def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int, save_inner_image):
get_buffer = FileBufferHandle().get_buffer
for split_handle in split_handles:
if split_handle.support(file, get_buffer):
return split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, save_inner_image)
raise Exception(_('Unsupported file format'))
def is_valid_uuid(uuid_str: str):
try:
uuid.UUID(uuid_str)
except ValueError:
return False
return True
def get_image_list(result_list: list, zip_files: List[str]):
image_file_list = []
for result in result_list:
for p in result.get('content', []):
content: str = p.get('content', '')
image_list = parse_md_image(content)
for image in image_list:
search = re.search("\(.*\)", image)
if search:
new_image_id = str(uuid.uuid7())
source_image_path = search.group().replace('(', '').replace(')', '')
source_image_path = source_image_path.strip().split(" ")[0]
image_path = urljoin(result.get('name'), '.' + source_image_path if source_image_path.startswith(
'/') else source_image_path)
if not zip_files.__contains__(image_path):
continue
if image_path.startswith('api/file/') or image_path.startswith('api/image/'):
image_id = image_path.replace('api/file/', '').replace('api/image/', '')
if is_valid_uuid(image_id):
image_file_list.append({'source_file': image_path,
'image_id': image_id})
else:
image_file_list.append({'source_file': image_path,
'image_id': new_image_id})
content = content.replace(source_image_path, f'/api/image/{new_image_id}')
p['content'] = content
else:
image_file_list.append({'source_file': image_path,
'image_id': new_image_id})
content = content.replace(source_image_path, f'/api/image/{new_image_id}')
p['content'] = content
return image_file_list
def get_file_name(file_name):
try:
file_name_code = file_name.encode('cp437')
charset = detect(file_name_code)['encoding']
return file_name_code.decode(charset)
except Exception as e:
return file_name
def filter_image_file(result_list: list, image_list):
image_source_file_list = [image.get('source_file') for image in image_list]
return [r for r in result_list if not image_source_file_list.__contains__(r.get('name', ''))]
class ZipSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
bytes_io = io.BytesIO(buffer)
result = []
# 打开zip文件
with zipfile.ZipFile(bytes_io, 'r') as zip_ref:
# 获取压缩包中的文件名列表
files = zip_ref.namelist()
# 读取压缩包中的文件内容
for file in files:
if file.endswith('/') or file.startswith('__MACOSX'):
continue
with zip_ref.open(file) as f:
# 对文件内容进行处理
try:
# 处理一下文件名
f.name = get_file_name(f.name)
value = file_to_paragraph(f, pattern_list, with_filter, limit, save_image)
if isinstance(value, list):
result = [*result, *value]
else:
result.append(value)
except Exception:
pass
image_list = get_image_list(result, files)
result = filter_image_file(result, image_list)
image_mode_list = []
for image in image_list:
with zip_ref.open(image.get('source_file')) as f:
i = File(
id=image.get('image_id'),
image_name=os.path.basename(image.get('source_file')),
meta={'debug': False, 'content': f.read()} # 这里的content是二进制数据
)
image_mode_list.append(i)
save_image(image_mode_list)
return result
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".zip") or file_name.endswith(".ZIP"):
return True
return False
def get_content(self, file, save_image):
return ""

View File

@ -257,3 +257,9 @@ def post(post_function):
return inner
def parse_md_image(content: str):
matches = re.finditer("!\[.*?\]\(.*?\)", content)
image_list = [match.group() for match in matches]
return image_list

View File

@ -32,3 +32,46 @@ class DocumentCreateAPI(APIMixin):
@staticmethod
def get_response():
return DocumentCreateResponse
class DocumentSplitAPI(APIMixin):
@staticmethod
def get_parameters():
return [
OpenApiParameter(
name="workspace_id",
description="工作空间id",
type=OpenApiTypes.STR,
location='path',
required=True,
),
OpenApiParameter(
name="file",
description="文件",
type=OpenApiTypes.BINARY,
location='query',
required=False,
),
OpenApiParameter(
name="limit",
description="分段长度",
type=OpenApiTypes.INT,
location='query',
required=False,
),
OpenApiParameter(
name="patterns",
description="分段正则列表",
type=OpenApiTypes.STR,
location='query',
required=False,
),
OpenApiParameter(
name="with_filter",
description="是否清除特殊字符",
type=OpenApiTypes.BOOL,
location='query',
required=False,
),
]

View File

@ -13,14 +13,34 @@ from rest_framework import serializers
from common.db.search import native_search
from common.event import ListenerManagement
from common.exception.app_exception import AppApiException
from common.handle.impl.csv_split_handle import CsvSplitHandle
from common.handle.impl.doc_split_handle import DocSplitHandle
from common.handle.impl.html_split_handle import HTMLSplitHandle
from common.handle.impl.pdf_split_handle import PdfSplitHandle
from common.handle.impl.text_split_handle import TextSplitHandle
from common.handle.impl.xls_split_handle import XlsSplitHandle
from common.handle.impl.xlsx_split_handle import XlsxSplitHandle
from common.handle.impl.zip_split_handle import ZipSplitHandle
from common.utils.common import post, get_file_content
from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
TaskType
TaskType, File
from knowledge.serializers.common import ProblemParagraphManage
from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer
from knowledge.task import embedding_by_document
from maxkb.const import PROJECT_DIR
default_split_handle = TextSplitHandle()
split_handles = [
HTMLSplitHandle(),
DocSplitHandle(),
PdfSplitHandle(),
XlsxSplitHandle(),
XlsSplitHandle(),
CsvSplitHandle(),
ZipSplitHandle(),
default_split_handle
]
class DocumentInstanceSerializer(serializers.Serializer):
name = serializers.CharField(required=True, label=_('document name'), max_length=128, min_length=1)
@ -34,6 +54,17 @@ class DocumentCreateRequest(serializers.Serializer):
documents = DocumentInstanceSerializer(required=False, many=True)
class DocumentSplitRequest(serializers.Serializer):
file = serializers.ListField(required=True, label=_('file list'))
limit = serializers.IntegerField(required=False, label=_('limit'))
patterns = serializers.ListField(
required=False,
child=serializers.CharField(required=True, label=_('patterns')),
label=_('patterns')
)
with_filter = serializers.BooleanField(required=False, label=_('Auto Clean'))
class DocumentSerializers(serializers.Serializer):
class Operate(serializers.Serializer):
document_id = serializers.UUIDField(required=True, label=_('document id'))
@ -177,3 +208,67 @@ class DocumentSerializers(serializers.Serializer):
document_model,
instance.get('paragraphs') if 'paragraphs' in instance else []
)
class Split(serializers.Serializer):
workspace_id = serializers.CharField(required=True, label=_('workspace id'))
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
def is_valid(self, *, raise_exception=True):
super().is_valid(raise_exception=True)
files = self.data.get('file')
for f in files:
if f.size > 1024 * 1024 * 100:
raise AppApiException(500, _(
'The maximum size of the uploaded file cannot exceed {}MB'
).format(100))
def parse(self, instance):
self.is_valid(raise_exception=True)
DocumentSplitRequest(instance).is_valid(raise_exception=True)
file_list = instance.get("file")
return reduce(
lambda x, y: [*x, *y],
[self.file_to_paragraph(
f,
instance.get("patterns", None),
instance.get("with_filter", None),
instance.get("limit", 4096)
) for f in file_list],
[]
)
def save_image(self, image_list):
if image_list is not None and len(image_list) > 0:
exist_image_list = [str(i.get('id')) for i in
QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))]
save_image_list = list({img.id: img for img in save_image_list}.values())
# save image
for file in save_image_list:
file_bytes = file.meta.pop('content')
file.workspace_id = self.data.get('workspace_id')
file.meta['knowledge_id'] = self.data.get('knowledge_id')
file.save(file_bytes)
def file_to_paragraph(self, file, pattern_list: List, with_filter: bool, limit: int):
get_buffer = FileBufferHandle().get_buffer
for split_handle in split_handles:
if split_handle.support(file, get_buffer):
result = split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image)
if isinstance(result, list):
return result
return [result]
result = default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image)
if isinstance(result, list):
return result
return [result]
class FileBufferHandle:
buffer = None
def get_buffer(self, file):
if self.buffer is None:
self.buffer = file.read()
return self.buffer

View File

@ -8,5 +8,6 @@ urlpatterns = [
path('workspace/<str:workspace_id>/knowledge/base', views.KnowledgeBaseView.as_view()),
path('workspace/<str:workspace_id>/knowledge/web', views.KnowledgeWebView.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>', views.KnowledgeView.Operate.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/split', views.DocumentView.Split.as_view()),
path('workspace/<str:workspace_id>/knowledge/<int:current_page>/<int:page_size>', views.KnowledgeView.Page.as_view()),
]

View File

@ -1 +1,2 @@
from .knowledge import *
from .document import *

View File

@ -0,0 +1,70 @@
from django.utils.translation import gettext_lazy as _
from drf_spectacular.utils import extend_schema
from rest_framework.parsers import MultiPartParser
from rest_framework.request import Request
from rest_framework.views import APIView
from common.auth import TokenAuth
from common.auth.authentication import has_permissions
from common.constants.permission_constants import PermissionConstants, CompareConstants
from common.result import result
from knowledge.api.document import DocumentSplitAPI
from knowledge.api.knowledge import KnowledgeTreeReadAPI
from knowledge.serializers.document import DocumentSerializers
from knowledge.serializers.knowledge import KnowledgeSerializer
class DocumentView(APIView):
authentication_classes = [TokenAuth]
@extend_schema(
methods=['GET'],
description=_('Get document'),
operation_id=_('Get document'),
parameters=KnowledgeTreeReadAPI.get_parameters(),
responses=KnowledgeTreeReadAPI.get_response(),
tags=[_('Knowledge Base')]
)
@has_permissions(PermissionConstants.DOCUMENT_READ.get_workspace_permission())
def get(self, request: Request, workspace_id: str):
return result.success(KnowledgeSerializer.Query(
data={
'workspace_id': workspace_id,
'folder_id': request.query_params.get('folder_id'),
'name': request.query_params.get('name'),
'desc': request.query_params.get("desc"),
'user_id': request.query_params.get('user_id')
}
).list())
class Split(APIView):
authentication_classes = [TokenAuth]
parser_classes = [MultiPartParser]
@extend_schema(
methods=['POST'],
description=_('Segmented document'),
operation_id=_('Segmented document'),
parameters=DocumentSplitAPI.get_parameters(),
request=DocumentSplitAPI.get_request(),
responses=DocumentSplitAPI.get_response(),
tags=[_('Knowledge Base/Documentation')]
)
@has_permissions([
PermissionConstants.DOCUMENT_CREATE.get_workspace_permission(),
PermissionConstants.DOCUMENT_EDIT.get_workspace_permission(),
])
def post(self, request: Request, workspace_id: str, knowledge_id: str):
split_data = {'file': request.FILES.getlist('file')}
request_data = request.data
if 'patterns' in request.data and request.data.get('patterns') is not None and len(
request.data.get('patterns')) > 0:
split_data.__setitem__('patterns', request_data.getlist('patterns'))
if 'limit' in request.data:
split_data.__setitem__('limit', request_data.get('limit'))
if 'with_filter' in request.data:
split_data.__setitem__('with_filter', request_data.get('with_filter'))
return result.success(DocumentSerializers.Split(data={
'workspace_id': workspace_id,
'knowledge_id': knowledge_id,
}).parse(split_data))

View File

@ -46,6 +46,12 @@ celery-once = "3.0.1"
beautifulsoup4 = "4.13.4"
html2text = "2025.4.15"
jieba = "0.42.1"
openpyxl = "3.1.5"
python-docx = "1.1.2"
xlrd = "2.0.1"
xlwt = "1.3.0"
pymupdf = "1.24.9"
pypdf = "4.3.1"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"