mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
refactor: reorganize file handling imports into a structured directory
This commit is contained in:
parent
2a5cd4ca14
commit
43bef216d5
|
|
@ -9,12 +9,12 @@
|
|||
import io
|
||||
import os
|
||||
import re
|
||||
import uuid_utils.compat as uuid
|
||||
import zipfile
|
||||
from typing import List
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from django.db.models import QuerySet
|
||||
import uuid_utils.compat as uuid
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
from common.handle.base_parse_qa_handle import BaseParseQAHandle
|
||||
from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
|
||||
|
|
@ -22,7 +22,6 @@ from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
|
|||
from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
|
||||
from common.utils.common import parse_md_image
|
||||
from knowledge.models import File
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
|
||||
class FileBufferHandle:
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ class CsvSplitHandle(BaseParseTableHandle):
|
|||
return True
|
||||
return False
|
||||
|
||||
def handle(self, file, get_buffer,save_image):
|
||||
def handle(self, file, get_buffer, save_image):
|
||||
buffer = get_buffer(file)
|
||||
try:
|
||||
content = buffer.decode(detect(buffer)['encoding'])
|
||||
|
|
@ -41,4 +41,4 @@ class CsvSplitHandle(BaseParseTableHandle):
|
|||
return buffer.decode(detect(buffer)['encoding'])
|
||||
except BaseException as e:
|
||||
max_kb.error(f'csv split handle error: {e}')
|
||||
return f'error: {e}'
|
||||
return f'error: {e}'
|
||||
|
|
|
|||
|
|
@ -78,7 +78,6 @@ class XlsxSplitHandle(BaseParseTableHandle):
|
|||
return [{'name': file.name, 'paragraphs': []}]
|
||||
return result
|
||||
|
||||
|
||||
def get_content(self, file, save_image):
|
||||
try:
|
||||
# 加载 Excel 文件
|
||||
|
|
|
|||
|
|
@ -10,10 +10,10 @@ import io
|
|||
import os
|
||||
import re
|
||||
import traceback
|
||||
import uuid_utils.compat as uuid
|
||||
from functools import reduce
|
||||
from typing import List
|
||||
|
||||
import uuid_utils.compat as uuid
|
||||
from docx import Document, ImagePart
|
||||
from docx.oxml import ns
|
||||
from docx.table import Table
|
||||
|
|
@ -22,7 +22,6 @@ from docx.text.paragraph import Paragraph
|
|||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.utils.split_model import SplitModel
|
||||
from knowledge.models import File
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
||||
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
||||
|
|
@ -70,4 +70,4 @@ class HTMLSplitHandle(BaseSplitHandle):
|
|||
return html2text(content)
|
||||
except BaseException as e:
|
||||
traceback.print_exception(e)
|
||||
return f'{e}'
|
||||
return f'{e}'
|
||||
|
|
@ -15,11 +15,11 @@ import traceback
|
|||
from typing import List
|
||||
|
||||
import fitz
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
|
||||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.utils.split_model import SplitModel
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
||||
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
||||
|
|
@ -42,6 +42,7 @@ def check_links_in_pdf(doc):
|
|||
return True
|
||||
return False
|
||||
|
||||
|
||||
class PdfSplitHandle(BaseSplitHandle):
|
||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||
|
|
@ -181,7 +182,8 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
for text in split_text:
|
||||
chapters.append({"title": real_chapter_title, "content": text})
|
||||
else:
|
||||
chapters.append({"title": real_chapter_title, "content": chapter_text if chapter_text else real_chapter_title})
|
||||
chapters.append(
|
||||
{"title": real_chapter_title, "content": chapter_text if chapter_text else real_chapter_title})
|
||||
# 保存章节内容和章节标题
|
||||
return chapters
|
||||
|
||||
|
|
@ -336,4 +338,4 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
return self.handle_pdf_content(file, pdf_document)
|
||||
except BaseException as e:
|
||||
traceback.print_exception(e)
|
||||
return f'{e}'
|
||||
return f'{e}'
|
||||
|
|
@ -54,7 +54,7 @@ class TextSplitHandle(BaseSplitHandle):
|
|||
def get_content(self, file, save_image):
|
||||
buffer = file.read()
|
||||
try:
|
||||
return buffer.decode(detect(buffer)['encoding'])
|
||||
return buffer.decode(detect(buffer)['encoding'])
|
||||
except BaseException as e:
|
||||
traceback.print_exception(e)
|
||||
return f'{e}'
|
||||
return f'{e}'
|
||||
|
|
@ -18,13 +18,13 @@ from charset_normalizer import detect
|
|||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.handle.impl.csv_split_handle import CsvSplitHandle
|
||||
from common.handle.impl.doc_split_handle import DocSplitHandle
|
||||
from common.handle.impl.html_split_handle import HTMLSplitHandle
|
||||
from common.handle.impl.pdf_split_handle import PdfSplitHandle
|
||||
from common.handle.impl.text_split_handle import TextSplitHandle
|
||||
from common.handle.impl.xls_split_handle import XlsSplitHandle
|
||||
from common.handle.impl.xlsx_split_handle import XlsxSplitHandle
|
||||
from common.handle.impl.text.csv_split_handle import CsvSplitHandle
|
||||
from common.handle.impl.text.doc_split_handle import DocSplitHandle
|
||||
from common.handle.impl.text.html_split_handle import HTMLSplitHandle
|
||||
from common.handle.impl.text.pdf_split_handle import PdfSplitHandle
|
||||
from common.handle.impl.text.text_split_handle import TextSplitHandle
|
||||
from common.handle.impl.text.xls_split_handle import XlsSplitHandle
|
||||
from common.handle.impl.text.xlsx_split_handle import XlsxSplitHandle
|
||||
from common.utils.common import parse_md_image
|
||||
from knowledge.models import File
|
||||
|
||||
|
|
@ -13,14 +13,14 @@ from rest_framework import serializers
|
|||
from common.db.search import native_search
|
||||
from common.event import ListenerManagement
|
||||
from common.exception.app_exception import AppApiException
|
||||
from common.handle.impl.csv_split_handle import CsvSplitHandle
|
||||
from common.handle.impl.doc_split_handle import DocSplitHandle
|
||||
from common.handle.impl.html_split_handle import HTMLSplitHandle
|
||||
from common.handle.impl.pdf_split_handle import PdfSplitHandle
|
||||
from common.handle.impl.text_split_handle import TextSplitHandle
|
||||
from common.handle.impl.xls_split_handle import XlsSplitHandle
|
||||
from common.handle.impl.xlsx_split_handle import XlsxSplitHandle
|
||||
from common.handle.impl.zip_split_handle import ZipSplitHandle
|
||||
from common.handle.impl.text.csv_split_handle import CsvSplitHandle
|
||||
from common.handle.impl.text.doc_split_handle import DocSplitHandle
|
||||
from common.handle.impl.text.html_split_handle import HTMLSplitHandle
|
||||
from common.handle.impl.text.pdf_split_handle import PdfSplitHandle
|
||||
from common.handle.impl.text.text_split_handle import TextSplitHandle
|
||||
from common.handle.impl.text.xls_split_handle import XlsSplitHandle
|
||||
from common.handle.impl.text.xlsx_split_handle import XlsxSplitHandle
|
||||
from common.handle.impl.text.zip_split_handle import ZipSplitHandle
|
||||
from common.utils.common import post, get_file_content
|
||||
from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
|
||||
TaskType, File
|
||||
|
|
|
|||
Loading…
Reference in New Issue