refactor: reorganize file handling imports into a structured directory

This commit is contained in:
CaptainB 2025-04-30 16:06:30 +08:00
parent 2a5cd4ca14
commit 43bef216d5
13 changed files with 28 additions and 29 deletions

View File

@ -9,12 +9,12 @@
import io
import os
import re
import uuid_utils.compat as uuid
import zipfile
from typing import List
from urllib.parse import urljoin
from django.db.models import QuerySet
import uuid_utils.compat as uuid
from django.utils.translation import gettext_lazy as _
from common.handle.base_parse_qa_handle import BaseParseQAHandle
from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
@ -22,7 +22,6 @@ from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
from common.utils.common import parse_md_image
from knowledge.models import File
from django.utils.translation import gettext_lazy as _
class FileBufferHandle:

View File

@ -15,7 +15,7 @@ class CsvSplitHandle(BaseParseTableHandle):
return True
return False
def handle(self, file, get_buffer,save_image):
def handle(self, file, get_buffer, save_image):
buffer = get_buffer(file)
try:
content = buffer.decode(detect(buffer)['encoding'])
@ -41,4 +41,4 @@ class CsvSplitHandle(BaseParseTableHandle):
return buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
max_kb.error(f'csv split handle error: {e}')
return f'error: {e}'
return f'error: {e}'

View File

@ -78,7 +78,6 @@ class XlsxSplitHandle(BaseParseTableHandle):
return [{'name': file.name, 'paragraphs': []}]
return result
def get_content(self, file, save_image):
try:
# 加载 Excel 文件

View File

View File

@ -10,10 +10,10 @@ import io
import os
import re
import traceback
import uuid_utils.compat as uuid
from functools import reduce
from typing import List
import uuid_utils.compat as uuid
from docx import Document, ImagePart
from docx.oxml import ns
from docx.table import Table
@ -22,7 +22,6 @@ from docx.text.paragraph import Paragraph
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.split_model import SplitModel
from knowledge.models import File
from django.utils.translation import gettext_lazy as _
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),

View File

@ -70,4 +70,4 @@ class HTMLSplitHandle(BaseSplitHandle):
return html2text(content)
except BaseException as e:
traceback.print_exception(e)
return f'{e}'
return f'{e}'

View File

@ -15,11 +15,11 @@ import traceback
from typing import List
import fitz
from django.utils.translation import gettext_lazy as _
from langchain_community.document_loaders import PyPDFLoader
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.split_model import SplitModel
from django.utils.translation import gettext_lazy as _
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
@ -42,6 +42,7 @@ def check_links_in_pdf(doc):
return True
return False
class PdfSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
@ -181,7 +182,8 @@ class PdfSplitHandle(BaseSplitHandle):
for text in split_text:
chapters.append({"title": real_chapter_title, "content": text})
else:
chapters.append({"title": real_chapter_title, "content": chapter_text if chapter_text else real_chapter_title})
chapters.append(
{"title": real_chapter_title, "content": chapter_text if chapter_text else real_chapter_title})
# 保存章节内容和章节标题
return chapters
@ -336,4 +338,4 @@ class PdfSplitHandle(BaseSplitHandle):
return self.handle_pdf_content(file, pdf_document)
except BaseException as e:
traceback.print_exception(e)
return f'{e}'
return f'{e}'

View File

@ -54,7 +54,7 @@ class TextSplitHandle(BaseSplitHandle):
def get_content(self, file, save_image):
buffer = file.read()
try:
return buffer.decode(detect(buffer)['encoding'])
return buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
traceback.print_exception(e)
return f'{e}'
return f'{e}'

View File

@ -18,13 +18,13 @@ from charset_normalizer import detect
from django.utils.translation import gettext_lazy as _
from common.handle.base_split_handle import BaseSplitHandle
from common.handle.impl.csv_split_handle import CsvSplitHandle
from common.handle.impl.doc_split_handle import DocSplitHandle
from common.handle.impl.html_split_handle import HTMLSplitHandle
from common.handle.impl.pdf_split_handle import PdfSplitHandle
from common.handle.impl.text_split_handle import TextSplitHandle
from common.handle.impl.xls_split_handle import XlsSplitHandle
from common.handle.impl.xlsx_split_handle import XlsxSplitHandle
from common.handle.impl.text.csv_split_handle import CsvSplitHandle
from common.handle.impl.text.doc_split_handle import DocSplitHandle
from common.handle.impl.text.html_split_handle import HTMLSplitHandle
from common.handle.impl.text.pdf_split_handle import PdfSplitHandle
from common.handle.impl.text.text_split_handle import TextSplitHandle
from common.handle.impl.text.xls_split_handle import XlsSplitHandle
from common.handle.impl.text.xlsx_split_handle import XlsxSplitHandle
from common.utils.common import parse_md_image
from knowledge.models import File

View File

@ -13,14 +13,14 @@ from rest_framework import serializers
from common.db.search import native_search
from common.event import ListenerManagement
from common.exception.app_exception import AppApiException
from common.handle.impl.csv_split_handle import CsvSplitHandle
from common.handle.impl.doc_split_handle import DocSplitHandle
from common.handle.impl.html_split_handle import HTMLSplitHandle
from common.handle.impl.pdf_split_handle import PdfSplitHandle
from common.handle.impl.text_split_handle import TextSplitHandle
from common.handle.impl.xls_split_handle import XlsSplitHandle
from common.handle.impl.xlsx_split_handle import XlsxSplitHandle
from common.handle.impl.zip_split_handle import ZipSplitHandle
from common.handle.impl.text.csv_split_handle import CsvSplitHandle
from common.handle.impl.text.doc_split_handle import DocSplitHandle
from common.handle.impl.text.html_split_handle import HTMLSplitHandle
from common.handle.impl.text.pdf_split_handle import PdfSplitHandle
from common.handle.impl.text.text_split_handle import TextSplitHandle
from common.handle.impl.text.xls_split_handle import XlsSplitHandle
from common.handle.impl.text.xlsx_split_handle import XlsxSplitHandle
from common.handle.impl.text.zip_split_handle import ZipSplitHandle
from common.utils.common import post, get_file_content
from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
TaskType, File