fix: 修改分段正则,优化分段逻辑

This commit is contained in:
shaohuzhang1 2024-04-09 18:05:50 +08:00
parent b18da6b21e
commit 765c79ed9d
5 changed files with 55 additions and 28 deletions

View File

@ -15,10 +15,12 @@ from docx import Document
from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
re.compile("(?<!#)### (?!#).*"),
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
class DocSplitHandle(BaseSplitHandle):

View File

@ -14,10 +14,13 @@ import fitz
from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
re.compile("(?<!#)### (?!#).*"),
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"),
re.compile("(?<!\n)\n\n+")]
def number_to_text(pdf_document, page_number):

View File

@ -14,10 +14,12 @@ from charset_normalizer import detect
from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
re.compile("(?<!#)### (?!#).*"),
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
class TextSplitHandle(BaseSplitHandle):

View File

@ -295,7 +295,7 @@ class SplitModel:
"""
if len(self.content_level_pattern) == index:
return
level_content_list = parse_title_level(text, self.content_level_pattern, index)
level_content_list = parse_title_level(text, self.content_level_pattern, 0)
cursor = 0
for i in range(len(level_content_list)):
block, cursor = get_level_block(text, level_content_list, i, cursor)
@ -313,10 +313,15 @@ class SplitModel:
if end_index == 0:
return level_content_list
other_content = text[0:end_index]
if len(other_content.strip()) > 0:
level_content_list = [*level_content_list, *list(
map(lambda row: to_tree_obj(row, 'block'),
post_handler_paragraph(other_content, with_filter=self.with_filter, limit=self.limit)))]
children = self.parse_to_tree(text=other_content,
index=index)
if len(children) > 0:
level_content_list = [*level_content_list, *children]
else:
if len(other_content.strip()) > 0:
level_content_list = [*level_content_list, *list(
map(lambda row: to_tree_obj(row, 'block'),
post_handler_paragraph(other_content, with_filter=self.with_filter, limit=self.limit)))]
else:
if len(text.strip()) > 0:
level_content_list = [*level_content_list, *list(
@ -330,15 +335,16 @@ class SplitModel:
:param text: 文本数据
:return: 解析后数据 {content:段落数据,keywords:[段落关键词],parent_chain:['段落父级链路']}
"""
result_tree = self.parse_to_tree(text.replace('\r', '\n'), 0)
text = text.replace('\r', '\n')
result_tree = self.parse_to_tree(text, 0)
result = result_tree_to_paragraph(result_tree, [], [])
# 过滤段落内容不为空字符串的数据
result = [item for item in result if 'content' in item and len(item.get('content').strip()) > 0]
return [self.post_reset_paragraph(item) for item in result]
return [item for item in [self.post_reset_paragraph(row) for row in result] if
'content' in item and len(item.get('content').strip()) > 0]
def post_reset_paragraph(self, paragraph: Dict):
result = self.filter_title_special_characters(paragraph)
result = self.sub_title(result)
result = self.content_is_null(result)
return result
@staticmethod
@ -349,6 +355,15 @@ class SplitModel:
return {**paragraph, 'title': title[0:255], 'content': title[255:len(title)] + paragraph.get('content')}
return paragraph
@staticmethod
def content_is_null(paragraph: Dict):
if 'title' in paragraph:
title = paragraph.get('title')
content = paragraph.get('content')
if (content is None or len(content.strip()) == 0) and (title is not None and len(title) > 0):
return {'title': '', 'content': title}
return paragraph
@staticmethod
def filter_title_special_characters(paragraph: Dict):
title = paragraph.get('title') if 'title' in paragraph else ''
@ -361,9 +376,12 @@ class SplitModel:
title_special_characters_list = ['#', '\n', '\r', '\\s']
default_split_pattern = {
'md': [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'), re.compile("(?<!#)### (?!#).*"),
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")],
'md': [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")],
'default': [re.compile("(?<!\n)\n\n+")]
}

View File

@ -506,10 +506,12 @@ class DocumentSerializers(ApiMixin, serializers.Serializer):
class SplitPattern(ApiMixin, serializers.Serializer):
@staticmethod
def list():
return [{'key': "#", 'value': '(?<=^)# .*|(?<=\\n)# .*'}, {'key': '##', 'value': '(?<!#)## (?!#).*'},
{'key': '###', 'value': "(?<!#)### (?!#).*"}, {'key': '####', 'value': "(?<!#)#### (?!#).*"},
{'key': '#####', 'value': "(?<!#)##### (?!#).*"},
{'key': '######', 'value': "(?<!#)###### (?!#).*"},
return [{'key': "#", 'value': '(?<=^)# .*|(?<=\\n)# .*'},
{'key': '##', 'value': '(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'},
{'key': '###', 'value': "(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"},
{'key': '####', 'value': "(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"},
{'key': '#####', 'value': "(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"},
{'key': '######', 'value': "(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"},
{'key': '-', 'value': '(?<! )- .*'},
{'key': '空格', 'value': '(?<!\\s)\\s(?!\\s)'},
{'key': '分号', 'value': '(?<!)(?!)'}, {'key': '逗号', 'value': '(?<!)(?!)'},