mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-27 20:42:52 +00:00
feat: refine regex patterns in text_split_handle for improved comment detection
--bug=1057526 --user=刘瑞斌 【知识库】markdown文件导入知识库,分段详情中代码块展示异常 https://www.tapd.cn/62980211/s/1719131
This commit is contained in:
parent
56fe631ed6
commit
e24a2001c5
|
|
@ -15,12 +15,14 @@ from charset_normalizer import detect
|
|||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.utils.split_model import SplitModel
|
||||
|
||||
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
||||
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
||||
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
|
||||
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
|
||||
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
|
||||
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
|
||||
default_pattern_list = [
|
||||
re.compile('(?<=^)# (?!-\\*- coding:).*|(?<=\\n)# (?!-\\*- coding:).*'),
|
||||
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
||||
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
|
||||
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
|
||||
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
|
||||
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")
|
||||
]
|
||||
|
||||
|
||||
class TextSplitHandle(BaseSplitHandle):
|
||||
|
|
@ -45,11 +47,8 @@ class TextSplitHandle(BaseSplitHandle):
|
|||
try:
|
||||
content = buffer.decode(detect(buffer)['encoding'])
|
||||
except BaseException as e:
|
||||
return {'name': file.name,
|
||||
'content': []}
|
||||
return {'name': file.name,
|
||||
'content': split_model.parse(content)
|
||||
}
|
||||
return {'name': file.name, 'content': []}
|
||||
return {'name': file.name, 'content': split_model.parse(content)}
|
||||
|
||||
def get_content(self, file, save_image):
|
||||
buffer = file.read()
|
||||
|
|
|
|||
Loading…
Reference in New Issue