MaxKB/apps/dataset/task/tools.py
CaptainB bb6fd01200
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run
feat: 生成关联问题
--story=1016225 --user=刘瑞斌 【北区】在现有的分段基础上,加个 「生成问题」 ,支持调用大模型 生成问题并关联到分段。 https://www.tapd.cn/57709429/s/1589572
2024-10-09 17:46:33 +08:00

84 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
"""
@project: MaxKB
@Author
@file tools.py
@date2024/8/20 21:48
@desc:
"""
import logging
import re
import traceback
from common.util.fork import ChildLink, Fork
from common.util.split_model import get_split_model
from dataset.models import Type, Document, Status
max_kb_error = logging.getLogger("max_kb_error")
max_kb = logging.getLogger("max_kb")
def get_save_handler(dataset_id, selector):
from dataset.serializers.document_serializers import DocumentSerializers
def handler(child_link: ChildLink, response: Fork.Response):
if response.status == 200:
try:
document_name = child_link.tag.text if child_link.tag is not None and len(
child_link.tag.text.strip()) > 0 else child_link.url
paragraphs = get_split_model('web.md').parse(response.content)
DocumentSerializers.Create(data={'dataset_id': dataset_id}).save(
{'name': document_name, 'paragraphs': paragraphs,
'meta': {'source_url': child_link.url, 'selector': selector},
'type': Type.web}, with_valid=True)
except Exception as e:
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
return handler
def get_sync_web_document_handler(dataset_id):
from dataset.serializers.document_serializers import DocumentSerializers
def handler(source_url: str, selector, response: Fork.Response):
if response.status == 200:
try:
paragraphs = get_split_model('web.md').parse(response.content)
# 插入
DocumentSerializers.Create(data={'dataset_id': dataset_id}).save(
{'name': source_url[0:128], 'paragraphs': paragraphs,
'meta': {'source_url': source_url, 'selector': selector},
'type': Type.web}, with_valid=True)
except Exception as e:
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
else:
Document(name=source_url[0:128],
dataset_id=dataset_id,
meta={'source_url': source_url, 'selector': selector},
type=Type.web,
char_length=0,
status=Status.error).save()
return handler
def save_problem(dataset_id, document_id, paragraph_id, problem):
from dataset.serializers.paragraph_serializers import ParagraphSerializers
# print(f"dataset_id: {dataset_id}")
# print(f"document_id: {document_id}")
# print(f"paragraph_id: {paragraph_id}")
# print(f"problem: {problem}")
problem = re.sub(r"^\d+\.\s*", "", problem)
pattern = r"<question>(.*?)</question>"
match = re.search(pattern, problem)
problem = match.group(1) if match else None
if problem is None or len(problem) == 0:
return
try:
ParagraphSerializers.Problem(
data={"dataset_id": dataset_id, 'document_id': document_id,
'paragraph_id': paragraph_id}).save(instance={"content": problem}, with_valid=True)
except Exception as e:
max_kb_error.error(f'关联问题失败: {e}')