From 9f889d8806ebfc973d697ae1fd97540353cb7df6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=B8=8D=E5=81=9A=E4=BA=86=E7=9D=A1=E5=A4=A7=E8=A7=89?=
 <64798754+stakeswky@users.noreply.github.com>
Date: Thu, 9 Nov 2023 11:52:53 +0800
Subject: [PATCH] Create Python API (#457)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 更新镜像

* 更新镜像信息

* 更新镜像信息

* Create openai_api.py

* Create requirements.txt

* Create README.md

* 添加python接口

* Delete python directory

* Create README.md

* Create Python API

* 文件结构化

* 文件结构化
---
 python/api/README.md              | 54 ++++++++++++++++++
 python/api/api.py                 | 74 ++++++++++++++++++++++++
 python/api/dockerfile             | 20 +++++++
 python/api/main.py                | 48 ++++++++++++++++
 python/api/requirements.txt       | 14 +++++
 python/api/services/fetch.py      | 90 ++++++++++++++++++++++++++++++
 python/api/services/office2txt.py | 93 +++++++++++++++++++++++++++++++
 python/api/test/fetch_test.py     | 25 +++++++++
 python/api/test/office_test.py    | 49 ++++++++++++++++
 9 files changed, 467 insertions(+)
 create mode 100644 python/api/README.md
 create mode 100644 python/api/api.py
 create mode 100644 python/api/dockerfile
 create mode 100644 python/api/main.py
 create mode 100644 python/api/requirements.txt
 create mode 100644 python/api/services/fetch.py
 create mode 100644 python/api/services/office2txt.py
 create mode 100644 python/api/test/fetch_test.py
 create mode 100644 python/api/test/office_test.py

diff --git a/python/api/README.md b/python/api/README.md
new file mode 100644
index 000000000..0c1e0434e
--- /dev/null
+++ b/python/api/README.md
@@ -0,0 +1,54 @@
+# FastGPT-python-API
+作者：stakeswky。有问题请这样联系我：stakeswky@gmail.com
+## 1. 项目简介
+该API以python为技术栈，为fastgpt提供了一个简单易用的接口，方便fastgpt处理各种任务。该API的主要功能包括：
+1. Word & PDF 图文提取
+在现有的文件读取中，fastgpt只能读取文件中的文字，而无法读取图片。该API可以将word和pdf中的文字和图片提取出来，方便fastgpt进行处理。
+
+2. 网页递归获取
+该API可以递归获取指定页面的内容和挖掘该页面存在的链接指向页面的内容，请注意，该功能现在仅支持获取静态页面的内容，如果出现动态页面，可能会出现无法获取的情况。
+
+3. （研发中。。）
+
+## 2. 安装方法
+### 必要的知识
+会使用Google  
+python的基本用法  
+docker的基本用法  
+百度OCR-API的文档：https://ai.baidu.com/ai-doc/OCR/Ek3h7xypm
+
+### 2.1 源码安装
+该API依赖于python3.8，请确保您的python版本符合要求。
+```shell
+pip install -r requirements.txt
+```
+引入环境变量：APP_ID,API_KEY,SECRET_KEY
+
+然后运行：
+```shell
+python main.py
+```
+启动！
+
+### 2.2 Docker安装
+一把梭拉现成的镜像，直接拉下来用就行了。
+```shell
+docker pull registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/fastgpt_python_api:1.0
+```
+然后运行,三个环境变量记得配置成自己的：
+```shell
+docker run -d -p 6010:6010 -e APP_ID=<your_app_id> -e API_KEY=<your_api_key> -e SECRET_KEY=<your_secret_key> registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/fastgpt_python_api:1.0
+```
+
+或者你也可以自己打镜像
+```shell
+docker build -t fastgpt-python-api .
+```
+然后运行：
+```shell
+docker run -d -p 6010:6010 -e APP_ID=<your_app_id> -e API_KEY=<your_api_key> -e SECRET_KEY=<your_secret_key> fastgpt-python-api
+```
+## 3. 使用方法
+目录下附带了两个测试案例，分别是word和pdf的图文提取，和网页递归获取。按照那个来使用就好
+
+
diff --git a/python/api/api.py b/python/api/api.py
new file mode 100644
index 000000000..f9aad4eca
--- /dev/null
+++ b/python/api/api.py
@@ -0,0 +1,74 @@
+import os
+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from services.office2txt import office_to_txt
+from typing import List
+from fastapi import HTTPException
+from services.fetch import get_summary
+import aiofiles
+import queue
+import uuid
+
+
+# 请求模型
+class SummaryRequest(BaseModel):
+    url: str
+    level: int
+
+# 响应模型
+class SummaryResponse(BaseModel):
+    url: str
+    title: str
+    summary: str
+
+class ExtractedText(BaseModel):
+    text: str
+
+
+# 文件转文本
+async def process_file(file: UploadFile):
+    file_ext = os.path.splitext(file.filename)[1].lower()
+    if file_ext not in ['.docx', '.pdf', '.doc', '.txt']:
+        return JSONResponse(content={"error": "Unsupported file format"}, status_code=400)
+
+    # 生成唯一的文件名
+    unique_filename = f"{uuid.uuid4()}{file_ext}"
+
+    try:
+        # 读取文件内容并保存到唯一命名的文件中
+        async with aiofiles.open(unique_filename, "wb") as out_file:
+            while True:
+                contents = await file.read(1024)  # 以块的方式读取文件
+                if not contents:
+                    break
+                await out_file.write(contents)
+
+        # 文件处理逻辑，注意传入新的唯一文件名
+        extracted_text = office_to_txt(unique_filename)
+        print(extracted_text)
+        return {"text": extracted_text}
+    except Exception as e:
+        return JSONResponse(content={"error": str(e)}, status_code=500)
+    finally:
+        # 清理：删除临时保存的唯一命名文件
+        if os.path.exists(unique_filename):
+            os.remove(unique_filename)
+
+
+
+# 定义一个处理网页摘要的函数
+async def process_summary(request):
+    if request.level < 0:
+        raise HTTPException(status_code=400, detail="Level must be non-negative.")
+    try:
+        # 使用定义的函数来获取网页摘要
+        summaries = get_summary(request.url, request.level)
+        # 将结果转换为响应模型列表
+        print(summaries)
+        return [SummaryResponse(url=url, title=title, summary=summary) for url, title, summary in summaries]
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+
diff --git a/python/api/dockerfile b/python/api/dockerfile
new file mode 100644
index 000000000..8d6fa54fb
--- /dev/null
+++ b/python/api/dockerfile
@@ -0,0 +1,20 @@
+# 使用官方Python运行时作为父镜像
+FROM python:3.8
+
+# 设置工作目录
+WORKDIR /app
+
+# 将当前目录内容复制到容器的/app中
+ADD . /app
+
+RUN pip install --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple pip
+# 安装程序需要的包
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir -r requirements.txt 
+
+RUN python -m nltk.downloader punkt
+
+# 运行时监听的端口
+EXPOSE 6010
+
+# 运行app.py时的命令及其参数
+CMD ["python", "main.py"]
diff --git a/python/api/main.py b/python/api/main.py
new file mode 100644
index 000000000..bd6133431
--- /dev/null
+++ b/python/api/main.py
@@ -0,0 +1,48 @@
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi import FastAPI, File, UploadFile
+import queue
+from typing import List
+from api import SummaryRequest, SummaryResponse, ExtractedText,process_file,process_summary
+import uvicorn
+
+app = FastAPI()
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+q = queue.Queue()
+
+# 定义一个接口，接收文件并将其放入队列中
+@app.post("/extract_text/", response_model=ExtractedText)
+async def extract_text(file: UploadFile = File(...)):
+    # 将文件对象放入队列中，先进先出
+    q.put(file)
+    # 从队列中取出文件对象，并调用处理函数
+    file = q.get()
+    result = await process_file(file)
+    # 标记队列中的任务已完成
+    q.task_done()
+    # 返回处理结果
+    return result
+
+# 定义一个接口，接收请求并将其放入队列中
+@app.post("/generate_summary/", response_model=List[SummaryResponse])
+async def generate_summary(request: SummaryRequest):
+    # 将请求对象放入队列中，先进先出
+    q.put(request)
+    # 从队列中取出请求对象，并调用处理函数
+    request = q.get()
+    result = await process_summary(request)
+    # 标记队列中的任务已完成
+    q.task_done()
+    # 返回处理结果
+    return result
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=6010)
\ No newline at end of file
diff --git a/python/api/requirements.txt b/python/api/requirements.txt
new file mode 100644
index 000000000..0b7a34f06
--- /dev/null
+++ b/python/api/requirements.txt
@@ -0,0 +1,14 @@
+aiofiles==23.2.1
+baidu_aip==4.16.12
+beautifulsoup4==4.11.1
+fastapi==0.104.1
+nltk==3.8.1
+pdf2image==1.16.3
+pydantic==1.10.7
+PyPDF2==3.0.1
+python_docx==0.8.11
+python_pptx==0.6.21
+Requests==2.31.0
+uvicorn==0.24.0
+chardet==5.2.0
+python-multipart==0.0.6
\ No newline at end of file
diff --git a/python/api/services/fetch.py b/python/api/services/fetch.py
new file mode 100644
index 000000000..92df68d77
--- /dev/null
+++ b/python/api/services/fetch.py
@@ -0,0 +1,90 @@
+import requests
+import bs4
+import nltk
+from urllib.parse import urljoin
+from time import sleep
+import time
+import math
+
+# 全局变量来记录开始时间
+start_time = time.time()
+
+# 你可以设定一个最大运行时长，比如60秒
+max_run_time = 20
+
+# 添加一个简单的IDF计算器
+class SimpleIDFCalculator:
+    def __init__(self):
+        self.doc_freq = {}
+        self.num_docs = 0
+
+    def add_document(self, doc):
+        self.num_docs += 1
+        words = set(nltk.word_tokenize(doc))
+        for word in words:
+            if word in self.doc_freq:
+                self.doc_freq[word] += 1
+            else:
+                self.doc_freq[word] = 1
+
+    def idf(self, word):
+        return math.log(self.num_docs / (1 + self.doc_freq.get(word, 0)))
+
+
+
+# 定义一个函数，用于获取网页的内容，并进行总结
+def get_summary(url, level):
+    result = []
+    visited = set()
+    idf_calculator = SimpleIDFCalculator()
+    helper(url, level, result, visited, idf_calculator)
+    return result
+
+# 辅助函数
+def helper(url, level, result, visited, idf_calculator):
+    # # 检查是否超出运行时间限制
+    # if time.time() - start_time > max_run_time:
+    #     print("Reached max run time, exiting...")
+    #     return
+    
+    if level == 0 or url in visited or not url.startswith("http"):
+        return
+
+    visited.add(url)
+    try:
+        response = requests.get(url)
+        if response.status_code != 200:
+            return
+        soup = bs4.BeautifulSoup(response.text, "html.parser")
+        title = soup.title.string if soup.title else 'No Title'
+        text = soup.get_text().strip()
+        idf_calculator.add_document(text)
+        sentences = nltk.sent_tokenize(text)
+        words = nltk.word_tokenize(text)
+
+        scores = {}
+        for sentence in sentences:
+            for word in nltk.word_tokenize(sentence):
+                tf = words.count(word) / len(words)
+                idf = idf_calculator.idf(word)
+                scores[sentence] = scores.get(sentence, 0) + (tf * idf)
+
+        summary = " ".join(sorted(scores, key=scores.get, reverse=True)[:10])
+        result.append((url, title, summary))
+
+        sleep(1)  # Simple delay to prevent aggressive crawling
+
+        links = soup.find_all("a")
+        for link in links:
+            href = link.get("href")
+            if href:
+                # Handle relative links
+                next_url = urljoin(url, href)
+                helper(next_url, level - 1, result, visited, idf_calculator)
+
+    except Exception as e:
+        print(f"Error processing {url}: {e}")
+
+# # 主程序部分，仅作为函数调用示例：
+# summary = get_summary('https://zhihu.com', 2)
+# print(summary)
diff --git a/python/api/services/office2txt.py b/python/api/services/office2txt.py
new file mode 100644
index 000000000..5d9b0ae7f
--- /dev/null
+++ b/python/api/services/office2txt.py
@@ -0,0 +1,93 @@
+import os
+import docx
+from aip import AipOcr
+from io import BytesIO
+from PyPDF2 import PdfReader
+from pdf2image import convert_from_path
+
+
+# 百度OCR API设置
+APP_ID = os.environ.get('APP_ID','xxx')
+API_KEY = os.environ.get('API_KEY','xxx')
+SECRET_KEY = os.environ.get('SECRET_KEY','xxx')
+
+
+
+client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
+
+def ocr_image(image_data):
+    result = client.basicGeneral(image_data)
+    text = ''
+    if 'words_result' in result:
+        for item in result['words_result']:
+            text += item['words'] + '\n'
+    return text
+
+def process_pdf(file_path):
+    pdf = PdfReader(file_path)
+    num_pages = len(pdf.pages)
+    text = ''
+    for page_num in range(num_pages):
+        page = pdf.pages[page_num]
+        text += f'--------------------------------------------\n'
+        text += f'文档名：{os.path.basename(file_path)}\n'
+        text += f'页数：{page_num + 1}\n'
+        text += f'该页内容：\n'
+        text += page.extract_text() + '\n'
+        images = convert_from_path(file_path, first_page=page_num + 1, last_page=page_num + 1)
+        for image in images:
+            image_data = BytesIO()
+            image.save(image_data, format='PNG')
+            image_data = image_data.getvalue()
+            ocr_text = ocr_image(image_data)
+            if ocr_text:
+                text += f'图片文字：\n'
+                text += ocr_text + '\n'
+        text += '--------------------------------------------\n'
+    return text
+
+def process_doc(file_path):
+    doc = docx.Document(file_path)
+    text = ''
+    page_num = 1
+    for paragraph in doc.paragraphs:
+        if paragraph.text.strip() == '':  # 简单地将空行视为分页符
+            page_num += 1
+        else:
+            text += f'--------------------------------------------\n'
+            text += f'文档名：{os.path.basename(file_path)}\n'
+            text += f'页数：{page_num}\n'
+            text += f'该页内容：\n'
+            text += paragraph.text + '\n'
+
+        for shape in doc.inline_shapes:
+            if shape.type == docx.enum.shape.WD_INLINE_SHAPE.PICTURE:
+                blip_id = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
+                image_part = doc.part.related_parts[blip_id]
+                image_data = image_part.blob
+                ocr_text = ocr_image(image_data)
+                if ocr_text:
+                    text += f'图片文字：\n'
+                    text += ocr_text + '\n'
+
+    return text
+
+def process_txt(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        text = f.read()
+    return text
+
+def office_to_txt(file_path):
+    file_ext = os.path.splitext(file_path)[1].lower()
+    if file_ext == '.docx':
+        return process_doc(file_path)
+    elif file_ext == '.pdf':
+        return process_pdf(file_path)
+    elif file_ext == '.doc':
+        return process_doc(file_path)
+    elif file_ext == '.txt':
+        return process_txt(file_path)
+    
+    else:
+        raise ValueError('Unsupported file format')
+    
diff --git a/python/api/test/fetch_test.py b/python/api/test/fetch_test.py
new file mode 100644
index 000000000..86361b5e3
--- /dev/null
+++ b/python/api/test/fetch_test.py
@@ -0,0 +1,25 @@
+import requests
+
+# 接口的URL
+api_url = "http://127.0.0.1:6010/generate_summary/"
+
+# 请求的数据
+data = {
+    "url": "https://bing.com",
+    "level": 1
+}
+
+# 发送POST请求
+response = requests.post(api_url, json=data)
+
+# 检查响应状态
+if response.status_code == 200:
+    # 请求成功，打印结果
+    summaries = response.json()
+    for summary in summaries:
+        print(f"URL: {summary['url']}")
+        print(f"Title: {summary['title']}")
+        print(f"Summary: {summary['summary']}\n")
+else:
+    # 请求失败，打印错误信息
+    print(f"Failed to generate summary with status code {response.status_code}: {response.text}")
diff --git a/python/api/test/office_test.py b/python/api/test/office_test.py
new file mode 100644
index 000000000..1a7d67f15
--- /dev/null
+++ b/python/api/test/office_test.py
@@ -0,0 +1,49 @@
+import requests
+import pytest
+from docx import Document
+import os
+from tempfile import NamedTemporaryFile
+from docx.shared import Inches
+
+image_path = os.path.join(os.path.dirname(__file__), "test.png")
+
+# 定义一个函数来创建一个新的Word文档，并添加一个图片
+def create_test_docx_with_image():
+    # 使用临时文件来避免文件名冲突
+    temp_file = NamedTemporaryFile(delete=False, suffix='.docx')
+    # 创建一个文档对象
+    doc = Document()
+    # 添加一个段落
+    doc.add_paragraph("This is a test document with an image.")
+    # 添加一个图片，确保提供的图片路径是有效的
+    doc.add_picture(image_path, width=Inches(1.25))  # 图片宽度设为1.25英寸
+    # 保存文档到临时文件
+    doc.save(temp_file.name)
+    # 关闭临时文件
+    temp_file.close()
+    # 返回文件路径
+    return temp_file.name
+
+# 定义一个函数，它将创建并发送多个Word文档，并返回响应对象列表
+def get_responses():
+    responses = []
+    # 创建并发送10个文档
+    for _ in range(10):
+        test_file_path = create_test_docx_with_image()
+        with open(test_file_path, "rb") as f:
+            files = {"file": (os.path.basename(test_file_path), f, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")}
+            response = requests.post("http://127.0.0.1:6010/extract_text/", files=files)
+            responses.append(response)
+        # 测试完成后删除文件
+        os.unlink(test_file_path)
+    return responses
+
+# 使用pytest的parametrize装饰器测试所有响应
+@pytest.mark.parametrize("response", get_responses())
+def test_response(response):
+    # 断言响应的状态码为200
+    assert response.status_code == 200
+    # 断言响应的内容类型是application/json
+    assert "application/json" in response.headers["Content-Type"]
+    # 断言响应的数据包含文本信息
+    assert "text" in response.json()