实践-并发调用qwen-openai格式http请求
·
1、用ms-swift启动模型
2、用python实现http调用
3、用并发实现同时调用
import requests
class Qwen:
api_key = 'EMPTY'
base_url = "http://127.0.0.1:8001/v1/chat/completions"
name = 'qwen2_5-72b-instruct'
class Llama:
api_key = 'EMPTY'
base_url = "http://127.0.0.1:8001/v1/chat/completions"
name = 'llama3-70b-instruct'
def chat_with_gpt(api_key, messages, model="gpt-3.5-turbo", temperature=0.3, stream=False, url="https://api.openai.com/v1/chat/completions"):
"""
与OpenAI ChatGPT API交互的函数。
参数:
api_key (str): OpenAI API密钥。
messages (list): 消息列表,格式为 [{"role": "user", "content": "你的消息"}].
model (str): 使用的模型,默认为 "gpt-3.5-turbo".
temperature (float): 温度参数,控制生成文本的随机性,默认为 0.7.
stream (bool): 是否启用流式输出,默认为 False.
url (str): API端点,默认为 "https://api.openai.com/v1/chat/completions".
返回:
如果 stream=False,返回完整的响应内容。
如果 stream=True,返回一个生成器,逐步生成响应片段。
"""
# 请求头
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
import json
# 请求体
data = {
"model": model,
"messages": messages,
"temperature": temperature,
"stream": stream
}
print('header',json.dumps(headers,ensure_ascii=False))
print('data',json.dumps(data,ensure_ascii=False))
# 发送请求
if stream:
# 流式请求
response = requests.post(url, headers=headers, json=data, stream=True)
return _handle_stream_response(response)
else:
# 非流式请求
response = requests.post(url, headers=headers, json=data)
return response.json()
def _handle_stream_response(response):
"""
处理流式响应的生成器函数。
参数:
response: requests.Response 对象。
返回:
生成器,逐步生成响应片段。
"""
for chunk in response.iter_lines():
if chunk:
decoded_chunk = chunk.decode("utf-8")
yield decoded_chunk
message = [{"role": "user", "content": "你是谁"}]
import time
# t1 = time.time()
# r1 = chat_with_gpt(Qwen.api_key, message, model=Qwen.name,url=Qwen.base_url)
# t2 = time.time()
# print(t2-t1)
#
# t1 = time.time()
# r2 = chat_with_gpt(Llama.api_key, message, model=Llama.name,url=Llama.base_url)
# t2 = time.time()
# print(t2-t1)
# print(r2)
#print(r2)
def chat_with_model(model_class, message):
"""
并发请求的辅助函数,用于与特定模型交互。
参数:
model_class: 模型类(如 Qwen 或 Llama)。
message (list): 消息列表,格式为 [{"role": "user", "content": "你的消息"}].
返回:
模型的响应内容。
"""
t1 = time.time()
response = chat_with_gpt(model_class.api_key, message, model=model_class.name, url=model_class.base_url)
t2 = time.time()
print(f"Time taken for {model_class.name}: {t2 - t1} seconds")
return response,round(t2-t1,4)
import concurrent.futures
# 定义不同的消息
messages = [
[{"role": "user", "content": "你是谁"}],
[{"role": "user", "content": "你是谁"}],
[{"role": "user", "content": "你是谁"}],
[{"role": "user", "content": "你是谁"}],
[{"role": "user", "content": "你是谁"}],
[{"role": "user", "content": "你是谁"}],
[{"role": "user", "content": "你是谁"}],
[{"role": "user", "content": "你是谁"}],
[{"role": "user", "content": "你是谁"}],
[{"role": "user", "content": "你是谁"}],
]
# 定义模型列表
models = [ Llama]
durations = []
# 并发请求多个模型和不同的消息
with concurrent.futures.ThreadPoolExecutor() as executor:
# 提交任务,每个任务使用不同的模型和消息
futures = {
executor.submit(chat_with_model, model, message): (model, message)
for model in models
for message in messages
}
# 获取结果
for future in concurrent.futures.as_completed(futures):
model, message = futures[future]
try:
response,duration = future.result()
durations.append(duration)
print(f"Response from {model.name} with message {message}: {response}")
except Exception as e:
print(f"Error occurred with {model.name} and message {message}: {e}")
# llama
#[1.487, 5.2435, 6.6109, 7.9721, 9.3312, 10.737, 15.1935, 16.5562, 17.9155, 19.2751]
# qwen
#[8.5853, 18.9962, 29.1533, 37.4593, 45.8992, 54.3415, 62.7779, 72.2762, 82.2203, 92.1694]
print(durations)
更多推荐


所有评论(0)