一个简单测试Deepseek吞吐量的脚本,这里用DeepSeek-R1-Distill-Qwen-32B ,支持单卡4090 24G可跑,具体看你的硬件情况做调整,理论支持所有的模型,看你需要,可以修改模型名称,重点是pip使用国内的源,模型下载用阿里的ModelScope,无障碍下载,使用.
最后可以生成一个txt与html报表.
前提是你安装了python与python-venv,你可以不用venv来做,可动手改一下脚本.
 

#!/bin/bash
set -e
# 参数配置
MODEL_REPO="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
BATCH_SIZES=(1 2 4 10 20 30 50)  # 32B模型显存需求大,batch_size调小
SEQ_LENGTHS=(256 512)
WARMUP_STEPS=3
MEASURE_STEPS=10
VENV_NAME="deepseek_test"
REPORT_FILE="benchmark_report.txt"
HTML_REPORT_FILE="benchmark_report.html"

# 创建虚拟环境
echo "创建Python虚拟环境..."
python3 -m venv $VENV_NAME
source $VENV_NAME/bin/activate

# 配置国内pip源
echo "配置国内pip源..."
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn

# 安装依赖
echo "安装依赖包..."
pip install torch transformers modelscope accelerate

# 测试函数
run_benchmark() {
    local batch_size=$1
    local seq_length=$2
    
    echo -e "\n测试配置: batch_size=${batch_size}, seq_length=${seq_length}"
    
    python3 - <<EOF
import torch
from modelscope import AutoModelForCausalLM, AutoTokenizer
import time

model_name = "${MODEL_REPO}"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载模型和分词器
print("加载模型和分词器中...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# 显式设置 pad_token_id
if tokenizer.eos_token is None:
    eos_token = tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id)
    tokenizer.add_special_tokens({'pad_token': eos_token})
else:
    tokenizer.pad_token = tokenizer.eos_token

# 准备输入
input_text = "测试" * ${seq_length}
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs.input_ids.repeat(${batch_size}, 1).to(device)
attention_mask = inputs.attention_mask.repeat(${batch_size}, 1).to(device)

# 预热
print("预热中...")
for _ in range(${WARMUP_STEPS}):
    _ = model.generate(input_ids, attention_mask=attention_mask, max_length=input_ids.shape[1]+10)

# 正式测试
print("性能测试中...")
start_time = time.time()
for _ in range(${MEASURE_STEPS}):
    _ = model.generate(input_ids, attention_mask=attention_mask, max_length=input_ids.shape[1]+10)
elapsed = time.time() - start_time

# 计算指标
avg_latency = elapsed / ${MEASURE_STEPS}
tokens_per_sec = (${batch_size} * 10) / avg_latency

print(f"平均延迟: {avg_latency:.3f}s")
print(f"吞吐量: {tokens_per_sec:.2f} tokens/s")
print(f"显存占用: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")
EOF
}

# 主流程
echo "$MODEL_REPO 性能测试开始" > $REPORT_FILE
echo "GPU信息:" >> $REPORT_FILE
nvidia-smi --query-gpu=name,memory.total --format=csv >> $REPORT_FILE

echo "<html><body><h1>DeepSeek-R1-Distill-Qwen-32B性能测试报告</h1>" > $HTML_REPORT_FILE
echo "<p>GPU信息:</p>" >> $HTML_REPORT_FILE
nvidia-smi --query-gpu=name,memory.total --format=csv | sed 's/^/<p>/' | sed 's/$/<\/p>/' >> $HTML_REPORT_FILE

for bs in ${BATCH_SIZES[@]}; do
    for seq in ${SEQ_LENGTHS[@]}; do
        echo -e "\n测试配置: batch_size=${bs}, seq_length=${seq}" >> $REPORT_FILE
        echo "<h2>测试配置: batch_size=${bs}, seq_length=${seq}</h2>" >> $HTML_REPORT_FILE
        run_benchmark $bs $seq | tee -a $REPORT_FILE | sed 's/^/<p>/' | sed 's/$/<\/p>/' >> $HTML_REPORT_FILE
    done
done

echo "</body></html>" >> $HTML_REPORT_FILE

deactivate
echo "测试完成"
Logo

火山引擎开发者社区是火山引擎打造的AI技术生态平台,聚焦Agent与大模型开发,提供豆包系列模型(图像/视频/视觉)、智能分析与会话工具,并配套评测集、动手实验室及行业案例库。社区通过技术沙龙、挑战赛等活动促进开发者成长,新用户可领50万Tokens权益,助力构建智能应用。

更多推荐