大模型(LLM)高性能网络综述(二) - 测试实验 NVMe-oF / NCCL / MPI
主要是补充了NVMe-oF,NCCL,MPI 三个组件的测试调试示例
·
承接前文博客
三、相关的技术栈测试调试
3.3 NVMe-oF 示例
a) 基础环境准备

### 基础环境
sudo apt update -y
sudo apt install -y nvme-cli
b) 代码准备
config.sh
#!/bin/bash
# --- Target (te1) Configuration ---
TARGET_IP="172.16.40.93" # te1 的 RDMA IP 地址
TARGET_PORT="4420" # NVMe/RDMA 默认端口
TARGET_NQN="nqn.2024-04.com.example:te1.target1" # Target NQN (必须与 Target 设置一致)
TARGET_BACKING_FILE="/var/lib/nvmet/disk1.img" # Target 上的后端文件路径
TARGET_BACKING_SIZE_MB=1024 # 后端文件大小 (MB)
# --- Initiator (te2) Device/Mount Configuration ---
REMOTE_NVME_DEV_ON_INITIATOR="/dev/nvme1n1" # te2 上预期的设备名
MOUNT_POINT="/mnt/nvmeof_rdma_test" # te2 上的挂载点
te1 初始化01_setup_target.sh
#!/bin/bash
SCRIPT_DIR=$(dirname "$(realpath "$0")")
source "$SCRIPT_DIR/../config.sh" || { echo "Error: config.sh not found!"; exit 1; }
echo "=== Setting up NVMe-oF Target ($TARGET_NQN) ==="
# 1. Load modules
echo "[1/6] Loading kernel modules..."
sudo modprobe nvmet || { echo "Failed to load nvmet"; exit 1; }
sudo modprobe nvmet-rdma || { echo "Failed to load nvmet-rdma"; exit 1; }
sudo modprobe configfs
# 2. Create backing file and loop device
echo "[2/6] Creating backing file ($TARGET_BACKING_FILE, ${TARGET_BACKING_SIZE_MB}MB)..."
sudo mkdir -p "$(dirname "$TARGET_BACKING_FILE")"
if [ ! -f "$TARGET_BACKING_FILE" ]; then
sudo dd if=/dev/zero of="$TARGET_BACKING_FILE" bs=1M count=$TARGET_BACKING_SIZE_MB status=progress || { echo "Failed to create backing file"; exit 1; }
else
echo "Backing file already exists."
fi
sudo losetup -d $(losetup -j $TARGET_BACKING_FILE | cut -d: -f1) 2>/dev/null # Detach if already attached
BACKING_DEV=$(sudo losetup --find --show "$TARGET_BACKING_FILE")
if [ -z "$BACKING_DEV" ]; then echo "Error: Could not set up loop device."; exit 1; fi
echo "Using backing device: $BACKING_DEV"
# 3. Get Initiator NQN (Ensure te2 allows SSH key auth or manually set INITIATOR_NQN in config.sh)
echo "[3/6] Determining Initiator NQN..."
if [ -z "$INITIATOR_NQN" ]; then
echo "Attempting to fetch Initiator NQN from te2 via SSH..."
INITIATOR_NQN=$(ssh te2 cat /etc/nvme/hostnqn 2>/dev/null)
fi
if [ -z "$INITIATOR_NQN" ]; then
echo "Error: Could not automatically get Initiator NQN. Please set INITIATOR_NQN in config.sh"
sudo losetup -d "$BACKING_DEV" # Clean up loop device
exit 1
fi
echo "Will allow Initiator NQN: $INITIATOR_NQN"
# 4. Configure using configfs
echo "[4/6] Configuring target via configfs..."
CONFIGFS_MNT="/sys/kernel/config"
if ! mountpoint -q $CONFIGFS_MNT; then sudo mount -t configfs none $CONFIGFS_MNT; fi
# Clean previous config for this NQN/Port if exists
sudo unlink $CONFIGFS_MNT/nvmet/ports/1/subsystems/$TARGET_NQN 2>/dev/null
sudo rmdir $CONFIGFS_MNT/nvmet/ports/1 2>/dev/null
sudo rmdir $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN/namespaces/1 2>/dev/null
sudo rmdir $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN/allowed_hosts/$INITIATOR_NQN 2>/dev/null
sudo rmdir $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN 2>/dev/null
# Create Subsystem
sudo mkdir $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN || { echo "Failed to create subsystem"; exit 1; }
#
echo "Temporarily allowing any host for testing..."
sudo sh -c "echo 1 > $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN/attr_allow_any_host" || { echo "Failed to set attr_allow_any_host=1"; exit 1; }
# Create Namespace
sudo mkdir $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN/namespaces/1 || { echo "Failed to create namespace 1"; exit 1; }
sudo sh -c "echo -n $BACKING_DEV > $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN/namespaces/1/device_path" || { echo "Failed to set device path"; exit 1; }
sudo sh -c "echo 1 > $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN/namespaces/1/enable" || { echo "Failed to enable namespace 1"; exit 1; }
# Create Port
sudo mkdir $CONFIGFS_MNT/nvmet/ports/1 || { echo "Failed to create port 1"; exit 1; }
sudo sh -c "echo $TARGET_IP > $CONFIGFS_MNT/nvmet/ports/1/addr_traddr" || { echo "Failed to set traddr"; exit 1; }
sudo sh -c "echo rdma > $CONFIGFS_MNT/nvmet/ports/1/addr_trtype" || { echo "Failed to set trtype"; exit 1; }
sudo sh -c "echo $TARGET_PORT > $CONFIGFS_MNT/nvmet/ports/1/addr_trsvcid" || { echo "Failed to set trsvcid"; exit 1; }
sudo sh -c "echo ipv4 > $CONFIGFS_MNT/nvmet/ports/1/addr_adrfam" || { echo "Failed to set adrfam"; exit 1; }
# Link Subsystem to Port
echo "[5/6] Linking subsystem to port..."
sudo ln -s $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN $CONFIGFS_MNT/nvmet/ports/1/subsystems/$TARGET_NQN || { echo "Failed to link subsystem to port"; exit 1; }
echo "[6/6] Checking firewall/security group..."
echo "IMPORTANT: Ensure firewall/security group on te1 allows INBOUND traffic on TCP/RDMA port $TARGET_PORT from te2."
echo "=== Target Setup Complete ==="
sudo nvme list
lsblk
te1 释放清理 02_teardown_target.sh
#!/bin/bash
SCRIPT_DIR=$(dirname "$(realpath "$0")")
source "$SCRIPT_DIR/../config.sh" || { echo "Error: config.sh not found!"; exit 1; }
echo "=== Tearing down NVMe-oF Target ($TARGET_NQN) ==="
CONFIGFS_MNT="/sys/kernel/config"
BACKING_DEV=$(losetup -j "$TARGET_BACKING_FILE" | cut -d: -f1) # Find associated loop device
# 1. Unlink Subsystem from Port
echo "[1/4] Unlinking subsystem from port..."
sudo unlink $CONFIGFS_MNT/nvmet/ports/1/subsystems/$TARGET_NQN 2>/dev/null || echo "Warning: Could not unlink subsystem, already unlinked or path incorrect."
# 2. Remove Port
echo "[2/4] Removing port..."
sudo rmdir $CONFIGFS_MNT/nvmet/ports/1 2>/dev/null || echo "Warning: Could not remove port 1, already removed or path incorrect."
# 3. Disable/Remove Namespace and Subsystem
echo "[3/4] Removing subsystem and namespace..."
sudo sh -c "echo 0 > $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN/namespaces/1/enable" 2>/dev/null
sudo rmdir $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN/namespaces/1 2>/dev/null || echo "Warning: Could not remove namespace 1."
# Remove allowed host if it exists
if [ ! -z "$INITIATOR_NQN" ]; then
sudo rmdir $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN/allowed_hosts/$INITIATOR_NQN 2>/dev/null
fi
sudo rmdir $CONFIGFS_MNT/nvmet/subsystems/$TARGET_NQN 2>/dev/null || echo "Warning: Could not remove subsystem."
# 4. Detach loop device
echo "[4/4] Detaching loop device..."
if [ ! -z "$BACKING_DEV" ] && [ -b "$BACKING_DEV" ]; then
sudo losetup -d "$BACKING_DEV" || echo "Warning: Failed to detach $BACKING_DEV."
else
echo "Loop device $BACKING_DEV not found or invalid."
fi
# Optional: Remove backing file
# echo "Optional: Remove backing file '$TARGET_BACKING_FILE' manually if desired."
# sudo rm "$TARGET_BACKING_FILE"
echo "=== Teardown Complete ==="
te2 连接 target (11_connect_target.sh)
#!/bin/bash
SCRIPT_DIR=$(dirname "$(realpath "$0")")
source "$SCRIPT_DIR/../config.sh" || { echo "Error: config.sh not found!"; exit 1; }
echo "=== Connecting to NVMe-oF Target ($TARGET_NQN) ==="
# 1. Load module
echo "[1/4] Loading nvme-rdma module..."
sudo modprobe nvme-rdma || { echo "Failed to load nvme-rdma"; exit 1; }
# 2. Ensure host NQN file exists
echo "[2/4] Checking initiator NQN file..."
if [ ! -f /etc/nvme/hostnqn ]; then
echo "Host NQN file not found, generating..."
sudo nvme gen-hostnqn | sudo tee /etc/nvme/hostnqn || { echo "Failed to generate host NQN"; exit 1; }
fi
echo "Using Initiator NQN: $(cat /etc/nvme/hostnqn)"
# 3. Discover Target
echo "[3/4] Discovering target $TARGET_IP:$TARGET_PORT (RDMA)..."
sudo nvme discover -t rdma -a $TARGET_IP -s $TARGET_PORT
if [ $? -ne 0 ]; then
echo "Warning: Discover command failed. Connection might still work if NQN is correct."
fi
# Check if target NQN appears in discover output (optional check)
# sudo nvme discover -t rdma -a $TARGET_IP -s $TARGET_PORT | grep -q $TARGET_NQN || echo "Warning: Target NQN not found in discovery."
# 4. Connect to Target
echo "[4/4] Connecting to $TARGET_NQN at $TARGET_IP:$TARGET_PORT (RDMA)..."
sudo nvme connect -t rdma -a $TARGET_IP -s $TARGET_PORT -n $TARGET_NQN
if [ $? -ne 0 ]; then
echo "Error: Failed to connect to target. Check target status, network, firewall, and NQNs."
echo "Also, the previous RDMA issues might be preventing connection."
exit 1
fi
# Verify
echo "--- Verifying Connection ---"
sleep 2
sudo nvme list
lsblk | grep nvme
if ! lsblk | grep -q "$(basename $REMOTE_NVME_DEV_ON_INITIATOR)"; then
echo "Error: Expected device $REMOTE_NVME_DEV_ON_INITIATOR did not appear."
exit 1
fi
echo "=== Connection Successful ==="
sudo nvme list
lsblk
te2 测试 (12_test_io.sh)
#!/bin/bash
SCRIPT_DIR=$(dirname "$(realpath "$0")")
source "$SCRIPT_DIR/../config.sh" || { echo "Error: config.sh not found!"; exit 1; }
echo "=== Testing I/O on Remote NVMe Device ($REMOTE_NVME_DEV_ON_INITIATOR) ==="
# 1. Check if device exists
if [ ! -b "$REMOTE_NVME_DEV_ON_INITIATOR" ]; then
echo "Error: Remote NVMe device $REMOTE_NVME_DEV_ON_INITIATOR not found! Is it connected?"
exit 1
fi
echo "[1/4] Device $REMOTE_NVME_DEV_ON_INITIATOR found."
# 2. Format (if not already formatted)
echo "[2/4] Formatting $REMOTE_NVME_DEV_ON_INITIATOR (ext4)..."
sudo mkfs.ext4 $REMOTE_NVME_DEV_ON_INITIATOR || { echo "Error: mkfs failed."; exit 1; }
# 3. Mount
echo "[3/4] Mounting $REMOTE_NVME_DEV_ON_INITIATOR to $MOUNT_POINT..."
sudo mkdir -p $MOUNT_POINT
sudo mount $REMOTE_NVME_DEV_ON_INITIATOR $MOUNT_POINT || { echo "Error: mount failed."; exit 1; }
# 4. Basic I/O Test
echo "[4/4] Performing basic read/write test..."
TEST_FILE="$MOUNT_POINT/test_$(date +%s).txt"
TEST_STRING="NVMe-oF RDMA test successful at $(date)"
echo "$TEST_STRING" | sudo tee "$TEST_FILE" > /dev/null || { echo "Error: write failed."; sudo umount $MOUNT_POINT; exit 1; }
READ_BACK=$(sudo cat "$TEST_FILE") || { echo "Error: read failed."; sudo umount $MOUNT_POINT; exit 1; }
if [ "$READ_BACK" == "$TEST_STRING" ]; then
echo "Read/Write content matched: OK"
else
echo "Error: Read/Write content mismatch!"
echo "Expected: $TEST_STRING"
echo "Got: $READ_BACK"
sudo umount $MOUNT_POINT
exit 1
fi
sudo rm "$TEST_FILE"
# Unmount
sudo umount $MOUNT_POINT || { echo "Warning: umount failed."; exit 1; }
sudo rmdir $MOUNT_POINT 2>/dev/null # Remove if empty
# Optional: Add fio test here
# echo "--- Running fio test (example) ---"
# if command -v fio &> /dev/null; then
# sudo fio --name=randwrite --rw=randwrite --bs=4k --iodepth=32 --size=256M \
# --ioengine=libaio --direct=1 --filename=$REMOTE_NVME_DEV_ON_INITIATOR \
# --numjobs=1 --runtime=30 --group_reporting --output-format=terse
# else
# echo "fio not installed. Skipping fio test."
# fi
echo "=== I/O Test Completed Successfully ==="
te2 释放资源 (13_disconnect_target.sh)
#!/bin/bash
SCRIPT_DIR=$(dirname "$(realpath "$0")")
source "$SCRIPT_DIR/../config.sh" || { echo "Error: config.sh not found!"; exit 1; }
echo "=== Disconnecting from NVMe-oF Target ($TARGET_NQN) ==="
# Ensure TARGET_NQN is set
if [ -z "$TARGET_NQN" ]; then
echo "Error: TARGET_NQN is not set in config.sh!"
exit 1
fi
# 1. Disconnect
echo "[1/2] Sending disconnect command for NQN: $TARGET_NQN..."
# Use -n for NQN. If the device node still exists, you could use -d instead.
sudo nvme disconnect -n "$TARGET_NQN"
# Alternatively, disconnect by device name if known:
# sudo nvme disconnect -d "$(basename $REMOTE_NVME_DEV_ON_INITIATOR)"
if [ $? -ne 0 ]; then
echo "Warning: Disconnect command failed. Maybe already disconnected or device busy?"
fi
# 2. Verify Disconnect
echo "[2/2] Verifying disconnect..."
sleep 1
sudo nvme list
lsblk | grep nvme
if lsblk | grep -q "$(basename $REMOTE_NVME_DEV_ON_INITIATOR)"; then
echo "Warning: Device $REMOTE_NVME_DEV_ON_INITIATOR still appears after disconnect."
fi
echo "=== Disconnect Process Finished ==="
c) nvme 磁盘空间分配

d) 跨节点读写测试

e) 释放资源


3.4 NCCL 示例
服务器申请:
te1 带两块 A10

te2 带一块 A10

wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt update
sudo apt install libnccl2 libnccl-dev
ls /usr/include/nccl.h # 头文件路径
ls /usr/lib/x86_64-linux-gnu/libnccl* # 库文件路径
git clone https://github.com/NVIDIA/nccl-tests.git cd nccl-tests
cd ~/nccl-tests
make clean
make CUDA_HOME=/usr/local/cuda-12.1 all


a) 单机多卡测试
(在 te1,2 卡 上运行)
1) 测试 AllReduce 性能
# 进入 nccl-tests 目录
cd ~/nccl-tests
./build/all_reduce_perf -b 8 -e 1G -f 2 -g 2

2) 测试 Broadcast 性能
./build/broadcast_perf -b 8 -e 1G -f 2 -g 2
3) 测试 AllGather 性能
./build/all_gather_perf -b 8 -e 1G -f 2 -g 2
b) 跨节点多卡测试
(使用 te1 和 te2)
apt install lam-runtime mpich openmpi-bin slurm-wlm-torque

#hostfile
te1 slots=2
te2 slots=1
mpirun --allow-run-as-root \
-np 3 --hostfile hostfile \
-x NCCL_DEBUG=INFO \
-mca pml ob1 -mca btl tcp,self \
-mca btl_tcp_if_include eth0\
./build/all_reduce_perf -b 8 -e 1G -f 2 -g 1
问题:

3.5 MPI 示例
sudo apt update
sudo apt install -y openmpi-bin libopenmpi-dev
a) MPI "Hello World"
hello_mpi.c
#include <mpi.h>
#include <stdio.h>
#include <unistd.h>
int main(int argc, char** argv) {
// 初始化 MPI 环境
MPI_Init(&argc, &argv);
int world_size;
// 获取 MPI_COMM_WORLD 通信器中的总进程数
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int world_rank;
// 获取当前进程在 MPI_COMM_WORLD 中的 Rank (编号, 0 到 size-1)
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
// 获取当前进程运行的主机名
MPI_Get_processor_name(processor_name, &name_len);
// 打印信息
printf("Hello from processor %s, rank %d out of %d processors\n",
processor_name, world_rank, world_size);
// 清理 MPI 环境
MPI_Finalize();
return 0;
}
(llm) root@te1:~/mpi# mpicc hello_mpi.c -o hello_mpi

1) MPI "Hello World" (te1单节点)
(llm) root@te1:~/mpi# mpirun --allow-run-as-root -np 4 ./hello_mpi

2) MPI "Hello World" (te1和 te2 跨节点)
## 同步代码到所有节点 相同目录
rsync -avz ~/mpi/. te2:~/mpi
(llm) root@te1:~/mpi# mpirun --allow-run-as-root \
-np 4 --hostfile hostfile ./hello_mpi

b) 集体通信 - MPI_Reduce
reduce_mpi.c
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int world_rank, world_size;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// 每个进程贡献自己的 Rank 值作为数据
int send_data = world_rank;
int recv_data = 0; // 接收缓冲区,只在 root 进程有意义
printf("Process %d contributing %d\n", world_rank, send_data);
// MPI_Reduce(发送缓冲区, 接收缓冲区, 数据个数, 数据类型, 操作类型, 根进程Rank, 通信器)
// 所有进程调用 Reduce
MPI_Reduce(&send_data, &recv_data, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
// 只有 Rank 0 进程会收到有效的规约结果
if (world_rank == 0) {
// 预期结果是 0 + 1 + 2 + ... + (size-1)
int expected_sum = world_size * (world_size - 1) / 2;
printf("Process 0 (root) received reduced sum: %d (Expected: %d)\n", recv_data, expected_sum);
}
MPI_Finalize();
return 0;
}
mpicc reduce_mpi.c -o reduce_mpi
1) 单节点 te1 运行
mpirun --allow-run-as-root -np 4 ./reduce_mpi

2) 跨节点 te1 ,te2运行
mpirun --allow-run-as-root -np 4 --hostfile hostfile ./reduce_mpi

c) 全局规约-MPI_Allreduce
allreduce_mpi.c
#include <mpi.h>
#include <stdio.h>
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int world_rank, world_size;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int send_data = world_rank; // 每个进程贡献自己的 Rank
int recv_data; // 每个进程都用来接收最终结果
printf("Process %d contributing data %d\n", world_rank, send_data);
// MPI_Allreduce(发送缓冲, 接收缓冲, 数据个数, 数据类型, 操作类型, 通信器)
MPI_Allreduce(&send_data, &recv_data, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
// 所有进程都应该收到相同的规约结果
int expected_sum = world_size * (world_size - 1) / 2;
printf("Process %d received reduced sum %d via Allreduce (Expected: %d)\n",
world_rank, recv_data, expected_sum);
MPI_Finalize();
return 0;
}
mpicc allreduce_mpi.c -o allreduce_mpi
rsync -avz ~/mpi/. te2:~/mpi
1) 单节点 te1 运行
mpirun --allow-run-as-root -np 4 ./allreduce_mpi

2) 跨节点 te1 ,te2运行
mpirun --allow-run-as-root -np 4 --hostfile hostfile ./allreduce_mpi

火山引擎开发者社区是火山引擎打造的AI技术生态平台,聚焦Agent与大模型开发,提供豆包系列模型(图像/视频/视觉)、智能分析与会话工具,并配套评测集、动手实验室及行业案例库。社区通过技术沙龙、挑战赛等活动促进开发者成长,新用户可领50万Tokens权益,助力构建智能应用。
更多推荐
所有评论(0)