Install
- 安装
slurm-wlm
sudo apt install slurm-wlm slurm-wlm-doc -y
# check version
slurmd -V
- 配置
在线配置: Configuration Tool - Easy Version,或者
cd /usr/share/doc/slurmctld
chmod +r slurm-wlm-configurator.html
python3 -m http.server
打开 http://<ip>:8000,填写
- ClusterName:集群名称
- SlurmUser:可以填
root - Compute Machines:可以填
slurmd -C的结果,例如 ``
点击 submit 后复制内容,写入 /etc/slurm/slurm.conf
- 创建文件夹
sudo mkdir /var/spool/slurmd
sudo mkdir /var/spool/slurmctld
写入以下文件: /etc/slurm/cgroup.conf
CgroupAutomount=yes
ConstrainCores=no
ConstrainRAMSpace=no
- 启动服务
sudo systemctl start slurmd
sudo systemctl start slurmctld
- 加入计算节点
- 按照上述步骤安装 slurm
- 复制
/etc/munge/munge.key到新节点,重启 munge 服务sudo systemctl restart munge - 启动 slurmd,
sudo systemctl start slurmd
- 查看状态
# check nodes
sinfo --long
# check jobs
sstat
# execute /bin/hostname on three nodes (-N3)
srun -N3 -l /bin/hostname
Admin
配置
# show config of `/etc/slurm/slurm.conf`
scontrol show config
# reload config
scontrol reconfigure
查看日志
sudo cat /var/log/slurm/slurmctld.log
查看节点状态
sinfo
sinfo -l # --long
sinfo -v # --verbose
sinfo -s # --summarize
sinfo --states idle # show idle nodes
sinfo --dead # show dead nodes
scontrol show node
scontrol show node <node_name>
管理节点状态
# remove a node from the SLURM's pool
scontrol update nodename=node-5 state=drain
# undrain a node that is ready to be used
scontrol update nodename=node-5 state=idle
# find nodes that are drained due to "Kill task failed"
sinfo -R | grep "Kill task failed"
查看 partition
scontrol show partition
scontrol show partition <partition_name>
# or
sinfo --partition <partition_name>
查看每个用户的使用份额(share)
sshare --all
Usage
srun
run interactive job with srun
srun --partition=<partition_name> --nodes=1 --time=02:00:00 <command>
salloc
交互式地分配资源,并启动一个 Shell 会话,用户可以直接在分配的资源上运行命令
salloc --nodes=1 --cpus-per-task=4 --mem=8G
sbatch
先创建一个 sbatch 脚本例如 my_job.sh
#!/bin/bash
#SBATCH --job-name=my_job
#SBATCH --partition=debug
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
# Your script goes here
sleep 30
echo "hello"
启动任务
sbatch my_job.sh
查看任务状态
squeue --job <job_id>
# 查看我的任务
squeue -u `whoami`
# 查看整个分区任务
squeue --partition=dev
查看任务 accounting 信息
# accounting
sacct -j <job_id> --long
sacct -u `whoami`
取消任务
scancel <job_id>
常用参数
#SBATCH --job-name=alignment # 设置任务名称
#SBATCH --nodes=4 # 请求 4 个节点
#SBATCH --mem=1024 # 设置每个节点分配内存数量,单位为 MB
#SBATCH --gpus-per-node=2 # 每个节点请求 2 个 GPU
#SBATCH --cpus-per-task=4 # 每个 task 请求 4 个 vCPU
#SBATCH --time="days-hours" # 设置任务时间上限
多机任务
4 个节点,每个节点各分配 4 vCPU,分别执行以下 4 个 tasks
#!/bin/bash
#SBATCH --job-name=multithreadedtasks
#SBATCH --nodes=4
#SBATCH --ntasks=4
#SBATCH --cpus-per-task=4
# Your script goes here
srun --ntasks=1 mycommand1 --threads 4
srun --ntasks=1 mycommand2 --threads 4
srun --ntasks=1 mycommand3 --threads 4
srun --ntasks=1 mycommand4 --threads 4
MPI 任务
多机任务,共 16 vCPU,自动分配到不同节点
#!/bin/bash
#SBATCH --job-name=simplempi
#SBATCH --ntasks=16
# Your script goes here
mpirun myscript
指点两个节点,每个节点分配 8 vCPU
#!/bin/bash
#SBATCH --job-name=nodempi
#SBATCH --ntasks=16
#SBATCH --ntasks-per-node=8
# Your script goes here
mpirun myscript
Example
#!/bin/bash
# this is a 2 node slurm job example, you will most likely need to adapt --cpus-per-task and --partition
#SBATCH --job-name=example-job
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=96
#SBATCH --gres=gpu:8
#SBATCH --time=0:10:00
#SBATCH --exclusive
#SBATCH --partition=xyz-cluster
#SBATCH --output=%x-%j.out
set -x -e
# CHANGE HERE THE CONDA EVN AND ANY STARTUP SCRIPTS
source /path/to/start-xxx-user # if you have something to preload before the job
conda activate stas-xxx # if you have conda env to activate
echo "START TIME: $(date)"
# CHANGE TO CUMMULATIVELY LOG OUTPUTS
LOG_PATH="main_log.txt"
GPUS_PER_NODE=8
NNODES=$SLURM_NNODES
# so processes know who to talk to
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
# OTHER LAUNCHERS CAN BE USED HERE
export LAUNCHER="python -u -m torch.distributed.run \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend c10d \
--max_restarts 0 \
--role `hostname -s`: \
--tee 3 \
"
# CHANGE HERE THE SCRIPT AND WHATEVER ARGS IT NEEDS
CMD="\
torch-distributed-gpu-test.py \
"
echo $CMD
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
# force crashing on nccl issues like hanging broadcast
export NCCL_ASYNC_ERROR_HANDLING=1
# export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=COLL
# export NCCL_SOCKET_NTHREADS=1
# export NCCL_NSOCKS_PERTHREAD=1
# export CUDA_LAUNCH_BLOCKING=1
# srun error handling:
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
SRUN_ARGS=" \
--wait=60 \
--kill-on-bad-exit=1 \
"
# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee -a $LOG_PATH
echo "END TIME: $(date)"
Run Container
NCCL
测试 NCCL 是否 work
#!/bin/bash
#SBATCH --job-name=test-nodes-nccl
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=96 # number of cores per tasks
#SBATCH --gres=gpu:8 # number of gpus
#SBATCH --time 0:05:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=%x-%j.out # output file name
#SBATCH --partition=prod
source $six_ALL_CCFRWORK/start-prod
NNODES=2
GPUS_PER_NODE=4
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
export LAUNCHER="python -u -m torch.distributed.launch \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT \
"
export SCRIPT=test-nodes-nccl.py
cat << EOT > $SCRIPT
#!/usr/bin/env python
import torch.distributed as dist
import torch
import socket
import os
import fcntl
def printflock(*msgs):
""" print """
with open(__file__, "r") as fh:
fcntl.flock(fh, fcntl.LOCK_EX)
try:
print(*msgs)
finally:
fcntl.flock(fh, fcntl.LOCK_UN)
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
dist.init_process_group("nccl")
header = f"{socket.gethostname()}-{local_rank}"
try:
dist.barrier()
printflock(f"{header}: NCCL {torch.cuda.nccl.version()} is OK")
except:
printflock(f"{header}: NCCL {torch.cuda.nccl.version()} is broken")
raise
EOT
echo $LAUNCHER --node_rank $SLURM_PROCID $SCRIPT
srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $SCRIPT'