Slurm | 蚂蚁蚂蚁的技术分享

Install

Quick Start Administrator Guide

从零开始安装slurm作业调度系统Slurm-Ubuntu

Github - slurm_ubuntu_gpu_cluster

安装 slurm-wlm

sudo apt install slurm-wlm slurm-wlm-doc -y

# check version
slurmd -V

配置

在线配置: Configuration Tool - Easy Version，或者

cd /usr/share/doc/slurmctld
chmod +r slurm-wlm-configurator.html
python3 -m http.server

打开 http://<ip>:8000，填写

ClusterName：集群名称
SlurmUser：可以填 root
Compute Machines：可以填 slurmd -C 的结果，例如 ``

点击 submit 后复制内容，写入 /etc/slurm/slurm.conf

创建文件夹

sudo mkdir /var/spool/slurmd
sudo mkdir /var/spool/slurmctld

配置 cgroup：error: cgroup namespace ‘freezer’ not mounted. aborting

写入以下文件： /etc/slurm/cgroup.conf

CgroupAutomount=yes
ConstrainCores=no
ConstrainRAMSpace=no

启动服务

sudo systemctl start slurmd
sudo systemctl start slurmctld

加入计算节点

按照上述步骤安装 slurm
复制 /etc/munge/munge.key 到新节点，重启 munge 服务 sudo systemctl restart munge
启动 slurmd，sudo systemctl start slurmd

查看状态

# check nodes
sinfo --long

# check jobs
sstat

# execute /bin/hostname on three nodes (-N3)
srun -N3 -l /bin/hostname

Admin

配置

# show config of `/etc/slurm/slurm.conf`
scontrol show config

# reload config
scontrol reconfigure

查看日志

sudo cat /var/log/slurm/slurmctld.log

查看节点状态

sinfo
sinfo -l # --long
sinfo -v # --verbose
sinfo -s # --summarize
sinfo --states idle # show idle nodes
sinfo --dead # show dead nodes

scontrol show node
scontrol show node <node_name>

管理节点状态

# remove a node from the SLURM's pool
scontrol update nodename=node-5 state=drain

# undrain a node that is ready to be used
scontrol update nodename=node-5 state=idle

# find nodes that are drained due to "Kill task failed"
sinfo -R | grep "Kill task failed"

查看 partition

scontrol show partition
scontrol show partition <partition_name>
# or
sinfo --partition <partition_name>

查看每个用户的使用份额（share）

sshare --all

Usage

A simple Slurm guide for beginners

`srun`

run interactive job with srun

srun --partition=<partition_name> --nodes=1 --time=02:00:00 <command>

`salloc`

交互式地分配资源，并启动一个 Shell 会话，用户可以直接在分配的资源上运行命令

salloc --nodes=1 --cpus-per-task=4 --mem=8G

`sbatch`

先创建一个 sbatch 脚本例如 my_job.sh

#!/bin/bash

#SBATCH --job-name=my_job
#SBATCH --partition=debug
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1

# Your script goes here
sleep 30
echo "hello"

启动任务

sbatch my_job.sh

查看任务状态

squeue --job <job_id>

# 查看我的任务
squeue -u `whoami`

# 查看整个分区任务
squeue --partition=dev

查看任务 accounting 信息

# accounting
sacct -j <job_id> --long
sacct -u `whoami`

取消任务

scancel <job_id>

常用参数

#SBATCH --job-name=alignment 			# 设置任务名称
#SBATCH --nodes=4									# 请求 4 个节点
#SBATCH --mem=1024								# 设置每个节点分配内存数量，单位为 MB
#SBATCH --gpus-per-node=2					# 每个节点请求 2 个 GPU
#SBATCH --cpus-per-task=4					# 每个 task 请求 4 个 vCPU
#SBATCH --time="days-hours"				# 设置任务时间上限

多机任务

4 个节点，每个节点各分配 4 vCPU，分别执行以下 4 个 tasks

#!/bin/bash

#SBATCH --job-name=multithreadedtasks
#SBATCH --nodes=4
#SBATCH --ntasks=4
#SBATCH --cpus-per-task=4

# Your script goes here
srun --ntasks=1 mycommand1 --threads 4
srun --ntasks=1 mycommand2 --threads 4
srun --ntasks=1 mycommand3 --threads 4
srun --ntasks=1 mycommand4 --threads 4

MPI 任务

多机任务，共 16 vCPU，自动分配到不同节点

#!/bin/bash

#SBATCH --job-name=simplempi
#SBATCH --ntasks=16

# Your script goes here
mpirun myscript

指点两个节点，每个节点分配 8 vCPU

#!/bin/bash

#SBATCH --job-name=nodempi
#SBATCH --ntasks=16
#SBATCH --ntasks-per-node=8

# Your script goes here
mpirun myscript

Example

ml-engineering - orchestration/slurm/example.slurm

#!/bin/bash

# this is a 2 node slurm job example, you will most likely need to adapt --cpus-per-task and --partition

#SBATCH --job-name=example-job
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=96
#SBATCH --gres=gpu:8
#SBATCH --time=0:10:00
#SBATCH --exclusive
#SBATCH --partition=xyz-cluster
#SBATCH --output=%x-%j.out


set -x -e

# CHANGE HERE THE CONDA EVN AND ANY STARTUP SCRIPTS
source /path/to/start-xxx-user # if you have something to preload before the job
conda activate stas-xxx        # if you have conda env to activate

echo "START TIME: $(date)"

# CHANGE TO CUMMULATIVELY LOG OUTPUTS
LOG_PATH="main_log.txt"

GPUS_PER_NODE=8
NNODES=$SLURM_NNODES

# so processes know who to talk to
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000

# OTHER LAUNCHERS CAN BE USED HERE
export LAUNCHER="python -u -m torch.distributed.run \
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
    --rdzv_backend c10d \
    --max_restarts 0 \
    --role `hostname -s`: \
    --tee 3 \
    "

# CHANGE HERE THE SCRIPT AND WHATEVER ARGS IT NEEDS
CMD="\
torch-distributed-gpu-test.py \
"

echo $CMD

# hide duplicated errors using this hack - will be properly fixed in pt-1.12
# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json

# force crashing on nccl issues like hanging broadcast
export NCCL_ASYNC_ERROR_HANDLING=1
# export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=COLL
# export NCCL_SOCKET_NTHREADS=1
# export NCCL_NSOCKS_PERTHREAD=1
# export CUDA_LAUNCH_BLOCKING=1

# srun error handling:
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
SRUN_ARGS=" \
    --wait=60 \
    --kill-on-bad-exit=1 \
    "

# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee -a $LOG_PATH

echo "END TIME: $(date)"

Run Container

Containers Guide

NCCL

测试 NCCL 是否 work

#!/bin/bash
#SBATCH --job-name=test-nodes-nccl
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=96           # number of cores per tasks
#SBATCH --gres=gpu:8                 # number of gpus
#SBATCH --time 0:05:00               # maximum execution time (HH:MM:SS)
#SBATCH --output=%x-%j.out           # output file name
#SBATCH --partition=prod

source $six_ALL_CCFRWORK/start-prod

NNODES=2

GPUS_PER_NODE=4
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000

export LAUNCHER="python -u -m torch.distributed.launch \
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT \
    "

export SCRIPT=test-nodes-nccl.py

cat << EOT > $SCRIPT
#!/usr/bin/env python
import torch.distributed as dist
import torch
import socket
import os
import fcntl

def printflock(*msgs):
    """ print """
    with open(__file__, "r") as fh:
        fcntl.flock(fh, fcntl.LOCK_EX)
        try:
            print(*msgs)
        finally:
            fcntl.flock(fh, fcntl.LOCK_UN)

local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
dist.init_process_group("nccl")
header = f"{socket.gethostname()}-{local_rank}"
try:
    dist.barrier()
    printflock(f"{header}: NCCL {torch.cuda.nccl.version()} is OK")
except:
    printflock(f"{header}: NCCL {torch.cuda.nccl.version()} is broken")
    raise
EOT

echo $LAUNCHER --node_rank $SLURM_PROCID $SCRIPT

srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $SCRIPT'

Install#

Admin#

Usage#

srun#

salloc#

sbatch#

多机任务#

MPI 任务#

Example#

Run Container#

NCCL#