Skip to content

Testing PySlurm with Docker

Giovanni Torres edited this page Mar 10, 2017 · 5 revisions

Create Dockerfile

Create a Dockerfile based on a CentOS 7 image:

FROM centos:7
MAINTAINER "Giovanni Torres"

RUN groupadd -r slurm && useradd -r -g slurm slurm

RUN yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
RUN yum -y install wget bzip2 perl gcc vim-enhanced git make munge munge-devel \
    supervisor python-devel python-pip
RUN pip install Cython nose

ENV SLURM_VERSION 16.05.6
ENV SLURM_DOWNLOAD_MD5 0c7911e52443e9f5ad1fc381085ec183
ENV SLURM_DOWNLOAD_URL http://www.schedmd.com/download/latest/slurm-"$SLURM_VERSION".tar.bz2

RUN set -x \
    && wget -O slurm.tar.bz2 "$SLURM_DOWNLOAD_URL" \
    && echo "$SLURM_DOWNLOAD_MD5" slurm.tar.bz2 | md5sum -c - \
    && mkdir /usr/local/src/slurm \
    && tar jxf slurm.tar.bz2 -C /usr/local/src/slurm --strip-components=1 \
    && rm slurm.tar.bz2 \
    && cd /usr/local/src/slurm \
    && ./configure --enable-debug --enable-front-end --prefix=/usr --sysconfdir=/etc/slurm \
    && make install \
    && install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
    && install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \
    && install -D -m644 etc/slurm.epilog.clean /etc/slurm/slurm.epilog.clean \
    && install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \
    && cd \
    && rm -rf /usr/local/src/slurm \
    && mkdir /etc/sysconfig/slurm \
    && mkdir /var/spool/slurmd \
    && chown slurm:slurm /var/spool/slurmd \
    && mkdir /var/run/slurmd \
    && chown slurm:slurm /var/run/slurmd \
    && mkdir /var/lib/slurmd \
    && chown slurm:slurm /var/lib/slurmd \
    && /sbin/create-munge-key

COPY slurm.conf /etc/slurm/slurm.conf

COPY supervisord.conf /etc/
ENTRYPOINT /usr/bin/supervisord -c /etc/supervisord.conf && /bin/bash

Create a slurm.conf

Create a slurm.conf with a few nodes that will all run on the same node for testing:

# slurm.conf
#
# See the slurm.conf man page for more information.
#
ClusterName=linux
ControlMachine=ernie
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/lib/slurmd
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
ProctrackType=proctrack/pgid
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=0
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType=select/cons_res
SelectTypeParameters=CR_CPU_Memory
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
#SlurmctldLogFile=
SlurmdDebug=3
#SlurmdLogFile=
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
# COMPUTE NODES
NodeName=c[1-10] NodeHostName=localhost NodeAddr=127.0.0.1 RealMemory=1000
#
# PARTITIONS
PartitionName=normal Default=yes Nodes=c[1-5] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=1 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
PartitionName=normal Nodes=c[6-10] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=1 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP

Create a supervisord configuration file

This will use supervisord as a process manager inside the docker container. It will be responsible for starting slurmd, slurmctld, and munged:

[unix_http_server]
file=/var/run/supervisor/supervisor.sock

[supervisord]
logfile=/var/log/supervisor/supervisord.log
logfile_maxbytes=5MB
logfile_backups=10
loglevel=info
pidfile=/var/run/supervisord.pid
nodaemon=false

[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface

[supervisorctl]
serverurl=unix:///var/run/supervisor/supervisor.sock

[program:munged]
user=munge
command=/usr/sbin/munged -F
autostart=true
autorestart=false
startsecs=5
startretries=2
exitcodes=0,1,2
stdout_logfile=/var/log/supervisor/munged.log
stdout_logfile_maxbytes=1MB
stdout_logfile_backups=5
stderr_logfile=/var/log/supervisor/munged.log
stderr_logfile_maxbytes=1MB
stderr_logfile_backups=5

[program:slurmctld]
user=root
command=/usr/sbin/slurmctld -D -vvvvv
autostart=true
autorestart=false
startsecs=5
startretries=2
exitcodes=0,1,2
stdout_logfile=/var/log/supervisor/slurmctld.log
stdout_logfile_maxbytes=1MB
stdout_logfile_backups=5
stderr_logfile=/var/log/supervisor/slurmctld.log
stderr_logfile_maxbytes=1MB
stderr_logfile_backups=5

[program:slurmd]
user=root
command=/usr/sbin/slurmd -D -vvvvv
autostart=true
autorestart=false
startsecs=5
startretries=2
exitcodes=0,1,2
stdout_logfile=/var/log/supervisor/slurmd.log
stdout_logfile_maxbytes=1MB
stdout_logfile_backups=5
stderr_logfile=/var/log/supervisor/slurmd.log
stderr_logfile_maxbytes=1MB
stderr_logfile_backups=5

Build the Slurm container

Before building the container, be sure to edit the following ENV values in the Dockerfile depending on your desired version of Slurm:

ENV SLURM_VERSION 16.05.6
ENV SLURM_DOWNLOAD_MD5 0c7911e52443e9f5ad1fc381085ec183

You can get the MD5 hashes from https://www.schedmd.com/downloads.php.

Put all three files above in a directory. From that directory, run the following to build the containter:

docker build -t slurm-16.05.6-1 .

Run the container

Notice in slurm.conf, the ControlMachine is given the name ernie. Therefore, run the container with the following to keep the hostname, otherwise slurmctld will fail due to a mismatched hostname:

docker run -it -h ernie slurm-16.05.6-1

This should take you right to a bash shell inside the container:

[root@ernie /]# sinfo
PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
normal*      up 5-00:00:00      5   idle c[1-5]
[root@ernie /]# scontrol show node c1
NodeName=c1 CoresPerSocket=1
   CPUAlloc=0 CPUErr=0 CPUTot=1 CPULoad=N/A
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=(null)
   NodeAddr=127.0.0.1 NodeHostName=localhost Version=(null)
   RealMemory=1000 AllocMem=0 FreeMem=N/A Sockets=1 Boards=1
   State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   BootTime=2016-10-23T22:22:09 SlurmdStartTime=2016-11-05T16:38:10
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

Build PySlurm

Slurm is already installed and the libraries are in the system defaults (/usr/lib/slurm). Therefore, PySlurm will find them without specifying paths to libraries and include headers.

Clone the repo:

git clone https://github.com/PySlurm/pyslurm.git

Build and install PySlurm

cd pyslurm
python setup.py build
python setup.py install

Run a test job:

sbatch --wrap="sleep 1000" --partition=normal -N 1

Change directories and run tests:

cd && nosetests -v /pyslurm/tests

At this point, some of the tests will fail because some OS features, such as Arch and OS, are not populated inside the container whereas it would be on baremetal or virtual machines. You may comment out these tests and re-run nosetests.