Skip to content

Commit

Permalink
Migrate Slurm to use rockylinux (#650)
Browse files Browse the repository at this point in the history
* Migrate Slurm to use rockylinux

* Revert resource changes
  • Loading branch information
jacobtomlinson authored Aug 5, 2024
1 parent 37143f8 commit 9d4c371
Show file tree
Hide file tree
Showing 4 changed files with 211 additions and 6 deletions.
112 changes: 108 additions & 4 deletions ci/slurm/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,104 @@
FROM giovtorres/slurm-docker-cluster
# SPDX-FileCopyrightText: Copyright (c) 2019 Giovanni Torres
# SPDX-License-Identifier: BSD 3-Clause License
# SPDX-URL: https://github.com/giovtorres/slurm-docker-cluster/blob/65e3a83098eae4ae6f0dcf656e814129d279b7c9/LICENSE

RUN yum install -y iproute
# This is a modified version of the original Dockerfile
# https://github.com/giovtorres/slurm-docker-cluster/blob/65e3a83098eae4ae6f0dcf656e814129d279b7c9/Dockerfile
# Previously this image used "FROM giovtorres/slurm-docker-cluster:latest"
# but while that project is still being maintained it is no longer being pushed to Docker Hub
# so we are vendoring it in here.


FROM rockylinux:8

LABEL org.opencontainers.image.source="https://github.com/giovtorres/slurm-docker-cluster" \
org.opencontainers.image.title="slurm-docker-cluster" \
org.opencontainers.image.description="Slurm Docker cluster on Rocky Linux 8" \
org.label-schema.docker.cmd="docker-compose up -d" \
maintainer="Giovanni Torres"

RUN set -ex \
&& yum makecache \
&& yum -y update \
&& yum -y install dnf-plugins-core \
&& yum config-manager --set-enabled powertools \
&& yum -y install \
iproute \
wget \
bzip2 \
perl \
gcc \
gcc-c++\
git \
gnupg \
make \
munge \
munge-devel \
python3-devel \
python3-pip \
python3 \
mariadb-server \
mariadb-devel \
psmisc \
bash-completion \
vim-enhanced \
http-parser-devel \
json-c-devel \
&& yum clean all \
&& rm -rf /var/cache/yum

RUN alternatives --set python /usr/bin/python3

RUN pip3 install Cython nose

ARG GOSU_VERSION=1.11

RUN set -ex \
&& wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" \
&& wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64.asc" \
&& export GNUPGHOME="$(mktemp -d)" \
&& gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 \
&& gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu \
&& rm -rf "${GNUPGHOME}" /usr/local/bin/gosu.asc \
&& chmod +x /usr/local/bin/gosu \
&& gosu nobody true

ARG SLURM_TAG=slurm-21-08-6-1

RUN set -x \
&& git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \
&& pushd slurm \
&& ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \
--with-mysql_config=/usr/bin --libdir=/usr/lib64 \
&& make install \
&& install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
&& install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \
&& install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \
&& install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \
&& popd \
&& rm -rf slurm \
&& groupadd -r --gid=990 slurm \
&& useradd -r -g slurm --uid=990 slurm \
&& mkdir /etc/sysconfig/slurm \
/var/spool/slurmd \
/var/run/slurmd \
/var/run/slurmdbd \
/var/lib/slurmd \
/var/log/slurm \
/data \
&& touch /var/lib/slurmd/node_state \
/var/lib/slurmd/front_end_state \
/var/lib/slurmd/job_state \
/var/lib/slurmd/resv_state \
/var/lib/slurmd/trigger_state \
/var/lib/slurmd/assoc_mgr_state \
/var/lib/slurmd/assoc_usage \
/var/lib/slurmd/qos_usage \
/var/lib/slurmd/fed_mgr_state \
&& chown -R slurm:slurm /var/*/slurm* \
&& /sbin/create-munge-key

CMD ["slurmdbd"]

RUN curl -o miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash miniconda.sh -f -b -p /opt/anaconda && \
Expand All @@ -12,6 +110,12 @@ COPY environment.yml .
RUN conda env create --file environment.yml
SHELL ["conda", "run", "-n", "dask-jobqueue", "/bin/bash", "-c"]

ENV LC_ALL en_US.UTF-8

COPY slurm.conf /etc/slurm/slurm.conf
COPY slurmdbd.conf /etc/slurm/slurmdbd.conf
RUN set -x \
&& chown slurm:slurm /etc/slurm/slurmdbd.conf \
&& chmod 600 /etc/slurm/slurmdbd.conf
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]

ENV LC_ALL en_US.UTF-8
64 changes: 64 additions & 0 deletions ci/slurm/docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash
set -e

if [ "$1" = "slurmdbd" ]
then
echo "---> Starting the MUNGE Authentication service (munged) ..."
gosu munge /usr/sbin/munged

echo "---> Starting the Slurm Database Daemon (slurmdbd) ..."

{
. /etc/slurm/slurmdbd.conf
until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null
do
echo "-- Waiting for database to become active ..."
sleep 2
done
}
echo "-- Database is now active ..."

exec gosu slurm /usr/sbin/slurmdbd -Dvvv
fi

if [ "$1" = "slurmctld" ]
then
echo "---> Starting the MUNGE Authentication service (munged) ..."
gosu munge /usr/sbin/munged

echo "---> Waiting for slurmdbd to become active before starting slurmctld ..."

until 2>/dev/null >/dev/tcp/slurmdbd/6819
do
echo "-- slurmdbd is not available. Sleeping ..."
sleep 2
done
echo "-- slurmdbd is now active ..."

echo "---> Starting the Slurm Controller Daemon (slurmctld) ..."
if /usr/sbin/slurmctld -V | grep -q '17.02' ; then
exec gosu slurm /usr/sbin/slurmctld -Dvvv
else
exec gosu slurm /usr/sbin/slurmctld -i -Dvvv
fi
fi

if [ "$1" = "slurmd" ]
then
echo "---> Starting the MUNGE Authentication service (munged) ..."
gosu munge /usr/sbin/munged

echo "---> Waiting for slurmctld to become active before starting slurmd..."

until 2>/dev/null >/dev/tcp/slurmctld/6817
do
echo "-- slurmctld is not available. Sleeping ..."
sleep 2
done
echo "-- slurmctld is now active ..."

echo "---> Starting the Slurm Node Daemon (slurmd) ..."
exec /usr/sbin/slurmd -Dvvv
fi

exec "$@"
4 changes: 2 additions & 2 deletions ci/slurm/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
ProctrackType=proctrack/linuxproc
#PluginDir=
CacheGroups=0
#CacheGroups=0
#FirstJobId=
ReturnToService=0
#MaxJobCount=
Expand Down Expand Up @@ -83,7 +83,7 @@ JobAcctGatherFrequency=30
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=slurmdbd
AccountingStoragePort=6819
AccountingStorageLoc=slurm_acct_db
#AccountingStorageLoc=slurm_acct_db
#AccountingStoragePass=
#AccountingStorageUser=
#
Expand Down
37 changes: 37 additions & 0 deletions ci/slurm/slurmdbd.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#
# Example slurmdbd.conf file.
#
# See the slurmdbd.conf man page for more information.
#
# Archive info
#ArchiveJobs=yes
#ArchiveDir="/tmp"
#ArchiveSteps=yes
#ArchiveScript=
#JobPurge=12
#StepPurge=1
#
# Authentication info
AuthType=auth/munge
#AuthInfo=/var/run/munge/munge.socket.2
#
# slurmDBD info
DbdAddr=slurmdbd
DbdHost=slurmdbd
#DbdPort=6819
SlurmUser=slurm
#MessageTimeout=300
DebugLevel=4
#DefaultQOS=normal,standby
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd/slurmdbd.pid
#PluginDir=/usr/lib/slurm
#PrivateData=accounts,users,usage,jobs
#TrackWCKey=yes
#
# Database info
StorageType=accounting_storage/mysql
StorageHost=mysql
StorageUser=slurm
StoragePass=password
#StorageLoc=slurm_acct_db

0 comments on commit 9d4c371

Please sign in to comment.