diff --git a/ci/slurm/Dockerfile b/ci/slurm/Dockerfile index 4415681a..fb8e1b44 100644 --- a/ci/slurm/Dockerfile +++ b/ci/slurm/Dockerfile @@ -1,6 +1,104 @@ -FROM giovtorres/slurm-docker-cluster +# SPDX-FileCopyrightText: Copyright (c) 2019 Giovanni Torres +# SPDX-License-Identifier: BSD 3-Clause License +# SPDX-URL: https://github.com/giovtorres/slurm-docker-cluster/blob/65e3a83098eae4ae6f0dcf656e814129d279b7c9/LICENSE -RUN yum install -y iproute +# This is a modified version of the original Dockerfile +# https://github.com/giovtorres/slurm-docker-cluster/blob/65e3a83098eae4ae6f0dcf656e814129d279b7c9/Dockerfile +# Previously this image used "FROM giovtorres/slurm-docker-cluster:latest" +# but while that project is still being maintained it is no longer being pushed to Docker Hub +# so we are vendoring it in here. + + +FROM rockylinux:8 + +LABEL org.opencontainers.image.source="https://github.com/giovtorres/slurm-docker-cluster" \ + org.opencontainers.image.title="slurm-docker-cluster" \ + org.opencontainers.image.description="Slurm Docker cluster on Rocky Linux 8" \ + org.label-schema.docker.cmd="docker-compose up -d" \ + maintainer="Giovanni Torres" + +RUN set -ex \ + && yum makecache \ + && yum -y update \ + && yum -y install dnf-plugins-core \ + && yum config-manager --set-enabled powertools \ + && yum -y install \ + iproute \ + wget \ + bzip2 \ + perl \ + gcc \ + gcc-c++\ + git \ + gnupg \ + make \ + munge \ + munge-devel \ + python3-devel \ + python3-pip \ + python3 \ + mariadb-server \ + mariadb-devel \ + psmisc \ + bash-completion \ + vim-enhanced \ + http-parser-devel \ + json-c-devel \ + && yum clean all \ + && rm -rf /var/cache/yum + +RUN alternatives --set python /usr/bin/python3 + +RUN pip3 install Cython nose + +ARG GOSU_VERSION=1.11 + +RUN set -ex \ + && wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" \ + && wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64.asc" \ + && export GNUPGHOME="$(mktemp -d)" \ + && gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 \ + && gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu \ + && rm -rf "${GNUPGHOME}" /usr/local/bin/gosu.asc \ + && chmod +x /usr/local/bin/gosu \ + && gosu nobody true + +ARG SLURM_TAG=slurm-21-08-6-1 + +RUN set -x \ + && git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \ + && pushd slurm \ + && ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \ + --with-mysql_config=/usr/bin --libdir=/usr/lib64 \ + && make install \ + && install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \ + && install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \ + && install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \ + && install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \ + && popd \ + && rm -rf slurm \ + && groupadd -r --gid=990 slurm \ + && useradd -r -g slurm --uid=990 slurm \ + && mkdir /etc/sysconfig/slurm \ + /var/spool/slurmd \ + /var/run/slurmd \ + /var/run/slurmdbd \ + /var/lib/slurmd \ + /var/log/slurm \ + /data \ + && touch /var/lib/slurmd/node_state \ + /var/lib/slurmd/front_end_state \ + /var/lib/slurmd/job_state \ + /var/lib/slurmd/resv_state \ + /var/lib/slurmd/trigger_state \ + /var/lib/slurmd/assoc_mgr_state \ + /var/lib/slurmd/assoc_usage \ + /var/lib/slurmd/qos_usage \ + /var/lib/slurmd/fed_mgr_state \ + && chown -R slurm:slurm /var/*/slurm* \ + && /sbin/create-munge-key + +CMD ["slurmdbd"] RUN curl -o miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash miniconda.sh -f -b -p /opt/anaconda && \ @@ -12,6 +110,12 @@ COPY environment.yml . RUN conda env create --file environment.yml SHELL ["conda", "run", "-n", "dask-jobqueue", "/bin/bash", "-c"] -ENV LC_ALL en_US.UTF-8 - COPY slurm.conf /etc/slurm/slurm.conf +COPY slurmdbd.conf /etc/slurm/slurmdbd.conf +RUN set -x \ + && chown slurm:slurm /etc/slurm/slurmdbd.conf \ + && chmod 600 /etc/slurm/slurmdbd.conf +COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh +ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] + +ENV LC_ALL en_US.UTF-8 diff --git a/ci/slurm/docker-entrypoint.sh b/ci/slurm/docker-entrypoint.sh new file mode 100755 index 00000000..2d933db9 --- /dev/null +++ b/ci/slurm/docker-entrypoint.sh @@ -0,0 +1,64 @@ +#!/bin/bash +set -e + +if [ "$1" = "slurmdbd" ] +then + echo "---> Starting the MUNGE Authentication service (munged) ..." + gosu munge /usr/sbin/munged + + echo "---> Starting the Slurm Database Daemon (slurmdbd) ..." + + { + . /etc/slurm/slurmdbd.conf + until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null + do + echo "-- Waiting for database to become active ..." + sleep 2 + done + } + echo "-- Database is now active ..." + + exec gosu slurm /usr/sbin/slurmdbd -Dvvv +fi + +if [ "$1" = "slurmctld" ] +then + echo "---> Starting the MUNGE Authentication service (munged) ..." + gosu munge /usr/sbin/munged + + echo "---> Waiting for slurmdbd to become active before starting slurmctld ..." + + until 2>/dev/null >/dev/tcp/slurmdbd/6819 + do + echo "-- slurmdbd is not available. Sleeping ..." + sleep 2 + done + echo "-- slurmdbd is now active ..." + + echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." + if /usr/sbin/slurmctld -V | grep -q '17.02' ; then + exec gosu slurm /usr/sbin/slurmctld -Dvvv + else + exec gosu slurm /usr/sbin/slurmctld -i -Dvvv + fi +fi + +if [ "$1" = "slurmd" ] +then + echo "---> Starting the MUNGE Authentication service (munged) ..." + gosu munge /usr/sbin/munged + + echo "---> Waiting for slurmctld to become active before starting slurmd..." + + until 2>/dev/null >/dev/tcp/slurmctld/6817 + do + echo "-- slurmctld is not available. Sleeping ..." + sleep 2 + done + echo "-- slurmctld is now active ..." + + echo "---> Starting the Slurm Node Daemon (slurmd) ..." + exec /usr/sbin/slurmd -Dvvv +fi + +exec "$@" diff --git a/ci/slurm/slurm.conf b/ci/slurm/slurm.conf index 0aad9f1b..5b4cdd8c 100644 --- a/ci/slurm/slurm.conf +++ b/ci/slurm/slurm.conf @@ -23,7 +23,7 @@ SlurmctldPidFile=/var/run/slurmd/slurmctld.pid SlurmdPidFile=/var/run/slurmd/slurmd.pid ProctrackType=proctrack/linuxproc #PluginDir= -CacheGroups=0 +#CacheGroups=0 #FirstJobId= ReturnToService=0 #MaxJobCount= @@ -83,12 +83,12 @@ JobAcctGatherFrequency=30 AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=slurmdbd AccountingStoragePort=6819 -AccountingStorageLoc=slurm_acct_db +#AccountingStorageLoc=slurm_acct_db #AccountingStoragePass= #AccountingStorageUser= # # COMPUTE NODES -NodeName=c[1-2] RealMemory=4096 CPUs=2 State=UNKNOWN +NodeName=c[1-2] RealMemory=1000 State=UNKNOWN # # PARTITIONS -PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=2048 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP +PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP diff --git a/ci/slurm/slurmdbd.conf b/ci/slurm/slurmdbd.conf new file mode 100644 index 00000000..69d7e1cf --- /dev/null +++ b/ci/slurm/slurmdbd.conf @@ -0,0 +1,37 @@ +# +# Example slurmdbd.conf file. +# +# See the slurmdbd.conf man page for more information. +# +# Archive info +#ArchiveJobs=yes +#ArchiveDir="/tmp" +#ArchiveSteps=yes +#ArchiveScript= +#JobPurge=12 +#StepPurge=1 +# +# Authentication info +AuthType=auth/munge +#AuthInfo=/var/run/munge/munge.socket.2 +# +# slurmDBD info +DbdAddr=slurmdbd +DbdHost=slurmdbd +#DbdPort=6819 +SlurmUser=slurm +#MessageTimeout=300 +DebugLevel=4 +#DefaultQOS=normal,standby +LogFile=/var/log/slurm/slurmdbd.log +PidFile=/var/run/slurmdbd/slurmdbd.pid +#PluginDir=/usr/lib/slurm +#PrivateData=accounts,users,usage,jobs +#TrackWCKey=yes +# +# Database info +StorageType=accounting_storage/mysql +StorageHost=mysql +StorageUser=slurm +StoragePass=password +#StorageLoc=slurm_acct_db