From 9d9834105bc41c5b62f0dc43af49003eb5a972d0 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Tue, 18 May 2021 10:41:36 -0700 Subject: [PATCH 01/20] Introduce a common script library, config for env vars, and inject these into all scripts --- config.example/env.sh | 6 ++++++ scripts/airgap/build_offline_cache.sh | 4 ++++ scripts/common.sh | 23 +++++++++++++++++++++++ scripts/generic/install_docker.sh | 5 +++++ scripts/k8s/debug.sh | 10 ++++++++++ scripts/k8s/deploy_dashboard_user.sh | 5 +++++ scripts/k8s/deploy_ingress.sh | 3 +++ scripts/k8s/deploy_kubeflow.sh | 3 +++ scripts/k8s/deploy_loadbalancer.sh | 3 +++ scripts/k8s/deploy_monitoring.sh | 3 +++ scripts/k8s/deploy_rook.sh | 5 ++++- scripts/k8s/install_helm.sh | 5 +++++ scripts/k8s/setup_remote_k8s.sh | 5 +++++ scripts/k8s/verify_gpu.sh | 5 +++++ scripts/nginx-docker-cache/gen-ca.sh | 5 +++++ 15 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 config.example/env.sh create mode 100644 scripts/common.sh diff --git a/config.example/env.sh b/config.example/env.sh new file mode 100644 index 000000000..c33799baf --- /dev/null +++ b/config.example/env.sh @@ -0,0 +1,6 @@ +# This file acts as a location to override the default configurations of deepops/scripts/* +# Many of the scripts in this directory define global variables and set reasonable defaults +# Global variables (in all caps) that are defined here will be automatically sourced and used in all scripts +# See deepops/scripts/common.sh for implementation details + +DEEPOPS_VERSION="development-branch" diff --git a/scripts/airgap/build_offline_cache.sh b/scripts/airgap/build_offline_cache.sh index 00428a9fa..5ed169ac8 100755 --- a/scripts/airgap/build_offline_cache.sh +++ b/scripts/airgap/build_offline_cache.sh @@ -4,6 +4,10 @@ set -ex SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." DEEPOPS_CONFIG_DIR="${DEEPOPS_CONFIG_DIR:-${ROOT_DIR}/config.example}" + +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + DEST_DIR="/tmp/deepops" TARBALL="/tmp/deepops-archive.tar" DEEPOPS_BUILD_TARBALL="${DEEPOPS_BUILD_TARBALL:-1}" diff --git a/scripts/common.sh b/scripts/common.sh new file mode 100644 index 000000000..024f0b690 --- /dev/null +++ b/scripts/common.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# This is a common set of libraries, configuration override, helper functions, and debug output +# This file should be sourced at the top of all scripts and primarily does 3 things +# 1. Will source the env.sh file to allow override variables be version controlled in ./config +# 2. Will print out some standard debug for each script, to ease debugging +# 3. Will provide a common set of libraries, directory names, etc. + + +# Determine the path to the configuration directory and verify it exists +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/.." +DEEPOPS_CONFIG_DIR=${DEEPOPS_CONFIG_DIR:-"${ROOT_DIR}/config"} +if [ ! -d "${DEEPOPS_CONFIG_DIR}" ]; then + # Because this is a widely used script, we warn here instead of throwing an error + echo "WARNING: Can't find configuration in ${DEEPOPS_CONFIG_DIR}" + echo "WARNING: Please set DEEPOPS_CONFIG_DIR env variable to point to config location" +else + # Source the configuration environment variable overrides + source ${DEEPOPS_CONFIG_DIR}/env.sh +fi + +# Print out base debug +echo "Starting '${0}'; DeepOps version '${DEEPOPS_VERSION}'" diff --git a/scripts/generic/install_docker.sh b/scripts/generic/install_docker.sh index 930726d4c..53d5a704b 100755 --- a/scripts/generic/install_docker.sh +++ b/scripts/generic/install_docker.sh @@ -1,5 +1,10 @@ #!/usr/bin/env bash +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + DOCKER_COMPOSE_URL="${DOCKER_COMPOSE_URL:-https://github.com/docker/compose/releases/download/1.23.2/docker-compose-$(uname -s)-$(uname -m)}" type docker >/dev/null 2>&1 diff --git a/scripts/k8s/debug.sh b/scripts/k8s/debug.sh index b34f32415..bb0078f99 100755 --- a/scripts/k8s/debug.sh +++ b/scripts/k8s/debug.sh @@ -1,4 +1,14 @@ #!/usr/bin/env bash +# This script is meant as a quick best-effort debug log-bundler tool +# Running this will create a quick tarbal with most of the information needed to debug a cluster + + +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + + timestamp=$(date +%s) logdir=config/log_${timestamp} mkdir ${logdir} diff --git a/scripts/k8s/deploy_dashboard_user.sh b/scripts/k8s/deploy_dashboard_user.sh index b4e92784d..7c82685fe 100755 --- a/scripts/k8s/deploy_dashboard_user.sh +++ b/scripts/k8s/deploy_dashboard_user.sh @@ -1,5 +1,10 @@ #!/usr/bin/env bash +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + # Make the dashboard a NodePort kubectl patch svc -n kube-system kubernetes-dashboard -p '{"spec": {"type": "NodePort", "ports": [{"nodePort": 31443, "port": 443}] }}' diff --git a/scripts/k8s/deploy_ingress.sh b/scripts/k8s/deploy_ingress.sh index 76b04e857..4b6a75f05 100755 --- a/scripts/k8s/deploy_ingress.sh +++ b/scripts/k8s/deploy_ingress.sh @@ -5,6 +5,9 @@ set -x SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + HELM_CHARTS_REPO_INGRESS="${HELM_CHARTS_REPO_INGRESS:-https://kubernetes.github.io/ingress-nginx}" HELM_INGRESS_CHART_VERSION="${HELM_INGRESS_CHART_VERSION:-3.5.1}" # HELM_INGRESS_CONFIG, defaults below based on presence of metallb diff --git a/scripts/k8s/deploy_kubeflow.sh b/scripts/k8s/deploy_kubeflow.sh index 31b59122d..887c0dcdd 100755 --- a/scripts/k8s/deploy_kubeflow.sh +++ b/scripts/k8s/deploy_kubeflow.sh @@ -5,6 +5,9 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." CONFIG_DIR="${ROOT_DIR}/config" +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + # Specify credentials for the default user. # TODO: Dynamically sed/hash these value into the CONFIG, these are currently not used export KUBEFLOW_USER_EMAIL="${KUBEFLOW_USER_EMAIL:-admin@kubeflow.org}" diff --git a/scripts/k8s/deploy_loadbalancer.sh b/scripts/k8s/deploy_loadbalancer.sh index 65e309fd5..e2dada441 100755 --- a/scripts/k8s/deploy_loadbalancer.sh +++ b/scripts/k8s/deploy_loadbalancer.sh @@ -5,6 +5,9 @@ set -x SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + # Allow overriding config dir to look in DEEPOPS_CONFIG_DIR=${DEEPOPS_CONFIG_DIR:-"${ROOT_DIR}/config"} if [ ! -d "${DEEPOPS_CONFIG_DIR}" ]; then diff --git a/scripts/k8s/deploy_monitoring.sh b/scripts/k8s/deploy_monitoring.sh index d32a93ef7..3fe620b54 100755 --- a/scripts/k8s/deploy_monitoring.sh +++ b/scripts/k8s/deploy_monitoring.sh @@ -11,6 +11,9 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." cd "${ROOT_DIR}" || exit 1 +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + # Allow overriding config dir to look in DEEPOPS_CONFIG_DIR=${DEEPOPS_CONFIG_DIR:-"${ROOT_DIR}/config"} diff --git a/scripts/k8s/deploy_rook.sh b/scripts/k8s/deploy_rook.sh index 56394804f..9a746863f 100755 --- a/scripts/k8s/deploy_rook.sh +++ b/scripts/k8s/deploy_rook.sh @@ -8,8 +8,11 @@ # Get absolute path for script and root SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." -CHART_VERSION="1.22.1" +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + +CHART_VERSION="1.22.1" HELM_ROOK_CHART_REPO="${HELM_ROOK_CHART_REPO:-https://charts.rook.io/release}" HELM_ROOK_CHART_VERSION="${HELM_ROOK_CHART_VERSION:-v1.1.1}" diff --git a/scripts/k8s/install_helm.sh b/scripts/k8s/install_helm.sh index 0366ecf07..fad5f356e 100755 --- a/scripts/k8s/install_helm.sh +++ b/scripts/k8s/install_helm.sh @@ -2,6 +2,11 @@ set -x +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + HELM_INSTALL_DIR=/usr/local/bin HELM_INSTALL_SCRIPT_URL="${HELM_INSTALL_SCRIPT_URL:-https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3}" HELM_MINIMUM_VERSION=v3.4.1+gc4e7485 diff --git a/scripts/k8s/setup_remote_k8s.sh b/scripts/k8s/setup_remote_k8s.sh index 8115fc856..70462b874 100755 --- a/scripts/k8s/setup_remote_k8s.sh +++ b/scripts/k8s/setup_remote_k8s.sh @@ -1,5 +1,10 @@ #!/usr/bin/env bash +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + KUBECTL_BINARY_URL="${KUBECTL_BINARY_URL:-https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl}" # Install dependencies diff --git a/scripts/k8s/verify_gpu.sh b/scripts/k8s/verify_gpu.sh index e284af019..a417733b0 100755 --- a/scripts/k8s/verify_gpu.sh +++ b/scripts/k8s/verify_gpu.sh @@ -4,6 +4,11 @@ # Check the output and verify the number of nodes and GPUs is as expected # TODO: This script should be wrapped by Ansible to verify that the output of nvidia-smi on each node matches K8S +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + export KFCTL=${KFCTL:-~/kfctl} export CLUSTER_VERIFY_NS=${CLUSTER_VERIFY_NS:-cluster-gpu-verify} export CLUSTER_VERIFY_EXPECTED_PODS=${CLUSTER_VERIFY_EXPECTED_PODS:-} diff --git a/scripts/nginx-docker-cache/gen-ca.sh b/scripts/nginx-docker-cache/gen-ca.sh index da2ad9e7d..3da38f783 100755 --- a/scripts/nginx-docker-cache/gen-ca.sh +++ b/scripts/nginx-docker-cache/gen-ca.sh @@ -1,5 +1,10 @@ #!/usr/bin/env bash +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + CA_CRT_OUTFILE="${CA_CRT_OUTFILE:-/tmp/ca.crt}" CA_KEY_OUTFILE="${CA_KEY_OUTFILE:-/tmp/ca.key}" From 371188e9e9a80d8c1e1e96097555994a4d6ca79b Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Fri, 28 May 2021 00:34:57 +0000 Subject: [PATCH 02/20] Document new env.sh config file --- docs/deepops/configuration.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/deepops/configuration.md b/docs/deepops/configuration.md index 69b0069f6..3a0fbc0d6 100644 --- a/docs/deepops/configuration.md +++ b/docs/deepops/configuration.md @@ -12,6 +12,7 @@ In particular, this directory includes: - `config/group_vars/all.yml`: An Ansible [variables file](https://docs.ansible.com/ansible/latest/user_guide/playbooks_variables.html) that contains variables we expect to work for all hosts - `config/group_vars/k8s-cluster.yml`: Variables specific to deploying Kubernetes clusters - `config/group_vars/slurm-cluster.yml`: Variables specific to deploying Slurm clusters +- `config/env.sh`: Global variables that override default variable values for all `sh` files in `scripts/*`. It's expected that most DeepOps deployments will make changes to these files! The inventory file will be different for every cluster; From 6129e424723b396ac46425ed69238f629597728c Mon Sep 17 00:00:00 2001 From: Adam DeConinck Date: Mon, 13 Dec 2021 20:24:25 +0000 Subject: [PATCH 03/20] Update slurm es logging playbook for log4shell Summary ------- - Update Ansible Galaxy requirements to use a different set of roles (due to the old ones not working) - Update logging.yml playbook to make use of the new Galaxy roles - Add mitigations for CVE-2021-44228 as documented in https://discuss.elastic.co/t/apache-log4j2-remote-code-execution-rce-vulnerability-cve-2021-44228-esa-2021-31/291476 Note that mitigations are applied for ElasticSearch and Logstash. Kibana and Filebeat are confirmed to not be impacted. Test plan --------- Successful execution of the logging.yml playbook --- .../files/cve_2021_44228.options | 1 + playbooks/slurm-cluster/logging.yml | 54 ++++++++++++++++--- .../slurm-cluster/templates/filebeat.conf | 12 +++++ roles/requirements.yml | 21 ++++---- 4 files changed, 71 insertions(+), 17 deletions(-) create mode 100644 playbooks/slurm-cluster/files/cve_2021_44228.options create mode 100644 playbooks/slurm-cluster/templates/filebeat.conf diff --git a/playbooks/slurm-cluster/files/cve_2021_44228.options b/playbooks/slurm-cluster/files/cve_2021_44228.options new file mode 100644 index 000000000..5af9281fc --- /dev/null +++ b/playbooks/slurm-cluster/files/cve_2021_44228.options @@ -0,0 +1 @@ +-Dlog4j2.formatMsgNoLookups=true diff --git a/playbooks/slurm-cluster/logging.yml b/playbooks/slurm-cluster/logging.yml index 6a6f7609f..fb7aede41 100644 --- a/playbooks/slurm-cluster/logging.yml +++ b/playbooks/slurm-cluster/logging.yml @@ -3,21 +3,59 @@ become: true vars: elasticsearch_network_host: 0.0.0.0 - logstash_listen_port_beats: 5000 roles: - - geerlingguy.java - - geerlingguy.elasticsearch - - geerlingguy.logstash - - geerlingguy.kibana + - robertdebock.java + - robertdebock.elastic_repo + - robertdebock.elasticsearch + - robertdebock.logstash + - robertdebock.kibana + +- hosts: slurm-master[0] + become: true + vars: + filebeat_port: "5000" + tasks: + - name: configure logstash to accept logs from filebeat + template: + src: "filebeat.conf" + dest: "/etc/logstash/conf.d/filebeat.conf" + owner: "root" + group: "root" + mode: "0644" + +# Mitigation for CVE-2021-44228 impacting Log4j2 +# https://discuss.elastic.co/t/apache-log4j2-remote-code-execution-rce-vulnerability-cve-2021-44228-esa-2021-31/291476 +- hosts: slurm-master[0] + become: yes tasks: - - name: fix bug in logstash role - command: /usr/share/logstash/bin/logstash-plugin install logstash-filter-multiline + - name: configure elasticsearch to mitigate CVE-2021-44228 + copy: + src: "cve_2021_44228.options" + dest: "/etc/elasticsearch/jvm.options.d/cve_2021_44228.options" + owner: "root" + group: "root" + mode: "0644" + notify: + - restart-elasticsearch + - name: configure logstash to mitigate CVE-2021-44228 + shell: zip -q -d /usr/share/logstash/logstash-core/lib/jars/log4j-core-2.* org/apache/logging/log4j/core/lookup/JndiLookup.class + notify: + - restart-logstash + handlers: + - name: restart-elasticsearch + service: + name: elasticsearch + state: restarted + - name: restart-logstash + service: + name: logstash + state: restarted - hosts: slurm-cluster become: true vars: filebeat_create_config: true - filebeat_prospectors: + filebeat_inputs: - input_type: log paths: - "/var/log/*.log" diff --git a/playbooks/slurm-cluster/templates/filebeat.conf b/playbooks/slurm-cluster/templates/filebeat.conf new file mode 100644 index 000000000..53860ed4f --- /dev/null +++ b/playbooks/slurm-cluster/templates/filebeat.conf @@ -0,0 +1,12 @@ +input { + beats { + port => {{ filebeat_port }} + } +} + +output { + elasticsearch { + hosts => ["http://localhost:9200"] + index => "%{[@metadata][beat]}-%{[@metadata][version]}" + } +} diff --git a/roles/requirements.yml b/roles/requirements.yml index da9b0c789..798d7f8f7 100644 --- a/roles/requirements.yml +++ b/roles/requirements.yml @@ -36,19 +36,22 @@ roles: version: "v0.5.0" - src: geerlingguy.filebeat - version: "2.0.1" + version: "3.3.0" -- src: geerlingguy.logstash - version: "4.0.0" +- src: robertdebock.java + version: "4.1.1" -- src: geerlingguy.elasticsearch - version: "3.0.1" +- src: robertdebock.elastic_repo + version: "1.0.3" -- src: geerlingguy.java - version: "1.9.5" +- src: robertdebock.logstash + version: "1.1.1" -- src: geerlingguy.kibana - version: "3.2.1" +- src: robertdebock.elasticsearch + version: "1.1.3" + +- src: robertdebock.kibana + version: "1.2.4" - src: https://github.com/DeepOps/ansible-maas.git name: ansible-maas From 774ffe0ea7ee27a45fe7323f921dba89dacf106d Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Thu, 6 Jan 2022 16:32:12 -0700 Subject: [PATCH 04/20] update Kubespray to v2.18.0 --- submodules/kubespray | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/kubespray b/submodules/kubespray index bcf695913..92f25bf26 160000 --- a/submodules/kubespray +++ b/submodules/kubespray @@ -1 +1 @@ -Subproject commit bcf695913f5332c0acf08b206cc055c9482664d9 +Subproject commit 92f25bf267ffd3393f6caffa588169d3a44a799c From 7c78b92adbb4a1266e94a1515651ab7663aa908c Mon Sep 17 00:00:00 2001 From: Adam DeConinck Date: Fri, 7 Jan 2022 21:26:29 +0000 Subject: [PATCH 05/20] ensure apt cache is updated when running playbook --- playbooks/slurm-cluster/logging.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/playbooks/slurm-cluster/logging.yml b/playbooks/slurm-cluster/logging.yml index fb7aede41..2e3125206 100644 --- a/playbooks/slurm-cluster/logging.yml +++ b/playbooks/slurm-cluster/logging.yml @@ -3,6 +3,11 @@ become: true vars: elasticsearch_network_host: 0.0.0.0 + pre_tasks: + - name: debian - ensure apt cache updated + apt: + update_cache: true + when: ansible_os_family == "Debian" roles: - robertdebock.java - robertdebock.elastic_repo From 8452b02f67c5b7f6cf05ab139c32eaed05d985f4 Mon Sep 17 00:00:00 2001 From: Adam DeConinck Date: Fri, 7 Jan 2022 21:26:50 +0000 Subject: [PATCH 06/20] only run mitigation when offending class is present --- playbooks/slurm-cluster/logging.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/playbooks/slurm-cluster/logging.yml b/playbooks/slurm-cluster/logging.yml index 2e3125206..e9a229cb2 100644 --- a/playbooks/slurm-cluster/logging.yml +++ b/playbooks/slurm-cluster/logging.yml @@ -42,10 +42,16 @@ mode: "0644" notify: - restart-elasticsearch + - name: check for relevant class in logstash + shell: unzip -l /usr/share/logstash/logstash-core/lib/jars/log4j-core-2.* | grep JndiLookup.class + register: logstash_jndi + changed_when: logstash_jndi.rc == 0 + failed_when: logstash_jndi.rc == 2 - name: configure logstash to mitigate CVE-2021-44228 shell: zip -q -d /usr/share/logstash/logstash-core/lib/jars/log4j-core-2.* org/apache/logging/log4j/core/lookup/JndiLookup.class notify: - restart-logstash + when: logstash_jndi.changed handlers: - name: restart-elasticsearch service: From 8540c4f9356fddb8ae83b4d9a9eed09687c8bfea Mon Sep 17 00:00:00 2001 From: Adam DeConinck Date: Tue, 11 Jan 2022 21:50:37 +0000 Subject: [PATCH 07/20] add manual stop of logstash during initial run --- playbooks/slurm-cluster/logging.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/playbooks/slurm-cluster/logging.yml b/playbooks/slurm-cluster/logging.yml index e9a229cb2..bb56a77c4 100644 --- a/playbooks/slurm-cluster/logging.yml +++ b/playbooks/slurm-cluster/logging.yml @@ -52,6 +52,13 @@ notify: - restart-logstash when: logstash_jndi.changed + - name: manually stop logstash as restart is not consistently working later + service: + name: logstash + state: stopped + notify: + - restart-logstash + when: logstash_jndi.changed handlers: - name: restart-elasticsearch service: From cd425ffe0c62ad194f99c5e21395a4ba1d7614f6 Mon Sep 17 00:00:00 2001 From: Atsushi Nukariya Date: Tue, 11 Jan 2022 23:56:29 +0000 Subject: [PATCH 08/20] Add proxy config to standalone container registry Signed-off-by: Atsushi Nukariya --- roles/standalone-container-registry/defaults/main.yml | 3 +++ roles/standalone-container-registry/tasks/main.yml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/roles/standalone-container-registry/defaults/main.yml b/roles/standalone-container-registry/defaults/main.yml index 77a56b62d..9aaf7c4da 100644 --- a/roles/standalone-container-registry/defaults/main.yml +++ b/roles/standalone-container-registry/defaults/main.yml @@ -19,3 +19,6 @@ standalone_container_registry_cache_enable: false standalone_container_registry_cache_upstream: "https://registry-1.docker.io" # standalone_container_registry_cache_username: # standalone_container_registry_cache_password: + +# standalone_container_registry_http_proxy: +# standalone_container_registry_https_proxy: diff --git a/roles/standalone-container-registry/tasks/main.yml b/roles/standalone-container-registry/tasks/main.yml index 1a76370ac..5a0cb0843 100644 --- a/roles/standalone-container-registry/tasks/main.yml +++ b/roles/standalone-container-registry/tasks/main.yml @@ -53,5 +53,8 @@ network_mode: host restart: yes restart_policy: unless-stopped + env: + http_proxy: "{{ standalone_container_registry_http_proxy }}" + https_proxy: "{{ standalone_container_registry_https_proxy }}" volumes: - "{{ standalone_container_registry_config_dir }}/config.yml:/etc/docker/registry/config.yml" From eac5d3cf033b36b047a8a7d5370362dbb1d001c6 Mon Sep 17 00:00:00 2001 From: Atsushi Nukariya Date: Wed, 12 Jan 2022 00:00:55 +0000 Subject: [PATCH 09/20] Stop systemd-resolved on Ubuntu 20.04 Signed-off-by: Atsushi Nukariya --- roles/dns-config/tasks/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/dns-config/tasks/main.yml b/roles/dns-config/tasks/main.yml index 639e3f3cc..58b7b7dfd 100644 --- a/roles/dns-config/tasks/main.yml +++ b/roles/dns-config/tasks/main.yml @@ -16,12 +16,12 @@ - systemd-resolved when: ansible_distribution == 'Ubuntu' and ansible_distribution_major_version == '16' -- name: disable services (bionic) +- name: disable services (bionic, focal) service: name: systemd-resolved state: stopped enabled: no - when: ansible_distribution == 'Ubuntu' and ansible_distribution_major_version == '18' + when: ansible_distribution == 'Ubuntu' and (ansible_distribution_major_version in ['18', '20']) - name: install /etc/resolv.conf template: From 87116ea86ebe3a0388479e5e4f46c07035f2695a Mon Sep 17 00:00:00 2001 From: Adam DeConinck Date: Thu, 9 Dec 2021 21:58:07 +0000 Subject: [PATCH 10/20] Add Molecule testing for Singularity, plus infra for more roles Summary ------- - Fixes the ability to install Singularity on EL8 by enabling the necessary repos - Move the pre-execution logic for the Singularity Galaxy role into a new DeepOps role, singularity_wrapper - Modify the playbook to use singularity_wrapper - Add Molecule tests to singularity_wrapper role - Add a Github Action configuration to execute this and other Molecule tests of DeepOps roles - Add documentation to `docs/deepops/testing.md` to describe these tests and provide instructions on adding more Test plan --------- Clean execution of the Github action for this role Future work ----------- This change adds a lot of infrastructure that should make it easier to add testing to additional roles. Future changes should enable testing of more of the roles in DeepOps, but I didn't want to get *too* carried away in this one. :wink: --- .github/workflows/molecule.yml | 33 ++++++++ docs/deepops/testing.md | 76 ++++++++++++++++++- playbooks/container/singularity.yml | 7 +- roles/requirements.yml | 6 +- roles/singularity_wrapper/.yamllint | 33 ++++++++ roles/singularity_wrapper/defaults/main.yml | 10 +++ roles/singularity_wrapper/meta/main.yml | 9 +++ .../molecule/default/converge.yml | 7 ++ .../molecule/default/molecule.yml | 26 +++++++ .../molecule/default/verify.yml | 10 +++ roles/singularity_wrapper/tasks/main.yml | 35 +++++++++ 11 files changed, 241 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/molecule.yml create mode 100644 roles/singularity_wrapper/.yamllint create mode 100644 roles/singularity_wrapper/defaults/main.yml create mode 100644 roles/singularity_wrapper/meta/main.yml create mode 100644 roles/singularity_wrapper/molecule/default/converge.yml create mode 100644 roles/singularity_wrapper/molecule/default/molecule.yml create mode 100644 roles/singularity_wrapper/molecule/default/verify.yml create mode 100644 roles/singularity_wrapper/tasks/main.yml diff --git a/.github/workflows/molecule.yml b/.github/workflows/molecule.yml new file mode 100644 index 000000000..f4165f104 --- /dev/null +++ b/.github/workflows/molecule.yml @@ -0,0 +1,33 @@ +--- +name: test ansible roles with molecule +on: + - push + - pull_request +jobs: + build: + runs-on: ubuntu-20.04 + strategy: + max-parallel: 4 + matrix: + deepops-role: + - singularity_wrapper + steps: + - name: check out repo + uses: actions/checkout@v2 + with: + path: "${{ github.repository }}" + - name: set up python + uses: actions/setup-python@v2 + with: + python-version: "3.9" + - name: install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install molecule[docker] docker ansible + - name: run molecule test + run: | + cd "${{ github.repository }}/roles" + ansible-galaxy role install --force -r ./requirements.yml + ansible-galaxy collection install --force -r ./requirements.yml + cd "${{ matrix.deepops-role }}" + molecule test diff --git a/docs/deepops/testing.md b/docs/deepops/testing.md index c09f26bd7..26c2486db 100644 --- a/docs/deepops/testing.md +++ b/docs/deepops/testing.md @@ -1,12 +1,13 @@ # DeepOps Testing, CI/CD, and Validation -## DeepOps Continuous Integration Testing + +## DeepOps end-to-end testing The DeepOps project leverages a private Jenkins server to run continuous integration tests. Testing is done using the [virtual](../../virtual) deployment mechanism. Several Vagrant VMs are created, the cluster is deployed, tests are executed, and then the VMs are destroyed. The goal of the DeepOps CI is to prevent bugs from being introduced into the code base and to identify when changes in 3rd party platforms have occurred or impacted the DeepOps deployment mechanisms. In general, K8s and Slurm deployment issues are detected and resolved with urgency. Many components of DeepOps are 3rd party open source tools that may silently fail or suddenly change without notice. The team will make a best-effort to resolve these issues and include regression tests, however there may be times where a fix is unavailable. Historically, this has been an issue with Rook-Ceph and Kubeflow, and those GitHub communities are best equipped to help with resolutions. -### Testing Methodi +### Testing Method DeepOps CI contains two types of automated tests: @@ -63,6 +64,77 @@ A short description of the nightly testing is outlined below. The full suit of t | MIG configuration | | | | No testing support +## DeepOps Ansible role testing + +A subset of the Ansible roles in DeepOps have tests defined using [Ansible Molecule](https://molecule.readthedocs.io/en/latest/). +This testing mechanism allows the roles to be tested individually, providing additional test signal to identify issues which do not appear in the end-to-end tests. +These tests are run automatically for each pull request using [Github Actions](https://github.com/NVIDIA/deepops/actions). + +Molecule testing runs the Ansible role in quesiton inside a Docker container. +As such, not all roles will be easy to test witth this mechanism. +Roles which mostly involve installing software, configuring services, or executing scripts should generally be possible to test. +Roles which rely on the presence of specific hardware (such as GPUs), which reboot the nodes they act on, or which make changes to kernel configuration are going to be harder to test with Molecule. + +### Defining Molecule tests for a new role + +To add Molecule tests to a new role, the following procedure can be used. + +1. Ensure you have Docker installed in your development environment + +2. Install Ansible Molecule in your development environment + +``` +$ python3 -m pip install "molecule[docker,lint]" +``` + +3. Initialize Molecule in your new role + +``` +$ cd deepops/roles/ +$ molecule init scenario -r --driver docker +``` + +4. In the file `molecule/default/molecule.yml`, define the list of platforms to be tested. +DeepOps currently supports operating systems based on Ubuntu 18.04, Ubuntu 20.04, EL7, and EL8. +To test these stacks, the following `platforms` stanza can be used. + +``` +platforms: + - name: ubuntu-1804 + image: geerlingguy/docker-ubuntu1804-ansible + pre_build_image: true + - name: ubuntu-2004 + image: geerlingguy/docker-ubuntu2004-ansible + pre_build_image: true + - name: centos-7 + image: geerlingguy/docker-centos7-ansible + pre_build_image: true + - name: centos-8 + image: geerlingguy/docker-centos8-ansible + pre_build_image: true +``` + +5. If you haven't already, define your role's metadata in the file `meta/main.yml`. +A sample `meta.yml` is shown here: + +``` +galaxy_info: + role_name: + namespace: deepops + author: DeepOps Team + company: NVIDIA + description: + license: 3-Clause BSD + min_ansible_version: 2.9 +``` + +6. Once this is done, verify that your role executes successfully in the Molecule environment by running `molecule test`. If you run into any issues, consult the [Molecule documentation](https://molecule.readthedocs.io/en/latest/index.html) for help resolving them. + +7. (optional) In addition to testing successful execution, you can add additional tests which will be run after your role completes in a file `molecule/default/verify.yml`. This is an Ansible playbook that will run in the same environment as your playbook ran. For a simple example of such a verify playbook, see the [Enroot role](https://github.com/NVIDIA/ansible-role-enroot/blob/master/molecule/default/verify.yml). + +8. Once you're confident that your new tests are all passing, add your role to the `deepops-role` section in the `.github/workflows/molecule.yml` file. + + ## DeepOps Deployment Validation The Slurm and Kubernetes deployment guides both document cluster verification steps. These should be run during the installation process to validate a GPU workload can be executed on the cluster. diff --git a/playbooks/container/singularity.yml b/playbooks/container/singularity.yml index 16ca5b9ab..74208898f 100644 --- a/playbooks/container/singularity.yml +++ b/playbooks/container/singularity.yml @@ -1,10 +1,5 @@ --- - hosts: all become: yes - pre_tasks: - - name: create a folder for go - file: - path: "{{ golang_install_dir }}" - recurse: yes roles: - - lecorguille.singularity + - singularity_wrapper diff --git a/roles/requirements.yml b/roles/requirements.yml index da9b0c789..44ba03d8b 100644 --- a/roles/requirements.yml +++ b/roles/requirements.yml @@ -61,8 +61,8 @@ roles: - src: https://github.com/OSC/ood-ansible.git version: 'v2.0.3' +- src: abims_sbr.singularity + version: 3.7.1-1 + - src: gantsign.golang version: 2.4.0 - -- src: lecorguille.singularity - version: 1.2.0 diff --git a/roles/singularity_wrapper/.yamllint b/roles/singularity_wrapper/.yamllint new file mode 100644 index 000000000..882767605 --- /dev/null +++ b/roles/singularity_wrapper/.yamllint @@ -0,0 +1,33 @@ +--- +# Based on ansible-lint config +extends: default + +rules: + braces: + max-spaces-inside: 1 + level: error + brackets: + max-spaces-inside: 1 + level: error + colons: + max-spaces-after: -1 + level: error + commas: + max-spaces-after: -1 + level: error + comments: disable + comments-indentation: disable + document-start: disable + empty-lines: + max: 3 + level: error + hyphens: + level: error + indentation: disable + key-duplicates: enable + line-length: disable + new-line-at-end-of-file: disable + new-lines: + type: unix + trailing-spaces: disable + truthy: disable diff --git a/roles/singularity_wrapper/defaults/main.yml b/roles/singularity_wrapper/defaults/main.yml new file mode 100644 index 000000000..5be75a6a4 --- /dev/null +++ b/roles/singularity_wrapper/defaults/main.yml @@ -0,0 +1,10 @@ +--- +# vars for lecorguille.singularity +singularity_version: "3.7.3" +singularity_conf_path: "/etc/singularity/singularity.conf" +bind_paths: [] + +# vars for gantsign.golang +golang_version: "1.14.4" +golang_install_dir: "/opt/go/{{ golang_version }}" +golang_gopath: "/opt/go/packages" diff --git a/roles/singularity_wrapper/meta/main.yml b/roles/singularity_wrapper/meta/main.yml new file mode 100644 index 000000000..9fbd94944 --- /dev/null +++ b/roles/singularity_wrapper/meta/main.yml @@ -0,0 +1,9 @@ +--- +galaxy_info: + role_name: singularity_wrapper + namespace: deepops + author: DeepOps Team + company: NVIDIA + description: Wrap lecourguille.singularity role + license: 3-Clause BSD + min_ansible_version: 2.9 diff --git a/roles/singularity_wrapper/molecule/default/converge.yml b/roles/singularity_wrapper/molecule/default/converge.yml new file mode 100644 index 000000000..c0295f669 --- /dev/null +++ b/roles/singularity_wrapper/molecule/default/converge.yml @@ -0,0 +1,7 @@ +--- +- name: Converge + hosts: all + tasks: + - name: "Include singularity_wrapper" + include_role: + name: "singularity_wrapper" diff --git a/roles/singularity_wrapper/molecule/default/molecule.yml b/roles/singularity_wrapper/molecule/default/molecule.yml new file mode 100644 index 000000000..2962ff2e7 --- /dev/null +++ b/roles/singularity_wrapper/molecule/default/molecule.yml @@ -0,0 +1,26 @@ +--- +dependency: + name: galaxy + options: + requirements-file: requirements.yml +driver: + name: docker +platforms: + - name: ubuntu-1804 + image: geerlingguy/docker-ubuntu1804-ansible + pre_build_image: true + - name: ubuntu-2004 + image: geerlingguy/docker-ubuntu2004-ansible + pre_build_image: true + - name: centos-7 + image: geerlingguy/docker-centos7-ansible + pre_build_image: true + - name: centos-8 + image: geerlingguy/docker-centos8-ansible + pre_build_image: true +provisioner: + name: ansible + ansible_args: + - -vv +verifier: + name: ansible diff --git a/roles/singularity_wrapper/molecule/default/verify.yml b/roles/singularity_wrapper/molecule/default/verify.yml new file mode 100644 index 000000000..79044cd06 --- /dev/null +++ b/roles/singularity_wrapper/molecule/default/verify.yml @@ -0,0 +1,10 @@ +--- +# This is an example playbook to execute Ansible tests. + +- name: Verify + hosts: all + gather_facts: false + tasks: + - name: Example assertion + assert: + that: true diff --git a/roles/singularity_wrapper/tasks/main.yml b/roles/singularity_wrapper/tasks/main.yml new file mode 100644 index 000000000..a0675c73e --- /dev/null +++ b/roles/singularity_wrapper/tasks/main.yml @@ -0,0 +1,35 @@ +--- +- name: centos 8 - ensure powertools installed + block: + - name: ensure prereq packages installed + yum: + name: "dnf-plugins-core" + state: "present" + - name: enable powertools + command: "yum config-manager --set-enabled powertools" + register: enable_powertools + changed_when: enable_powertools.rc != 0 + when: (ansible_distribution == "CentOS") and (ansible_distribution_major_version == "8") + +- name: rhel 8 - ensure CRB repository is enabled + rhsm_repository: + name: "codeready-builder-for-rhel-8-x86_64-rpms" + when: (ansible_distribution == "Red Hat Enterprise Linux") and (ansible_distribution_major_version == "8") + +- name: debian - ensure apt cache is up to date + apt: + update_cache: yes + when: ansible_os_family == "Debian" + +- name: create a folder for go + file: + path: "{{ golang_install_dir }}" + recurse: yes + +- name: install golang explicitly + include_role: + name: gantsign.golang + +- name: install singularity + include_role: + name: abims_sbr.singularity From 7d80b583ed241e003c03287fee3911d374d88b42 Mon Sep 17 00:00:00 2001 From: Adam DeConinck Date: Fri, 7 Jan 2022 15:34:49 +0000 Subject: [PATCH 11/20] Fix setup.sh to deal with includes in requirements Annoyingly, includes in requirements.yml seem to be relative to the cwd of ansible-galaxy, rather than relative to the requirements file. https://github.com/ansible/ansible/issues/46385 Made changes to setup.sh in order to ensure we can install requirements even if we use includes. --- scripts/setup.sh | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/scripts/setup.sh b/scripts/setup.sh index e711c4f74..b1f8fbcbf 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -7,10 +7,14 @@ # Can be run standalone with: curl -sL git.io/deepops | bash # or: curl -sL git.io/deepops | bash -s -- 19.07 +# Determine current directory and root directory +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/.." + # Configuration ANSIBLE_VERSION="${ANSIBLE_VERSION:-2.9.21}" # Ansible version to install ANSIBLE_TOO_NEW="${ANSIBLE_TOO_NEW:-2.10.0}" # Ansible version too new -CONFIG_DIR="${CONFIG_DIR:-./config}" # Default configuration directory location +CONFIG_DIR="${CONFIG_DIR:-${ROOT_DIR}/config}" # Default configuration directory location DEEPOPS_TAG="${1:-master}" # DeepOps branch to set up JINJA2_VERSION="${JINJA2_VERSION:-2.11.1}" # Jinja2 required version PIP="${PIP:-pip3}" # Pip binary to use @@ -21,10 +25,6 @@ VENV_DIR="${VENV_DIR:-/opt/deepops/env}" # Path to python virtual environ # Set distro-specific variables . /etc/os-release - -SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -ROOT_DIR="${SCRIPT_DIR}/.." - DEPS_DEB=(git virtualenv python3-virtualenv sshpass wget) DEPS_EL7=(git libselinux-python3 python-virtualenv python3-virtualenv sshpass wget) DEPS_EL8=(git python3-libselinux python3-virtualenv sshpass wget) @@ -146,10 +146,25 @@ fi # Install Ansible Galaxy roles if command -v ansible-galaxy &> /dev/null ; then echo "Updating Ansible Galaxy roles..." - as_user ansible-galaxy collection install --force -r "${ROOT_DIR}/roles/requirements.yml" >/dev/null - as_user ansible-galaxy role install --force -r "${ROOT_DIR}/roles/requirements.yml" >/dev/null - as_user ansible-galaxy collection install --force -i -r "${ROOT_DIR}/config/requirements.yml" >/dev/null - as_user ansible-galaxy role install --force -i -r "${ROOT_DIR}/config/requirements.yml" >/dev/null + initial_dir="$(pwd)" + roles_path="${ROOT_DIR}/roles/galaxy" + collections_path="${ROOT_DIR}/collections" + + # First, install requirements from role requirements. + # Note: due to a known issue in ansible-galaxy, this works best when the + # cwd is the same as the directory where the file is located. + # https://github.com/ansible/ansible/issues/46385 + cd "${ROOT_DIR}/roles" + as_user ansible-galaxy collection install -p "${collections_path}" --force -r "requirements.yml" >/dev/null + as_user ansible-galaxy role install -p "${roles_path}" --force -r "requirements.yml" >/dev/null + + # Install any user-defined config requirements + if [ -d "${CONFIG_DIR}" ]; then + cd "${CONFIG_DIR}" + as_user ansible-galaxy collection install -p "${collections_path}" --force -i -r "requirements.yml" >/dev/null + as_user ansible-galaxy role install -p "${roles_path}" --force -i -r "requirements.yml" >/dev/null + fi + cd "${initial_dir}" else echo "ERROR: Unable to install Ansible Galaxy roles, 'ansible-galaxy' command not found" fi From 123ae2922468a4c1c9e353ecbeaa604b7cd50344 Mon Sep 17 00:00:00 2001 From: Adam DeConinck Date: Fri, 7 Jan 2022 18:04:30 +0000 Subject: [PATCH 12/20] Add basic verification test Just check that we actually install Singularity --- .../molecule/default/verify.yml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/roles/singularity_wrapper/molecule/default/verify.yml b/roles/singularity_wrapper/molecule/default/verify.yml index 79044cd06..b5afb1c0d 100644 --- a/roles/singularity_wrapper/molecule/default/verify.yml +++ b/roles/singularity_wrapper/molecule/default/verify.yml @@ -1,10 +1,13 @@ --- -# This is an example playbook to execute Ansible tests. - -- name: Verify +- name: verify hosts: all - gather_facts: false tasks: - - name: Example assertion + - name: check for path to singularity + command: which singularity + register: which_singularity + changed_when: which_singularity.rc != 0 + + - name: verify path to singularity assert: - that: true + that: + - "'/usr/local/bin/singularity' in which_singularity.stdout" From d9e964c2c38dab4149c1e3f0561326cebbaf0165 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Fri, 14 Jan 2022 12:21:41 -0800 Subject: [PATCH 13/20] Update software versions --- config.example/group_vars/all.yml | 2 +- config.example/group_vars/slurm-cluster.yml | 8 ++++---- roles/nvidia-cuda/defaults/main.yml | 2 +- roles/nvidia-hpc-sdk/defaults/main.yml | 8 ++++---- roles/slurm/defaults/main.yml | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml index b5ba175da..5d92a730d 100644 --- a/config.example/group_vars/all.yml +++ b/config.example/group_vars/all.yml @@ -122,7 +122,7 @@ sftp_chroot: false ################################################################################ # NVIDIA GPU configuration # Playbook: nvidia-cuda -cuda_version: cuda-toolkit-11-4 +cuda_version: cuda-toolkit-11-5 # DGX-specific vars may be used to target specific models, # because available versions for DGX may differ from the generic repo diff --git a/config.example/group_vars/slurm-cluster.yml b/config.example/group_vars/slurm-cluster.yml index 04f69c815..7f380e74d 100644 --- a/config.example/group_vars/slurm-cluster.yml +++ b/config.example/group_vars/slurm-cluster.yml @@ -3,7 +3,7 @@ ################################################################################ # Slurm job scheduler configuration # Playbook: slurm, slurm-cluster, slurm-perf, slurm-perf-cluster, slurm-validation -slurm_version: 21.08.1 +slurm_version: 21.08.5 slurm_install_prefix: /usr/local pmix_install_prefix: /opt/deepops/pmix hwloc_install_prefix: /opt/deepops/hwloc @@ -117,9 +117,9 @@ sm_install_host: "slurm-master[0]" slurm_install_hpcsdk: true # Select the version of HPC SDK to download -hpcsdk_major_version: "21" -hpcsdk_minor_version: "9" -hpcsdk_file_cuda: "11.4" +hpcsdk_major_version: "22" +hpcsdk_minor_version: "1" +hpcsdk_file_cuda: "11.5" hpcsdk_arch: "x86_64" # In a Slurm cluster, default to setting up HPC SDK as modules rather than in diff --git a/roles/nvidia-cuda/defaults/main.yml b/roles/nvidia-cuda/defaults/main.yml index 84cbe2b78..628eb5675 100644 --- a/roles/nvidia-cuda/defaults/main.yml +++ b/roles/nvidia-cuda/defaults/main.yml @@ -1,6 +1,6 @@ --- # 'cuda' is the generic package and will pull the latest version -cuda_version: "cuda-toolkit-11-3" +cuda_version: "cuda-toolkit-11-5" # DGX-specific vars may be used to target specific models, # because available versions for DGX may differ from the generic repo diff --git a/roles/nvidia-hpc-sdk/defaults/main.yml b/roles/nvidia-hpc-sdk/defaults/main.yml index 98f27b794..b54736936 100644 --- a/roles/nvidia-hpc-sdk/defaults/main.yml +++ b/roles/nvidia-hpc-sdk/defaults/main.yml @@ -15,15 +15,15 @@ # See https://developer.nvidia.com/nvidia-hpc-sdk-downloads for more detail on available downloads. # Version strings used to construct download URL -hpcsdk_major_version: "21" -hpcsdk_minor_version: "9" -hpcsdk_file_cuda: "11.4" +hpcsdk_major_version: "22" +hpcsdk_minor_version: "1" +hpcsdk_file_cuda: "11.5" hpcsdk_arch: "x86_64" # We need to specify the default CUDA toolkit to use during installation. # This should usually be the latest CUDA included in the HPC SDK you are # installing. -hpcsdk_default_cuda: "11.4" +hpcsdk_default_cuda: "11.5" # Add HPC SDK modules to the MODULEPATH? hpcsdk_install_as_modules: false diff --git a/roles/slurm/defaults/main.yml b/roles/slurm/defaults/main.yml index 49efe6330..79dc46b53 100644 --- a/roles/slurm/defaults/main.yml +++ b/roles/slurm/defaults/main.yml @@ -6,7 +6,7 @@ hwloc_build_dir: /opt/deepops/build/hwloc pmix_build_dir: /opt/deepops/build/pmix slurm_workflow_build: yes -slurm_version: 21.08.0 +slurm_version: 21.08.5 slurm_src_url: "https://download.schedmd.com/slurm/slurm-{{ slurm_version }}.tar.bz2" slurm_build_make_clean: no slurm_build_dir_cleanup: no From 407f5bc81da50cb4ba27b1ad8e2f385acc8a746b Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Fri, 14 Jan 2022 12:25:43 -0800 Subject: [PATCH 14/20] Update latest release tag --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 65abe99dd..ed3e0944f 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Check out the [video tutorial](https://drive.google.com/file/d/1RNLQYlgJqE8JMv0n ## Releases -Latest release: [DeepOps 21.09 Release](https://github.com/NVIDIA/deepops/releases/tag/21.09) +Latest release: [DeepOps 22.01 Release](https://github.com/NVIDIA/deepops/releases/tag/22.01) It is recommended to use the latest release branch for stable code (linked above). All development takes place on the master branch, which is generally [functional](docs/deepops/testing.md) but may change significantly between releases. From d93d48951f77ef1a2bc86e35e8dad7767532241d Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Fri, 14 Jan 2022 12:34:48 -0800 Subject: [PATCH 15/20] clarify use of env.sh --- config.example/env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.example/env.sh b/config.example/env.sh index c33799baf..288e585db 100644 --- a/config.example/env.sh +++ b/config.example/env.sh @@ -3,4 +3,4 @@ # Global variables (in all caps) that are defined here will be automatically sourced and used in all scripts # See deepops/scripts/common.sh for implementation details -DEEPOPS_VERSION="development-branch" +DEEPOPS_EXAMPLE_VAR="" From e741e5b6493527083177820f8e2bb3369cb28720 Mon Sep 17 00:00:00 2001 From: Atsushi Nukariya Date: Sun, 16 Jan 2022 22:56:13 +0000 Subject: [PATCH 16/20] Address review comment Signed-off-by: Atsushi Nukariya --- roles/standalone-container-registry/defaults/main.yml | 3 --- roles/standalone-container-registry/tasks/main.yml | 4 +--- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/roles/standalone-container-registry/defaults/main.yml b/roles/standalone-container-registry/defaults/main.yml index 9aaf7c4da..77a56b62d 100644 --- a/roles/standalone-container-registry/defaults/main.yml +++ b/roles/standalone-container-registry/defaults/main.yml @@ -19,6 +19,3 @@ standalone_container_registry_cache_enable: false standalone_container_registry_cache_upstream: "https://registry-1.docker.io" # standalone_container_registry_cache_username: # standalone_container_registry_cache_password: - -# standalone_container_registry_http_proxy: -# standalone_container_registry_https_proxy: diff --git a/roles/standalone-container-registry/tasks/main.yml b/roles/standalone-container-registry/tasks/main.yml index 5a0cb0843..e462581a9 100644 --- a/roles/standalone-container-registry/tasks/main.yml +++ b/roles/standalone-container-registry/tasks/main.yml @@ -53,8 +53,6 @@ network_mode: host restart: yes restart_policy: unless-stopped - env: - http_proxy: "{{ standalone_container_registry_http_proxy }}" - https_proxy: "{{ standalone_container_registry_https_proxy }}" + env: "{{ proxy_env if proxy_env is defined else {} }}" volumes: - "{{ standalone_container_registry_config_dir }}/config.yml:/etc/docker/registry/config.yml" From 52e6f401d57f75a5dc3a43c1b9906c6659d89b56 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Tue, 18 Jan 2022 12:04:56 -0800 Subject: [PATCH 17/20] Bump default helm version and don't override kubespray helm version --- config.example/group_vars/k8s-cluster.yml | 3 --- scripts/k8s/install_helm.sh | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/config.example/group_vars/k8s-cluster.yml b/config.example/group_vars/k8s-cluster.yml index 1fe90c349..2560e1d2d 100644 --- a/config.example/group_vars/k8s-cluster.yml +++ b/config.example/group_vars/k8s-cluster.yml @@ -36,9 +36,6 @@ dashboard_image_repo: "kubernetesui/dashboard" dashboard_metrics_scrape_tagr: "v1.0.4" dashboard_metrics_scraper_repo: "kubernetesui/metrics-scraper" -# Override the Helm version installed by Kubespray -helm_version: "v3.5.4" - # Ensure hosts file generation only runs across k8s cluster hosts_add_ansible_managed_hosts_groups: ["k8s-cluster"] diff --git a/scripts/k8s/install_helm.sh b/scripts/k8s/install_helm.sh index 6906578ed..57a0ee04b 100755 --- a/scripts/k8s/install_helm.sh +++ b/scripts/k8s/install_helm.sh @@ -41,7 +41,7 @@ if [ "${HELM_MINIMUM_VERSION}" != "${helm_min_installed}" ]; then chmod +x /var/tmp/get_helm.sh #sed -i 's/sudo//g' /var/tmp/get_helm.sh mkdir -p ${HELM_INSTALL_DIR} - HELM_INSTALL_DIR=${HELM_INSTALL_DIR} DESIRED_VERSION=v3.5.4 /var/tmp/get_helm.sh # Should match: config/group_vars/k8s-cluster.yml:helm_version: + HELM_INSTALL_DIR=${HELM_INSTALL_DIR} DESIRED_VERSION=v3.7.1 /var/tmp/get_helm.sh # Should match: config/group_vars/k8s-cluster.yml:helm_version: fi # Display the helm version for better debug From 86725b0ef0ce495e0218e665b23e37ef57fa12a1 Mon Sep 17 00:00:00 2001 From: Adam DeConinck Date: Tue, 18 Jan 2022 20:17:29 +0000 Subject: [PATCH 18/20] we don't need to support includes anymore, so move back to repo root --- scripts/setup.sh | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/scripts/setup.sh b/scripts/setup.sh index b1f8fbcbf..ca7cd6b25 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -150,13 +150,9 @@ if command -v ansible-galaxy &> /dev/null ; then roles_path="${ROOT_DIR}/roles/galaxy" collections_path="${ROOT_DIR}/collections" - # First, install requirements from role requirements. - # Note: due to a known issue in ansible-galaxy, this works best when the - # cwd is the same as the directory where the file is located. - # https://github.com/ansible/ansible/issues/46385 - cd "${ROOT_DIR}/roles" - as_user ansible-galaxy collection install -p "${collections_path}" --force -r "requirements.yml" >/dev/null - as_user ansible-galaxy role install -p "${roles_path}" --force -r "requirements.yml" >/dev/null + cd "${ROOT_DIR}" + as_user ansible-galaxy collection install -p "${collections_path}" --force -r "roles/requirements.yml" >/dev/null + as_user ansible-galaxy role install -p "${roles_path}" --force -r "roles/requirements.yml" >/dev/null # Install any user-defined config requirements if [ -d "${CONFIG_DIR}" ]; then From e4991dda7c0885359608a2aff431c37ff84133e7 Mon Sep 17 00:00:00 2001 From: Adam DeConinck Date: Tue, 18 Jan 2022 20:17:49 +0000 Subject: [PATCH 19/20] only run against config requirements.yml if file exists --- scripts/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/setup.sh b/scripts/setup.sh index ca7cd6b25..d65e1792e 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -155,7 +155,7 @@ if command -v ansible-galaxy &> /dev/null ; then as_user ansible-galaxy role install -p "${roles_path}" --force -r "roles/requirements.yml" >/dev/null # Install any user-defined config requirements - if [ -d "${CONFIG_DIR}" ]; then + if [ -d "${CONFIG_DIR}" ] && [ -f "${CONFIG_DIR}/requirement.yml" ] ; then cd "${CONFIG_DIR}" as_user ansible-galaxy collection install -p "${collections_path}" --force -i -r "requirements.yml" >/dev/null as_user ansible-galaxy role install -p "${roles_path}" --force -i -r "requirements.yml" >/dev/null From aaedef8f12177119d5fccf48b13227ffceedb120 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Wed, 19 Jan 2022 11:32:57 -0800 Subject: [PATCH 20/20] Downgrade Kubespray to v2.17.1 due to containerd issue --- submodules/kubespray | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/kubespray b/submodules/kubespray index 92f25bf26..eeeca4a1d 160000 --- a/submodules/kubespray +++ b/submodules/kubespray @@ -1 +1 @@ -Subproject commit 92f25bf267ffd3393f6caffa588169d3a44a799c +Subproject commit eeeca4a1d0334efebcf732d08bffc7e10240fc9c