diff --git a/.github/workflows/molecule.yml b/.github/workflows/molecule.yml new file mode 100644 index 000000000..f4165f104 --- /dev/null +++ b/.github/workflows/molecule.yml @@ -0,0 +1,33 @@ +--- +name: test ansible roles with molecule +on: + - push + - pull_request +jobs: + build: + runs-on: ubuntu-20.04 + strategy: + max-parallel: 4 + matrix: + deepops-role: + - singularity_wrapper + steps: + - name: check out repo + uses: actions/checkout@v2 + with: + path: "${{ github.repository }}" + - name: set up python + uses: actions/setup-python@v2 + with: + python-version: "3.9" + - name: install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install molecule[docker] docker ansible + - name: run molecule test + run: | + cd "${{ github.repository }}/roles" + ansible-galaxy role install --force -r ./requirements.yml + ansible-galaxy collection install --force -r ./requirements.yml + cd "${{ matrix.deepops-role }}" + molecule test diff --git a/README.md b/README.md index 65abe99dd..ed3e0944f 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Check out the [video tutorial](https://drive.google.com/file/d/1RNLQYlgJqE8JMv0n ## Releases -Latest release: [DeepOps 21.09 Release](https://github.com/NVIDIA/deepops/releases/tag/21.09) +Latest release: [DeepOps 22.01 Release](https://github.com/NVIDIA/deepops/releases/tag/22.01) It is recommended to use the latest release branch for stable code (linked above). All development takes place on the master branch, which is generally [functional](docs/deepops/testing.md) but may change significantly between releases. diff --git a/config.example/env.sh b/config.example/env.sh new file mode 100644 index 000000000..288e585db --- /dev/null +++ b/config.example/env.sh @@ -0,0 +1,6 @@ +# This file acts as a location to override the default configurations of deepops/scripts/* +# Many of the scripts in this directory define global variables and set reasonable defaults +# Global variables (in all caps) that are defined here will be automatically sourced and used in all scripts +# See deepops/scripts/common.sh for implementation details + +DEEPOPS_EXAMPLE_VAR="" diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml index b5ba175da..5d92a730d 100644 --- a/config.example/group_vars/all.yml +++ b/config.example/group_vars/all.yml @@ -122,7 +122,7 @@ sftp_chroot: false ################################################################################ # NVIDIA GPU configuration # Playbook: nvidia-cuda -cuda_version: cuda-toolkit-11-4 +cuda_version: cuda-toolkit-11-5 # DGX-specific vars may be used to target specific models, # because available versions for DGX may differ from the generic repo diff --git a/config.example/group_vars/k8s-cluster.yml b/config.example/group_vars/k8s-cluster.yml index 1fe90c349..2560e1d2d 100644 --- a/config.example/group_vars/k8s-cluster.yml +++ b/config.example/group_vars/k8s-cluster.yml @@ -36,9 +36,6 @@ dashboard_image_repo: "kubernetesui/dashboard" dashboard_metrics_scrape_tagr: "v1.0.4" dashboard_metrics_scraper_repo: "kubernetesui/metrics-scraper" -# Override the Helm version installed by Kubespray -helm_version: "v3.5.4" - # Ensure hosts file generation only runs across k8s cluster hosts_add_ansible_managed_hosts_groups: ["k8s-cluster"] diff --git a/config.example/group_vars/slurm-cluster.yml b/config.example/group_vars/slurm-cluster.yml index 04f69c815..7f380e74d 100644 --- a/config.example/group_vars/slurm-cluster.yml +++ b/config.example/group_vars/slurm-cluster.yml @@ -3,7 +3,7 @@ ################################################################################ # Slurm job scheduler configuration # Playbook: slurm, slurm-cluster, slurm-perf, slurm-perf-cluster, slurm-validation -slurm_version: 21.08.1 +slurm_version: 21.08.5 slurm_install_prefix: /usr/local pmix_install_prefix: /opt/deepops/pmix hwloc_install_prefix: /opt/deepops/hwloc @@ -117,9 +117,9 @@ sm_install_host: "slurm-master[0]" slurm_install_hpcsdk: true # Select the version of HPC SDK to download -hpcsdk_major_version: "21" -hpcsdk_minor_version: "9" -hpcsdk_file_cuda: "11.4" +hpcsdk_major_version: "22" +hpcsdk_minor_version: "1" +hpcsdk_file_cuda: "11.5" hpcsdk_arch: "x86_64" # In a Slurm cluster, default to setting up HPC SDK as modules rather than in diff --git a/docs/deepops/configuration.md b/docs/deepops/configuration.md index ca3d556fd..c669d94f1 100644 --- a/docs/deepops/configuration.md +++ b/docs/deepops/configuration.md @@ -12,6 +12,7 @@ In particular, this directory includes: - `config/group_vars/all.yml`: An Ansible [variables file](https://docs.ansible.com/ansible/latest/user_guide/playbooks_variables.html) that contains variables we expect to work for all hosts - `config/group_vars/k8s-cluster.yml`: Variables specific to deploying Kubernetes clusters - `config/group_vars/slurm-cluster.yml`: Variables specific to deploying Slurm clusters +- `config/env.sh`: Global variables that override default variable values for all `sh` files in `scripts/*`. - `config/requirements.yml`: An Ansible Galaxy [requirements file](https://docs.ansible.com/ansible/latest/galaxy/user_guide.html#installing-roles-and-collections-from-the-same-requirements-yml-file) that contains a list of custom Collections and Roles to install. Collections and Roles required by DeepOps are stored in a separate `roles/requirements.yml` file, which should not be modified. It's expected that most DeepOps deployments will make changes to these files! diff --git a/docs/deepops/testing.md b/docs/deepops/testing.md index c09f26bd7..26c2486db 100644 --- a/docs/deepops/testing.md +++ b/docs/deepops/testing.md @@ -1,12 +1,13 @@ # DeepOps Testing, CI/CD, and Validation -## DeepOps Continuous Integration Testing + +## DeepOps end-to-end testing The DeepOps project leverages a private Jenkins server to run continuous integration tests. Testing is done using the [virtual](../../virtual) deployment mechanism. Several Vagrant VMs are created, the cluster is deployed, tests are executed, and then the VMs are destroyed. The goal of the DeepOps CI is to prevent bugs from being introduced into the code base and to identify when changes in 3rd party platforms have occurred or impacted the DeepOps deployment mechanisms. In general, K8s and Slurm deployment issues are detected and resolved with urgency. Many components of DeepOps are 3rd party open source tools that may silently fail or suddenly change without notice. The team will make a best-effort to resolve these issues and include regression tests, however there may be times where a fix is unavailable. Historically, this has been an issue with Rook-Ceph and Kubeflow, and those GitHub communities are best equipped to help with resolutions. -### Testing Methodi +### Testing Method DeepOps CI contains two types of automated tests: @@ -63,6 +64,77 @@ A short description of the nightly testing is outlined below. The full suit of t | MIG configuration | | | | No testing support +## DeepOps Ansible role testing + +A subset of the Ansible roles in DeepOps have tests defined using [Ansible Molecule](https://molecule.readthedocs.io/en/latest/). +This testing mechanism allows the roles to be tested individually, providing additional test signal to identify issues which do not appear in the end-to-end tests. +These tests are run automatically for each pull request using [Github Actions](https://github.com/NVIDIA/deepops/actions). + +Molecule testing runs the Ansible role in quesiton inside a Docker container. +As such, not all roles will be easy to test witth this mechanism. +Roles which mostly involve installing software, configuring services, or executing scripts should generally be possible to test. +Roles which rely on the presence of specific hardware (such as GPUs), which reboot the nodes they act on, or which make changes to kernel configuration are going to be harder to test with Molecule. + +### Defining Molecule tests for a new role + +To add Molecule tests to a new role, the following procedure can be used. + +1. Ensure you have Docker installed in your development environment + +2. Install Ansible Molecule in your development environment + +``` +$ python3 -m pip install "molecule[docker,lint]" +``` + +3. Initialize Molecule in your new role + +``` +$ cd deepops/roles/ +$ molecule init scenario -r --driver docker +``` + +4. In the file `molecule/default/molecule.yml`, define the list of platforms to be tested. +DeepOps currently supports operating systems based on Ubuntu 18.04, Ubuntu 20.04, EL7, and EL8. +To test these stacks, the following `platforms` stanza can be used. + +``` +platforms: + - name: ubuntu-1804 + image: geerlingguy/docker-ubuntu1804-ansible + pre_build_image: true + - name: ubuntu-2004 + image: geerlingguy/docker-ubuntu2004-ansible + pre_build_image: true + - name: centos-7 + image: geerlingguy/docker-centos7-ansible + pre_build_image: true + - name: centos-8 + image: geerlingguy/docker-centos8-ansible + pre_build_image: true +``` + +5. If you haven't already, define your role's metadata in the file `meta/main.yml`. +A sample `meta.yml` is shown here: + +``` +galaxy_info: + role_name: + namespace: deepops + author: DeepOps Team + company: NVIDIA + description: + license: 3-Clause BSD + min_ansible_version: 2.9 +``` + +6. Once this is done, verify that your role executes successfully in the Molecule environment by running `molecule test`. If you run into any issues, consult the [Molecule documentation](https://molecule.readthedocs.io/en/latest/index.html) for help resolving them. + +7. (optional) In addition to testing successful execution, you can add additional tests which will be run after your role completes in a file `molecule/default/verify.yml`. This is an Ansible playbook that will run in the same environment as your playbook ran. For a simple example of such a verify playbook, see the [Enroot role](https://github.com/NVIDIA/ansible-role-enroot/blob/master/molecule/default/verify.yml). + +8. Once you're confident that your new tests are all passing, add your role to the `deepops-role` section in the `.github/workflows/molecule.yml` file. + + ## DeepOps Deployment Validation The Slurm and Kubernetes deployment guides both document cluster verification steps. These should be run during the installation process to validate a GPU workload can be executed on the cluster. diff --git a/playbooks/container/singularity.yml b/playbooks/container/singularity.yml index 16ca5b9ab..74208898f 100644 --- a/playbooks/container/singularity.yml +++ b/playbooks/container/singularity.yml @@ -1,10 +1,5 @@ --- - hosts: all become: yes - pre_tasks: - - name: create a folder for go - file: - path: "{{ golang_install_dir }}" - recurse: yes roles: - - lecorguille.singularity + - singularity_wrapper diff --git a/playbooks/slurm-cluster/files/cve_2021_44228.options b/playbooks/slurm-cluster/files/cve_2021_44228.options new file mode 100644 index 000000000..5af9281fc --- /dev/null +++ b/playbooks/slurm-cluster/files/cve_2021_44228.options @@ -0,0 +1 @@ +-Dlog4j2.formatMsgNoLookups=true diff --git a/playbooks/slurm-cluster/logging.yml b/playbooks/slurm-cluster/logging.yml index 6a6f7609f..bb56a77c4 100644 --- a/playbooks/slurm-cluster/logging.yml +++ b/playbooks/slurm-cluster/logging.yml @@ -3,21 +3,77 @@ become: true vars: elasticsearch_network_host: 0.0.0.0 - logstash_listen_port_beats: 5000 + pre_tasks: + - name: debian - ensure apt cache updated + apt: + update_cache: true + when: ansible_os_family == "Debian" roles: - - geerlingguy.java - - geerlingguy.elasticsearch - - geerlingguy.logstash - - geerlingguy.kibana + - robertdebock.java + - robertdebock.elastic_repo + - robertdebock.elasticsearch + - robertdebock.logstash + - robertdebock.kibana + +- hosts: slurm-master[0] + become: true + vars: + filebeat_port: "5000" + tasks: + - name: configure logstash to accept logs from filebeat + template: + src: "filebeat.conf" + dest: "/etc/logstash/conf.d/filebeat.conf" + owner: "root" + group: "root" + mode: "0644" + +# Mitigation for CVE-2021-44228 impacting Log4j2 +# https://discuss.elastic.co/t/apache-log4j2-remote-code-execution-rce-vulnerability-cve-2021-44228-esa-2021-31/291476 +- hosts: slurm-master[0] + become: yes tasks: - - name: fix bug in logstash role - command: /usr/share/logstash/bin/logstash-plugin install logstash-filter-multiline + - name: configure elasticsearch to mitigate CVE-2021-44228 + copy: + src: "cve_2021_44228.options" + dest: "/etc/elasticsearch/jvm.options.d/cve_2021_44228.options" + owner: "root" + group: "root" + mode: "0644" + notify: + - restart-elasticsearch + - name: check for relevant class in logstash + shell: unzip -l /usr/share/logstash/logstash-core/lib/jars/log4j-core-2.* | grep JndiLookup.class + register: logstash_jndi + changed_when: logstash_jndi.rc == 0 + failed_when: logstash_jndi.rc == 2 + - name: configure logstash to mitigate CVE-2021-44228 + shell: zip -q -d /usr/share/logstash/logstash-core/lib/jars/log4j-core-2.* org/apache/logging/log4j/core/lookup/JndiLookup.class + notify: + - restart-logstash + when: logstash_jndi.changed + - name: manually stop logstash as restart is not consistently working later + service: + name: logstash + state: stopped + notify: + - restart-logstash + when: logstash_jndi.changed + handlers: + - name: restart-elasticsearch + service: + name: elasticsearch + state: restarted + - name: restart-logstash + service: + name: logstash + state: restarted - hosts: slurm-cluster become: true vars: filebeat_create_config: true - filebeat_prospectors: + filebeat_inputs: - input_type: log paths: - "/var/log/*.log" diff --git a/playbooks/slurm-cluster/templates/filebeat.conf b/playbooks/slurm-cluster/templates/filebeat.conf new file mode 100644 index 000000000..53860ed4f --- /dev/null +++ b/playbooks/slurm-cluster/templates/filebeat.conf @@ -0,0 +1,12 @@ +input { + beats { + port => {{ filebeat_port }} + } +} + +output { + elasticsearch { + hosts => ["http://localhost:9200"] + index => "%{[@metadata][beat]}-%{[@metadata][version]}" + } +} diff --git a/roles/dns-config/tasks/main.yml b/roles/dns-config/tasks/main.yml index 639e3f3cc..58b7b7dfd 100644 --- a/roles/dns-config/tasks/main.yml +++ b/roles/dns-config/tasks/main.yml @@ -16,12 +16,12 @@ - systemd-resolved when: ansible_distribution == 'Ubuntu' and ansible_distribution_major_version == '16' -- name: disable services (bionic) +- name: disable services (bionic, focal) service: name: systemd-resolved state: stopped enabled: no - when: ansible_distribution == 'Ubuntu' and ansible_distribution_major_version == '18' + when: ansible_distribution == 'Ubuntu' and (ansible_distribution_major_version in ['18', '20']) - name: install /etc/resolv.conf template: diff --git a/roles/nvidia-cuda/defaults/main.yml b/roles/nvidia-cuda/defaults/main.yml index 84cbe2b78..628eb5675 100644 --- a/roles/nvidia-cuda/defaults/main.yml +++ b/roles/nvidia-cuda/defaults/main.yml @@ -1,6 +1,6 @@ --- # 'cuda' is the generic package and will pull the latest version -cuda_version: "cuda-toolkit-11-3" +cuda_version: "cuda-toolkit-11-5" # DGX-specific vars may be used to target specific models, # because available versions for DGX may differ from the generic repo diff --git a/roles/nvidia-hpc-sdk/defaults/main.yml b/roles/nvidia-hpc-sdk/defaults/main.yml index 98f27b794..b54736936 100644 --- a/roles/nvidia-hpc-sdk/defaults/main.yml +++ b/roles/nvidia-hpc-sdk/defaults/main.yml @@ -15,15 +15,15 @@ # See https://developer.nvidia.com/nvidia-hpc-sdk-downloads for more detail on available downloads. # Version strings used to construct download URL -hpcsdk_major_version: "21" -hpcsdk_minor_version: "9" -hpcsdk_file_cuda: "11.4" +hpcsdk_major_version: "22" +hpcsdk_minor_version: "1" +hpcsdk_file_cuda: "11.5" hpcsdk_arch: "x86_64" # We need to specify the default CUDA toolkit to use during installation. # This should usually be the latest CUDA included in the HPC SDK you are # installing. -hpcsdk_default_cuda: "11.4" +hpcsdk_default_cuda: "11.5" # Add HPC SDK modules to the MODULEPATH? hpcsdk_install_as_modules: false diff --git a/roles/requirements.yml b/roles/requirements.yml index da9b0c789..b205d5917 100644 --- a/roles/requirements.yml +++ b/roles/requirements.yml @@ -36,19 +36,22 @@ roles: version: "v0.5.0" - src: geerlingguy.filebeat - version: "2.0.1" + version: "3.3.0" -- src: geerlingguy.logstash - version: "4.0.0" +- src: robertdebock.java + version: "4.1.1" -- src: geerlingguy.elasticsearch - version: "3.0.1" +- src: robertdebock.elastic_repo + version: "1.0.3" -- src: geerlingguy.java - version: "1.9.5" +- src: robertdebock.logstash + version: "1.1.1" -- src: geerlingguy.kibana - version: "3.2.1" +- src: robertdebock.elasticsearch + version: "1.1.3" + +- src: robertdebock.kibana + version: "1.2.4" - src: https://github.com/DeepOps/ansible-maas.git name: ansible-maas @@ -61,8 +64,8 @@ roles: - src: https://github.com/OSC/ood-ansible.git version: 'v2.0.3' +- src: abims_sbr.singularity + version: 3.7.1-1 + - src: gantsign.golang version: 2.4.0 - -- src: lecorguille.singularity - version: 1.2.0 diff --git a/roles/singularity_wrapper/.yamllint b/roles/singularity_wrapper/.yamllint new file mode 100644 index 000000000..882767605 --- /dev/null +++ b/roles/singularity_wrapper/.yamllint @@ -0,0 +1,33 @@ +--- +# Based on ansible-lint config +extends: default + +rules: + braces: + max-spaces-inside: 1 + level: error + brackets: + max-spaces-inside: 1 + level: error + colons: + max-spaces-after: -1 + level: error + commas: + max-spaces-after: -1 + level: error + comments: disable + comments-indentation: disable + document-start: disable + empty-lines: + max: 3 + level: error + hyphens: + level: error + indentation: disable + key-duplicates: enable + line-length: disable + new-line-at-end-of-file: disable + new-lines: + type: unix + trailing-spaces: disable + truthy: disable diff --git a/roles/singularity_wrapper/defaults/main.yml b/roles/singularity_wrapper/defaults/main.yml new file mode 100644 index 000000000..5be75a6a4 --- /dev/null +++ b/roles/singularity_wrapper/defaults/main.yml @@ -0,0 +1,10 @@ +--- +# vars for lecorguille.singularity +singularity_version: "3.7.3" +singularity_conf_path: "/etc/singularity/singularity.conf" +bind_paths: [] + +# vars for gantsign.golang +golang_version: "1.14.4" +golang_install_dir: "/opt/go/{{ golang_version }}" +golang_gopath: "/opt/go/packages" diff --git a/roles/singularity_wrapper/meta/main.yml b/roles/singularity_wrapper/meta/main.yml new file mode 100644 index 000000000..9fbd94944 --- /dev/null +++ b/roles/singularity_wrapper/meta/main.yml @@ -0,0 +1,9 @@ +--- +galaxy_info: + role_name: singularity_wrapper + namespace: deepops + author: DeepOps Team + company: NVIDIA + description: Wrap lecourguille.singularity role + license: 3-Clause BSD + min_ansible_version: 2.9 diff --git a/roles/singularity_wrapper/molecule/default/converge.yml b/roles/singularity_wrapper/molecule/default/converge.yml new file mode 100644 index 000000000..c0295f669 --- /dev/null +++ b/roles/singularity_wrapper/molecule/default/converge.yml @@ -0,0 +1,7 @@ +--- +- name: Converge + hosts: all + tasks: + - name: "Include singularity_wrapper" + include_role: + name: "singularity_wrapper" diff --git a/roles/singularity_wrapper/molecule/default/molecule.yml b/roles/singularity_wrapper/molecule/default/molecule.yml new file mode 100644 index 000000000..2962ff2e7 --- /dev/null +++ b/roles/singularity_wrapper/molecule/default/molecule.yml @@ -0,0 +1,26 @@ +--- +dependency: + name: galaxy + options: + requirements-file: requirements.yml +driver: + name: docker +platforms: + - name: ubuntu-1804 + image: geerlingguy/docker-ubuntu1804-ansible + pre_build_image: true + - name: ubuntu-2004 + image: geerlingguy/docker-ubuntu2004-ansible + pre_build_image: true + - name: centos-7 + image: geerlingguy/docker-centos7-ansible + pre_build_image: true + - name: centos-8 + image: geerlingguy/docker-centos8-ansible + pre_build_image: true +provisioner: + name: ansible + ansible_args: + - -vv +verifier: + name: ansible diff --git a/roles/singularity_wrapper/molecule/default/verify.yml b/roles/singularity_wrapper/molecule/default/verify.yml new file mode 100644 index 000000000..b5afb1c0d --- /dev/null +++ b/roles/singularity_wrapper/molecule/default/verify.yml @@ -0,0 +1,13 @@ +--- +- name: verify + hosts: all + tasks: + - name: check for path to singularity + command: which singularity + register: which_singularity + changed_when: which_singularity.rc != 0 + + - name: verify path to singularity + assert: + that: + - "'/usr/local/bin/singularity' in which_singularity.stdout" diff --git a/roles/singularity_wrapper/tasks/main.yml b/roles/singularity_wrapper/tasks/main.yml new file mode 100644 index 000000000..a0675c73e --- /dev/null +++ b/roles/singularity_wrapper/tasks/main.yml @@ -0,0 +1,35 @@ +--- +- name: centos 8 - ensure powertools installed + block: + - name: ensure prereq packages installed + yum: + name: "dnf-plugins-core" + state: "present" + - name: enable powertools + command: "yum config-manager --set-enabled powertools" + register: enable_powertools + changed_when: enable_powertools.rc != 0 + when: (ansible_distribution == "CentOS") and (ansible_distribution_major_version == "8") + +- name: rhel 8 - ensure CRB repository is enabled + rhsm_repository: + name: "codeready-builder-for-rhel-8-x86_64-rpms" + when: (ansible_distribution == "Red Hat Enterprise Linux") and (ansible_distribution_major_version == "8") + +- name: debian - ensure apt cache is up to date + apt: + update_cache: yes + when: ansible_os_family == "Debian" + +- name: create a folder for go + file: + path: "{{ golang_install_dir }}" + recurse: yes + +- name: install golang explicitly + include_role: + name: gantsign.golang + +- name: install singularity + include_role: + name: abims_sbr.singularity diff --git a/roles/slurm/defaults/main.yml b/roles/slurm/defaults/main.yml index 49efe6330..79dc46b53 100644 --- a/roles/slurm/defaults/main.yml +++ b/roles/slurm/defaults/main.yml @@ -6,7 +6,7 @@ hwloc_build_dir: /opt/deepops/build/hwloc pmix_build_dir: /opt/deepops/build/pmix slurm_workflow_build: yes -slurm_version: 21.08.0 +slurm_version: 21.08.5 slurm_src_url: "https://download.schedmd.com/slurm/slurm-{{ slurm_version }}.tar.bz2" slurm_build_make_clean: no slurm_build_dir_cleanup: no diff --git a/roles/standalone-container-registry/tasks/main.yml b/roles/standalone-container-registry/tasks/main.yml index 1a76370ac..e462581a9 100644 --- a/roles/standalone-container-registry/tasks/main.yml +++ b/roles/standalone-container-registry/tasks/main.yml @@ -53,5 +53,6 @@ network_mode: host restart: yes restart_policy: unless-stopped + env: "{{ proxy_env if proxy_env is defined else {} }}" volumes: - "{{ standalone_container_registry_config_dir }}/config.yml:/etc/docker/registry/config.yml" diff --git a/scripts/airgap/build_offline_cache.sh b/scripts/airgap/build_offline_cache.sh index 00428a9fa..5ed169ac8 100755 --- a/scripts/airgap/build_offline_cache.sh +++ b/scripts/airgap/build_offline_cache.sh @@ -4,6 +4,10 @@ set -ex SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." DEEPOPS_CONFIG_DIR="${DEEPOPS_CONFIG_DIR:-${ROOT_DIR}/config.example}" + +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + DEST_DIR="/tmp/deepops" TARBALL="/tmp/deepops-archive.tar" DEEPOPS_BUILD_TARBALL="${DEEPOPS_BUILD_TARBALL:-1}" diff --git a/scripts/common.sh b/scripts/common.sh new file mode 100644 index 000000000..024f0b690 --- /dev/null +++ b/scripts/common.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# This is a common set of libraries, configuration override, helper functions, and debug output +# This file should be sourced at the top of all scripts and primarily does 3 things +# 1. Will source the env.sh file to allow override variables be version controlled in ./config +# 2. Will print out some standard debug for each script, to ease debugging +# 3. Will provide a common set of libraries, directory names, etc. + + +# Determine the path to the configuration directory and verify it exists +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/.." +DEEPOPS_CONFIG_DIR=${DEEPOPS_CONFIG_DIR:-"${ROOT_DIR}/config"} +if [ ! -d "${DEEPOPS_CONFIG_DIR}" ]; then + # Because this is a widely used script, we warn here instead of throwing an error + echo "WARNING: Can't find configuration in ${DEEPOPS_CONFIG_DIR}" + echo "WARNING: Please set DEEPOPS_CONFIG_DIR env variable to point to config location" +else + # Source the configuration environment variable overrides + source ${DEEPOPS_CONFIG_DIR}/env.sh +fi + +# Print out base debug +echo "Starting '${0}'; DeepOps version '${DEEPOPS_VERSION}'" diff --git a/scripts/generic/install_docker.sh b/scripts/generic/install_docker.sh index 930726d4c..53d5a704b 100755 --- a/scripts/generic/install_docker.sh +++ b/scripts/generic/install_docker.sh @@ -1,5 +1,10 @@ #!/usr/bin/env bash +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + DOCKER_COMPOSE_URL="${DOCKER_COMPOSE_URL:-https://github.com/docker/compose/releases/download/1.23.2/docker-compose-$(uname -s)-$(uname -m)}" type docker >/dev/null 2>&1 diff --git a/scripts/k8s/debug.sh b/scripts/k8s/debug.sh index ff8a2af66..8a9102368 100755 --- a/scripts/k8s/debug.sh +++ b/scripts/k8s/debug.sh @@ -7,6 +7,11 @@ # Requirements for this script are a working "kubectl" and ideally a working "helm" # Optionally, a working "ansible" with a config/inventory file that has kubernetes node defined in a kube-node group +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + timestamp=$(date +%s) logdir=config/log_${timestamp} mkdir -p ${logdir} @@ -35,6 +40,8 @@ kubectl get nodes > ${logdir}/get-nodes.log kubectl describe nodes > ${logdir}/describe-nodes.log kubectl get storageclass > ${logdir}/get-storageclass.log kubectl get events -A > ${logdir}/get-events.log +kubectl get svc -A > ${logdir}/get-svc.log + # Kubectl / GPU Operator (Generic for any Kubernetes cluster) kubectl get pvc -A > ${logdir}/get-pvc.log for pod in $(kubectl get pods -n gpu-operator-resources | grep nvidia-device-plugin | awk '{print $1}'); do diff --git a/scripts/k8s/deploy_dashboard_user.sh b/scripts/k8s/deploy_dashboard_user.sh index b4e92784d..7c82685fe 100755 --- a/scripts/k8s/deploy_dashboard_user.sh +++ b/scripts/k8s/deploy_dashboard_user.sh @@ -1,5 +1,10 @@ #!/usr/bin/env bash +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + # Make the dashboard a NodePort kubectl patch svc -n kube-system kubernetes-dashboard -p '{"spec": {"type": "NodePort", "ports": [{"nodePort": 31443, "port": 443}] }}' diff --git a/scripts/k8s/deploy_ingress.sh b/scripts/k8s/deploy_ingress.sh index 8c4fc3f2c..8b3691245 100755 --- a/scripts/k8s/deploy_ingress.sh +++ b/scripts/k8s/deploy_ingress.sh @@ -5,6 +5,9 @@ set -x SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + HELM_CHARTS_REPO_INGRESS="${HELM_CHARTS_REPO_INGRESS:-https://kubernetes.github.io/ingress-nginx}" HELM_INGRESS_CHART_VERSION="${HELM_INGRESS_CHART_VERSION:-3.5.1}" # HELM_INGRESS_CONFIG, defaults below based on presence of metallb diff --git a/scripts/k8s/deploy_kubeflow.sh b/scripts/k8s/deploy_kubeflow.sh index 31b59122d..887c0dcdd 100755 --- a/scripts/k8s/deploy_kubeflow.sh +++ b/scripts/k8s/deploy_kubeflow.sh @@ -5,6 +5,9 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." CONFIG_DIR="${ROOT_DIR}/config" +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + # Specify credentials for the default user. # TODO: Dynamically sed/hash these value into the CONFIG, these are currently not used export KUBEFLOW_USER_EMAIL="${KUBEFLOW_USER_EMAIL:-admin@kubeflow.org}" diff --git a/scripts/k8s/deploy_loadbalancer.sh b/scripts/k8s/deploy_loadbalancer.sh index 65e309fd5..e2dada441 100755 --- a/scripts/k8s/deploy_loadbalancer.sh +++ b/scripts/k8s/deploy_loadbalancer.sh @@ -5,6 +5,9 @@ set -x SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + # Allow overriding config dir to look in DEEPOPS_CONFIG_DIR=${DEEPOPS_CONFIG_DIR:-"${ROOT_DIR}/config"} if [ ! -d "${DEEPOPS_CONFIG_DIR}" ]; then diff --git a/scripts/k8s/deploy_monitoring.sh b/scripts/k8s/deploy_monitoring.sh index aab0906d7..d793ef8d9 100755 --- a/scripts/k8s/deploy_monitoring.sh +++ b/scripts/k8s/deploy_monitoring.sh @@ -11,6 +11,9 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." cd "${ROOT_DIR}" || exit 1 +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + # Allow overriding config dir to look in DEEPOPS_CONFIG_DIR=${DEEPOPS_CONFIG_DIR:-"${ROOT_DIR}/config"} diff --git a/scripts/k8s/deploy_rook.sh b/scripts/k8s/deploy_rook.sh index 56394804f..9a746863f 100755 --- a/scripts/k8s/deploy_rook.sh +++ b/scripts/k8s/deploy_rook.sh @@ -8,8 +8,11 @@ # Get absolute path for script and root SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ROOT_DIR="${SCRIPT_DIR}/../.." -CHART_VERSION="1.22.1" +# Source common libraries and env variables +source ${ROOT_DIR}/scripts/common.sh + +CHART_VERSION="1.22.1" HELM_ROOK_CHART_REPO="${HELM_ROOK_CHART_REPO:-https://charts.rook.io/release}" HELM_ROOK_CHART_VERSION="${HELM_ROOK_CHART_VERSION:-v1.1.1}" diff --git a/scripts/k8s/install_helm.sh b/scripts/k8s/install_helm.sh index 6906578ed..3df866146 100755 --- a/scripts/k8s/install_helm.sh +++ b/scripts/k8s/install_helm.sh @@ -2,6 +2,11 @@ set -x +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + HELM_INSTALL_DIR=/usr/local/bin HELM_INSTALL_SCRIPT_URL="${HELM_INSTALL_SCRIPT_URL:-https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3}" HELM_MINIMUM_VERSION=v3.4.1+gc4e7485 @@ -41,7 +46,7 @@ if [ "${HELM_MINIMUM_VERSION}" != "${helm_min_installed}" ]; then chmod +x /var/tmp/get_helm.sh #sed -i 's/sudo//g' /var/tmp/get_helm.sh mkdir -p ${HELM_INSTALL_DIR} - HELM_INSTALL_DIR=${HELM_INSTALL_DIR} DESIRED_VERSION=v3.5.4 /var/tmp/get_helm.sh # Should match: config/group_vars/k8s-cluster.yml:helm_version: + HELM_INSTALL_DIR=${HELM_INSTALL_DIR} DESIRED_VERSION=v3.7.1 /var/tmp/get_helm.sh # Should match: config/group_vars/k8s-cluster.yml:helm_version: fi # Display the helm version for better debug diff --git a/scripts/k8s/setup_remote_k8s.sh b/scripts/k8s/setup_remote_k8s.sh index 8115fc856..70462b874 100755 --- a/scripts/k8s/setup_remote_k8s.sh +++ b/scripts/k8s/setup_remote_k8s.sh @@ -1,5 +1,10 @@ #!/usr/bin/env bash +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + KUBECTL_BINARY_URL="${KUBECTL_BINARY_URL:-https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl}" # Install dependencies diff --git a/scripts/k8s/verify_gpu.sh b/scripts/k8s/verify_gpu.sh index e284af019..a417733b0 100755 --- a/scripts/k8s/verify_gpu.sh +++ b/scripts/k8s/verify_gpu.sh @@ -4,6 +4,11 @@ # Check the output and verify the number of nodes and GPUs is as expected # TODO: This script should be wrapped by Ansible to verify that the output of nvidia-smi on each node matches K8S +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + export KFCTL=${KFCTL:-~/kfctl} export CLUSTER_VERIFY_NS=${CLUSTER_VERIFY_NS:-cluster-gpu-verify} export CLUSTER_VERIFY_EXPECTED_PODS=${CLUSTER_VERIFY_EXPECTED_PODS:-} diff --git a/scripts/nginx-docker-cache/gen-ca.sh b/scripts/nginx-docker-cache/gen-ca.sh index da2ad9e7d..3da38f783 100755 --- a/scripts/nginx-docker-cache/gen-ca.sh +++ b/scripts/nginx-docker-cache/gen-ca.sh @@ -1,5 +1,10 @@ #!/usr/bin/env bash +# Source common libraries and env variables +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/../.." +source ${ROOT_DIR}/scripts/common.sh + CA_CRT_OUTFILE="${CA_CRT_OUTFILE:-/tmp/ca.crt}" CA_KEY_OUTFILE="${CA_KEY_OUTFILE:-/tmp/ca.key}" diff --git a/scripts/setup.sh b/scripts/setup.sh index 291907797..c28166e85 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -7,10 +7,14 @@ # Can be run standalone with: curl -sL git.io/deepops | bash # or: curl -sL git.io/deepops | bash -s -- 19.07 +# Determine current directory and root directory +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +ROOT_DIR="${SCRIPT_DIR}/.." + # Configuration ANSIBLE_VERSION="${ANSIBLE_VERSION:-2.9.27}" # Ansible version to install ANSIBLE_TOO_NEW="${ANSIBLE_TOO_NEW:-2.10.0}" # Ansible version too new -CONFIG_DIR="${CONFIG_DIR:-./config}" # Default configuration directory location +CONFIG_DIR="${CONFIG_DIR:-${ROOT_DIR}/config}" # Default configuration directory location DEEPOPS_TAG="${1:-master}" # DeepOps branch to set up JINJA2_VERSION="${JINJA2_VERSION:-2.11.1}" # Jinja2 required version PIP="${PIP:-pip3}" # Pip binary to use @@ -21,10 +25,6 @@ VENV_DIR="${VENV_DIR:-/opt/deepops/env}" # Path to python virtual environ # Set distro-specific variables . /etc/os-release - -SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -ROOT_DIR="${SCRIPT_DIR}/.." - DEPS_DEB=(git virtualenv python3-virtualenv sshpass wget) DEPS_EL7=(git libselinux-python3 python-virtualenv python3-virtualenv sshpass wget) DEPS_EL8=(git python3-libselinux python3-virtualenv sshpass wget) @@ -146,10 +146,21 @@ fi # Install Ansible Galaxy roles if command -v ansible-galaxy &> /dev/null ; then echo "Updating Ansible Galaxy roles..." - as_user ansible-galaxy collection install --force -r "${ROOT_DIR}/roles/requirements.yml" >/dev/null - as_user ansible-galaxy role install --force -r "${ROOT_DIR}/roles/requirements.yml" >/dev/null - as_user ansible-galaxy collection install --force -i -r "${ROOT_DIR}/config/requirements.yml" >/dev/null - as_user ansible-galaxy role install --force -i -r "${ROOT_DIR}/config/requirements.yml" >/dev/null + initial_dir="$(pwd)" + roles_path="${ROOT_DIR}/roles/galaxy" + collections_path="${ROOT_DIR}/collections" + + cd "${ROOT_DIR}" + as_user ansible-galaxy collection install -p "${collections_path}" --force -r "roles/requirements.yml" >/dev/null + as_user ansible-galaxy role install -p "${roles_path}" --force -r "roles/requirements.yml" >/dev/null + + # Install any user-defined config requirements + if [ -d "${CONFIG_DIR}" ] && [ -f "${CONFIG_DIR}/requirement.yml" ] ; then + cd "${CONFIG_DIR}" + as_user ansible-galaxy collection install -p "${collections_path}" --force -i -r "requirements.yml" >/dev/null + as_user ansible-galaxy role install -p "${roles_path}" --force -i -r "requirements.yml" >/dev/null + fi + cd "${initial_dir}" else echo "ERROR: Unable to install Ansible Galaxy roles, 'ansible-galaxy' command not found" fi diff --git a/submodules/kubespray b/submodules/kubespray index bcf695913..eeeca4a1d 160000 --- a/submodules/kubespray +++ b/submodules/kubespray @@ -1 +1 @@ -Subproject commit bcf695913f5332c0acf08b206cc055c9482664d9 +Subproject commit eeeca4a1d0334efebcf732d08bffc7e10240fc9c