From d4ab28d7b62fa24f17b3918e5e28c41747f4fc7e Mon Sep 17 00:00:00 2001 From: Boris Glimcher Date: Tue, 16 Jan 2024 04:23:59 +0200 Subject: [PATCH] feat: Install Nvidia DOCA on the servers post provisioning Signed-off-by: Boris Glimcher --- .../InstallingProvisionTool/index.rst | 2 +- .../installprovisiontool.rst | 12 ++ .../provisionparams.rst | 4 + .../provisionprereqs.rst | 4 +- .../InstallationGuides/addinganewnode.rst | 1 + .../reprovisioningthecluster.rst | 1 + .../SupportMatrix/omniainstalledsoftware.rst | 2 + docs/source/Roles/Network/index.rst | 12 +- docs/source/Tables/bmc.csv | 5 + docs/source/Tables/mapping.csv | 5 + docs/source/Tables/snmpwalk.csv | 5 + docs/source/Tables/switch-based.csv | 5 + input/network_config.yml | 10 ++ input/provision_config.yml | 8 ++ network/network.yml | 10 ++ .../nvidia_doca/tasks/install_doca_leap.yml | 18 +++ .../nvidia_doca/tasks/install_doca_redhat.yml | 53 ++++++++ network/roles/nvidia_doca/tasks/main.yml | 41 ++++++ .../nvidia_doca/tasks/pre-requisites.yml | 31 +++++ .../roles/nvidia_doca/tasks/validations.yml | 57 +++++++++ network/roles/nvidia_doca/vars/main.yml | 26 ++++ prereq.sh | 2 +- provision/README.rst | 2 +- .../tasks/validate_postscripts_path.yml | 27 +++- .../roles/provision_validation/vars/main.yml | 3 + .../postscripts/configure_postscripts.yml | 13 ++ .../vars/postscripts.yml | 1 + .../files/postscripts/omnia_doca | 19 +++ .../tasks/postscripts/configure_doca.yml | 118 ++++++++++++++++++ .../tasks/postscripts/main.yml | 4 + .../xcat_repo_manipulate/vars/postscripts.yml | 9 ++ 31 files changed, 504 insertions(+), 6 deletions(-) create mode 100644 network/roles/nvidia_doca/tasks/install_doca_leap.yml create mode 100644 network/roles/nvidia_doca/tasks/install_doca_redhat.yml create mode 100644 network/roles/nvidia_doca/tasks/main.yml create mode 100644 network/roles/nvidia_doca/tasks/pre-requisites.yml create mode 100644 network/roles/nvidia_doca/tasks/validations.yml create mode 100644 network/roles/nvidia_doca/vars/main.yml create mode 100644 provision/roles/xcat_repo_manipulate/files/postscripts/omnia_doca create mode 100644 provision/roles/xcat_repo_manipulate/tasks/postscripts/configure_doca.yml diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/index.rst b/docs/source/InstallationGuides/InstallingProvisionTool/index.rst index 7fccfac68..3797f32ba 100644 --- a/docs/source/InstallationGuides/InstallingProvisionTool/index.rst +++ b/docs/source/InstallationGuides/InstallingProvisionTool/index.rst @@ -13,7 +13,7 @@ This playbook achieves the following tasks: * Configures a docker registry to pull images from the internet and store them locally - * Optionally installs OFED and CUDA + * Optionally installs OFED, DOCA and CUDA .. toctree:: diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/installprovisiontool.rst b/docs/source/InstallationGuides/InstallingProvisionTool/installprovisiontool.rst index b299dc11b..19af4adda 100644 --- a/docs/source/InstallationGuides/InstallingProvisionTool/installprovisiontool.rst +++ b/docs/source/InstallationGuides/InstallingProvisionTool/installprovisiontool.rst @@ -25,6 +25,18 @@ Optional configurations managed by the provision tool * CUDA requires an additional reboot while being installed. While this is taken care of by Omnia, users are required to wait an additional few minutes when running the provision tool with CUDA installation for the target nodes to come up. +**Installing DOCA** + + **Using the provision tool** + + * If ``nvidia_doca_path`` is provided in ``input/provision_config.yml`` and Nvidia DPUs are available on the target nodes, DOCA packages will be deployed post provisioning without user intervention. + + **Using the Network playbook** + + * DOCA can also be installed using `network.yml <../../Roles/Network/index.html>`_ after provisioning the servers (Assuming the provision tool did not install DOCA packages). + + .. note:: The DOCA package can be downloaded from `here `_ . + **Installing OFED** **Using the provision tool** diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/provisionparams.rst b/docs/source/InstallationGuides/InstallingProvisionTool/provisionparams.rst index 9552199ea..b94bbfdb8 100644 --- a/docs/source/InstallationGuides/InstallingProvisionTool/provisionparams.rst +++ b/docs/source/InstallationGuides/InstallingProvisionTool/provisionparams.rst @@ -194,6 +194,10 @@ Fill in all provision-specific parameters in ``input/provision_config.yml`` | ``string`` | | | Optional | | +----------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| nvidia_doca_path | Absolute path to local copy of .rpm file containing DOCA packages. The doca rpm can be downloaded from https://developer.nvidia.com/networking/doca. DOCA will be installed post provisioning without any user intervention. Eg: nvidia_doca_path: "/root/doca-host-repo-rhel86-2.5.0-0.0.1.2.5.0108.1.el8.23.10.1.1.9.0.x86_64.rpm" | +| ``string`` | | +| Optional | | ++----------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ .. note:: diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/provisionprereqs.rst b/docs/source/InstallationGuides/InstallingProvisionTool/provisionprereqs.rst index c8f181ec0..d83299009 100644 --- a/docs/source/InstallationGuides/InstallingProvisionTool/provisionprereqs.rst +++ b/docs/source/InstallationGuides/InstallingProvisionTool/provisionprereqs.rst @@ -54,12 +54,14 @@ Note the compatibility between cluster OS and control plane OS below: .. [1] Ensure that control planes running RHEL have an active subscription or are configured to access local repositories. The following repositories should be enabled on the control plane: **AppStream**, **Code Ready Builder (CRB)**, **BaseOS**. For RHEL control planes running 8.5 and below, ensure that sshpass is additionally available to install or download to the control plane (from any local repository). -* To **optionally** set up CUDA and OFED using the provisioning tool, download the required repositories to the control plane from here to deploy on the target nodes: +* To **optionally** set up CUDA, DOCA and OFED using the provisioning tool, download the required repositories to the control plane from here to deploy on the target nodes: 1. `For NVIDIA GPUs: `_: CUDA is a parallel computing platform and application programming interface that allows software to use certain types of graphics processing units for general purpose processing, an approach called general-purpose computing on GPUs. 2. `For Mellanox `_: OFED (OpenFabrics Enterprise Distribution) is open-source software for RDMA and kernel bypass applications. OFED can be used in business, research and scientific environments that require highly efficient networks, storage connectivity and parallel computing. + 3. `For NVIDIA DPUs: `_: DOCA is ... + * Ensure that all connection names under the network manager match their corresponding device names. To verify network connection names: :: diff --git a/docs/source/InstallationGuides/addinganewnode.rst b/docs/source/InstallationGuides/addinganewnode.rst index 57b7ab4b9..34be5a8e6 100644 --- a/docs/source/InstallationGuides/addinganewnode.rst +++ b/docs/source/InstallationGuides/addinganewnode.rst @@ -8,6 +8,7 @@ While adding a new node to the cluster, users can modify the following: - The operating system - CUDA - OFED + - DOCA A new node can be added using the following ways: diff --git a/docs/source/InstallationGuides/reprovisioningthecluster.rst b/docs/source/InstallationGuides/reprovisioningthecluster.rst index e3af26b75..cae70a73e 100644 --- a/docs/source/InstallationGuides/reprovisioningthecluster.rst +++ b/docs/source/InstallationGuides/reprovisioningthecluster.rst @@ -6,6 +6,7 @@ In the event that an existing Omnia cluster needs a different OS version or a fr - The operating system - CUDA - OFED + - DOCA Omnia can re-provision the cluster by running the following command: :: diff --git a/docs/source/Overview/SupportMatrix/omniainstalledsoftware.rst b/docs/source/Overview/SupportMatrix/omniainstalledsoftware.rst index 1ba159efd..28bc566c4 100644 --- a/docs/source/Overview/SupportMatrix/omniainstalledsoftware.rst +++ b/docs/source/Overview/SupportMatrix/omniainstalledsoftware.rst @@ -126,6 +126,8 @@ Software Installed by Omnia +------------------------------------+------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | MLNX-OFED | BSD License | MLNX_OFED is an NVIDIA tested and packaged version of OFED that supports two interconnect types using the same RDMA (remote DMA) and kernel bypass APIs called OFED verbs – InfiniBand and Ethernet. | +------------------------------------+------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| NVIDIA DOCA | NVIDIA License | The NVIDIA® DOCA® is the key to unlocking the potential of the NVIDIA® BlueField® networking platform to offload, accelerate, and isolate data center workloads. | ++------------------------------------+------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ansible pylibssh | LGPL 2.1 | Python bindings to client functionality of libssh specific to Ansible use case. | +------------------------------------+------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | perl-DBD-Pg | GNU General Public License v3 | DBD::Pg - PostgreSQL database driver for the DBI module | diff --git a/docs/source/Roles/Network/index.rst b/docs/source/Roles/Network/index.rst index 66aff73d4..8f0adb00d 100644 --- a/docs/source/Roles/Network/index.rst +++ b/docs/source/Roles/Network/index.rst @@ -16,7 +16,9 @@ Some of the network features Omnia offers are: 2. Infiniband switch configuration -To install OFED drivers, enter all required parameters in ``input/network_config.yml``: +3. Nvidia DOCA + +To install OFED and DOCA drivers, enter all required parameters in ``input/network_config.yml``: +------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -37,6 +39,14 @@ To install OFED drivers, enter all required parameters in ``input/network_config | | * ``false`` <- Default | | | * ``true`` | +------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| nvidia_doca_offline_path | Absolute path to local copy of rpm file containing DOCA package. The package can be downloaded from https://developer.nvidia.com/networking/doca/. | +| [optional] | | +| ``string`` | | ++------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| nvidia_doca_version | Indicates the version of DOCA to be downloaded. If ``nvidia_doca_offline_path`` is not given, declaring this variable is mandatory. | +| [optional] | | +| ``string`` | **Default value**: 2.5.0-0.0.1.23.10.1.1.9.0 | ++------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ To run the script: :: diff --git a/docs/source/Tables/bmc.csv b/docs/source/Tables/bmc.csv index 41b85a1a2..feac55ac1 100644 --- a/docs/source/Tables/bmc.csv +++ b/docs/source/Tables/bmc.csv @@ -283,6 +283,11 @@ Optional",Absolute path to a local copy of the .iso file containing Mellanox OF ``string`` Optional","Absolute path to local copy of .rpm file containing CUDA packages. The cuda rpm can be downloaded from https://developer.nvidia.com/cuda-downloads. CUDA will be installed post provisioning without any user intervention. Eg: cuda_toolkit_path: ""/root/cuda-repo-rhel8-12-0-local-12.0.0_525.60.13-1.x86_64.rpm""" +"**nvidia_doca_path** + +``string`` + +Optional","Absolute path to local copy of .rpm file containing DOCA packages. The doca rpm can be downloaded from https://developer.nvidia.com/networking/doca. DOCA will be installed post provisioning without any user intervention. Eg: nvidia_doca_path: ""/root/doca-host-repo-rhel86-2.5.0-0.0.1.2.5.0108.1.el8.23.10.1.1.9.0.x86_64.rpm""" "**apptainer_support** ``boolean`` [1]_ diff --git a/docs/source/Tables/mapping.csv b/docs/source/Tables/mapping.csv index f45ad0e68..43cac7e2a 100644 --- a/docs/source/Tables/mapping.csv +++ b/docs/source/Tables/mapping.csv @@ -258,6 +258,11 @@ Optional",Absolute path to a local copy of the .iso file containing Mellanox OF ``string`` Optional","Absolute path to local copy of .rpm file containing CUDA packages. The cuda rpm can be downloaded from https://developer.nvidia.com/cuda-downloads. CUDA will be installed post provisioning without any user intervention. Eg: cuda_toolkit_path: ""/root/cuda-repo-rhel8-12-0-local-12.0.0_525.60.13-1.x86_64.rpm""" +"**nvidia_doca_path** + +``string`` + +Optional","Absolute path to local copy of .rpm file containing DOCA packages. The doca rpm can be downloaded from https://developer.nvidia.com/networking/doca. DOCA will be installed post provisioning without any user intervention. Eg: nvidia_doca_path: ""/root/doca-host-repo-rhel86-2.5.0-0.0.1.2.5.0108.1.el8.23.10.1.1.9.0.x86_64.rpm""" "**apptainer_support** ``boolean`` [1]_ diff --git a/docs/source/Tables/snmpwalk.csv b/docs/source/Tables/snmpwalk.csv index 40656ce33..9888fe872 100644 --- a/docs/source/Tables/snmpwalk.csv +++ b/docs/source/Tables/snmpwalk.csv @@ -265,6 +265,11 @@ Optional",Absolute path to a local copy of the .iso file containing Mellanox OF ``string`` Optional","Absolute path to local copy of .rpm file containing CUDA packages. The cuda rpm can be downloaded from https://developer.nvidia.com/cuda-downloads. CUDA will be installed post provisioning without any user intervention. Eg: cuda_toolkit_path: ""/root/cuda-repo-rhel8-12-0-local-12.0.0_525.60.13-1.x86_64.rpm""" +"**nvidia_doca_path** + +``string`` + +Optional","Absolute path to local copy of .rpm file containing DOCA packages. The doca rpm can be downloaded from https://developer.nvidia.com/networking/doca. DOCA will be installed post provisioning without any user intervention. Eg: nvidia_doca_path: ""/root/doca-host-repo-rhel86-2.5.0-0.0.1.2.5.0108.1.el8.23.10.1.1.9.0.x86_64.rpm""" "**apptainer_support** ``boolean`` [1]_ diff --git a/docs/source/Tables/switch-based.csv b/docs/source/Tables/switch-based.csv index d7167ecc3..d973f37a1 100644 --- a/docs/source/Tables/switch-based.csv +++ b/docs/source/Tables/switch-based.csv @@ -299,6 +299,11 @@ Optional",Absolute path to a local copy of the .iso file containing Mellanox OF ``string`` Optional","Absolute path to local copy of .rpm file containing CUDA packages. The cuda rpm can be downloaded from https://developer.nvidia.com/cuda-downloads. CUDA will be installed post provisioning without any user intervention. Eg: cuda_toolkit_path: ""/root/cuda-repo-rhel8-12-0-local-12.0.0_525.60.13-1.x86_64.rpm""" +"**nvidia_doca_path** + +``string`` + +Optional","Absolute path to local copy of .rpm file containing DOCA packages. The doca rpm can be downloaded from https://developer.nvidia.com/networking/doca. DOCA will be installed post provisioning without any user intervention. Eg: nvidia_doca_path: ""/root/doca-host-repo-rhel86-2.5.0-0.0.1.2.5.0108.1.el8.23.10.1.1.9.0.x86_64.rpm""" "**apptainer_support** ``boolean`` [1]_ diff --git a/input/network_config.yml b/input/network_config.yml index 48671c1de..94eaee60e 100644 --- a/input/network_config.yml +++ b/input/network_config.yml @@ -33,3 +33,13 @@ mlnx_ofed_version: 5.4-2.4.1.3 # Mandatory variable # Default value: true mlnx_ofed_add_kernel_support: true + +# Absolute path to local copy of .tgz file containing DOCA package. +# The package can be downloaded from https://developer.nvidia.com/networking/doca/ +# Optional variable. +nvidia_doca_offline_path: "" + +# If nvidia_doca_offline_path is not given, declaring this variable is mandatory. +# The DOCA package is downloaded as per version mentioned in this variable. +# Default value: 2.5.0-0.0.1.23.10.1.1.9.0 +nvidia_doca_version: 2.5.0-0.0.1.23.10.1.1.9.0 diff --git a/input/provision_config.yml b/input/provision_config.yml index 2d9efc063..eb5b35983 100644 --- a/input/provision_config.yml +++ b/input/provision_config.yml @@ -256,6 +256,14 @@ mlnx_ofed_path: "" # cuda_toolkit_path: "/root/cuda-repo-rhel8-12-0-local-12.0.0_525.60.13-1.x86_64.rpm" cuda_toolkit_path: "" +#### Optional, discovery_mechanism: mapping or switch_based or bmc or snmpwalk +# Absolute path to local copy of .rpm file containing DOCA packages. +# The cuda rpm can be downloaded from https://developer.nvidia.com/networking/doca +# DOCA will be installed post provisioning without any user intervention requirement +# Example: +# nvidia_doca_path: "/root/doca-host-repo-rhel86-2.5.0-0.0.1.2.5.0108.1.el8.23.10.1.1.9.0.x86_64.rpm" +nvidia_doca_path: "" + #### Mandatory, discovery_mechanism: mapping or switch_based or bmc or snmpwalk # apptainer will be installed on the cluster to enable execution of HPC benchmarks in a containeraized environment. # If apptainer_support: false, apptainer will not be installed on the cluster diff --git a/network/network.yml b/network/network.yml index 4b2443575..93e74be54 100644 --- a/network/network.yml +++ b/network/network.yml @@ -39,6 +39,16 @@ name: mlnx_ofed tasks_from: validations.yml +- name: Validate input parameters for nvidia_doca + hosts: localhost + connection: local + gather_facts: true + tasks: + - name: Validate variables from network_config.yml + ansible.builtin.include_role: + name: nvidia_doca + tasks_from: validations.yml + - name: Check nodes having Infiniband Support hosts: all tasks: diff --git a/network/roles/nvidia_doca/tasks/install_doca_leap.yml b/network/roles/nvidia_doca/tasks/install_doca_leap.yml new file mode 100644 index 000000000..371aaa239 --- /dev/null +++ b/network/roles/nvidia_doca/tasks/install_doca_leap.yml @@ -0,0 +1,18 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Install doca using offline path + ansible.builtin.debug: + msg: "Doca Installation on leap is not yet supported." diff --git a/network/roles/nvidia_doca/tasks/install_doca_redhat.yml b/network/roles/nvidia_doca/tasks/install_doca_redhat.yml new file mode 100644 index 000000000..c78639a16 --- /dev/null +++ b/network/roles/nvidia_doca/tasks/install_doca_redhat.yml @@ -0,0 +1,53 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Install doca using offline path + block: + - name: Install packages from doca rpm file + ansible.builtin.yum: + name: "{{ doca_filepath }}" + state: present + disable_gpg_check: true + + when: hostvars['localhost']['local_installer'] + +- name: Install latest doca using network way + block: + - name: Set Redhat distro + ansible.builtin.set_fact: + distro: "{{ 'rhel' + ansible_distribution_major_version }}" + + - name: Add doca repository + ansible.builtin.command: dnf config-manager --add-repo "{{ doca_repo_url }}" + changed_when: false + + when: not hostvars['localhost']['local_installer'] + +- name: Clean cache + ansible.builtin.command: yum clean all + changed_when: false + +- name: Install DOCA SDK + ansible.builtin.package: + name: "{{ item }}" + state: present + with_items: + - doca-runtime + - doca-tools + notify: + - Reboot node + +- name: Flush handler to reboot the node + ansible.builtin.meta: flush_handlers diff --git a/network/roles/nvidia_doca/tasks/main.yml b/network/roles/nvidia_doca/tasks/main.yml new file mode 100644 index 000000000..d57daaa91 --- /dev/null +++ b/network/roles/nvidia_doca/tasks/main.yml @@ -0,0 +1,41 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if nvidia bluefield dpu is present on compute node + ansible.builtin.include_tasks: pre-requisites.yml + +- name: Install doca when node has nvidia dpu installed + block: + + - name: Include vars file of inventory role + ansible.builtin.include_vars: "{{ role_path }}/../../../input/network_config.yml" + + # - name: Check status of doca installation + # ansible.builtin.command: yum info doca + # changed_when: false + # failed_when: false + # register: doca_status + + - name: Install doca on redhat / rocky nodes + ansible.builtin.include_tasks: install_doca_redhat.yml + when: os_supported_rhel in ansible_facts['distribution'] | lower or + os_supported_rocky in ansible_facts['distribution'] | lower + + - name: Install doca on leap nodes + ansible.builtin.include_tasks: install_doca_leap.yml + when: os_supported_leap in ansible_facts['distribution'] | lower + + when: + - doca_node_status diff --git a/network/roles/nvidia_doca/tasks/pre-requisites.yml b/network/roles/nvidia_doca/tasks/pre-requisites.yml new file mode 100644 index 000000000..eba326445 --- /dev/null +++ b/network/roles/nvidia_doca/tasks/pre-requisites.yml @@ -0,0 +1,31 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Initialise doca node status + ansible.builtin.set_fact: + doca_node_status: false + +- name: Check bluefield dpu card status + ansible.builtin.shell: > + set -o pipefail && \ + lspci | grep -i bluefield + changed_when: false + register: lspci_status + failed_when: false + +- name: Update doca node status + ansible.builtin.set_fact: + doca_node_status: true + when: "'Bluefield' in lspci_status.stdout" diff --git a/network/roles/nvidia_doca/tasks/validations.yml b/network/roles/nvidia_doca/tasks/validations.yml new file mode 100644 index 000000000..01f15d290 --- /dev/null +++ b/network/roles/nvidia_doca/tasks/validations.yml @@ -0,0 +1,57 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Include input for DOCA + ansible.builtin.include_vars: "{{ role_path }}/../../../input/network_config.yml" + +- name: Set doca installation way to network + ansible.builtin.set_fact: + local_installer: false + when: nvidia_doca_path | default("", true) | length == 0 + +- name: Set doca installation way to local + ansible.builtin.set_fact: + local_installer: true + when: nvidia_doca_path | default("", true) | length > 0 + +- name: Validate nvidia_doca_version + ansible.builtin.assert: + that: nvidia_doca_version == "latest" + fail_msg: "{{ nvidia_doca_mandatory }}" + when: not local_installer + +- name: Check if correct nvidia_doca file exists + block: + - name: Verify if nvidia_doca offline path is given + ansible.builtin.assert: + that: nvidia_doca_path | length > 4 + fail_msg: "{{ nvidia_doca_mandatory }}" + + - name: Check if file is .rpm file + ansible.builtin.assert: + that: "'.rpm' in nvidia_doca_path" + fail_msg: "{{ nvidia_doca_file_type }}" + + - name: Check that nvidia_doca .rpm file exists at mentioned path + ansible.builtin.stat: + path: "{{ nvidia_doca_path }}" + register: stat_result + + - name: Fail if toolkit file doesn't exist + ansible.builtin.fail: + msg: "{{ fail_nvidia_doca + nvidia_doca_path }}" + when: not stat_result.stat.exists + + when: local_installer diff --git a/network/roles/nvidia_doca/vars/main.yml b/network/roles/nvidia_doca/vars/main.yml new file mode 100644 index 000000000..20d6d28ec --- /dev/null +++ b/network/roles/nvidia_doca/vars/main.yml @@ -0,0 +1,26 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: main.yml +os_supported_leap: "leap" +os_supported_rocky: "rocky" +os_supported_rhel: "redhat" + +doca_repo_url: "https://linux.mellanox.com/public/repo/doca/latest/{{ distro }}/x86_64" + +# Usage: validations.yml +nvidia_doca_mandatory: Giving nvidia_doca_path is mandatory if nvidia_doca_version is not latest. +nvidia_doca_file_type: Please give location of appropriate .rpm installer file downloaded from https://developer.nvidia.com/networking/doca +fail_nvidia_doca: "No file could be found at the location: " diff --git a/prereq.sh b/prereq.sh index c5fc590d3..84e73ccae 100755 --- a/prereq.sh +++ b/prereq.sh @@ -50,7 +50,7 @@ echo "" echo "" echo "Download the ISO file required to provision in the control plane." echo "" -echo "Download OFED ISO and CUDA RPM file to install OFED and CUDA during provisioning." +echo "Download OFED ISO, DOCA and CUDA RPM file to install OFED, DOCA and CUDA during provisioning." echo "" echo "Please configure all the NICs and set the hostname for the control plane in the format hostname.domain_name. Eg: controlplane.omnia.test" echo "" diff --git a/provision/README.rst b/provision/README.rst index dd44d6f27..8f0c0e510 100644 --- a/provision/README.rst +++ b/provision/README.rst @@ -16,6 +16,6 @@ This playbook achieves the following tasks: * Configures a docker registry to pull images from the internet and store them locally - * Optionally installs OFED and CUDA + * Optionally installs OFED, DOCA and CUDA `Click here `_ for more information on ``provision.yml``. \ No newline at end of file diff --git a/provision/roles/provision_validation/tasks/validate_postscripts_path.yml b/provision/roles/provision_validation/tasks/validate_postscripts_path.yml index da21f50c9..d206e1cfc 100644 --- a/provision/roles/provision_validation/tasks/validate_postscripts_path.yml +++ b/provision/roles/provision_validation/tasks/validate_postscripts_path.yml @@ -13,10 +13,11 @@ # limitations under the License. --- -- name: Initialize ofed_config_status and cuda_config_status +- name: Initialize ofed_config_status, doca_config_status and cuda_config_status ansible.builtin.set_fact: ofed_config_status: false cuda_config_status: false + doca_config_status: false - name: Set ofed_config_status to true ansible.builtin.set_fact: @@ -75,3 +76,27 @@ when: - cuda_config_status - not verify_cuda_path.stat.exists + +- name: Set doca_config_status to true + ansible.builtin.set_fact: + doca_config_status: true + when: nvidia_doca_path | default("", true) | length > 1 + +- name: Warning - waiting for {{ warning_wait_time }} seconds + ansible.builtin.pause: + seconds: "{{ warning_wait_time }}" + prompt: "{{ doca_rpm_empty_msg }}" + when: not doca_config_status + +- name: Verify the nvidia_doca_path + ansible.builtin.stat: + path: "{{ nvidia_doca_path }}" + register: verify_doca_path + when: doca_config_status + +- name: Assert nvidia_doca_path location + ansible.builtin.fail: + msg: "{{ nvidia_doca_path_missing_msg }}" + when: + - doca_config_status + - not verify_doca_path.stat.exists diff --git a/provision/roles/provision_validation/vars/main.yml b/provision/roles/provision_validation/vars/main.yml index 2eba85416..1e68d6357 100644 --- a/provision/roles/provision_validation/vars/main.yml +++ b/provision/roles/provision_validation/vars/main.yml @@ -211,6 +211,9 @@ ofed_rhel_check: "rhel{{ provision_os_version }}" cuda_rpm_empty_msg: "[WARNING] cuda_toolkit_path variable empty in provision_config.yml. CUDA won't be installed during provisioning." cuda_toolkit_path_missing_msg: "Failed. Incorrect cuda_toolkit_path: {{ cuda_toolkit_path }} provided. Make sure CUDA toolkit rpm file is present in the provided cuda_toolkit_path variable in provision_config.yml." +doca_rpm_empty_msg: "[WARNING] nvidia_doca_path variable empty in provision_config.yml. DOCA won't be installed during provisioning." +nvidia_doca_path_missing_msg: "Failed. Incorrect nvidia_doca_path: {{ nvidia_doca_path }} provided. +Make sure DOCA rpm file is present in the provided nvidia_doca_path variable in provision_config.yml." # Usage: validate_repo_path.yml update_repos_success_msg: "Validated update_repos" diff --git a/provision/roles/xcat_discovery_provision/tasks/postscripts/configure_postscripts.yml b/provision/roles/xcat_discovery_provision/tasks/postscripts/configure_postscripts.yml index 50ca914ba..8b03912fd 100644 --- a/provision/roles/xcat_discovery_provision/tasks/postscripts/configure_postscripts.yml +++ b/provision/roles/xcat_discovery_provision/tasks/postscripts/configure_postscripts.yml @@ -56,3 +56,16 @@ when: - cuda_config_status - cuda_repo_stat.stat.exists + +- name: Verify DOCA repo created + ansible.builtin.stat: + path: "{{ doca_core_path }}" + register: doca_repo_stat + when: doca_config_status + +- name: Configure postscripts for DOCA + ansible.builtin.command: chdef all -p postscripts=omnia_doca + changed_when: true + when: + - doca_config_status + - doca_repo_stat.stat.exists diff --git a/provision/roles/xcat_discovery_provision/vars/postscripts.yml b/provision/roles/xcat_discovery_provision/vars/postscripts.yml index c5af34df7..93628a470 100644 --- a/provision/roles/xcat_discovery_provision/vars/postscripts.yml +++ b/provision/roles/xcat_discovery_provision/vars/postscripts.yml @@ -20,6 +20,7 @@ bmc_postscripts_path: # Usage: configure_postscripts.yml mlnx_ofed_repo: /install/ofed cuda_core_path: /install/cuda/x86_64/cuda-core +doca_core_path: /install/doca/x86_64 hostname_postscripts_path: - { src: "{{ role_path }}/files/postscripts/omnia_hostname", dest: "/install/postscripts/omnia_hostname", mode: "755" } diff --git a/provision/roles/xcat_repo_manipulate/files/postscripts/omnia_doca b/provision/roles/xcat_repo_manipulate/files/postscripts/omnia_doca new file mode 100644 index 000000000..fcea23623 --- /dev/null +++ b/provision/roles/xcat_repo_manipulate/files/postscripts/omnia_doca @@ -0,0 +1,19 @@ +#!/bin/bash +################################################################################################################ +# omnia_doca: +# Install DOCA on all the cluster nodes using DOCA rpm file provided +# +################################################################################################################# +echo "--------------------------" >> /var/log/xcat/xcat.log +echo "Checking for NVIDIA DPU cards" >> /var/log/xcat/xcat.log +nvidia_check=`lspci | grep -i bluefield` +if [[ $nvidia_check == *"Bluefield"* ]] +then + echo "Starting DOCA installation" >> /var/log/xcat/xcat.log + dnf -y install doca-runtime doca-tools + reboot + echo "DOCA installation completed" >> /var/log/xcat/xcat.log +else + echo "NVIDIA DPU cards not found" >> /var/log/xcat/xcat.log +fi +echo "-----------------------------" >> /var/log/xcat/xcat.log \ No newline at end of file diff --git a/provision/roles/xcat_repo_manipulate/tasks/postscripts/configure_doca.yml b/provision/roles/xcat_repo_manipulate/tasks/postscripts/configure_doca.yml new file mode 100644 index 000000000..549e80025 --- /dev/null +++ b/provision/roles/xcat_repo_manipulate/tasks/postscripts/configure_doca.yml @@ -0,0 +1,118 @@ +# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Delete doca repo folders + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_items: + - "{{ doca_tmp_path }}" + - "{{ doca_core_path }}" + - "{{ doca_deps_path }}" + +- name: Create doca repo folders + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: "{{ repo_permission }}" + with_items: + - "{{ doca_tmp_path }}" + - "{{ doca_core_path }}" + - "{{ doca_deps_path }}" + +- name: Try extracting doca rpms + block: + - name: Extract doca rpms + ansible.builtin.shell: > + set -o pipefail && \ + cd {{ doca_tmp_path }} && rpm2cpio {{ nvidia_doca_path }} | cpio -i -d + changed_when: true + rescue: + - name: Invalid doca rpm file + ansible.builtin.fail: + msg: "{{ invalid_doca_rpm_fail_msg }}" + +- name: Find doca rpm folder + ansible.builtin.find: + paths: "{{ doca_tmp_path }}/var/" + file_type: directory + register: doca_rpm_dir + +- name: Find doca rpm files + ansible.builtin.find: + paths: "{{ doca_rpm_dir.files[0].path }}" + patterns: "*.rpm" + register: doca_rpm_file + +- name: Copy doca rpm files to doca core repo + ansible.builtin.copy: + src: "{{ item.path }}" + dest: "{{ doca_core_path }}" + mode: preserve + with_items: "{{ doca_rpm_file.files }}" + +- name: Download epel 8 package list starts with d + ansible.builtin.get_url: + url: "{{ dkms_url }}" + dest: "{{ dkms_tmp_file }}" + mode: "{{ dkms_file_permission }}" + register: dkms_repo_result + until: dkms_repo_result is not failed + retries: "{{ download_retries_count }}" + +- name: Fetch dkms file name + ansible.builtin.shell: > + set -o pipefail && \ + echo {{ dkms_url }}`cat {{ dkms_tmp_file }} | grep dkms | grep -o -P '(?<=rpm">).*(?=)'` + changed_when: false + register: fetch_dkms_url + +- name: Download dkms + ansible.builtin.get_url: + url: "{{ fetch_dkms_url.stdout }}" + dest: "{{ doca_deps_path }}" + mode: "{{ dkms_file_permission }}" + register: dkms_rpm_result + until: dkms_rpm_result is not failed + retries: "{{ download_retries_count }}" + +- name: Create doca-core and doca-deps repos + ansible.builtin.command: createrepo {{ item }} + changed_when: true + with_items: + - "{{ doca_core_path }}" + - "{{ doca_deps_path }}" + +- name: Copy DOCA script to postscripts + ansible.builtin.copy: + src: "{{ item.src }}" + dest: "{{ item.dest }}" + mode: "{{ item.mode }}" + with_items: "{{ doca_postscripts_path }}" + +- name: Add doca path in enviornment variable in postscripts + ansible.builtin.lineinfile: + path: "{{ xcat_rhel8_post_script }}" + regexp: 'EOF' + line: 'echo "PATH=$PATH:/opt/dell/srvadmin/sbin:/usr/local/sbin:/usr/local/bin/:/usr/local/doca/bin" >> /etc/bashrc' + +- name: Delete dkms.html file + ansible.builtin.file: + path: "{{ dkms_tmp_file }}" + state: absent + +- name: Update osimage with doca repository + ansible.builtin.command: chdef -t osimage -o {{ provision_os_image }} -p pkgdir={{ doca_core_path }},{{ doca_deps_path }} + changed_when: true diff --git a/provision/roles/xcat_repo_manipulate/tasks/postscripts/main.yml b/provision/roles/xcat_repo_manipulate/tasks/postscripts/main.yml index 013ab6998..ae4bc0514 100644 --- a/provision/roles/xcat_repo_manipulate/tasks/postscripts/main.yml +++ b/provision/roles/xcat_repo_manipulate/tasks/postscripts/main.yml @@ -39,3 +39,7 @@ - name: Configure CUDA postscripts ansible.builtin.include_tasks: configure_cuda.yml when: cuda_config_status + +- name: Configure DOCA postscripts + ansible.builtin.include_tasks: configure_doca.yml + when: doca_config_status diff --git a/provision/roles/xcat_repo_manipulate/vars/postscripts.yml b/provision/roles/xcat_repo_manipulate/vars/postscripts.yml index 087973055..0a17a541a 100644 --- a/provision/roles/xcat_repo_manipulate/vars/postscripts.yml +++ b/provision/roles/xcat_repo_manipulate/vars/postscripts.yml @@ -40,3 +40,12 @@ Make sure cuda rpm file is downloaded completely." cuda_postscripts_path: - { src: "{{ role_path }}/files/postscripts/omnia_cuda", dest: "/install/postscripts/omnia_cuda", mode: "755" } download_retries_count: 10 + +# Usage: configure_doca.yml +doca_tmp_path: /tmp/doca +doca_core_path: /install/doca/x86_64/doca-core +doca_deps_path: /install/doca/x86_64/doca-deps +invalid_doca_rpm_fail_msg: "Failed. Invalid nvidia_doca_path: {{ nvidia_doca_path }} provided in provision_config.yml. +Make sure doca rpm file is downloaded completely." +doca_postscripts_path: + - { src: "{{ role_path }}/files/postscripts/omnia_doca", dest: "/install/postscripts/omnia_doca", mode: "755" }