Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merging release 1.7 to main #2412

Merged
merged 1 commit into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 2 additions & 0 deletions .ansible-lint
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
skip_list:
- var-naming[no-role-prefix]
3 changes: 2 additions & 1 deletion .metadata/omnia_version
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
omnia_version: 1.6.1
omnia_version: 1.7
omnia_installation_path: ""
35 changes: 23 additions & 12 deletions accelerator/accelerator.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -13,34 +13,34 @@
# limitations under the License.
---

- name: Check if virtual environment is active
ansible.builtin.import_playbook: ../utils/check_venv.yml
when: not ( check_venv_executed | default(false) | bool )

- name: Update Inventory with ansible_host information
ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml

- name: Validate accelerator inputs
hosts: localhost
gather_facts: true
connection: local
roles:
- accelerator_validation
tags: amd, nvidia
tags: amd, nvidia, intel

- name: Update Repositories/Registries on nodes
ansible.builtin.import_playbook: ../utils/update_user_repo.yml
when: not ( hostvars['127.0.0.1']['update_user_repo_executed'] | default(false) | bool )

# - name: Validate repo file and subscription
# hosts: all
# gather_facts: true
# roles:
# - repo_validation
# tags: amd, nvidia

- name: Gather Cluster Facts
hosts: all
hosts: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd
gather_facts: true
roles:
- common
tags: amd, nvidia
tags: amd, nvidia, intel

- name: Perform GPU driver and ROCm installation for AMD Accelerators
hosts: all
hosts: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd
gather_facts: true
any_errors_fatal: true
roles:
Expand All @@ -66,3 +66,14 @@
# - name: Reboot node
# ansible.builtin.reboot:
# tags: nvidia

- name: Install Intel Gaudi drivers on nodes
hosts: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd
gather_facts: true
any_errors_fatal: true
roles:
- intel
tags: intel

- name: Import playbook to set performance profile on nodes
ansible.builtin.import_playbook: "../utils/performance_profile/performance_profile.yml"
4 changes: 3 additions & 1 deletion accelerator/ansible.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ log_path = /var/log/omnia/accelerator.log
host_key_checking = false
forks = 5
timeout = 180
collections_path = $VIRTUAL_ENV
executable = /bin/bash

[persistent_connection]
command_timeout = 180
connect_timeout = 180

[ssh_connection]
retries = 3
ssh_args = -o ControlMaster=auto -o ControlPersist=180
ssh_args = -o ControlMaster=auto -o ControlPersist=180
25 changes: 23 additions & 2 deletions accelerator/roles/accelerator_validation/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,33 @@
# limitations under the License.
---

- name: Saving distribution of os
- name: Saving distribution and version of OS
ansible.builtin.set_fact:
control_plane_os: "{{ ansible_distribution | lower }}"
oim_os: "{{ ansible_distribution | lower }}"
oim_os_version: "{{ ansible_distribution_version | lower }}"

- name: Include local_repo variables
ansible.builtin.include_tasks: include_local_repo_config.yml

- name: Check xcat installation status
ansible.builtin.include_tasks: validate_amd.yml

- name: Check xcat installation status
ansible.builtin.include_tasks: validate_intel_gaudi.yml

- name: Debug intel_gaudi_config_status
ansible.builtin.debug:
msg: "intel_gaudi_config_status is {{ hostvars['localhost']['intel_gaudi_config_status'] }}"

- name: Debug amdgpu_config_status
ansible.builtin.debug:
msg: "amdgpu_config_status is {{ hostvars['localhost']['amdgpu_config_status'] }}"

- name: Check if both intel_gaudi_config_status and amdgpu_config_status are false
ansible.builtin.fail:
msg: "{{ driver_not_found_msg }}"
when: >
hostvars['localhost']['intel_gaudi_config_status'] is defined and
hostvars['localhost']['amdgpu_config_status'] is defined and
not hostvars['localhost']['intel_gaudi_config_status'] | bool and
not hostvars['localhost']['amdgpu_config_status'] | bool
79 changes: 50 additions & 29 deletions accelerator/roles/accelerator_validation/tasks/validate_amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
file: "{{ software_config_json_file }}"
name: user_config

- name: Include vars for {{ control_plane_os }}
ansible.builtin.include_vars: "{{ role_path }}/vars/{{ control_plane_os }}.yml"
- name: Include vars for {{ oim_os }}
ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml"

- name: Get amdgpu status
ansible.builtin.set_fact:
Expand All @@ -47,10 +47,11 @@
loop_control:
loop_var: item

- name: Failed, AMDGPU ROCm software stack not present in software_config.json
ansible.builtin.fail:
msg: "{{ amdgpu_input_fail_msg }}"
when: not amdgpu_input_status
- name: Check if the rocm offline repo exists
ansible.builtin.stat:
path: "{{ offline_rocm_directory }}/rocm/"
register: check_rocm_repo
when: rocm_input_status

- name: Set amdgpu_config_status
when: amdgpu_input_status
Expand All @@ -63,28 +64,28 @@
ansible.builtin.set_fact:
amdgpu_directory: "{{ offline_rocm_directory }}/amdgpu/{{ amdgpu_version }}/"

- name: Check amdgpu_version exists or not
- name: Check amdgpu version directory exists or not
ansible.builtin.stat:
path: "{{ amdgpu_directory }}"
register: check_amdgpu_dir
failed_when: not check_amdgpu_dir.stat.exists

- name: Set amdgpu_config_status to true
- name: Set amdgpu_config_status based on directory existence
ansible.builtin.set_fact:
amdgpu_config_status: true
when: check_amdgpu_dir.stat.exists
amdgpu_config_status: "{{ check_amdgpu_dir.stat.exists | ternary(true, false) }}"
rescue:
- name: Failed, amdgpu directory repo not found
ansible.builtin.fail:
msg: "{{ amdgpu_repo_fail_msg }}"
when: not check_amdgpu_dir.stat.exists
- name: Log an error message
ansible.builtin.debug:
msg: " {{ amdgpu_fail_msg }} "

- name: Failed, amdgpu version not found
ansible.builtin.fail:
msg: "{{ amdgpu_version_fail_msg }}"
- name: Set amdgpu_config_status to false
ansible.builtin.set_fact:
amdgpu_config_status: false

- name: Set rocm_config_status
when: rocm_input_status
when:
- rocm_input_status
- user_config.repo_config == 'always' or user_config.repo_config == 'partial'
- check_rocm_repo.stat.exists
block:
- name: Fetch rocm_version
ansible.builtin.set_fact:
Expand All @@ -98,18 +99,38 @@
ansible.builtin.stat:
path: "{{ rocm_directory }}"
register: check_rocm_dir
failed_when: not check_rocm_dir.stat.exists

- name: Set rocm_config_status based on directory existence
ansible.builtin.set_fact:
rocm_config_status: "{{ check_rocm_dir.stat.exists | ternary(true, false) }}"

rescue:
- name: Log an error message
ansible.builtin.debug:
msg: " {{ amdgpu_fail_msg }} "

- name: Set rocm_config_status to false
ansible.builtin.set_fact:
rocm_config_status: false

- name: Set rocm_config_status
when:
- rocm_input_status
- user_config.repo_config == 'never' or user_config.repo_config == 'partial'
- not check_rocm_repo.stat.exists
block:
- name: Fetch rocm_version
ansible.builtin.set_fact:
rocm_version: "{{ user_config.amdgpu | selectattr('name', 'equalto', 'rocm') | map(attribute='version') | first }}"

- name: Set rocm_config_status to true
ansible.builtin.set_fact:
rocm_config_status: true
when: check_rocm_dir.stat.exists
rescue:
- name: Failed, rocm directory repo not found
ansible.builtin.fail:
msg: "{{ rocm_repo_fail_msg }}"
when: not check_rocm_dir.stat.exists

- name: Failed, rocm version not found
ansible.builtin.fail:
msg: "{{ rocm_version_fail_msg }}"
- name: Log an error message
ansible.builtin.debug:
msg: " {{ amdgpu_fail_msg }} "

- name: Set rocm_config_status to false
ansible.builtin.set_fact:
rocm_config_status: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright 2024 Intel Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Set default intel gaudi status
ansible.builtin.set_fact:
habana_config_status: false
habana_input_status: false
intel_gaudi_config_status: false
intel_gaudi_input_status: false

- name: Load software_config.json
ansible.builtin.include_vars:
file: "{{ software_config_json_file }}"
name: user_config

- name: Include vars for {{ oim_os }}
ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml"

- name: Get Intel Gaudi status
ansible.builtin.set_fact:
intel_gaudi_input_status: true
loop: "{{ user_config.softwares | default([]) }}"
when:
- "'intelgaudi' in item.name"
loop_control:
loop_var: item

- name: Get habana status only if intel gaudi is present gaudi_status is true
ansible.builtin.set_fact:
habana_input_status: true
loop: "{{ user_config.gaudi | default([]) }}"
when:
- intel_gaudi_input_status
- "'intel' in item.name"
loop_control:
loop_var: item

- name: Set intel_gaudi_config_status
when: intel_gaudi_input_status
block:
- name: Fetch intelgaudi_version
ansible.builtin.set_fact:
intelgaudi_version: "{{ user_config.softwares | selectattr('name', 'equalto', 'intelgaudi') | map(attribute='version') | first }}"

- name: Set intelgaudi_version
ansible.builtin.set_fact:
intelgaudi_directory: "{{ offline_intelgaudi_directory }}/intelgaudi/{{ intelgaudi_version }}/"

- name: Set gaudi_directory
ansible.builtin.set_fact:
gaudi_directory: "{{ intelgaudi_directory }}"

- name: Check gaudi_directory exists or not
ansible.builtin.stat:
path: "{{ gaudi_directory }}"
register: check_gaudi_dir

- name: Set intel_gaudi_config_status to true
ansible.builtin.set_fact:
intel_gaudi_config_status: true
when: check_gaudi_dir.stat.exists

rescue:
- name: Intel Gaudi not found
ansible.builtin.debug:
msg: "{{ intel_gaudi_repo_fail_msg }}"
when: not check_gaudi_dir.stat.exists

- name: Set habana_config_status
when: habana_config_status
block:

- name: Check driver packages inside offline_gaudi_directory
ansible.builtin.find:
paths: "{{ offline_gaudi_directory }}"
patterns: "{{ gaudi_search_pattern }}"
register: check_driver_packages

- name: Set habana_config_status to true
ansible.builtin.set_fact:
habana_config_status: true
when: check_driver_packages.matched > 0
rescue:
- name: Intel Gaudi driver packages not found
ansible.builtin.debug:
msg: "{{ intel_gaudi_repo_fail_msg }}"
when: check_driver_packages.matched == 0
9 changes: 9 additions & 0 deletions accelerator/roles/accelerator_validation/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,16 @@ amdgpu_version_fail_msg: "Failed, software_config.json does not have the version
amdgpu_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading AMDGPU packages."
rocm_version_fail_msg: "Failed, software_config.json does not have the version for ROCM."
rocm_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading ROCM packages."
amdgpu_fail_msg: "An error occurred while setting the rocm_config_status."

# Usage: include_local_repo_config.yml
local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml"
local_repo_config_syntax_fail_msg: "Failed. Syntax errors present in local_repo_config.yml. Fix errors and re-run playbook again."

# Usage: validate_intel_gaudi.yml
intel_gaudi_input_fail_msg: "Failed, software_config.json does not have the intelgaudi software stack."
intel_gaudi_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading Intel Gaudi driver packages."

# Usage: main.yml
driver_not_found_msg: |
"Please ensure that either 'intelgaudi' or 'amdgpu' is included in 'software_config.json' and then run 'accelerator.yml' to install GPU drivers."
5 changes: 5 additions & 0 deletions accelerator/roles/accelerator_validation/vars/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,8 @@

# Usage: validate_amd.yml
offline_rocm_directory: "{{ repo_store_path }}/cluster/apt"

# Usage: validate_intel_gaudi.yml
offline_intelgaudi_directory: "{{ repo_store_path }}/cluster/apt"
offline_gaudi_directory: "{{ repo_store_path }}/cluster/{{ oim_os }}/{{ oim_os_version }}/deb"
gaudi_search_pattern: "habanalabs*.deb"
Loading
Loading